diff --git a/codes/A2C/agent.py b/codes/A2C/agent.py index e095bc5..997401b 100644 --- a/codes/A2C/agent.py +++ b/codes/A2C/agent.py @@ -10,12 +10,40 @@ Discription: Environment: ''' import torch.optim as optim -from A2C.model import ActorCritic +import torch.nn as nn +import torch.nn.functional as F +from torch.distributions import Categorical + +class ActorCritic(nn.Module): + ''' A2C网络模型,包含一个Actor和Critic + ''' + def __init__(self, input_dim, output_dim, hidden_dim): + super(ActorCritic, self).__init__() + self.critic = nn.Sequential( + nn.Linear(input_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, 1) + ) + + self.actor = nn.Sequential( + nn.Linear(input_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, output_dim), + nn.Softmax(dim=1), + ) + + def forward(self, x): + value = self.critic(x) + probs = self.actor(x) + dist = Categorical(probs) + return dist, value class A2C: - def __init__(self,n_states,n_actions,cfg) -> None: + ''' A2C算法 + ''' + def __init__(self,state_dim,action_dim,cfg) -> None: self.gamma = cfg.gamma self.device = cfg.device - self.model = ActorCritic(n_states, n_actions, cfg.hidden_size).to(self.device) + self.model = ActorCritic(state_dim, action_dim, cfg.hidden_size).to(self.device) self.optimizer = optim.Adam(self.model.parameters()) def compute_returns(self,next_value, rewards, masks): diff --git a/codes/A2C/model.py b/codes/A2C/model.py deleted file mode 100644 index 473bcb2..0000000 --- a/codes/A2C/model.py +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -Author: JiangJi -Email: johnjim0816@gmail.com -Date: 2021-05-03 21:38:54 -LastEditor: JiangJi -LastEditTime: 2021-05-03 21:40:06 -Discription: -Environment: -''' -import torch.nn as nn -import torch.nn.functional as F -from torch.distributions import Categorical -class ActorCritic(nn.Module): - def __init__(self, n_states, n_actions, hidden_dim): - super(ActorCritic, self).__init__() - - self.critic = nn.Sequential( - nn.Linear(n_states, hidden_dim), - nn.ReLU(), - nn.Linear(hidden_dim, 1) - ) - - self.actor = nn.Sequential( - nn.Linear(n_states, hidden_dim), - nn.ReLU(), - nn.Linear(hidden_dim, n_actions), - nn.Softmax(dim=1), - ) - - def forward(self, x): - value = self.critic(x) - probs = self.actor(x) - dist = Categorical(probs) - return dist, value \ No newline at end of file diff --git a/codes/A2C/outputs/CartPole-v0/20211221-165620/results/train_ma_rewards.npy b/codes/A2C/outputs/CartPole-v0/20211221-165620/results/train_ma_rewards.npy new file mode 100644 index 0000000..6537afd Binary files /dev/null and b/codes/A2C/outputs/CartPole-v0/20211221-165620/results/train_ma_rewards.npy differ diff --git a/codes/A2C/outputs/CartPole-v0/20211221-165620/results/train_rewards.npy b/codes/A2C/outputs/CartPole-v0/20211221-165620/results/train_rewards.npy new file mode 100644 index 0000000..56f779b Binary files /dev/null and b/codes/A2C/outputs/CartPole-v0/20211221-165620/results/train_rewards.npy differ diff --git a/codes/A2C/task0_train.ipynb b/codes/A2C/task0.ipynb similarity index 100% rename from codes/A2C/task0_train.ipynb rename to codes/A2C/task0.ipynb diff --git a/codes/A2C/task0_train.py b/codes/A2C/task0.py similarity index 86% rename from codes/A2C/task0_train.py rename to codes/A2C/task0.py index 5927048..fd54d87 100644 --- a/codes/A2C/task0_train.py +++ b/codes/A2C/task0.py @@ -1,7 +1,8 @@ -import sys,os -curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 -parent_path = os.path.dirname(curr_path) # 父路径 -sys.path.append(parent_path) # 添加路径到系统路径sys.path +import sys +import os +curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 +parent_path = os.path.dirname(curr_path) # 父路径 +sys.path.append(parent_path) # 添加路径到系统路径 import gym import numpy as np @@ -9,15 +10,18 @@ import torch import torch.optim as optim import datetime from common.multiprocessing_env import SubprocVecEnv -from A2C.model import ActorCritic +from A2C.agent import ActorCritic from common.utils import save_results, make_dir -from common.plot import plot_rewards +from common.utils import plot_rewards + +curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 +algo_name = 'A2C' # 算法名称 +env_name = 'CartPole-v0' # 环境名称 -curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time class A2CConfig: def __init__(self) -> None: - self.algo='A2C' # 算法名称 - self.env_name= 'CartPole-v0' # 环境名称 + self.algo_name = algo_name# 算法名称 + self.env_name = env_name # 环境名称 self.n_envs = 8 # 异步的环境数目 self.gamma = 0.99 # 强化学习中的折扣因子 self.hidden_dim = 256 @@ -27,10 +31,9 @@ class A2CConfig: self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") class PlotConfig: def __init__(self) -> None: - self.algo = "DQN" # 算法名称 - self.env_name = 'CartPole-v0' # 环境名称 + self.algo_name = algo_name # 算法名称 + self.env_name = env_name # 环境名称 self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU - self.result_path = curr_path+"/outputs/" + self.env_name + \ '/'+curr_time+'/results/' # 保存结果的路径 self.model_path = curr_path+"/outputs/" + self.env_name + \ @@ -67,6 +70,8 @@ def compute_returns(next_value, rewards, masks, gamma=0.99): def train(cfg,envs): + print('开始训练!') + print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}') env = gym.make(cfg.env_name) # a single env env.seed(10) state_dim = envs.observation_space.shape[0] @@ -119,6 +124,7 @@ def train(cfg,envs): optimizer.zero_grad() loss.backward() optimizer.step() + print('完成训练!') return test_rewards, test_ma_rewards if __name__ == "__main__": cfg = A2CConfig() diff --git a/codes/DDPG/agent.py b/codes/DDPG/agent.py index 528872e..6ec2eef 100644 --- a/codes/DDPG/agent.py +++ b/codes/DDPG/agent.py @@ -9,15 +9,68 @@ LastEditTime: 2021-09-16 00:55:30 @Discription: @Environment: python 3.7.7 ''' +import random import numpy as np import torch import torch.nn as nn import torch.optim as optim - -from common.model import Actor, Critic -from common.memory import ReplayBuffer - - +import torch.nn.functional as F +class ReplayBuffer: + def __init__(self, capacity): + self.capacity = capacity # 经验回放的容量 + self.buffer = [] # 缓冲区 + self.position = 0 + + def push(self, state, action, reward, next_state, done): + ''' 缓冲区是一个队列,容量超出时去掉开始存入的转移(transition) + ''' + if len(self.buffer) < self.capacity: + self.buffer.append(None) + self.buffer[self.position] = (state, action, reward, next_state, done) + self.position = (self.position + 1) % self.capacity + + def sample(self, batch_size): + batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移 + state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等 + return state, action, reward, next_state, done + + def __len__(self): + ''' 返回当前存储的量 + ''' + return len(self.buffer) +class Actor(nn.Module): + def __init__(self, state_dim, action_dim, hidden_dim, init_w=3e-3): + super(Actor, self).__init__() + self.linear1 = nn.Linear(state_dim, hidden_dim) + self.linear2 = nn.Linear(hidden_dim, hidden_dim) + self.linear3 = nn.Linear(hidden_dim, action_dim) + + self.linear3.weight.data.uniform_(-init_w, init_w) + self.linear3.bias.data.uniform_(-init_w, init_w) + + def forward(self, x): + x = F.relu(self.linear1(x)) + x = F.relu(self.linear2(x)) + x = torch.tanh(self.linear3(x)) + return x +class Critic(nn.Module): + def __init__(self, state_dim, action_dim, hidden_dim, init_w=3e-3): + super(Critic, self).__init__() + + self.linear1 = nn.Linear(state_dim + action_dim, hidden_dim) + self.linear2 = nn.Linear(hidden_dim, hidden_dim) + self.linear3 = nn.Linear(hidden_dim, 1) + # 随机初始化为较小的值 + self.linear3.weight.data.uniform_(-init_w, init_w) + self.linear3.bias.data.uniform_(-init_w, init_w) + + def forward(self, state, action): + # 按维数1拼接 + x = torch.cat([state, action], 1) + x = F.relu(self.linear1(x)) + x = F.relu(self.linear2(x)) + x = self.linear3(x) + return x class DDPG: def __init__(self, state_dim, action_dim, cfg): self.device = cfg.device diff --git a/codes/DDPG/env.py b/codes/DDPG/env.py index 99da3c5..92fe482 100644 --- a/codes/DDPG/env.py +++ b/codes/DDPG/env.py @@ -16,12 +16,10 @@ class NormalizedActions(gym.ActionWrapper): ''' 将action范围重定在[0.1]之间 ''' def action(self, action): - low_bound = self.action_space.low upper_bound = self.action_space.high action = low_bound + (action + 1.0) * 0.5 * (upper_bound - low_bound) action = np.clip(action, low_bound, upper_bound) - return action def reverse_action(self, action): diff --git a/codes/DDPG/task0.py b/codes/DDPG/task0.py new file mode 100644 index 0000000..81fa9a6 --- /dev/null +++ b/codes/DDPG/task0.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python +# coding=utf-8 +''' +@Author: John +@Email: johnjim0816@gmail.com +@Date: 2020-06-11 20:58:21 +@LastEditor: John +LastEditTime: 2021-09-16 01:31:33 +@Discription: +@Environment: python 3.7.7 +''' +import sys,os +curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 +parent_path = os.path.dirname(curr_path) # 父路径 +sys.path.append(parent_path) # 添加路径到系统路径sys.path + +import datetime +import gym +import torch + +from DDPG.env import NormalizedActions +from DDPG.agent import DDPG +from DDPG.train import train,test +from common.utils import save_results,make_dir +from common.utils import plot_rewards + +curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 +algo_name = 'DDPG' # 算法名称 +env_name = 'Pendulum-v1' # 环境名称,gym新版本(约0.21.0之后)中Pendulum-v0改为Pendulum-v1 + +class DDPGConfig: + def __init__(self): + self.algo_name = algo_name # 算法名称 + self.env_name = env_name # 环境名称 + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU + self.train_eps = 300 # 训练的回合数 + self.test_eps = 50 # 测试的回合数 + self.gamma = 0.99 # 折扣因子 + self.critic_lr = 1e-3 # 评论家网络的学习率 + self.actor_lr = 1e-4 # 演员网络的学习率 + self.memory_capacity = 8000 # 经验回放的容量 + self.batch_size = 128 # mini-batch SGD中的批量大小 + self.target_update = 2 # 目标网络的更新频率 + self.hidden_dim = 256 # 网络隐藏层维度 + self.soft_tau = 1e-2 # 软更新参数 + +class PlotConfig: + def __init__(self) -> None: + self.algo_name = algo_name # 算法名称 + self.env_name = env_name # 环境名称 + self.result_path = curr_path+"/outputs/" + self.env_name + \ + '/'+curr_time+'/results/' # 保存结果的路径 + self.model_path = curr_path+"/outputs/" + self.env_name + \ + '/'+curr_time+'/models/' # 保存模型的路径 + self.save = True # 是否保存图片 + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU + +def env_agent_config(cfg,seed=1): + env = NormalizedActions(gym.make(cfg.env_name)) # 装饰action噪声 + env.seed(seed) # 随机种子 + state_dim = env.observation_space.shape[0] + action_dim = env.action_space.shape[0] + agent = DDPG(state_dim,action_dim,cfg) + return env,agent + +cfg = DDPGConfig() +plot_cfg = PlotConfig() +# 训练 +env,agent = env_agent_config(cfg,seed=1) +rewards, ma_rewards = train(cfg, env, agent) +make_dir(plot_cfg.result_path, plot_cfg.model_path) +agent.save(path=plot_cfg.model_path) +save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path) +plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果 +# 测试 +env,agent = env_agent_config(cfg,seed=10) +agent.load(path=plot_cfg.model_path) +rewards,ma_rewards = test(plot_cfg,env,agent) +save_results(rewards,ma_rewards,tag = 'test',path = cfg.result_path) +plot_rewards(rewards, ma_rewards, plot_cfg, tag="test") # 画出结果 + diff --git a/codes/DDPG/task0_train.py b/codes/DDPG/task0_train.py deleted file mode 100644 index ea76661..0000000 --- a/codes/DDPG/task0_train.py +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -@Author: John -@Email: johnjim0816@gmail.com -@Date: 2020-06-11 20:58:21 -@LastEditor: John -LastEditTime: 2021-09-16 01:31:33 -@Discription: -@Environment: python 3.7.7 -''' -import sys,os -curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 -parent_path = os.path.dirname(curr_path) # 父路径 -sys.path.append(parent_path) # 添加路径到系统路径sys.path - -import datetime -import gym -import torch - -from DDPG.env import NormalizedActions, OUNoise -from DDPG.agent import DDPG -from common.utils import save_results,make_dir -from common.plot import plot_rewards - -curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 - -class DDPGConfig: - def __init__(self): - self.algo = 'DDPG' # 算法名称 - self.env_name = 'Pendulum-v0' # 环境名称 - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU - self.train_eps = 300 # 训练的回合数 - self.eval_eps = 50 # 测试的回合数 - self.gamma = 0.99 # 折扣因子 - self.critic_lr = 1e-3 # 评论家网络的学习率 - self.actor_lr = 1e-4 # 演员网络的学习率 - self.memory_capacity = 8000 # 经验回放的容量 - self.batch_size = 128 # mini-batch SGD中的批量大小 - self.target_update = 2 # 目标网络的更新频率 - self.hidden_dim = 256 # 网络隐藏层维度 - self.soft_tau = 1e-2 # 软更新参数 - -class PlotConfig: - def __init__(self) -> None: - self.algo = "DQN" # 算法名称 - self.env_name = 'CartPole-v0' # 环境名称 - self.result_path = curr_path+"/outputs/" + self.env_name + \ - '/'+curr_time+'/results/' # 保存结果的路径 - self.model_path = curr_path+"/outputs/" + self.env_name + \ - '/'+curr_time+'/models/' # 保存模型的路径 - self.save = True # 是否保存图片 - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU - -def env_agent_config(cfg,seed=1): - env = NormalizedActions(gym.make(cfg.env_name)) # 装饰action噪声 - env.seed(seed) # 随机种子 - n_states = env.observation_space.shape[0] - n_actions = env.action_space.shape[0] - agent = DDPG(n_states,n_actions,cfg) - return env,agent - -def train(cfg, env, agent): - print('开始训练!') - print(f'环境:{cfg.env_name},算法:{cfg.algo},设备:{cfg.device}') - ou_noise = OUNoise(env.action_space) # 动作噪声 - rewards = [] # 记录所有回合的奖励 - ma_rewards = [] # 记录所有回合的滑动平均奖励 - for i_ep in range(cfg.train_eps): - state = env.reset() - ou_noise.reset() - done = False - ep_reward = 0 - i_step = 0 - while not done: - i_step += 1 - action = agent.choose_action(state) - action = ou_noise.get_action(action, i_step) - next_state, reward, done, _ = env.step(action) - ep_reward += reward - agent.memory.push(state, action, reward, next_state, done) - agent.update() - state = next_state - if (i_ep+1)%10 == 0: - print('回合:{}/{},奖励:{:.2f}'.format(i_ep+1, cfg.train_eps, ep_reward)) - rewards.append(ep_reward) - if ma_rewards: - ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) - else: - ma_rewards.append(ep_reward) - print('完成训练!') - return rewards, ma_rewards - -def eval(cfg, env, agent): - print('开始测试!') - print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}') - rewards = [] # 记录所有回合的奖励 - ma_rewards = [] # 记录所有回合的滑动平均奖励 - for i_ep in range(cfg.eval_eps): - state = env.reset() - done = False - ep_reward = 0 - i_step = 0 - while not done: - i_step += 1 - action = agent.choose_action(state) - next_state, reward, done, _ = env.step(action) - ep_reward += reward - state = next_state - print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward)) - rewards.append(ep_reward) - if ma_rewards: - ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) - else: - ma_rewards.append(ep_reward) - print('完成测试!') - return rewards, ma_rewards - - -if __name__ == "__main__": - cfg = DDPGConfig() - plot_cfg = PlotConfig() - # 训练 - env,agent = env_agent_config(cfg,seed=1) - rewards, ma_rewards = train(cfg, env, agent) - make_dir(plot_cfg.result_path, plot_cfg.model_path) - agent.save(path=plot_cfg.model_path) - save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path) - plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") - # 测试 - env,agent = env_agent_config(cfg,seed=10) - agent.load(path=plot_cfg.model_path) - rewards,ma_rewards = eval(plot_cfg,env,agent) - save_results(rewards,ma_rewards,tag = 'eval',path = cfg.result_path) - plot_rewards(rewards,ma_rewards,plot_cfg,tag = "eval") - diff --git a/codes/DDPG/train.py b/codes/DDPG/train.py new file mode 100644 index 0000000..4cdfa9d --- /dev/null +++ b/codes/DDPG/train.py @@ -0,0 +1,64 @@ +import sys +import os +curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 +parent_path = os.path.dirname(curr_path) # 父路径 +sys.path.append(parent_path) # 添加路径到系统路径 + +from DDPG.env import OUNoise + +def train(cfg, env, agent): + print('开始训练!') + print(f'环境:{cfg.env_name},算法:{cfg.algo},设备:{cfg.device}') + ou_noise = OUNoise(env.action_space) # 动作噪声 + rewards = [] # 记录所有回合的奖励 + ma_rewards = [] # 记录所有回合的滑动平均奖励 + for i_ep in range(cfg.train_eps): + state = env.reset() + ou_noise.reset() + done = False + ep_reward = 0 + i_step = 0 + while not done: + i_step += 1 + action = agent.choose_action(state) + action = ou_noise.get_action(action, i_step) + next_state, reward, done, _ = env.step(action) + ep_reward += reward + agent.memory.push(state, action, reward, next_state, done) + agent.update() + state = next_state + if (i_ep+1)%10 == 0: + print('回合:{}/{},奖励:{:.2f}'.format(i_ep+1, cfg.train_eps, ep_reward)) + rewards.append(ep_reward) + if ma_rewards: + ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) + else: + ma_rewards.append(ep_reward) + print('完成训练!') + return rewards, ma_rewards + +def test(cfg, env, agent): + print('开始测试!') + print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}') + rewards = [] # 记录所有回合的奖励 + ma_rewards = [] # 记录所有回合的滑动平均奖励 + for i_ep in range(cfg.test_eps): + state = env.reset() + done = False + ep_reward = 0 + i_step = 0 + while not done: + i_step += 1 + action = agent.choose_action(state) + next_state, reward, done, _ = env.step(action) + ep_reward += reward + state = next_state + print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward)) + rewards.append(ep_reward) + if ma_rewards: + ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) + else: + ma_rewards.append(ep_reward) + print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}") + print('完成测试!') + return rewards, ma_rewards \ No newline at end of file diff --git a/codes/DQN/agent.py b/codes/DQN/dqn.py similarity index 63% rename from codes/DQN/agent.py rename to codes/DQN/dqn.py index 27845d2..4a4dfc4 100644 --- a/codes/DQN/agent.py +++ b/codes/DQN/dqn.py @@ -5,7 +5,7 @@ @Email: johnjim0816@gmail.com @Date: 2020-06-12 00:50:49 @LastEditor: John -LastEditTime: 2021-09-15 13:35:36 +LastEditTime: 2021-12-22 14:01:37 @Discription: @Environment: python 3.7.7 ''' @@ -14,16 +14,57 @@ LastEditTime: 2021-09-15 13:35:36 import torch import torch.nn as nn +import torch.nn.functional as F import torch.optim as optim import random import math import numpy as np -from common.memory import ReplayBuffer -from common.model import MLP -class DQN: - def __init__(self, n_states, n_actions, cfg): - self.n_actions = n_actions # 总的动作个数 +class MLP(nn.Module): + def __init__(self, state_dim,action_dim,hidden_dim=128): + """ 初始化q网络,为全连接网络 + state_dim: 输入的特征数即环境的状态维度 + action_dim: 输出的动作维度 + """ + super(MLP, self).__init__() + self.fc1 = nn.Linear(state_dim, hidden_dim) # 输入层 + self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层 + self.fc3 = nn.Linear(hidden_dim, action_dim) # 输出层 + + def forward(self, x): + # 各层对应的激活函数 + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + return self.fc3(x) + +class ReplayBuffer: + def __init__(self, capacity): + self.capacity = capacity # 经验回放的容量 + self.buffer = [] # 缓冲区 + self.position = 0 + + def push(self, state, action, reward, next_state, done): + ''' 缓冲区是一个队列,容量超出时去掉开始存入的转移(transition) + ''' + if len(self.buffer) < self.capacity: + self.buffer.append(None) + self.buffer[self.position] = (state, action, reward, next_state, done) + self.position = (self.position + 1) % self.capacity + + def sample(self, batch_size): + batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移 + state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等 + return state, action, reward, next_state, done + + def __len__(self): + ''' 返回当前存储的量 + ''' + return len(self.buffer) + +class DQN: + def __init__(self, state_dim, action_dim, cfg): + + self.action_dim = action_dim # 总的动作个数 self.device = cfg.device # 设备,cpu或gpu等 self.gamma = cfg.gamma # 奖励的折扣因子 # e-greedy策略相关参数 @@ -32,8 +73,8 @@ class DQN: (cfg.epsilon_start - cfg.epsilon_end) * \ math.exp(-1. * frame_idx / cfg.epsilon_decay) self.batch_size = cfg.batch_size - self.policy_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device) - self.target_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device) + self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device) + self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device) for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net target_param.data.copy_(param.data) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器 @@ -49,7 +90,7 @@ class DQN: q_values = self.policy_net(state) action = q_values.max(1)[1].item() # 选择Q值最大的动作 else: - action = random.randrange(self.n_actions) + action = random.randrange(self.action_dim) return action def update(self): if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时,不更新策略 diff --git a/codes/DQN/dqn_cnn.py b/codes/DQN/dqn_cnn.py new file mode 100644 index 0000000..c14118f --- /dev/null +++ b/codes/DQN/dqn_cnn.py @@ -0,0 +1,133 @@ +import torch +import torch.nn as nn +import torch.optim as optim +import torch.autograd as autograd +import random +import math +class CNN(nn.Module): + def __init__(self, input_dim, output_dim): + super(CNN, self).__init__() + + self.input_dim = input_dim + self.output_dim = output_dim + + self.features = nn.Sequential( + nn.Conv2d(input_dim[0], 32, kernel_size=8, stride=4), + nn.ReLU(), + nn.Conv2d(32, 64, kernel_size=4, stride=2), + nn.ReLU(), + nn.Conv2d(64, 64, kernel_size=3, stride=1), + nn.ReLU() + ) + + self.fc = nn.Sequential( + nn.Linear(self.feature_size(), 512), + nn.ReLU(), + nn.Linear(512, self.output_dim) + ) + + def forward(self, x): + x = self.features(x) + x = x.view(x.size(0), -1) + x = self.fc(x) + return x + + def feature_size(self): + return self.features(autograd.Variable(torch.zeros(1, *self.input_dim))).view(1, -1).size(1) + + + def act(self, state, epsilon): + if random.random() > epsilon: + state = Variable(torch.FloatTensor(np.float32(state)).unsqueeze(0), volatile=True) + q_value = self.forward(state) + action = q_value.max(1)[1].data[0] + else: + action = random.randrange(env.action_space.n) + return action + +class ReplayBuffer: + def __init__(self, capacity): + self.capacity = capacity # 经验回放的容量 + self.buffer = [] # 缓冲区 + self.position = 0 + + def push(self, state, action, reward, next_state, done): + ''' 缓冲区是一个队列,容量超出时去掉开始存入的转移(transition) + ''' + if len(self.buffer) < self.capacity: + self.buffer.append(None) + self.buffer[self.position] = (state, action, reward, next_state, done) + self.position = (self.position + 1) % self.capacity + + def sample(self, batch_size): + batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移 + state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等 + return state, action, reward, next_state, done + + def __len__(self): + ''' 返回当前存储的量 + ''' + return len(self.buffer) + +class DQN: + def __init__(self, state_dim, action_dim, cfg): + + self.action_dim = action_dim # 总的动作个数 + self.device = cfg.device # 设备,cpu或gpu等 + self.gamma = cfg.gamma # 奖励的折扣因子 + # e-greedy策略相关参数 + self.frame_idx = 0 # 用于epsilon的衰减计数 + self.epsilon = lambda frame_idx: cfg.epsilon_end + \ + (cfg.epsilon_start - cfg.epsilon_end) * \ + math.exp(-1. * frame_idx / cfg.epsilon_decay) + self.batch_size = cfg.batch_size + self.policy_net = CNN(state_dim, action_dim).to(self.device) + self.target_net = CNN(state_dim, action_dim).to(self.device) + for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net + target_param.data.copy_(param.data) + self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器 + self.memory = ReplayBuffer(cfg.memory_capacity) # 经验回放 + + def choose_action(self, state): + ''' 选择动作 + ''' + self.frame_idx += 1 + if random.random() > self.epsilon(self.frame_idx): + with torch.no_grad(): + state = torch.tensor([state], device=self.device, dtype=torch.float32) + q_values = self.policy_net(state) + action = q_values.max(1)[1].item() # 选择Q值最大的动作 + else: + action = random.randrange(self.action_dim) + return action + def update(self): + if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时,不更新策略 + return + # 从经验回放中(replay memory)中随机采样一个批量的转移(transition) + state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample( + self.batch_size) + # 转为张量 + state_batch = torch.tensor(state_batch, device=self.device, dtype=torch.float) + action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1) + reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float) + next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float) + done_batch = torch.tensor(np.float32(done_batch), device=self.device) + q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch) # 计算当前状态(s_t,a)对应的Q(s_t, a) + next_q_values = self.target_net(next_state_batch).max(1)[0].detach() # 计算下一时刻的状态(s_t_,a)对应的Q值 + # 计算期望的Q值,对于终止状态,此时done_batch[0]=1, 对应的expected_q_value等于reward + expected_q_values = reward_batch + self.gamma * next_q_values * (1-done_batch) + loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) # 计算均方根损失 + # 优化更新模型 + self.optimizer.zero_grad() + loss.backward() + for param in self.policy_net.parameters(): # clip防止梯度爆炸 + param.grad.data.clamp_(-1, 1) + self.optimizer.step() + + def save(self, path): + torch.save(self.target_net.state_dict(), path+'dqn_checkpoint.pth') + + def load(self, path): + self.target_net.load_state_dict(torch.load(path+'dqn_checkpoint.pth')) + for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()): + param.data.copy_(target_param.data) \ No newline at end of file diff --git a/codes/DQN/outputs/CartPole-v0/20211111-165800/models/dqn_checkpoint.pth b/codes/DQN/outputs/CartPole-v0/20211111-165800/models/dqn_checkpoint.pth deleted file mode 100644 index a0b6ef9..0000000 Binary files a/codes/DQN/outputs/CartPole-v0/20211111-165800/models/dqn_checkpoint.pth and /dev/null differ diff --git a/codes/DQN/outputs/CartPole-v0/20211111-165800/results/eval_rewards_curve.png b/codes/DQN/outputs/CartPole-v0/20211111-165800/results/eval_rewards_curve.png deleted file mode 100644 index a260f79..0000000 Binary files a/codes/DQN/outputs/CartPole-v0/20211111-165800/results/eval_rewards_curve.png and /dev/null differ diff --git a/codes/DQN/outputs/CartPole-v0/20211111-165800/results/train_ma_rewards.npy b/codes/DQN/outputs/CartPole-v0/20211111-165800/results/train_ma_rewards.npy deleted file mode 100644 index 1e0ab6c..0000000 Binary files a/codes/DQN/outputs/CartPole-v0/20211111-165800/results/train_ma_rewards.npy and /dev/null differ diff --git a/codes/DQN/outputs/CartPole-v0/20211111-165800/results/train_rewards_curve.png b/codes/DQN/outputs/CartPole-v0/20211111-165800/results/train_rewards_curve.png deleted file mode 100644 index 4c14b8d..0000000 Binary files a/codes/DQN/outputs/CartPole-v0/20211111-165800/results/train_rewards_curve.png and /dev/null differ diff --git a/codes/DQN/outputs/CartPole-v0/20211229-144313/models/dqn_checkpoint.pth b/codes/DQN/outputs/CartPole-v0/20211229-144313/models/dqn_checkpoint.pth new file mode 100644 index 0000000..7fcf736 Binary files /dev/null and b/codes/DQN/outputs/CartPole-v0/20211229-144313/models/dqn_checkpoint.pth differ diff --git a/codes/DQN/outputs/CartPole-v0/20211111-165800/results/eval_ma_rewards.npy b/codes/DQN/outputs/CartPole-v0/20211229-144313/results/test_ma_rewards.npy similarity index 100% rename from codes/DQN/outputs/CartPole-v0/20211111-165800/results/eval_ma_rewards.npy rename to codes/DQN/outputs/CartPole-v0/20211229-144313/results/test_ma_rewards.npy diff --git a/codes/DQN/outputs/CartPole-v0/20211111-165800/results/eval_rewards.npy b/codes/DQN/outputs/CartPole-v0/20211229-144313/results/test_rewards.npy similarity index 100% rename from codes/DQN/outputs/CartPole-v0/20211111-165800/results/eval_rewards.npy rename to codes/DQN/outputs/CartPole-v0/20211229-144313/results/test_rewards.npy diff --git a/codes/DQN/outputs/CartPole-v0/20211229-144313/results/test_rewards_curve.png b/codes/DQN/outputs/CartPole-v0/20211229-144313/results/test_rewards_curve.png new file mode 100644 index 0000000..bc60080 Binary files /dev/null and b/codes/DQN/outputs/CartPole-v0/20211229-144313/results/test_rewards_curve.png differ diff --git a/codes/DQN/outputs/CartPole-v0/20211229-144313/results/train_ma_rewards.npy b/codes/DQN/outputs/CartPole-v0/20211229-144313/results/train_ma_rewards.npy new file mode 100644 index 0000000..d81acd2 Binary files /dev/null and b/codes/DQN/outputs/CartPole-v0/20211229-144313/results/train_ma_rewards.npy differ diff --git a/codes/DQN/outputs/CartPole-v0/20211111-165800/results/train_rewards.npy b/codes/DQN/outputs/CartPole-v0/20211229-144313/results/train_rewards.npy similarity index 55% rename from codes/DQN/outputs/CartPole-v0/20211111-165800/results/train_rewards.npy rename to codes/DQN/outputs/CartPole-v0/20211229-144313/results/train_rewards.npy index 88c137f..900914d 100644 Binary files a/codes/DQN/outputs/CartPole-v0/20211111-165800/results/train_rewards.npy and b/codes/DQN/outputs/CartPole-v0/20211229-144313/results/train_rewards.npy differ diff --git a/codes/DQN/outputs/CartPole-v0/20211229-144313/results/train_rewards_curve.png b/codes/DQN/outputs/CartPole-v0/20211229-144313/results/train_rewards_curve.png new file mode 100644 index 0000000..9df7664 Binary files /dev/null and b/codes/DQN/outputs/CartPole-v0/20211229-144313/results/train_rewards_curve.png differ diff --git a/codes/DQN/task0.py b/codes/DQN/task0.py new file mode 100644 index 0000000..c7cd5da --- /dev/null +++ b/codes/DQN/task0.py @@ -0,0 +1,148 @@ +import sys +import os +curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 +parent_path = os.path.dirname(curr_path) # 父路径 +sys.path.append(parent_path) # 添加路径到系统路径 + +import gym +import torch +import datetime +import numpy as np +from common.utils import save_results, make_dir +from common.utils import plot_rewards +from DQN.dqn import DQN + +curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 + + +class Config: + '''超参数 + ''' + + def __init__(self): + ################################## 环境超参数 ################################### + self.algo_name = 'DQN' # 算法名称 + self.env_name = 'CartPole-v0' # 环境名称 + self.device = torch.device( + "cuda" if torch.cuda.is_available() else "cpu") # 检测GPUgjgjlkhfsf风刀霜的撒发十 + self.seed = 10 # 随机种子,置0则不设置随机种子 + self.train_eps = 200 # 训练的回合数 + self.test_eps = 30 # 测试的回合数 + ################################################################################ + + ################################## 算法超参数 ################################### + self.gamma = 0.95 # 强化学习中的折扣因子 + self.epsilon_start = 0.90 # e-greedy策略中初始epsilon + self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon + self.epsilon_decay = 500 # e-greedy策略中epsilon的衰减率 + self.lr = 0.0001 # 学习率 + self.memory_capacity = 100000 # 经验回放的容量 + self.batch_size = 64 # mini-batch SGD中的批量大小 + self.target_update = 4 # 目标网络的更新频率 + self.hidden_dim = 256 # 网络隐藏层 + ################################################################################ + + ################################# 保存结果相关参数 ############################## + self.result_path = curr_path + "/outputs/" + self.env_name + \ + '/' + curr_time + '/results/' # 保存结果的路径 + self.model_path = curr_path + "/outputs/" + self.env_name + \ + '/' + curr_time + '/models/' # 保存模型的路径 + self.save = True # 是否保存图片 + ################################################################################ + + +def env_agent_config(cfg): + ''' 创建环境和智能体 + ''' + env = gym.make(cfg.env_name) # 创建环境 + state_dim = env.observation_space.shape[0] # 状态维度 + action_dim = env.action_space.n # 动作维度 + agent = DQN(state_dim, action_dim, cfg) # 创建智能体 + if cfg.seed !=0: # 设置随机种子 + torch.manual_seed(cfg.seed) + env.seed(cfg.seed) + np.random.seed(cfg.seed) + return env, agent + + +def train(cfg, env, agent): + ''' 训练 + ''' + print('开始训练!') + print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') + rewards = [] # 记录所有回合的奖励 + ma_rewards = [] # 记录所有回合的滑动平均奖励 + for i_ep in range(cfg.train_eps): + ep_reward = 0 # 记录一回合内的奖励 + state = env.reset() # 重置环境,返回初始状态 + while True: + action = agent.choose_action(state) # 选择动作 + next_state, reward, done, _ = env.step(action) # 更新环境,返回transition + agent.memory.push(state, action, reward, + next_state, done) # 保存transition + state = next_state # 更新下一个状态 + agent.update() # 更新智能体 + ep_reward += reward # 累加奖励 + if done: + break + if (i_ep + 1) % cfg.target_update == 0: # 智能体目标网络更新 + agent.target_net.load_state_dict(agent.policy_net.state_dict()) + rewards.append(ep_reward) + if ma_rewards: + ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward) + else: + ma_rewards.append(ep_reward) + if (i_ep + 1) % 10 == 0: + print('回合:{}/{}, 奖励:{}'.format(i_ep + 1, cfg.train_eps, ep_reward)) + print('完成训练!') + env.close() + return rewards, ma_rewards + + +def test(cfg, env, agent): + print('开始测试!') + print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') + ############# 由于测试不需要使用epsilon-greedy策略,所以相应的值设置为0 ############### + cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon + cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon + ################################################################################ + rewards = [] # 记录所有回合的奖励 + ma_rewards = [] # 记录所有回合的滑动平均奖励 + for i_ep in range(cfg.test_eps): + ep_reward = 0 # 记录一回合内的奖励 + state = env.reset() # 重置环境,返回初始状态 + while True: + action = agent.choose_action(state) # 选择动作 + next_state, reward, done, _ = env.step(action) # 更新环境,返回transition + state = next_state # 更新下一个状态 + ep_reward += reward # 累加奖励 + if done: + break + rewards.append(ep_reward) + if ma_rewards: + ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1) + else: + ma_rewards.append(ep_reward) + print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}") + print('完成测试!') + env.close() + return rewards, ma_rewards + + +if __name__ == "__main__": + cfg = Config() + # 训练 + env, agent = env_agent_config(cfg) + rewards, ma_rewards = train(cfg, env, agent) + make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹 + agent.save(path=cfg.model_path) # 保存模型 + save_results(rewards, ma_rewards, tag='train', + path=cfg.result_path) # 保存结果 + plot_rewards(rewards, ma_rewards, cfg, tag="train") # 画出结果 + # 测试 + env, agent = env_agent_config(cfg) + agent.load(path=cfg.model_path) # 导入模型 + rewards, ma_rewards = test(cfg, env, agent) + save_results(rewards, ma_rewards, tag='test', + path=cfg.result_path) # 保存结果 + plot_rewards(rewards, ma_rewards, cfg, tag="test") # 画出结果 diff --git a/codes/DQN/task0_train.ipynb b/codes/DQN/task0_train.ipynb deleted file mode 100644 index 464e216..0000000 --- a/codes/DQN/task0_train.ipynb +++ /dev/null @@ -1,423 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "from pathlib import Path\n", - "curr_path = str(Path().absolute()) # 当前路径\n", - "parent_path = str(Path().absolute().parent) # 父路径\n", - "sys.path.append(parent_path) # 添加路径到系统路径\n", - "\n", - "import math,random\n", - "import gym\n", - "import torch\n", - "import torch.nn as nn\n", - "import torch.optim as optim\n", - "import torch.nn.functional as F\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "from IPython.display import clear_output # 清空单元格输出区域" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 网络模型" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "class MLP(nn.Module):\n", - " def __init__(self, n_states,n_actions,hidden_dim=128):\n", - " \"\"\" 初始化q网络,为全连接网络\n", - " n_states: 输入的特征数即环境的状态数\n", - " n_actions: 输出的动作维度\n", - " \"\"\"\n", - " super(MLP, self).__init__()\n", - " self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层\n", - " self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层\n", - " self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层\n", - " \n", - " def forward(self, x):\n", - " # 各层对应的激活函数\n", - " x = F.relu(self.fc1(x)) \n", - " x = F.relu(self.fc2(x))\n", - " return self.fc3(x)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 经验回放" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "class ReplayBuffer:\n", - " def __init__(self, capacity):\n", - " self.capacity = capacity # 经验回放的容量\n", - " self.buffer = [] # 缓冲区\n", - " self.position = 0 \n", - " \n", - " def push(self, state, action, reward, next_state, done):\n", - " ''' 缓冲区是一个队列,容量超出时去掉开始存入的转移(transition)\n", - " '''\n", - " if len(self.buffer) < self.capacity:\n", - " self.buffer.append(None)\n", - " self.buffer[self.position] = (state, action, reward, next_state, done)\n", - " self.position = (self.position + 1) % self.capacity \n", - " \n", - " def sample(self, batch_size):\n", - " batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移\n", - " state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等\n", - " return state, action, reward, next_state, done\n", - " \n", - " def __len__(self):\n", - " ''' 返回当前存储的量\n", - " '''\n", - " return len(self.buffer)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## DQN" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "class DQN:\n", - " def __init__(self, n_states, n_actions, cfg):\n", - "\n", - " self.n_actions = n_actions # 总的动作个数\n", - " self.device = cfg.device # 设备,cpu或gpu等\n", - " self.gamma = cfg.gamma # 奖励的折扣因子\n", - " # e-greedy策略相关参数\n", - " self.frame_idx = 0 # 用于epsilon的衰减计数\n", - " self.epsilon = lambda frame_idx: cfg.epsilon_end + \\\n", - " (cfg.epsilon_start - cfg.epsilon_end) * \\\n", - " math.exp(-1. * frame_idx / cfg.epsilon_decay)\n", - " self.batch_size = cfg.batch_size\n", - " self.policy_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)\n", - " self.target_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)\n", - " for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net\n", - " target_param.data.copy_(param.data)\n", - " self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器\n", - " self.memory = ReplayBuffer(cfg.memory_capacity) # 经验回放\n", - "\n", - " def choose_action(self, state):\n", - " ''' 选择动作\n", - " '''\n", - " self.frame_idx += 1\n", - " if random.random() > self.epsilon(self.frame_idx):\n", - " with torch.no_grad():\n", - " state = torch.tensor([state], device=self.device, dtype=torch.float32)\n", - " q_values = self.policy_net(state)\n", - " action = q_values.max(1)[1].item() # 选择Q值最大的动作\n", - " else:\n", - " action = random.randrange(self.n_actions)\n", - " return action\n", - " def update(self):\n", - " if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时,不更新策略\n", - " return\n", - " # 从经验回放中(replay memory)中随机采样一个批量的转移(transition)\n", - " state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(\n", - " self.batch_size)\n", - " # 转为张量\n", - " state_batch = torch.tensor(state_batch, device=self.device, dtype=torch.float)\n", - " action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1) \n", - " reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float) \n", - " next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float)\n", - " done_batch = torch.tensor(np.float32(done_batch), device=self.device)\n", - " q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch) # 计算当前状态(s_t,a)对应的Q(s_t, a)\n", - " next_q_values = self.target_net(next_state_batch).max(1)[0].detach() # 计算下一时刻的状态(s_t_,a)对应的Q值\n", - " # 计算期望的Q值,对于终止状态,此时done_batch[0]=1, 对应的expected_q_value等于reward\n", - " expected_q_values = reward_batch + self.gamma * next_q_values * (1-done_batch)\n", - " loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) # 计算均方根损失\n", - " # 优化更新模型\n", - " self.optimizer.zero_grad() \n", - " loss.backward()\n", - " for param in self.policy_net.parameters(): # clip防止梯度爆炸\n", - " param.grad.data.clamp_(-1, 1)\n", - " self.optimizer.step()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### DQN参数" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "class DQNConfig:\n", - " def __init__(self):\n", - " self.algo = \"DQN\" # 算法名称\n", - " self.env = 'CartPole-v0' # 环境名称\n", - " self.train_eps = 200 # 训练的回合数\n", - " self.eval_eps = 20 # 测试的回合数\n", - " self.gamma = 0.95 # 强化学习中的折扣因子\n", - " self.epsilon_start = 0.90 # e-greedy策略中初始epsilon\n", - " self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon\n", - " self.epsilon_decay = 500 # e-greedy策略中epsilon的衰减率\n", - " self.lr = 0.0001 # 学习率\n", - " self.memory_capacity = 100000 # 经验回放的容量\n", - " self.batch_size = 64 # mini-batch SGD中的批量大小\n", - " self.target_update = 4 # 目标网络的更新频率\n", - " self.device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\") # 检测GPU\n", - " self.hidden_dim = 256 # 网络隐藏层" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 创建环境" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "def env_agent_config(cfg,seed=1):\n", - " ''' 创建环境和智能体\n", - " '''\n", - " env = gym.make(cfg.env) # 创建环境\n", - " env.seed(seed) # 设置随机种子\n", - " n_states = env.observation_space.shape[0] # 状态数\n", - " n_actions = env.action_space.n # 动作数\n", - " agent = DQN(n_states,n_actions,cfg) # 创建智能体\n", - " return env,agent" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 训练" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "开始训练!\n", - "环境:CartPole-v0, 算法:DQN, 设备:cuda\n", - "回合:10/200, 奖励:12.0\n", - "回合:20/200, 奖励:16.0\n", - "回合:30/200, 奖励:15.0\n", - "回合:40/200, 奖励:14.0\n", - "回合:50/200, 奖励:13.0\n", - "回合:60/200, 奖励:27.0\n", - "回合:70/200, 奖励:36.0\n", - "回合:80/200, 奖励:33.0\n", - "回合:90/200, 奖励:200.0\n", - "回合:100/200, 奖励:200.0\n", - "回合:110/200, 奖励:200.0\n", - "回合:120/200, 奖励:200.0\n", - "回合:130/200, 奖励:200.0\n", - "回合:140/200, 奖励:200.0\n", - "回合:150/200, 奖励:200.0\n", - "回合:160/200, 奖励:200.0\n", - "回合:170/200, 奖励:200.0\n", - "回合:180/200, 奖励:200.0\n", - "回合:190/200, 奖励:200.0\n", - "回合:200/200, 奖励:200.0\n", - "完成训练!\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "def train(cfg, env, agent):\n", - " ''' 训练\n", - " '''\n", - " print('开始训练!')\n", - " print(f'环境:{cfg.env}, 算法:{cfg.algo}, 设备:{cfg.device}')\n", - " rewards = [] # 记录所有回合的奖励\n", - " ma_rewards = [] # 记录所有回合的滑动平均奖励\n", - " for i_ep in range(cfg.train_eps):\n", - " ep_reward = 0 # 记录一回合内的奖励\n", - " state = env.reset() # 重置环境,返回初始状态\n", - " while True:\n", - " action = agent.choose_action(state) # 选择动作\n", - " next_state, reward, done, _ = env.step(action) # 更新环境,返回transition\n", - " agent.memory.push(state, action, reward, next_state, done) # 保存transition\n", - " state = next_state # 更新下一个状态\n", - " agent.update() # 更新智能体\n", - " ep_reward += reward # 累加奖励\n", - " if done:\n", - " break\n", - " if (i_ep+1) % cfg.target_update == 0: # 智能体目标网络更新\n", - " agent.target_net.load_state_dict(agent.policy_net.state_dict())\n", - " if (i_ep+1)%10 == 0: \n", - " print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward))\n", - " rewards.append(ep_reward)\n", - " if ma_rewards:\n", - " ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)\n", - " else:\n", - " ma_rewards.append(ep_reward)\n", - " print('完成训练!')\n", - " return rewards, ma_rewards\n", - "\n", - "def plot_rewards(rewards,ma_rewards,plot_cfg):\n", - " # clear_output(True) # 清空单元格输出区域,因为多次打印,每次需要清楚前面打印的图片\n", - " sns.set() \n", - " plt.figure() # 创建一个图形实例,方便同时多画几个图\n", - " plt.title(\"learning curve on {} of {} for {}\".format(plot_cfg.device, plot_cfg.algo, plot_cfg.env))\n", - " plt.xlabel('epsiodes')\n", - " plt.plot(rewards,label='rewards')\n", - " plt.plot(ma_rewards,label='ma rewards')\n", - " plt.legend()\n", - " plt.show()\n", - "\n", - "class PlotConfig:\n", - " def __init__(self) -> None:\n", - " self.algo = \"DQN\" # 算法名称\n", - " self.env = 'CartPole-v0' # 环境名称\n", - " self.device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\") # 检测GPU\n", - "\n", - "cfg = DQNConfig()\n", - "plot_cfg = PlotConfig()\n", - "env,agent = env_agent_config(cfg,seed=1)\n", - "rewards, ma_rewards = train(cfg, env, agent)\n", - "plot_rewards(rewards, ma_rewards, plot_cfg) # 画出结果" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "开始测试!\n", - "环境:CartPole-v0, 算法:DQN, 设备:cuda\n", - "回合:3/20, 奖励:200.0\n", - "回合:6/20, 奖励:200.0\n", - "回合:9/20, 奖励:200.0\n", - "回合:12/20, 奖励:200.0\n", - "回合:15/20, 奖励:200.0\n", - "回合:18/20, 奖励:200.0\n", - "完成测试!\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "def eval(cfg,env,agent):\n", - " print('开始测试!')\n", - " print(f'环境:{cfg.env}, 算法:{cfg.algo}, 设备:{cfg.device}')\n", - " # 由于测试不需要使用epsilon-greedy策略,所以相应的值设置为0\n", - " cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon\n", - " cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon\n", - " rewards = [] # 记录所有回合的奖励\n", - " ma_rewards = [] # 记录所有回合的滑动平均奖励\n", - " for i_ep in range(cfg.eval_eps):\n", - " ep_reward = 0 # 记录一回合内的奖励\n", - " state = env.reset() # 重置环境,返回初始状态\n", - " while True:\n", - " action = agent.choose_action(state) # 选择动作\n", - " next_state, reward, done, _ = env.step(action) # 更新环境,返回transition\n", - " state = next_state # 更新下一个状态\n", - " ep_reward += reward # 累加奖励\n", - " if done:\n", - " break\n", - " rewards.append(ep_reward)\n", - " if ma_rewards:\n", - " ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)\n", - " else:\n", - " ma_rewards.append(ep_reward)\n", - " if (i_ep+1)%3 == 0: \n", - " print(f\"回合:{i_ep+1}/{cfg.eval_eps}, 奖励:{ep_reward:.1f}\")\n", - " print('完成测试!')\n", - " return rewards,ma_rewards\n", - "\n", - "rewards,ma_rewards = eval(cfg,env,agent)\n", - "plot_rewards(rewards,ma_rewards, plot_cfg) # 画出结果\n" - ] - } - ], - "metadata": { - "interpreter": { - "hash": "fe38df673a99c62a9fea33a7aceda74c9b65b12ee9d076c5851d98b692a4989a" - }, - "kernelspec": { - "display_name": "Python 3.7.10 64-bit ('py37': conda)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.9" - }, - "metadata": { - "interpreter": { - "hash": "366e1054dee9d4501b0eb8f87335afd3c67fc62db6ee611bbc7f8f5a1fefe232" - } - }, - "orig_nbformat": 2 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/codes/DQN/task0_train.py b/codes/DQN/task0_train.py deleted file mode 100644 index 5fd0ccd..0000000 --- a/codes/DQN/task0_train.py +++ /dev/null @@ -1,137 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -@Author: John -@Email: johnjim0816@gmail.com -@Date: 2020-06-12 00:48:57 -@LastEditor: John -LastEditTime: 2021-09-15 15:34:13 -@Discription: -@Environment: python 3.7.7 -''' -import sys,os -curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 -parent_path = os.path.dirname(curr_path) # 父路径 -sys.path.append(parent_path) # 添加路径到系统路径 - -import gym -import torch -import datetime - -from common.utils import save_results, make_dir -from common.plot import plot_rewards -from DQN.agent import DQN - -curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 -class DQNConfig: - def __init__(self): - self.algo = "DQN" # 算法名称 - self.env_name = 'CartPole-v0' # 环境名称 - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU - self.train_eps = 200 # 训练的回合数 - self.eval_eps = 30 # 测试的回合数 - # 超参数 - self.gamma = 0.95 # 强化学习中的折扣因子 - self.epsilon_start = 0.90 # e-greedy策略中初始epsilon - self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon - self.epsilon_decay = 500 # e-greedy策略中epsilon的衰减率 - self.lr = 0.0001 # 学习率 - self.memory_capacity = 100000 # 经验回放的容量 - self.batch_size = 64 # mini-batch SGD中的批量大小 - self.target_update = 4 # 目标网络的更新频率 - self.hidden_dim = 256 # 网络隐藏层 -class PlotConfig: - def __init__(self) -> None: - self.algo = "DQN" # 算法名称 - self.env_name = 'CartPole-v0' # 环境名称 - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU - self.result_path = curr_path+"/outputs/" + self.env_name + \ - '/'+curr_time+'/results/' # 保存结果的路径 - self.model_path = curr_path+"/outputs/" + self.env_name + \ - '/'+curr_time+'/models/' # 保存模型的路径 - self.save = True # 是否保存图片 - -def env_agent_config(cfg,seed=1): - ''' 创建环境和智能体 - ''' - env = gym.make(cfg.env_name) # 创建环境 - env.seed(seed) # 设置随机种子 - n_states = env.observation_space.shape[0] # 状态数 - n_actions = env.action_space.n # 动作数 - agent = DQN(n_states,n_actions,cfg) # 创建智能体 - return env,agent - -def train(cfg, env, agent): - ''' 训练 - ''' - print('开始训练!') - print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}') - rewards = [] # 记录所有回合的奖励 - ma_rewards = [] # 记录所有回合的滑动平均奖励 - for i_ep in range(cfg.train_eps): - ep_reward = 0 # 记录一回合内的奖励 - state = env.reset() # 重置环境,返回初始状态 - while True: - action = agent.choose_action(state) # 选择动作 - next_state, reward, done, _ = env.step(action) # 更新环境,返回transition - agent.memory.push(state, action, reward, next_state, done) # 保存transition - state = next_state # 更新下一个状态 - agent.update() # 更新智能体 - ep_reward += reward # 累加奖励 - if done: - break - if (i_ep+1) % cfg.target_update == 0: # 智能体目标网络更新 - agent.target_net.load_state_dict(agent.policy_net.state_dict()) - if (i_ep+1)%10 == 0: - print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward)) - rewards.append(ep_reward) - if ma_rewards: - ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) - else: - ma_rewards.append(ep_reward) - print('完成训练!') - return rewards, ma_rewards - -def eval(cfg,env,agent): - print('开始测试!') - print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}') - # 由于测试不需要使用epsilon-greedy策略,所以相应的值设置为0 - cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon - cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon - rewards = [] # 记录所有回合的奖励 - ma_rewards = [] # 记录所有回合的滑动平均奖励 - for i_ep in range(cfg.eval_eps): - ep_reward = 0 # 记录一回合内的奖励 - state = env.reset() # 重置环境,返回初始状态 - while True: - action = agent.choose_action(state) # 选择动作 - next_state, reward, done, _ = env.step(action) # 更新环境,返回transition - state = next_state # 更新下一个状态 - ep_reward += reward # 累加奖励 - if done: - break - rewards.append(ep_reward) - if ma_rewards: - ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1) - else: - ma_rewards.append(ep_reward) - print(f"回合:{i_ep+1}/{cfg.eval_eps}, 奖励:{ep_reward:.1f}") - print('完成测试!') - return rewards,ma_rewards - -if __name__ == "__main__": - cfg = DQNConfig() - plot_cfg = PlotConfig() - # 训练 - env,agent = env_agent_config(cfg,seed=1) - rewards, ma_rewards = train(cfg, env, agent) - make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹 - agent.save(path=plot_cfg.model_path) # 保存模型 - save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path) # 保存结果 - plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果 - # 测试 - env,agent = env_agent_config(cfg,seed=10) - agent.load(path=plot_cfg.model_path) # 导入模型 - rewards,ma_rewards = eval(cfg,env,agent) - save_results(rewards,ma_rewards,tag='eval',path=plot_cfg.result_path) # 保存结果 - plot_rewards(rewards,ma_rewards, plot_cfg, tag="eval") # 画出结果 diff --git a/codes/DQN/task1.py b/codes/DQN/task1.py new file mode 100644 index 0000000..078aa4c --- /dev/null +++ b/codes/DQN/task1.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python +# coding=utf-8 +''' +Author: JiangJi +Email: johnjim0816@gmail.com +Date: 2021-12-22 11:14:17 +LastEditor: JiangJi +LastEditTime: 2021-12-22 11:40:44 +Discription: 使用 Nature DQN 训练 CartPole-v1 +''' +import sys +import os +curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 +parent_path = os.path.dirname(curr_path) # 父路径 +sys.path.append(parent_path) # 添加路径到系统路径 + +import gym +import torch +import datetime +from common.utils import save_results, make_dir +from common.utils import plot_rewards, plot_rewards_cn +from DQN.dqn import DQN + +curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 +algo_name = "DQN" # 算法名称 +env_name = 'CartPole-v1' # 环境名称 +class DQNConfig: + ''' 算法相关参数设置 + ''' + + def __init__(self): + self.algo_name = algo_name # 算法名称 + self.env_name = env_name # 环境名称 + self.device = torch.device( + "cuda" if torch.cuda.is_available() else "cpu") # 检测GPU + self.train_eps = 200 # 训练的回合数 + self.test_eps = 30 # 测试的回合数 + # 超参数 + self.gamma = 0.95 # 强化学习中的折扣因子 + self.epsilon_start = 0.90 # e-greedy策略中初始epsilon + self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon + self.epsilon_decay = 500 # e-greedy策略中epsilon的衰减率 + self.lr = 0.0001 # 学习率 + self.memory_capacity = 100000 # 经验回放的容量 + self.batch_size = 64 # mini-batch SGD中的批量大小 + self.target_update = 4 # 目标网络的更新频率 + self.hidden_dim = 256 # 网络隐藏层 +class PlotConfig: + ''' 绘图相关参数设置 + ''' + + def __init__(self) -> None: + self.algo_name = algo_name # 算法名称 + self.env_name = env_name # 环境名称 + self.device = torch.device( + "cuda" if torch.cuda.is_available() else "cpu") # 检测GPU + self.result_path = curr_path + "/outputs/" + self.env_name + \ + '/' + curr_time + '/results/' # 保存结果的路径 + self.model_path = curr_path + "/outputs/" + self.env_name + \ + '/' + curr_time + '/models/' # 保存模型的路径 + self.save = True # 是否保存图片 + + +def env_agent_config(cfg, seed=1): + ''' 创建环境和智能体 + ''' + env = gym.make(cfg.env_name) # 创建环境 + env.seed(seed) # 设置随机种子 + state_dim = env.observation_space.shape[0] # 状态维度 + action_dim = env.action_space.n # 动作维度 + agent = DQN(state_dim, action_dim, cfg) # 创建智能体 + return env, agent + +def train(cfg, env, agent): + ''' 训练 + ''' + print('开始训练!') + print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') + rewards = [] # 记录所有回合的奖励 + ma_rewards = [] # 记录所有回合的滑动平均奖励 + for i_ep in range(cfg.train_eps): + ep_reward = 0 # 记录一回合内的奖励 + state = env.reset() # 重置环境,返回初始状态 + while True: + action = agent.choose_action(state) # 选择动作 + next_state, reward, done, _ = env.step(action) # 更新环境,返回transition + agent.memory.push(state, action, reward, next_state, done) # 保存transition + state = next_state # 更新下一个状态 + agent.update() # 更新智能体 + ep_reward += reward # 累加奖励 + if done: + break + if (i_ep+1) % cfg.target_update == 0: # 智能体目标网络更新 + agent.target_net.load_state_dict(agent.policy_net.state_dict()) + rewards.append(ep_reward) + if ma_rewards: + ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) + else: + ma_rewards.append(ep_reward) + if (i_ep+1)%10 == 0: + print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward)) + print('完成训练!') + return rewards, ma_rewards + +def test(cfg,env,agent): + print('开始测试!') + print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') + # 由于测试不需要使用epsilon-greedy策略,所以相应的值设置为0 + cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon + cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon + rewards = [] # 记录所有回合的奖励 + ma_rewards = [] # 记录所有回合的滑动平均奖励 + for i_ep in range(cfg.test_eps): + ep_reward = 0 # 记录一回合内的奖励 + state = env.reset() # 重置环境,返回初始状态 + while True: + action = agent.choose_action(state) # 选择动作 + next_state, reward, done, _ = env.step(action) # 更新环境,返回transition + state = next_state # 更新下一个状态 + ep_reward += reward # 累加奖励 + if done: + break + rewards.append(ep_reward) + if ma_rewards: + ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1) + else: + ma_rewards.append(ep_reward) + print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}") + print('完成测试!') + return rewards,ma_rewards +if __name__ == "__main__": + cfg = DQNConfig() + plot_cfg = PlotConfig() + # 训练 + env, agent = env_agent_config(cfg, seed=1) + rewards, ma_rewards = train(cfg, env, agent) + make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹 + agent.save(path=plot_cfg.model_path) # 保存模型 + save_results(rewards, ma_rewards, tag='train', + path=plot_cfg.result_path) # 保存结果 + plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果 + # 测试 + env, agent = env_agent_config(cfg, seed=10) + agent.load(path=plot_cfg.model_path) # 导入模型 + rewards, ma_rewards = test(cfg, env, agent) + save_results(rewards, ma_rewards, tag='test', + path=plot_cfg.result_path) # 保存结果 + plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="test") # 画出结果 diff --git a/codes/DQN/task2.py b/codes/DQN/task2.py new file mode 100644 index 0000000..16571b2 --- /dev/null +++ b/codes/DQN/task2.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python +# coding=utf-8 +''' +Author: JiangJi +Email: johnjim0816@gmail.com +Date: 2021-12-22 11:14:17 +LastEditor: JiangJi +LastEditTime: 2021-12-22 15:27:48 +Discription: 使用 DQN-cnn 训练 PongNoFrameskip-v4 +''' +import sys +import os +curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 +parent_path = os.path.dirname(curr_path) # 父路径 +sys.path.append(parent_path) # 添加路径到系统路径 + +import gym +import torch +import datetime +from common.utils import save_results, make_dir +from common.utils import plot_rewards, plot_rewards_cn +from common.atari_wrappers import make_atari, wrap_deepmind +from DQN.dqn import DQN + +curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 +algo_name = 'DQN-cnn' # 算法名称 +env_name = 'PongNoFrameskip-v4' # 环境名称 +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU +class DQNConfig: + ''' 算法相关参数设置 + ''' + + def __init__(self): + self.algo_name = algo_name # 算法名称 + self.env_name = env_name # 环境名称 + self.device = device # 检测GPU + self.train_eps = 500 # 训练的回合数 + self.test_eps = 30 # 测试的回合数 + # 超参数 + self.gamma = 0.95 # 强化学习中的折扣因子 + self.epsilon_start = 0.90 # e-greedy策略中初始epsilon + self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon + self.epsilon_decay = 500 # e-greedy策略中epsilon的衰减率 + self.lr = 0.0001 # 学习率 + self.memory_capacity = 100000 # 经验回放的容量 + self.batch_size = 64 # mini-batch SGD中的批量大小 + self.target_update = 4 # 目标网络的更新频率 + self.hidden_dim = 256 # 网络隐藏层 +class PlotConfig: + ''' 绘图相关参数设置 + ''' + + def __init__(self) -> None: + self.algo_name = algo_name # 算法名称 + self.env_name = env_name # 环境名称 + self.device = device # 检测GPU + self.result_path = curr_path + "/outputs/" + self.env_name + \ + '/' + curr_time + '/results/' # 保存结果的路径 + self.model_path = curr_path + "/outputs/" + self.env_name + \ + '/' + curr_time + '/models/' # 保存模型的路径 + self.save = True # 是否保存图片 + + +def env_agent_config(cfg, seed=1): + ''' 创建环境和智能体 + ''' + env = make_atari(cfg.env_name) # 创建环境 + # env = wrap_deepmind(env) + # env = wrap_pytorch(env) + env.seed(seed) # 设置随机种子 + state_dim = env.observation_space.shape[0] # 状态维度 + action_dim = env.action_space.n # 动作维度 + agent = DQN(state_dim, action_dim, cfg) # 创建智能体 + return env, agent + +def train(cfg, env, agent): + ''' 训练 + ''' + print('开始训练!') + print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') + rewards = [] # 记录所有回合的奖励 + ma_rewards = [] # 记录所有回合的滑动平均奖励 + for i_ep in range(cfg.train_eps): + ep_reward = 0 # 记录一回合内的奖励 + state = env.reset() # 重置环境,返回初始状态 + while True: + action = agent.choose_action(state) # 选择动作 + next_state, reward, done, _ = env.step(action) # 更新环境,返回transition + agent.memory.push(state, action, reward, next_state, done) # 保存transition + state = next_state # 更新下一个状态 + agent.update() # 更新智能体 + ep_reward += reward # 累加奖励 + if done: + break + if (i_ep+1) % cfg.target_update == 0: # 智能体目标网络更新 + agent.target_net.load_state_dict(agent.policy_net.state_dict()) + rewards.append(ep_reward) + if ma_rewards: + ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) + else: + ma_rewards.append(ep_reward) + if (i_ep+1)%10 == 0: + print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward)) + print('完成训练!') + return rewards, ma_rewards + +def test(cfg,env,agent): + print('开始测试!') + print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') + # 由于测试不需要使用epsilon-greedy策略,所以相应的值设置为0 + cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon + cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon + rewards = [] # 记录所有回合的奖励 + ma_rewards = [] # 记录所有回合的滑动平均奖励 + for i_ep in range(cfg.test_eps): + ep_reward = 0 # 记录一回合内的奖励 + state = env.reset() # 重置环境,返回初始状态 + while True: + action = agent.choose_action(state) # 选择动作 + next_state, reward, done, _ = env.step(action) # 更新环境,返回transition + state = next_state # 更新下一个状态 + ep_reward += reward # 累加奖励 + if done: + break + rewards.append(ep_reward) + if ma_rewards: + ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1) + else: + ma_rewards.append(ep_reward) + print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}") + print('完成测试!') + return rewards,ma_rewards +if __name__ == "__main__": + cfg = DQNConfig() + plot_cfg = PlotConfig() + # 训练 + env, agent = env_agent_config(cfg, seed=1) + rewards, ma_rewards = train(cfg, env, agent) + make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹 + agent.save(path=plot_cfg.model_path) # 保存模型 + save_results(rewards, ma_rewards, tag='train', + path=plot_cfg.result_path) # 保存结果 + plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果 + # 测试 + env, agent = env_agent_config(cfg, seed=10) + agent.load(path=plot_cfg.model_path) # 导入模型 + rewards, ma_rewards = test(cfg, env, agent) + save_results(rewards, ma_rewards, tag='test', + path=plot_cfg.result_path) # 保存结果 + plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="test") # 画出结果 diff --git a/codes/Docs/使用DDPG解决倒立摆问题.md b/codes/Docs/使用DDPG解决倒立摆问题.md index da815dc..fd625f5 100644 --- a/codes/Docs/使用DDPG解决倒立摆问题.md +++ b/codes/Docs/使用DDPG解决倒立摆问题.md @@ -6,7 +6,7 @@ image-20210915161550713 -该环境的状态数有三个,设摆针竖直方向上的顺时针旋转角为$\theta$,$\theta$设在$[-\pi,\pi]$之间,则相应的状态为$[cos\theta,sin\theta,\dot{\theta}]$,即表示角度和角速度,我们的动作则是一个-2到2之间的力矩,它是一个连续量,因而该环境不能用离散动作的算法比如 DQN 来解决。关于奖励是根据相关的物理原理而计算出的等式,如下: +该环境的状态维度有三个,设摆针竖直方向上的顺时针旋转角为$\theta$,$\theta$设在$[-\pi,\pi]$之间,则相应的状态为$[cos\theta,sin\theta,\dot{\theta}]$,即表示角度和角速度,我们的动作则是一个-2到2之间的力矩,它是一个连续量,因而该环境不能用离散动作的算法比如 DQN 来解决。关于奖励是根据相关的物理原理而计算出的等式,如下: $$ -\left(\theta^{2}+0.1 * \hat{\theta}^{2}+0.001 * \text { action }^{2}\right) $$ diff --git a/codes/Docs/使用DQN解决推车杆问题.md b/codes/Docs/使用DQN解决推车杆问题.md index 5889165..393c52d 100644 --- a/codes/Docs/使用DQN解决推车杆问题.md +++ b/codes/Docs/使用DQN解决推车杆问题.md @@ -14,21 +14,21 @@ CartPole-v0是一个经典的入门环境,如下图,它通过向左(动作=0 import gym env = gym.make('CartPole-v0') # 建立环境 env.seed(1) # 随机种子 -n_states = env.observation_space.shape[0] # 状态数 -n_actions = env.action_space.n # 动作数 +state_dim = env.observation_space.shape[0] # 状态维度 +action_dim = env.action_space.n # 动作维度 state = env.reset() # 初始化环境 -print(f"状态数:{n_states},动作数:{n_actions}") +print(f"状态维度:{state_dim},动作维度:{action_dim}") print(f"初始状态:{state}") ``` 可以得到结果: ```bash -状态数:4,动作数:2 +状态维度:4,动作维度:2 初始状态:[ 0.03073904 0.00145001 -0.03088818 -0.03131252] ``` -该环境状态数是四个,分别为车的位置、车的速度、杆的角度以及杆顶部的速度,动作数为两个,并且是离散的向左或者向右。理论上达到最优化算法的情况下,推车杆是一直能保持平衡的,也就是每回合的步数是无限,但是这不方便训练,所以环境内部设置了每回合的最大步数为200,也就是说理想情况下,只需要我们每回合的奖励达到200就算训练完成。 +该环境状态维度是四个,分别为车的位置、车的速度、杆的角度以及杆顶部的速度,动作维度为两个,并且是离散的向左或者向右。理论上达到最优化算法的情况下,推车杆是一直能保持平衡的,也就是每回合的步数是无限,但是这不方便训练,所以环境内部设置了每回合的最大步数为200,也就是说理想情况下,只需要我们每回合的奖励达到200就算训练完成。 ## DQN基本接口 @@ -125,7 +125,7 @@ class ReplayBuffer: class MLP(nn.Module): def __init__(self, input_dim,output_dim,hidden_dim=128): """ 初始化q网络,为全连接网络 - input_dim: 输入的特征数即环境的状态数 + input_dim: 输入的特征数即环境的状态维度 output_dim: 输出的动作维度 """ super(MLP, self).__init__() diff --git a/codes/Docs/使用Q-learning解决悬崖寻路问题.md b/codes/Docs/使用Q-learning解决悬崖寻路问题.md index 244d85b..44e5b6c 100644 --- a/codes/Docs/使用Q-learning解决悬崖寻路问题.md +++ b/codes/Docs/使用Q-learning解决悬崖寻路问题.md @@ -27,21 +27,21 @@ env = gym.make('CliffWalking-v0') # 定义环境 env = CliffWalkingWapper(env) # 装饰环境 ``` -这里我们在程序中使用了一个装饰器重新定义环境,但不影响对环境的理解,感兴趣的同学具体看相关代码。可以由于gym环境封装得比较好,所以我们想要使用这个环境只需要使用gym.make命令输入函数名即可,然后我们可以查看环境的状态和动作数目: +这里我们在程序中使用了一个装饰器重新定义环境,但不影响对环境的理解,感兴趣的同学具体看相关代码。可以由于gym环境封装得比较好,所以我们想要使用这个环境只需要使用gym.make命令输入函数名即可,然后我们可以查看环境的状态和动作维度目: ```python -n_states = env.observation_space.n # 状态数 -n_actions = env.action_space.n # 动作数 -print(f"状态数:{n_states},动作数:{n_actions}") +state_dim = env.observation_space.n # 状态维度 +action_dim = env.action_space.n # 动作维度 +print(f"状态维度:{state_dim},动作维度:{action_dim}") ``` 打印出来的结果如下: ```bash -状态数:48,动作数:4 +状态维度:48,动作维度:4 ``` -我们的状态数是48个,这里我们设置的是智能体当前所在网格的编号,而动作数是4,这表示有0,1,2,3对应着上下左右四个动作。另外我们也可以初始化环境并打印当前所在的状态: +我们的状态维度是48个,这里我们设置的是智能体当前所在网格的编号,而动作维度是4,这表示有0,1,2,3对应着上下左右四个动作。另外我们也可以初始化环境并打印当前所在的状态: ```python state = env.reset() @@ -72,9 +72,9 @@ print(state) env = gym.make('CliffWalking-v0') # 定义环境 env = CliffWalkingWapper(env) # 装饰环境 env.seed(1) # 设置随机种子 -n_states = env.observation_space.n # 状态数 -n_actions = env.action_space.n # 动作数 -agent = QLearning(n_states,n_actions,cfg) # cfg存储算法相关参数 +state_dim = env.observation_space.n # 状态维度 +action_dim = env.action_space.n # 动作维度 +agent = QLearning(state_dim,action_dim,cfg) # cfg存储算法相关参数 for i_ep in range(cfg.train_eps): # cfg.train_eps表示最大训练的回合数 ep_reward = 0 # 记录每个回合的奖励 state = env.reset() # 重置环境 diff --git a/codes/DoubleDQN/agent.py b/codes/DoubleDQN/double_dqn.py similarity index 76% rename from codes/DoubleDQN/agent.py rename to codes/DoubleDQN/double_dqn.py index 1ade5f8..e712edb 100644 --- a/codes/DoubleDQN/agent.py +++ b/codes/DoubleDQN/double_dqn.py @@ -5,7 +5,7 @@ @Email: johnjim0816@gmail.com @Date: 2020-06-12 00:50:49 @LastEditor: John -LastEditTime: 2021-05-04 22:28:06 +LastEditTime: 2021-11-19 18:07:09 @Discription: @Environment: python 3.7.7 ''' @@ -16,14 +16,54 @@ LastEditTime: 2021-05-04 22:28:06 import torch import torch.nn as nn import torch.optim as optim +import torch.nn.functional as F import random import math import numpy as np -from common.memory import ReplayBuffer -from common.model import MLP + +class ReplayBuffer: + def __init__(self, capacity): + self.capacity = capacity # 经验回放的容量 + self.buffer = [] # 缓冲区 + self.position = 0 + + def push(self, state, action, reward, next_state, done): + ''' 缓冲区是一个队列,容量超出时去掉开始存入的转移(transition) + ''' + if len(self.buffer) < self.capacity: + self.buffer.append(None) + self.buffer[self.position] = (state, action, reward, next_state, done) + self.position = (self.position + 1) % self.capacity + + def sample(self, batch_size): + batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移 + state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等 + return state, action, reward, next_state, done + + def __len__(self): + ''' 返回当前存储的量 + ''' + return len(self.buffer) + +class MLP(nn.Module): + def __init__(self, state_dim,action_dim,hidden_dim=128): + """ 初始化q网络,为全连接网络 + state_dim: 输入的特征数即环境的状态维度 + action_dim: 输出的动作维度 + """ + super(MLP, self).__init__() + self.fc1 = nn.Linear(state_dim, hidden_dim) # 输入层 + self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层 + self.fc3 = nn.Linear(hidden_dim, action_dim) # 输出层 + + def forward(self, x): + # 各层对应的激活函数 + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + return self.fc3(x) + class DoubleDQN: def __init__(self, state_dim, action_dim, cfg): - self.action_dim = action_dim # 总的动作个数 self.device = cfg.device # 设备,cpu或gpu等 self.gamma = cfg.gamma @@ -43,8 +83,15 @@ class DoubleDQN: self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) self.loss = 0 self.memory = ReplayBuffer(cfg.memory_capacity) - def predict(self,state): - with torch.no_grad(): + + def choose_action(self, state): + '''选择动作 + ''' + self.actions_count += 1 + self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ + math.exp(-1. * self.actions_count / self.epsilon_decay) + if random.random() > self.epsilon: + with torch.no_grad(): # 先转为张量便于丢给神经网络,state元素数据原本为float64 # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价 state = torch.tensor( @@ -55,15 +102,6 @@ class DoubleDQN: # 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0])) # 所以tensor.max(1)[1]返回最大值对应的下标,即action action = q_value.max(1)[1].item() - return action - def choose_action(self, state): - '''选择动作 - ''' - self.actions_count += 1 - self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ - math.exp(-1. * self.actions_count / self.epsilon_decay) - if random.random() > self.epsilon: - action = self.predict(state) else: action = random.randrange(self.action_dim) return action diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/models/checkpoint.pth b/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/models/checkpoint.pth deleted file mode 100644 index 8c4b561..0000000 Binary files a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/models/checkpoint.pth and /dev/null differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_ma_rewards.npy b/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_ma_rewards.npy deleted file mode 100644 index 0f77696..0000000 Binary files a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_ma_rewards.npy and /dev/null differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_rewards.npy b/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_rewards.npy deleted file mode 100644 index 57f8759..0000000 Binary files a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_rewards.npy and /dev/null differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_rewards_curve.png b/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_rewards_curve.png deleted file mode 100644 index 038e031..0000000 Binary files a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/eval_rewards_curve.png and /dev/null differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_ma_rewards.npy b/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_ma_rewards.npy deleted file mode 100644 index 63d10e7..0000000 Binary files a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_ma_rewards.npy and /dev/null differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_rewards.npy b/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_rewards.npy deleted file mode 100644 index d486ad9..0000000 Binary files a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_rewards.npy and /dev/null differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_rewards_curve.png b/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_rewards_curve.png deleted file mode 100644 index f91bc4d..0000000 Binary files a/codes/DoubleDQN/outputs/CartPole-v0/20210504-150900/results/train_rewards_curve.png and /dev/null differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/models/checkpoint.pth b/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/models/checkpoint.pth new file mode 100644 index 0000000..2ec6bfd Binary files /dev/null and b/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/models/checkpoint.pth differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/results/test_ma_rewards.npy b/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/results/test_ma_rewards.npy new file mode 100644 index 0000000..81e0bba Binary files /dev/null and b/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/results/test_ma_rewards.npy differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/results/test_rewards.npy b/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/results/test_rewards.npy new file mode 100644 index 0000000..e7b6307 Binary files /dev/null and b/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/results/test_rewards.npy differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/results/test_rewards_curve.png b/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/results/test_rewards_curve.png new file mode 100644 index 0000000..4fbd77c Binary files /dev/null and b/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/results/test_rewards_curve.png differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/results/train_ma_rewards.npy b/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/results/train_ma_rewards.npy new file mode 100644 index 0000000..a73bbde Binary files /dev/null and b/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/results/train_ma_rewards.npy differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/results/train_rewards.npy b/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/results/train_rewards.npy new file mode 100644 index 0000000..3e707c5 Binary files /dev/null and b/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/results/train_rewards.npy differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/results/train_rewards_curve.png b/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/results/train_rewards_curve.png new file mode 100644 index 0000000..cb9dbeb Binary files /dev/null and b/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/results/train_rewards_curve.png differ diff --git a/codes/DoubleDQN/task0.py b/codes/DoubleDQN/task0.py new file mode 100644 index 0000000..7657a88 --- /dev/null +++ b/codes/DoubleDQN/task0.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python +# coding=utf-8 +''' +Author: JiangJi +Email: johnjim0816@gmail.com +Date: 2021-11-07 18:10:37 +LastEditor: JiangJi +LastEditTime: 2021-12-29 15:02:30 +Discription: +''' + +import sys,os +curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 +parent_path = os.path.dirname(curr_path) # 父路径 +sys.path.append(parent_path) # 添加路径到系统路径 + +import gym +import torch +import datetime + +from common.utils import save_results, make_dir +from common.utils import plot_rewards +from DoubleDQN.double_dqn import DoubleDQN + +curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 + +class Config: + def __init__(self): + ################################## 环境超参数 ################################### + self.algo_name = 'DoubleDQN' # 算法名称 + self.env_name = 'CartPole-v0' # 环境名称 + self.device = torch.device( + "cuda" if torch.cuda.is_available() else "cpu") # 检测GPU + self.train_eps = 200 # 训练的回合数 + self.test_eps = 30 # 测试的回合数 + ################################################################################ + + ################################## 算法超参数 ################################### + self.gamma = 0.95 # 强化学习中的折扣因子 + self.epsilon_start = 0.95 # e-greedy策略中初始epsilon + self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon + self.epsilon_decay = 500 # e-greedy策略中epsilon的衰减率 + self.lr = 0.0001 # 学习率 + self.memory_capacity = 100000 # 经验回放的容量 + self.batch_size = 64 # mini-batch SGD中的批量大小 + self.target_update = 2 # 目标网络的更新频率 + self.hidden_dim = 256 # 网络隐藏层 + ################################################################################ + + ################################# 保存结果相关参数 ############################## + self.result_path = curr_path + "/outputs/" + self.env_name + \ + '/' + curr_time + '/results/' # 保存结果的路径 + self.model_path = curr_path + "/outputs/" + self.env_name + \ + '/' + curr_time + '/models/' # 保存模型的路径 + self.save = True # 是否保存图片 + ################################################################################ + + +def env_agent_config(cfg,seed=1): + env = gym.make(cfg.env_name) + env.seed(seed) + state_dim = env.observation_space.shape[0] + action_dim = env.action_space.n + agent = DoubleDQN(state_dim,action_dim,cfg) + return env,agent + +def train(cfg,env,agent): + print('开始训练!') + print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') + rewards = [] # 记录所有回合的奖励 + ma_rewards = [] # 记录所有回合的滑动平均奖励 + for i_ep in range(cfg.train_eps): + ep_reward = 0 # 记录一回合内的奖励 + state = env.reset() # 重置环境,返回初始状态 + while True: + action = agent.choose_action(state) + next_state, reward, done, _ = env.step(action) + ep_reward += reward + agent.memory.push(state, action, reward, next_state, done) + state = next_state + agent.update() + if done: + break + if i_ep % cfg.target_update == 0: + agent.target_net.load_state_dict(agent.policy_net.state_dict()) + if (i_ep+1)%10 == 0: + print(f'回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward}') + rewards.append(ep_reward) + if ma_rewards: + ma_rewards.append( + 0.9*ma_rewards[-1]+0.1*ep_reward) + else: + ma_rewards.append(ep_reward) + print('完成训练!') + env.close() + return rewards,ma_rewards + +def test(cfg,env,agent): + print('开始测试!') + print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') + ############# 由于测试不需要使用epsilon-greedy策略,所以相应的值设置为0 ############### + cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon + cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon + ################################################################################ + rewards = [] # 记录所有回合的奖励 + ma_rewards = [] # 记录所有回合的滑动平均奖励 + + for i_ep in range(cfg.test_eps): + state = env.reset() + ep_reward = 0 + while True: + action = agent.choose_action(state) + next_state, reward, done, _ = env.step(action) + state = next_state + ep_reward += reward + if done: + break + rewards.append(ep_reward) + if ma_rewards: + ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1) + else: + ma_rewards.append(ep_reward) + print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}") + print('完成测试!') + env.close() + return rewards,ma_rewards + +if __name__ == "__main__": + cfg = Config() + # 训练 + env, agent = env_agent_config(cfg) + rewards, ma_rewards = train(cfg, env, agent) + make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹 + agent.save(path=cfg.model_path) # 保存模型 + save_results(rewards, ma_rewards, tag='train', + path=cfg.result_path) # 保存结果 + plot_rewards(rewards, ma_rewards, cfg, tag="train") # 画出结果 + # 测试 + env, agent = env_agent_config(cfg) + agent.load(path=cfg.model_path) # 导入模型 + rewards, ma_rewards = test(cfg, env, agent) + save_results(rewards, ma_rewards, tag='test', + path=cfg.result_path) # 保存结果 + plot_rewards(rewards, ma_rewards, cfg, tag="test") # 画出结果 diff --git a/codes/DoubleDQN/task0_train.ipynb b/codes/DoubleDQN/task0_train.ipynb deleted file mode 100644 index ee2e5d4..0000000 --- a/codes/DoubleDQN/task0_train.ipynb +++ /dev/null @@ -1,194 +0,0 @@ -{ - "metadata": { - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.10" - }, - "orig_nbformat": 2, - "kernelspec": { - "name": "python3710jvsc74a57bd0366e1054dee9d4501b0eb8f87335afd3c67fc62db6ee611bbc7f8f5a1fefe232", - "display_name": "Python 3.7.10 64-bit ('py37': conda)" - }, - "metadata": { - "interpreter": { - "hash": "366e1054dee9d4501b0eb8f87335afd3c67fc62db6ee611bbc7f8f5a1fefe232" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2, - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "from pathlib import Path\n", - "curr_path = str(Path().absolute())\n", - "parent_path = str(Path().absolute().parent)\n", - "sys.path.append(parent_path) # add current terminal path to sys.path" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import gym\n", - "import torch\n", - "import datetime\n", - "from DoubleDQN.agent import DoubleDQN\n", - "from common.plot import plot_rewards\n", - "from common.utils import save_results, make_dir\n", - "\n", - "curr_time = datetime.datetime.now().strftime(\n", - " \"%Y%m%d-%H%M%S\") # obtain current time" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class DoubleDQNConfig:\n", - " def __init__(self):\n", - " self.algo = \"DoubleDQN\" # name of algo\n", - " self.env = 'CartPole-v0' # env name\n", - " self.result_path = curr_path+\"/outputs/\" + self.env + \\\n", - " '/'+curr_time+'/results/' # path to save results\n", - " self.model_path = curr_path+\"/outputs/\" + self.env + \\\n", - " '/'+curr_time+'/models/' # path to save models\n", - " self.train_eps = 200 # max tranng episodes\n", - " self.eval_eps = 50 # max evaling episodes\n", - " self.gamma = 0.95\n", - " self.epsilon_start = 1 # start epsilon of e-greedy policy\n", - " self.epsilon_end = 0.01 \n", - " self.epsilon_decay = 500\n", - " self.lr = 0.001 # learning rate\n", - " self.memory_capacity = 100000 # capacity of Replay Memory\n", - " self.batch_size = 64\n", - " self.target_update = 2 # update frequency of target net\n", - " self.device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\") # check gpu\n", - " self.hidden_dim = 256 # hidden size of net" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def env_agent_config(cfg,seed=1):\n", - " env = gym.make(cfg.env) \n", - " env.seed(seed)\n", - " state_dim = env.observation_space.shape[0]\n", - " action_dim = env.action_space.n\n", - " agent = DoubleDQN(state_dim,action_dim,cfg)\n", - " return env,agent" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def train(cfg,env,agent):\n", - " print('Start to train !')\n", - " rewards,ma_rewards = [],[]\n", - " for i_ep in range(cfg.train_eps):\n", - " state = env.reset() \n", - " ep_reward = 0\n", - " while True:\n", - " action = agent.choose_action(state) \n", - " next_state, reward, done, _ = env.step(action)\n", - " ep_reward += reward\n", - " agent.memory.push(state, action, reward, next_state, done) \n", - " state = next_state \n", - " agent.update() \n", - " if done:\n", - " break\n", - " if i_ep % cfg.target_update == 0:\n", - " agent.target_net.load_state_dict(agent.policy_net.state_dict())\n", - " if (i_ep+1)%10 == 0:\n", - " print(f'Episode:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward}')\n", - " rewards.append(ep_reward)\n", - " if ma_rewards:\n", - " ma_rewards.append(\n", - " 0.9*ma_rewards[-1]+0.1*ep_reward)\n", - " else:\n", - " ma_rewards.append(ep_reward) \n", - " print('Complete training!')\n", - " return rewards,ma_rewards" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def eval(cfg,env,agent):\n", - " print('Start to eval !')\n", - " print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}')\n", - " rewards = [] \n", - " ma_rewards = []\n", - " for i_ep in range(cfg.eval_eps):\n", - " state = env.reset() \n", - " ep_reward = 0 \n", - " while True:\n", - " action = agent.predict(state) \n", - " next_state, reward, done, _ = env.step(action) \n", - " state = next_state \n", - " ep_reward += reward\n", - " if done:\n", - " break\n", - " rewards.append(ep_reward)\n", - " if ma_rewards:\n", - " ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)\n", - " else:\n", - " ma_rewards.append(ep_reward)\n", - " print(f\"Episode:{i_ep+1}/{cfg.eval_eps}, reward:{ep_reward:.1f}\")\n", - " print('Complete evaling!')\n", - " return rewards,ma_rewards " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if __name__ == \"__main__\":\n", - " cfg = DoubleDQNConfig()\n", - " # train\n", - " env,agent = env_agent_config(cfg,seed=1)\n", - " rewards, ma_rewards = train(cfg, env, agent)\n", - " make_dir(cfg.result_path, cfg.model_path)\n", - " agent.save(path=cfg.model_path)\n", - " save_results(rewards, ma_rewards, tag='train', path=cfg.result_path)\n", - " plot_rewards(rewards, ma_rewards, tag=\"train\",\n", - " algo=cfg.algo, path=cfg.result_path)\n", - "\n", - " # eval\n", - " env,agent = env_agent_config(cfg,seed=10)\n", - " agent.load(path=cfg.model_path)\n", - " rewards,ma_rewards = eval(cfg,env,agent)\n", - " save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path)\n", - " plot_rewards(rewards,ma_rewards,tag=\"eval\",env=cfg.env,algo = cfg.algo,path=cfg.result_path)" - ] - } - ] -} \ No newline at end of file diff --git a/codes/DoubleDQN/task0_train.py b/codes/DoubleDQN/task0_train.py deleted file mode 100644 index 0148ea2..0000000 --- a/codes/DoubleDQN/task0_train.py +++ /dev/null @@ -1,123 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -@Author: John -@Email: johnjim0816@gmail.com -@Date: 2020-06-12 00:48:57 -@LastEditor: John -LastEditTime: 2021-09-10 15:26:05 -@Discription: -@Environment: python 3.7.7 -''' -import sys,os -curr_path = os.path.dirname(__file__) -parent_path = os.path.dirname(curr_path) -sys.path.append(parent_path) # add current terminal path to sys.path - -import gym -import torch -import datetime -from DoubleDQN.agent import DoubleDQN -from common.plot import plot_rewards -from common.utils import save_results, make_dir - -curr_time = datetime.datetime.now().strftime( - "%Y%m%d-%H%M%S") # obtain current time - -class DoubleDQNConfig: - def __init__(self): - self.algo = "DoubleDQN" # name of algo - self.env = 'CartPole-v0' # env name - self.result_path = curr_path+"/outputs/" + self.env + \ - '/'+curr_time+'/results/' # path to save results - self.model_path = curr_path+"/outputs/" + self.env + \ - '/'+curr_time+'/models/' # path to save models - self.train_eps = 200 # max tranng episodes - self.eval_eps = 50 # max evaling episodes - self.gamma = 0.95 - self.epsilon_start = 1 # start epsilon of e-greedy policy - self.epsilon_end = 0.01 - self.epsilon_decay = 500 - self.lr = 0.001 # learning rate - self.memory_capacity = 100000 # capacity of Replay Memory - self.batch_size = 64 - self.target_update = 2 # update frequency of target net - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # check gpu - self.hidden_dim = 256 # hidden size of net - -def env_agent_config(cfg,seed=1): - env = gym.make(cfg.env) - env.seed(seed) - state_dim = env.observation_space.shape[0] - action_dim = env.action_space.n - agent = DoubleDQN(state_dim,action_dim,cfg) - return env,agent - -def train(cfg,env,agent): - print('Start to train !') - rewards,ma_rewards = [],[] - for i_ep in range(cfg.train_eps): - state = env.reset() - ep_reward = 0 - while True: - action = agent.choose_action(state) - next_state, reward, done, _ = env.step(action) - ep_reward += reward - agent.memory.push(state, action, reward, next_state, done) - state = next_state - agent.update() - if done: - break - if i_ep % cfg.target_update == 0: - agent.target_net.load_state_dict(agent.policy_net.state_dict()) - print(f'Episode:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward},Epsilon:{agent.epsilon:.2f}') - rewards.append(ep_reward) - if ma_rewards: - ma_rewards.append( - 0.9*ma_rewards[-1]+0.1*ep_reward) - else: - ma_rewards.append(ep_reward) - print('Complete training!') - return rewards,ma_rewards - -def eval(cfg,env,agent): - print('Start to eval !') - print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}') - rewards = [] - ma_rewards = [] - for i_ep in range(cfg.eval_eps): - state = env.reset() - ep_reward = 0 - while True: - action = agent.predict(state) - next_state, reward, done, _ = env.step(action) - state = next_state - ep_reward += reward - if done: - break - rewards.append(ep_reward) - if ma_rewards: - ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1) - else: - ma_rewards.append(ep_reward) - print(f"Episode:{i_ep+1}/{cfg.eval_eps}, reward:{ep_reward:.1f}") - print('Complete evaling!') - return rewards,ma_rewards - -if __name__ == "__main__": - cfg = DoubleDQNConfig() - # 训练 - env,agent = env_agent_config(cfg,seed=1) - rewards, ma_rewards = train(cfg, env, agent) - make_dir(cfg.result_path, cfg.model_path) - agent.save(path=cfg.model_path) - save_results(rewards, ma_rewards, tag='train', path=cfg.result_path) - plot_rewards(rewards, ma_rewards, tag="train", - algo=cfg.algo, path=cfg.result_path) - - # 测试 - env,agent = env_agent_config(cfg,seed=10) - agent.load(path=cfg.model_path) - rewards,ma_rewards = eval(cfg,env,agent) - save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path) - plot_rewards(rewards,ma_rewards,tag="eval",env=cfg.env,algo = cfg.algo,path=cfg.result_path) diff --git a/codes/DuelingDQN/task0_train.ipynb b/codes/DuelingDQN/task0_train.ipynb index c2cd1c3..7e38218 100644 --- a/codes/DuelingDQN/task0_train.ipynb +++ b/codes/DuelingDQN/task0_train.ipynb @@ -136,12 +136,12 @@ "outputs": [], "source": [ "class DuelingNet(nn.Module):\n", - " def __init__(self, n_states, n_actions,hidden_size=128):\n", + " def __init__(self, state_dim, action_dim,hidden_size=128):\n", " super(DuelingNet, self).__init__()\n", " \n", " # 隐藏层\n", " self.hidden = nn.Sequential(\n", - " nn.Linear(n_states, hidden_size),\n", + " nn.Linear(state_dim, hidden_size),\n", " nn.ReLU()\n", " )\n", " \n", @@ -149,7 +149,7 @@ " self.advantage = nn.Sequential(\n", " nn.Linear(hidden_size, hidden_size),\n", " nn.ReLU(),\n", - " nn.Linear(hidden_size, n_actions)\n", + " nn.Linear(hidden_size, action_dim)\n", " )\n", " \n", " # 价值函数\n", @@ -192,7 +192,7 @@ ], "source": [ "class DuelingDQN:\n", - " def __init__(self,n_states,n_actions,cfg) -> None:\n", + " def __init__(self,state_dim,action_dim,cfg) -> None:\n", " self.batch_size = cfg.batch_size\n", " self.device = cfg.device\n", " self.loss_history = [] # 记录loss的变化\n", @@ -200,8 +200,8 @@ " self.epsilon = lambda frame_idx: cfg.epsilon_end + \\\n", " (cfg.epsilon_start - cfg.epsilon_end) * \\\n", " math.exp(-1. * frame_idx / cfg.epsilon_decay)\n", - " self.policy_net = DuelingNet(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)\n", - " self.target_net = DuelingNet(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)\n", + " self.policy_net = DuelingNet(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)\n", + " self.target_net = DuelingNet(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)\n", " for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网络targe_net\n", " target_param.data.copy_(param.data)\n", " self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器\n", diff --git a/codes/HierarchicalDQN/agent.py b/codes/HierarchicalDQN/agent.py index 3760643..ce0cd1f 100644 --- a/codes/HierarchicalDQN/agent.py +++ b/codes/HierarchicalDQN/agent.py @@ -11,12 +11,51 @@ Environment: ''' import torch import torch.nn as nn +import torch.optim as optim +import torch.nn.functional as F import numpy as np import random,math -import torch.optim as optim -from common.model import MLP -from common.memory import ReplayBuffer +class ReplayBuffer: + def __init__(self, capacity): + self.capacity = capacity # 经验回放的容量 + self.buffer = [] # 缓冲区 + self.position = 0 + + def push(self, state, action, reward, next_state, done): + ''' 缓冲区是一个队列,容量超出时去掉开始存入的转移(transition) + ''' + if len(self.buffer) < self.capacity: + self.buffer.append(None) + self.buffer[self.position] = (state, action, reward, next_state, done) + self.position = (self.position + 1) % self.capacity + + def sample(self, batch_size): + batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移 + state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等 + return state, action, reward, next_state, done + + def __len__(self): + ''' 返回当前存储的量 + ''' + return len(self.buffer) +class MLP(nn.Module): + def __init__(self, input_dim,output_dim,hidden_dim=128): + """ 初始化q网络,为全连接网络 + input_dim: 输入的特征数即环境的状态维度 + output_dim: 输出的动作维度 + """ + super(MLP, self).__init__() + self.fc1 = nn.Linear(input_dim, hidden_dim) # 输入层 + self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层 + self.fc3 = nn.Linear(hidden_dim, output_dim) # 输出层 + + def forward(self, x): + # 各层对应的激活函数 + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + return self.fc3(x) + class HierarchicalDQN: def __init__(self,state_dim,action_dim,cfg): self.state_dim = state_dim @@ -24,7 +63,7 @@ class HierarchicalDQN: self.gamma = cfg.gamma self.device = cfg.device self.batch_size = cfg.batch_size - self.frame_idx = 0 + self.frame_idx = 0 # 用于epsilon的衰减计数 self.epsilon = lambda frame_idx: cfg.epsilon_end + (cfg.epsilon_start - cfg.epsilon_end ) * math.exp(-1. * frame_idx / cfg.epsilon_decay) self.policy_net = MLP(2*state_dim, action_dim,cfg.hidden_dim).to(self.device) self.meta_policy_net = MLP(state_dim, state_dim,cfg.hidden_dim).to(self.device) diff --git a/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/models/meta_checkpoint.pth b/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/models/meta_checkpoint.pth new file mode 100644 index 0000000..02f3f7c Binary files /dev/null and b/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/models/meta_checkpoint.pth differ diff --git a/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/models/policy_checkpoint.pth b/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/models/policy_checkpoint.pth new file mode 100644 index 0000000..9d906ea Binary files /dev/null and b/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/models/policy_checkpoint.pth differ diff --git a/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/results/test_ma_rewards.npy b/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/results/test_ma_rewards.npy new file mode 100644 index 0000000..14dd955 Binary files /dev/null and b/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/results/test_ma_rewards.npy differ diff --git a/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/results/test_rewards.npy b/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/results/test_rewards.npy new file mode 100644 index 0000000..e815222 Binary files /dev/null and b/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/results/test_rewards.npy differ diff --git a/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/results/test_rewards_curve.png b/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/results/test_rewards_curve.png new file mode 100644 index 0000000..645b21a Binary files /dev/null and b/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/results/test_rewards_curve.png differ diff --git a/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/results/train_ma_rewards.npy b/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/results/train_ma_rewards.npy new file mode 100644 index 0000000..bf58391 Binary files /dev/null and b/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/results/train_ma_rewards.npy differ diff --git a/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/results/train_rewards.npy b/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/results/train_rewards.npy new file mode 100644 index 0000000..f4d20ff Binary files /dev/null and b/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/results/train_rewards.npy differ diff --git a/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/results/train_rewards_curve.png b/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/results/train_rewards_curve.png new file mode 100644 index 0000000..20ccbc5 Binary files /dev/null and b/codes/HierarchicalDQN/outputs/CartPole-v0/20211221-200119/results/train_rewards_curve.png differ diff --git a/codes/HierarchicalDQN/results/20210331-134559/ma_rewards_train.npy b/codes/HierarchicalDQN/results/20210331-134559/ma_rewards_train.npy deleted file mode 100644 index daab87d..0000000 Binary files a/codes/HierarchicalDQN/results/20210331-134559/ma_rewards_train.npy and /dev/null differ diff --git a/codes/HierarchicalDQN/results/20210331-134559/rewards_curve_train.png b/codes/HierarchicalDQN/results/20210331-134559/rewards_curve_train.png deleted file mode 100644 index 77555ad..0000000 Binary files a/codes/HierarchicalDQN/results/20210331-134559/rewards_curve_train.png and /dev/null differ diff --git a/codes/HierarchicalDQN/results/20210331-134559/rewards_train.npy b/codes/HierarchicalDQN/results/20210331-134559/rewards_train.npy deleted file mode 100644 index 5a1ad82..0000000 Binary files a/codes/HierarchicalDQN/results/20210331-134559/rewards_train.npy and /dev/null differ diff --git a/codes/HierarchicalDQN/results/20210331-145852/losses_curve.png b/codes/HierarchicalDQN/results/20210331-145852/losses_curve.png deleted file mode 100644 index 4f962ea..0000000 Binary files a/codes/HierarchicalDQN/results/20210331-145852/losses_curve.png and /dev/null differ diff --git a/codes/HierarchicalDQN/results/20210331-145852/ma_rewards_train.npy b/codes/HierarchicalDQN/results/20210331-145852/ma_rewards_train.npy deleted file mode 100644 index 523bdb4..0000000 Binary files a/codes/HierarchicalDQN/results/20210331-145852/ma_rewards_train.npy and /dev/null differ diff --git a/codes/HierarchicalDQN/results/20210331-145852/rewards_curve_train.png b/codes/HierarchicalDQN/results/20210331-145852/rewards_curve_train.png deleted file mode 100644 index 97443e5..0000000 Binary files a/codes/HierarchicalDQN/results/20210331-145852/rewards_curve_train.png and /dev/null differ diff --git a/codes/HierarchicalDQN/results/20210331-145852/rewards_train.npy b/codes/HierarchicalDQN/results/20210331-145852/rewards_train.npy deleted file mode 100644 index 99cf87a..0000000 Binary files a/codes/HierarchicalDQN/results/20210331-145852/rewards_train.npy and /dev/null differ diff --git a/codes/HierarchicalDQN/saved_model/20210331-134559/meta_checkpoint.pth b/codes/HierarchicalDQN/saved_model/20210331-134559/meta_checkpoint.pth deleted file mode 100644 index 873b3ef..0000000 Binary files a/codes/HierarchicalDQN/saved_model/20210331-134559/meta_checkpoint.pth and /dev/null differ diff --git a/codes/HierarchicalDQN/saved_model/20210331-134559/policy_checkpoint.pth b/codes/HierarchicalDQN/saved_model/20210331-134559/policy_checkpoint.pth deleted file mode 100644 index be8ea8a..0000000 Binary files a/codes/HierarchicalDQN/saved_model/20210331-134559/policy_checkpoint.pth and /dev/null differ diff --git a/codes/HierarchicalDQN/saved_model/20210331-145852/meta_checkpoint.pth b/codes/HierarchicalDQN/saved_model/20210331-145852/meta_checkpoint.pth deleted file mode 100644 index e3f7c38..0000000 Binary files a/codes/HierarchicalDQN/saved_model/20210331-145852/meta_checkpoint.pth and /dev/null differ diff --git a/codes/HierarchicalDQN/saved_model/20210331-145852/policy_checkpoint.pth b/codes/HierarchicalDQN/saved_model/20210331-145852/policy_checkpoint.pth deleted file mode 100644 index 6be6ea3..0000000 Binary files a/codes/HierarchicalDQN/saved_model/20210331-145852/policy_checkpoint.pth and /dev/null differ diff --git a/codes/HierarchicalDQN/task0.py b/codes/HierarchicalDQN/task0.py new file mode 100644 index 0000000..3eceefd --- /dev/null +++ b/codes/HierarchicalDQN/task0.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python +# coding=utf-8 +''' +Author: John +Email: johnjim0816@gmail.com +Date: 2021-03-29 10:37:32 +LastEditor: John +LastEditTime: 2021-05-04 22:35:56 +Discription: +Environment: +''' +import sys +import os +curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 +parent_path = os.path.dirname(curr_path) # 父路径 +sys.path.append(parent_path) # 添加路径到系统路径 + +import datetime +import numpy as np +import torch +import gym + +from common.utils import save_results,make_dir +from common.utils import plot_rewards +from HierarchicalDQN.agent import HierarchicalDQN +from HierarchicalDQN.train import train,test + +curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 +algo_name = "Hierarchical DQN" # 算法名称 +env_name = 'CartPole-v0' # 环境名称 +class HierarchicalDQNConfig: + def __init__(self): + self.algo_name = algo_name # 算法名称 + self.env_name = env_name # 环境名称 + self.device = torch.device( + "cuda" if torch.cuda.is_available() else "cpu") # 检测GPU + self.train_eps = 300 # 训练的episode数目 + self.test_eps = 50 # 测试的episode数目 + self.gamma = 0.99 + self.epsilon_start = 1 # start epsilon of e-greedy policy + self.epsilon_end = 0.01 + self.epsilon_decay = 200 + self.lr = 0.0001 # learning rate + self.memory_capacity = 10000 # Replay Memory capacity + self.batch_size = 32 + self.target_update = 2 # 目标网络的更新频率 + self.hidden_dim = 256 # 网络隐藏层 +class PlotConfig: + ''' 绘图相关参数设置 + ''' + + def __init__(self) -> None: + self.algo_name = algo_name # 算法名称 + self.env_name = env_name # 环境名称 + self.device = torch.device( + "cuda" if torch.cuda.is_available() else "cpu") # 检测GPU + self.result_path = curr_path + "/outputs/" + self.env_name + \ + '/' + curr_time + '/results/' # 保存结果的路径 + self.model_path = curr_path + "/outputs/" + self.env_name + \ + '/' + curr_time + '/models/' # 保存模型的路径 + self.save = True # 是否保存图片 + +def env_agent_config(cfg,seed=1): + env = gym.make(cfg.env_name) + env.seed(seed) + state_dim = env.observation_space.shape[0] + action_dim = env.action_space.n + agent = HierarchicalDQN(state_dim,action_dim,cfg) + return env,agent + +if __name__ == "__main__": + cfg = HierarchicalDQNConfig() + plot_cfg = PlotConfig() + # 训练 + env, agent = env_agent_config(cfg, seed=1) + rewards, ma_rewards = train(cfg, env, agent) + make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹 + agent.save(path=plot_cfg.model_path) # 保存模型 + save_results(rewards, ma_rewards, tag='train', + path=plot_cfg.result_path) # 保存结果 + plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果 + # 测试 + env, agent = env_agent_config(cfg, seed=10) + agent.load(path=plot_cfg.model_path) # 导入模型 + rewards, ma_rewards = test(cfg, env, agent) + save_results(rewards, ma_rewards, tag='test', path=plot_cfg.result_path) # 保存结果 + plot_rewards(rewards, ma_rewards, plot_cfg, tag="test") # 画出结果 + diff --git a/codes/HierarchicalDQN/task0_train.ipynb b/codes/HierarchicalDQN/task0_train.ipynb deleted file mode 100644 index c63e950..0000000 --- a/codes/HierarchicalDQN/task0_train.ipynb +++ /dev/null @@ -1,477 +0,0 @@ -{ - "metadata": { - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.10-final" - }, - "orig_nbformat": 2, - "kernelspec": { - "name": "python3", - "display_name": "Python 3.7.10 64-bit ('py37': conda)", - "metadata": { - "interpreter": { - "hash": "fbea1422c2cf61ed9c0cfc03f38f71cc9083cc288606edc4170b5309b352ce27" - } - } - } - }, - "nbformat": 4, - "nbformat_minor": 2, - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import sys,os\n", - "from pathlib import Path\n", - "curr_path = str(Path().absolute())\n", - "parent_path = str(Path().absolute().parent)\n", - "sys.path.append(parent_path) # add current terminal path to sys.path\n", - "\n", - "import gym\n", - "import torch\n", - "import numpy as np\n", - "import datetime\n", - "\n", - "from HierarchicalDQN.agent import HierarchicalDQN\n", - "from common.plot import plot_rewards\n", - "from common.utils import save_results" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "SEQUENCE = datetime.datetime.now().strftime(\n", - " \"%Y%m%d-%H%M%S\") # obtain current time\n", - "SAVED_MODEL_PATH = curr_path+\"/saved_model/\"+SEQUENCE+'/' # path to save model\n", - "if not os.path.exists(curr_path+\"/saved_model/\"):\n", - " os.mkdir(curr_path+\"/saved_model/\")\n", - "if not os.path.exists(SAVED_MODEL_PATH):\n", - " os.mkdir(SAVED_MODEL_PATH)\n", - "RESULT_PATH = curr_path+\"/results/\"+SEQUENCE+'/' # path to save rewards\n", - "if not os.path.exists(curr_path+\"/results/\"):\n", - " os.mkdir(curr_path+\"/results/\")\n", - "if not os.path.exists(RESULT_PATH):\n", - " os.mkdir(RESULT_PATH)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "class HierarchicalDQNConfig:\n", - " def __init__(self):\n", - " self.algo = \"H-DQN\" # name of algo\n", - " self.gamma = 0.95\n", - " self.epsilon_start = 1 # start epsilon of e-greedy policy\n", - " self.epsilon_end = 0.01\n", - " self.epsilon_decay = 500\n", - " self.lr = 0.0001 # learning rate\n", - " self.memory_capacity = 20000 # Replay Memory capacity\n", - " self.batch_size = 64\n", - " self.train_eps = 300 # 训练的episode数目\n", - " self.target_update = 2 # target net的更新频率\n", - " self.eval_eps = 20 # 测试的episode数目\n", - " self.device = torch.device(\n", - " \"cuda\" if torch.cuda.is_available() else \"cpu\") # 检测gpu\n", - " self.hidden_dim = 256 # dimension of hidden layer" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "def train(cfg, env, agent):\n", - " print('Start to train !')\n", - " rewards = []\n", - " ma_rewards = [] # moveing average reward\n", - " for i_episode in range(cfg.train_eps):\n", - " state = env.reset()\n", - " done = False\n", - " ep_reward = 0\n", - " while not done:\n", - " goal = agent.set_goal(state)\n", - " onehot_goal = agent.to_onehot(goal)\n", - " meta_state = state\n", - " extrinsic_reward = 0\n", - " while not done and goal != np.argmax(state):\n", - " goal_state = np.concatenate([state, onehot_goal])\n", - " action = agent.choose_action(goal_state)\n", - " next_state, reward, done, _ = env.step(action)\n", - " ep_reward += reward\n", - " extrinsic_reward += reward\n", - " intrinsic_reward = 1.0 if goal == np.argmax(\n", - " next_state) else 0.0\n", - " agent.memory.push(goal_state, action, intrinsic_reward, np.concatenate(\n", - " [next_state, onehot_goal]), done)\n", - " state = next_state\n", - " agent.update()\n", - " agent.meta_memory.push(meta_state, goal, extrinsic_reward, state, done)\n", - " print('Episode:{}/{}, Reward:{}'.format(i_episode+1, cfg.train_eps, ep_reward))\n", - " rewards.append(ep_reward)\n", - " if ma_rewards:\n", - " ma_rewards.append(\n", - " 0.9*ma_rewards[-1]+0.1*ep_reward)\n", - " else:\n", - " ma_rewards.append(ep_reward)\n", - " print('Complete training!')\n", - " return rewards, ma_rewards" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Start to train !\n", - "Episode:1/300, Reward:25.0\n", - "Episode:2/300, Reward:26.0\n", - "Episode:3/300, Reward:23.0\n", - "Episode:4/300, Reward:19.0\n", - "Episode:5/300, Reward:23.0\n", - "Episode:6/300, Reward:21.0\n", - "Episode:7/300, Reward:21.0\n", - "Episode:8/300, Reward:22.0\n", - "Episode:9/300, Reward:15.0\n", - "Episode:10/300, Reward:12.0\n", - "Episode:11/300, Reward:39.0\n", - "Episode:12/300, Reward:42.0\n", - "Episode:13/300, Reward:79.0\n", - "Episode:14/300, Reward:54.0\n", - "Episode:15/300, Reward:28.0\n", - "Episode:16/300, Reward:85.0\n", - "Episode:17/300, Reward:46.0\n", - "Episode:18/300, Reward:37.0\n", - "Episode:19/300, Reward:45.0\n", - "Episode:20/300, Reward:79.0\n", - "Episode:21/300, Reward:80.0\n", - "Episode:22/300, Reward:154.0\n", - "Episode:23/300, Reward:74.0\n", - "Episode:24/300, Reward:129.0\n", - "Episode:25/300, Reward:185.0\n", - "Episode:26/300, Reward:200.0\n", - "Episode:27/300, Reward:115.0\n", - "Episode:28/300, Reward:104.0\n", - "Episode:29/300, Reward:200.0\n", - "Episode:30/300, Reward:118.0\n", - "Episode:31/300, Reward:200.0\n", - "Episode:32/300, Reward:200.0\n", - "Episode:33/300, Reward:83.0\n", - "Episode:34/300, Reward:75.0\n", - "Episode:35/300, Reward:46.0\n", - "Episode:36/300, Reward:96.0\n", - "Episode:37/300, Reward:78.0\n", - "Episode:38/300, Reward:150.0\n", - "Episode:39/300, Reward:147.0\n", - "Episode:40/300, Reward:74.0\n", - "Episode:41/300, Reward:137.0\n", - "Episode:42/300, Reward:182.0\n", - "Episode:43/300, Reward:200.0\n", - "Episode:44/300, Reward:200.0\n", - "Episode:45/300, Reward:200.0\n", - "Episode:46/300, Reward:184.0\n", - "Episode:47/300, Reward:200.0\n", - "Episode:48/300, Reward:200.0\n", - "Episode:49/300, Reward:200.0\n", - "Episode:50/300, Reward:61.0\n", - "Episode:51/300, Reward:9.0\n", - "Episode:52/300, Reward:9.0\n", - "Episode:53/300, Reward:200.0\n", - "Episode:54/300, Reward:200.0\n", - "Episode:55/300, Reward:200.0\n", - "Episode:56/300, Reward:200.0\n", - "Episode:57/300, Reward:200.0\n", - "Episode:58/300, Reward:200.0\n", - "Episode:59/300, Reward:200.0\n", - "Episode:60/300, Reward:167.0\n", - "Episode:61/300, Reward:200.0\n", - "Episode:62/300, Reward:200.0\n", - "Episode:63/300, Reward:200.0\n", - "Episode:64/300, Reward:200.0\n", - "Episode:65/300, Reward:200.0\n", - "Episode:66/300, Reward:200.0\n", - "Episode:67/300, Reward:200.0\n", - "Episode:68/300, Reward:200.0\n", - "Episode:69/300, Reward:197.0\n", - "Episode:70/300, Reward:200.0\n", - "Episode:71/300, Reward:200.0\n", - "Episode:72/300, Reward:200.0\n", - "Episode:73/300, Reward:200.0\n", - "Episode:74/300, Reward:200.0\n", - "Episode:75/300, Reward:200.0\n", - "Episode:76/300, Reward:200.0\n", - "Episode:77/300, Reward:200.0\n", - "Episode:78/300, Reward:200.0\n", - "Episode:79/300, Reward:200.0\n", - "Episode:80/300, Reward:200.0\n", - "Episode:81/300, Reward:181.0\n", - "Episode:82/300, Reward:200.0\n", - "Episode:83/300, Reward:200.0\n", - "Episode:84/300, Reward:200.0\n", - "Episode:85/300, Reward:200.0\n", - "Episode:86/300, Reward:200.0\n", - "Episode:87/300, Reward:200.0\n", - "Episode:88/300, Reward:200.0\n", - "Episode:89/300, Reward:200.0\n", - "Episode:90/300, Reward:200.0\n", - "Episode:91/300, Reward:200.0\n", - "Episode:92/300, Reward:200.0\n", - "Episode:93/300, Reward:200.0\n", - "Episode:94/300, Reward:200.0\n", - "Episode:95/300, Reward:200.0\n", - "Episode:96/300, Reward:200.0\n", - "Episode:97/300, Reward:200.0\n", - "Episode:98/300, Reward:200.0\n", - "Episode:99/300, Reward:192.0\n", - "Episode:100/300, Reward:183.0\n", - "Episode:101/300, Reward:200.0\n", - "Episode:102/300, Reward:200.0\n", - "Episode:103/300, Reward:200.0\n", - "Episode:104/300, Reward:200.0\n", - "Episode:105/300, Reward:200.0\n", - "Episode:106/300, Reward:200.0\n", - "Episode:107/300, Reward:200.0\n", - "Episode:108/300, Reward:200.0\n", - "Episode:109/300, Reward:200.0\n", - "Episode:110/300, Reward:200.0\n", - "Episode:111/300, Reward:200.0\n", - "Episode:112/300, Reward:200.0\n", - "Episode:113/300, Reward:200.0\n", - "Episode:114/300, Reward:200.0\n", - "Episode:115/300, Reward:200.0\n", - "Episode:116/300, Reward:200.0\n", - "Episode:117/300, Reward:200.0\n", - "Episode:118/300, Reward:200.0\n", - "Episode:119/300, Reward:200.0\n", - "Episode:120/300, Reward:196.0\n", - "Episode:121/300, Reward:200.0\n", - "Episode:122/300, Reward:200.0\n", - "Episode:123/300, Reward:200.0\n", - "Episode:124/300, Reward:200.0\n", - "Episode:125/300, Reward:200.0\n", - "Episode:126/300, Reward:189.0\n", - "Episode:127/300, Reward:193.0\n", - "Episode:128/300, Reward:200.0\n", - "Episode:129/300, Reward:200.0\n", - "Episode:130/300, Reward:193.0\n", - "Episode:131/300, Reward:183.0\n", - "Episode:132/300, Reward:183.0\n", - "Episode:133/300, Reward:200.0\n", - "Episode:134/300, Reward:200.0\n", - "Episode:135/300, Reward:200.0\n", - "Episode:136/300, Reward:200.0\n", - "Episode:137/300, Reward:200.0\n", - "Episode:138/300, Reward:200.0\n", - "Episode:139/300, Reward:100.0\n", - "Episode:140/300, Reward:118.0\n", - "Episode:141/300, Reward:99.0\n", - "Episode:142/300, Reward:185.0\n", - "Episode:143/300, Reward:41.0\n", - "Episode:144/300, Reward:11.0\n", - "Episode:145/300, Reward:9.0\n", - "Episode:146/300, Reward:152.0\n", - "Episode:147/300, Reward:155.0\n", - "Episode:148/300, Reward:181.0\n", - "Episode:149/300, Reward:197.0\n", - "Episode:150/300, Reward:200.0\n", - "Episode:151/300, Reward:200.0\n", - "Episode:152/300, Reward:200.0\n", - "Episode:153/300, Reward:200.0\n", - "Episode:154/300, Reward:200.0\n", - "Episode:155/300, Reward:200.0\n", - "Episode:156/300, Reward:123.0\n", - "Episode:157/300, Reward:11.0\n", - "Episode:158/300, Reward:8.0\n", - "Episode:159/300, Reward:9.0\n", - "Episode:160/300, Reward:10.0\n", - "Episode:161/300, Reward:9.0\n", - "Episode:162/300, Reward:10.0\n", - "Episode:163/300, Reward:9.0\n", - "Episode:164/300, Reward:9.0\n", - "Episode:165/300, Reward:10.0\n", - "Episode:166/300, Reward:9.0\n", - "Episode:167/300, Reward:9.0\n", - "Episode:168/300, Reward:9.0\n", - "Episode:169/300, Reward:9.0\n", - "Episode:170/300, Reward:10.0\n", - "Episode:171/300, Reward:9.0\n", - "Episode:172/300, Reward:9.0\n", - "Episode:173/300, Reward:11.0\n", - "Episode:174/300, Reward:11.0\n", - "Episode:175/300, Reward:10.0\n", - "Episode:176/300, Reward:9.0\n", - "Episode:177/300, Reward:10.0\n", - "Episode:178/300, Reward:8.0\n", - "Episode:179/300, Reward:9.0\n", - "Episode:180/300, Reward:9.0\n", - "Episode:181/300, Reward:10.0\n", - "Episode:182/300, Reward:10.0\n", - "Episode:183/300, Reward:9.0\n", - "Episode:184/300, Reward:10.0\n", - "Episode:185/300, Reward:10.0\n", - "Episode:186/300, Reward:13.0\n", - "Episode:187/300, Reward:16.0\n", - "Episode:188/300, Reward:117.0\n", - "Episode:189/300, Reward:13.0\n", - "Episode:190/300, Reward:16.0\n", - "Episode:191/300, Reward:11.0\n", - "Episode:192/300, Reward:11.0\n", - "Episode:193/300, Reward:13.0\n", - "Episode:194/300, Reward:13.0\n", - "Episode:195/300, Reward:9.0\n", - "Episode:196/300, Reward:20.0\n", - "Episode:197/300, Reward:12.0\n", - "Episode:198/300, Reward:10.0\n", - "Episode:199/300, Reward:14.0\n", - "Episode:200/300, Reward:12.0\n", - "Episode:201/300, Reward:14.0\n", - "Episode:202/300, Reward:12.0\n", - "Episode:203/300, Reward:11.0\n", - "Episode:204/300, Reward:10.0\n", - "Episode:205/300, Reward:13.0\n", - "Episode:206/300, Reward:10.0\n", - "Episode:207/300, Reward:10.0\n", - "Episode:208/300, Reward:13.0\n", - "Episode:209/300, Reward:9.0\n", - "Episode:210/300, Reward:11.0\n", - "Episode:211/300, Reward:14.0\n", - "Episode:212/300, Reward:10.0\n", - "Episode:213/300, Reward:20.0\n", - "Episode:214/300, Reward:12.0\n", - "Episode:215/300, Reward:13.0\n", - "Episode:216/300, Reward:17.0\n", - "Episode:217/300, Reward:17.0\n", - "Episode:218/300, Reward:11.0\n", - "Episode:219/300, Reward:15.0\n", - "Episode:220/300, Reward:26.0\n", - "Episode:221/300, Reward:73.0\n", - "Episode:222/300, Reward:44.0\n", - "Episode:223/300, Reward:48.0\n", - "Episode:224/300, Reward:102.0\n", - "Episode:225/300, Reward:162.0\n", - "Episode:226/300, Reward:123.0\n", - "Episode:227/300, Reward:200.0\n", - "Episode:228/300, Reward:200.0\n", - "Episode:229/300, Reward:120.0\n", - "Episode:230/300, Reward:173.0\n", - "Episode:231/300, Reward:138.0\n", - "Episode:232/300, Reward:106.0\n", - "Episode:233/300, Reward:193.0\n", - "Episode:234/300, Reward:117.0\n", - "Episode:235/300, Reward:120.0\n", - "Episode:236/300, Reward:98.0\n", - "Episode:237/300, Reward:98.0\n", - "Episode:238/300, Reward:200.0\n", - "Episode:239/300, Reward:96.0\n", - "Episode:240/300, Reward:170.0\n", - "Episode:241/300, Reward:107.0\n", - "Episode:242/300, Reward:107.0\n", - "Episode:243/300, Reward:200.0\n", - "Episode:244/300, Reward:128.0\n", - "Episode:245/300, Reward:165.0\n", - "Episode:246/300, Reward:168.0\n", - "Episode:247/300, Reward:200.0\n", - "Episode:248/300, Reward:200.0\n", - "Episode:249/300, Reward:200.0\n", - "Episode:250/300, Reward:200.0\n", - "Episode:251/300, Reward:200.0\n", - "Episode:252/300, Reward:200.0\n", - "Episode:253/300, Reward:200.0\n", - "Episode:254/300, Reward:200.0\n", - "Episode:255/300, Reward:200.0\n", - "Episode:256/300, Reward:200.0\n", - "Episode:257/300, Reward:164.0\n", - "Episode:258/300, Reward:200.0\n", - "Episode:259/300, Reward:190.0\n", - "Episode:260/300, Reward:185.0\n", - "Episode:261/300, Reward:200.0\n", - "Episode:262/300, Reward:200.0\n", - "Episode:263/300, Reward:200.0\n", - "Episode:264/300, Reward:200.0\n", - "Episode:265/300, Reward:168.0\n", - "Episode:266/300, Reward:200.0\n", - "Episode:267/300, Reward:200.0\n", - "Episode:268/300, Reward:200.0\n", - "Episode:269/300, Reward:200.0\n", - "Episode:270/300, Reward:200.0\n", - "Episode:271/300, Reward:200.0\n", - "Episode:272/300, Reward:200.0\n", - "Episode:273/300, Reward:200.0\n", - "Episode:274/300, Reward:200.0\n", - "Episode:275/300, Reward:188.0\n", - "Episode:276/300, Reward:200.0\n", - "Episode:277/300, Reward:177.0\n", - "Episode:278/300, Reward:200.0\n", - "Episode:279/300, Reward:200.0\n", - "Episode:280/300, Reward:200.0\n", - "Episode:281/300, Reward:200.0\n", - "Episode:282/300, Reward:200.0\n", - "Episode:283/300, Reward:200.0\n", - "Episode:284/300, Reward:189.0\n", - "Episode:285/300, Reward:200.0\n", - "Episode:286/300, Reward:200.0\n", - "Episode:287/300, Reward:200.0\n", - "Episode:288/300, Reward:200.0\n", - "Episode:289/300, Reward:200.0\n", - "Episode:290/300, Reward:200.0\n", - "Episode:291/300, Reward:200.0\n", - "Episode:292/300, Reward:200.0\n", - "Episode:293/300, Reward:200.0\n", - "Episode:294/300, Reward:200.0\n", - "Episode:295/300, Reward:200.0\n", - "Episode:296/300, Reward:200.0\n", - "Episode:297/300, Reward:200.0\n", - "Episode:298/300, Reward:200.0\n", - "Episode:299/300, Reward:200.0\n", - "Episode:300/300, Reward:200.0\n", - "Complete training!\n", - "results saved!\n" - ] - }, - { - "output_type": "display_data", - "data": { - "text/plain": "
", - "image/svg+xml": "\n\n\n\n \n \n \n \n 2021-03-31T14:01:15.395751\n image/svg+xml\n \n \n Matplotlib v3.3.4, https://matplotlib.org/\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n", - "image/png": "\n" - }, - "metadata": {} - } - ], - "source": [ - "env = gym.make('CartPole-v0')\n", - "env.seed(1)\n", - "cfg = HierarchicalDQNConfig()\n", - "state_dim = env.observation_space.shape[0]\n", - "action_dim = env.action_space.n\n", - "agent = HierarchicalDQN(state_dim, action_dim, cfg)\n", - "rewards, ma_rewards = train(cfg, env, agent)\n", - "agent.save(path=SAVED_MODEL_PATH)\n", - "save_results(rewards, ma_rewards, tag='train', path=RESULT_PATH)\n", - "plot_rewards(rewards, ma_rewards, tag=\"train\",\n", - " algo=cfg.algo, path=RESULT_PATH)" - ] - } - ] -} \ No newline at end of file diff --git a/codes/HierarchicalDQN/task0_train.py b/codes/HierarchicalDQN/task0_train.py deleted file mode 100644 index 2676094..0000000 --- a/codes/HierarchicalDQN/task0_train.py +++ /dev/null @@ -1,146 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -Author: John -Email: johnjim0816@gmail.com -Date: 2021-03-29 10:37:32 -LastEditor: John -LastEditTime: 2021-05-04 22:35:56 -Discription: -Environment: -''' - - -import sys,os -curr_path = os.path.dirname(__file__) -parent_path = os.path.dirname(curr_path) -sys.path.append(parent_path) # add current terminal path to sys.path - -import datetime -import numpy as np -import torch -import gym - -from common.utils import save_results,make_dir -from common.plot import plot_rewards -from HierarchicalDQN.agent import HierarchicalDQN - -curr_time = datetime.datetime.now().strftime( - "%Y%m%d-%H%M%S") # obtain current time - -class HierarchicalDQNConfig: - def __init__(self): - self.algo = "H-DQN" # name of algo - self.env = 'CartPole-v0' - self.result_path = curr_path+"/outputs/" + self.env + \ - '/'+curr_time+'/results/' # path to save results - self.model_path = curr_path+"/outputs/" + self.env + \ - '/'+curr_time+'/models/' # path to save models - self.train_eps = 300 # 训练的episode数目 - self.eval_eps = 50 # 测试的episode数目 - self.gamma = 0.99 - self.epsilon_start = 1 # start epsilon of e-greedy policy - self.epsilon_end = 0.01 - self.epsilon_decay = 200 - self.lr = 0.0001 # learning rate - self.memory_capacity = 10000 # Replay Memory capacity - self.batch_size = 32 - self.target_update = 2 # target net的更新频率 - self.device = torch.device( - "cuda" if torch.cuda.is_available() else "cpu") # 检测gpu - self.hidden_dim = 256 # dimension of hidden layer - -def env_agent_config(cfg,seed=1): - env = gym.make(cfg.env) - env.seed(seed) - state_dim = env.observation_space.shape[0] - action_dim = env.action_space.n - agent = HierarchicalDQN(state_dim,action_dim,cfg) - return env,agent - -def train(cfg, env, agent): - print('Start to train !') - print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}') - rewards = [] - ma_rewards = [] # moveing average reward - for i_ep in range(cfg.train_eps): - state = env.reset() - done = False - ep_reward = 0 - while not done: - goal = agent.set_goal(state) - onehot_goal = agent.to_onehot(goal) - meta_state = state - extrinsic_reward = 0 - while not done and goal != np.argmax(state): - goal_state = np.concatenate([state, onehot_goal]) - action = agent.choose_action(goal_state) - next_state, reward, done, _ = env.step(action) - ep_reward += reward - extrinsic_reward += reward - intrinsic_reward = 1.0 if goal == np.argmax( - next_state) else 0.0 - agent.memory.push(goal_state, action, intrinsic_reward, np.concatenate( - [next_state, onehot_goal]), done) - state = next_state - agent.update() - agent.meta_memory.push(meta_state, goal, extrinsic_reward, state, done) - print('Episode:{}/{}, Reward:{}, Loss:{:.2f}, Meta_Loss:{:.2f}'.format(i_ep+1, cfg.train_eps, ep_reward,agent.loss_numpy ,agent.meta_loss_numpy )) - rewards.append(ep_reward) - if ma_rewards: - ma_rewards.append( - 0.9*ma_rewards[-1]+0.1*ep_reward) - else: - ma_rewards.append(ep_reward) - print('Complete training!') - return rewards, ma_rewards - -def eval(cfg, env, agent): - print('Start to eval !') - print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}') - rewards = [] - ma_rewards = [] # moveing average reward - for i_ep in range(cfg.train_eps): - state = env.reset() - done = False - ep_reward = 0 - while not done: - goal = agent.set_goal(state) - onehot_goal = agent.to_onehot(goal) - extrinsic_reward = 0 - while not done and goal != np.argmax(state): - goal_state = np.concatenate([state, onehot_goal]) - action = agent.choose_action(goal_state) - next_state, reward, done, _ = env.step(action) - ep_reward += reward - extrinsic_reward += reward - state = next_state - agent.update() - print(f'Episode:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward}, Loss:{agent.loss_numpy:.2f}, Meta_Loss:{agent.meta_loss_numpy:.2f}') - rewards.append(ep_reward) - if ma_rewards: - ma_rewards.append( - 0.9*ma_rewards[-1]+0.1*ep_reward) - else: - ma_rewards.append(ep_reward) - print('Complete training!') - return rewards, ma_rewards - -if __name__ == "__main__": - cfg = HierarchicalDQNConfig() - - # train - env,agent = env_agent_config(cfg,seed=1) - rewards, ma_rewards = train(cfg, env, agent) - make_dir(cfg.result_path, cfg.model_path) - agent.save(path=cfg.model_path) - save_results(rewards, ma_rewards, tag='train', path=cfg.result_path) - plot_rewards(rewards, ma_rewards, tag="train", - algo=cfg.algo, path=cfg.result_path) - # eval - env,agent = env_agent_config(cfg,seed=10) - agent.load(path=cfg.model_path) - rewards,ma_rewards = eval(cfg,env,agent) - save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path) - plot_rewards(rewards,ma_rewards,tag="eval",env=cfg.env,algo = cfg.algo,path=cfg.result_path) - diff --git a/codes/HierarchicalDQN/train.py b/codes/HierarchicalDQN/train.py new file mode 100644 index 0000000..3dc8aa3 --- /dev/null +++ b/codes/HierarchicalDQN/train.py @@ -0,0 +1,77 @@ +import sys +import os +curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 +parent_path = os.path.dirname(curr_path) # 父路径 +sys.path.append(parent_path) # 添加路径到系统路径 + +import numpy as np + +def train(cfg, env, agent): + print('开始训练!') + print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') + rewards = [] # 记录所有回合的奖励 + ma_rewards = [] # 记录所有回合的滑动平均奖励 + for i_ep in range(cfg.train_eps): + state = env.reset() + done = False + ep_reward = 0 + while not done: + goal = agent.set_goal(state) + onehot_goal = agent.to_onehot(goal) + meta_state = state + extrinsic_reward = 0 + while not done and goal != np.argmax(state): + goal_state = np.concatenate([state, onehot_goal]) + action = agent.choose_action(goal_state) + next_state, reward, done, _ = env.step(action) + ep_reward += reward + extrinsic_reward += reward + intrinsic_reward = 1.0 if goal == np.argmax( + next_state) else 0.0 + agent.memory.push(goal_state, action, intrinsic_reward, np.concatenate( + [next_state, onehot_goal]), done) + state = next_state + agent.update() + if (i_ep+1)%10 == 0: + print(f'回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward},Loss:{agent.loss_numpy:.2f}, Meta_Loss:{agent.meta_loss_numpy:.2f}') + agent.meta_memory.push(meta_state, goal, extrinsic_reward, state, done) + rewards.append(ep_reward) + if ma_rewards: + ma_rewards.append( + 0.9*ma_rewards[-1]+0.1*ep_reward) + else: + ma_rewards.append(ep_reward) + print('完成训练!') + return rewards, ma_rewards + +def test(cfg, env, agent): + print('开始测试!') + print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') + rewards = [] # 记录所有回合的奖励 + ma_rewards = [] # 记录所有回合的滑动平均奖励 + for i_ep in range(cfg.train_eps): + state = env.reset() + done = False + ep_reward = 0 + while not done: + goal = agent.set_goal(state) + onehot_goal = agent.to_onehot(goal) + extrinsic_reward = 0 + while not done and goal != np.argmax(state): + goal_state = np.concatenate([state, onehot_goal]) + action = agent.choose_action(goal_state) + next_state, reward, done, _ = env.step(action) + ep_reward += reward + extrinsic_reward += reward + state = next_state + agent.update() + if (i_ep+1)%10 == 0: + print(f'回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward},Loss:{agent.loss_numpy:.2f}, Meta_Loss:{agent.meta_loss_numpy:.2f}') + rewards.append(ep_reward) + if ma_rewards: + ma_rewards.append( + 0.9*ma_rewards[-1]+0.1*ep_reward) + else: + ma_rewards.append(ep_reward) + print('完成训练!') + return rewards, ma_rewards \ No newline at end of file diff --git a/codes/Logs.md b/codes/Logs.md new file mode 100644 index 0000000..7dc6497 --- /dev/null +++ b/codes/Logs.md @@ -0,0 +1,7 @@ +## 记录笔者更新的日志 + +**2021.12.28-1**:将```task.py```中的两个Config类合并为一个,并加以注释便于阅读,从DQN算法开始更新 + +**2021.12.22-3**:将```agent.py```更改为对应的算法名称,便于区分如```dqn```与```dqn_cnn```的情况 +**2021.12.22-2**:简化了代码结构,将原来的```train.py```和```task.py```等合并到```task.py```中 +**2021.12.22-1**:简化了代码结构,将原来的```model.py```和```memory.py```等合并到```agent.py```中,```plot.py```的内容合并到```common.utils.py```中 \ No newline at end of file diff --git a/codes/NoisyDQN/noisy_dqn.py b/codes/NoisyDQN/noisy_dqn.py new file mode 100644 index 0000000..45cc5d2 --- /dev/null +++ b/codes/NoisyDQN/noisy_dqn.py @@ -0,0 +1,52 @@ +import torch +import torch.nn as nn + +class NoisyLinear(nn.Module): + def __init__(self, input_dim, output_dim, std_init=0.4): + super(NoisyLinear, self).__init__() + + self.input_dim = input_dim + self.output_dim = output_dim + self.std_init = std_init + + self.weight_mu = nn.Parameter(torch.FloatTensor(output_dim, input_dim)) + self.weight_sigma = nn.Parameter(torch.FloatTensor(output_dim, input_dim)) + self.register_buffer('weight_epsilon', torch.FloatTensor(output_dim, input_dim)) + + self.bias_mu = nn.Parameter(torch.FloatTensor(output_dim)) + self.bias_sigma = nn.Parameter(torch.FloatTensor(output_dim)) + self.register_buffer('bias_epsilon', torch.FloatTensor(output_dim)) + + self.reset_parameters() + self.reset_noise() + + def forward(self, x): + if self.training: + weight = self.weight_mu + self.weight_sigma.mul( (self.weight_epsilon)) + bias = self.bias_mu + self.bias_sigma.mul(Variable(self.bias_epsilon)) + else: + weight = self.weight_mu + bias = self.bias_mu + + return F.linear(x, weight, bias) + + def reset_parameters(self): + mu_range = 1 / math.sqrt(self.weight_mu.size(1)) + + self.weight_mu.data.uniform_(-mu_range, mu_range) + self.weight_sigma.data.fill_(self.std_init / math.sqrt(self.weight_sigma.size(1))) + + self.bias_mu.data.uniform_(-mu_range, mu_range) + self.bias_sigma.data.fill_(self.std_init / math.sqrt(self.bias_sigma.size(0))) + + def reset_noise(self): + epsilon_in = self._scale_noise(self.input_dim) + epsilon_out = self._scale_noise(self.output_dim) + + self.weight_epsilon.copy_(epsilon_out.ger(epsilon_in)) + self.bias_epsilon.copy_(self._scale_noise(self.output_dim)) + + def _scale_noise(self, size): + x = torch.randn(size) + x = x.sign().mul(x.abs().sqrt()) + return x \ No newline at end of file diff --git a/codes/PPO/task0.py b/codes/PPO/task0.py index cd55eda..8e0d92a 100644 --- a/codes/PPO/task0.py +++ b/codes/PPO/task0.py @@ -20,7 +20,7 @@ class PPOConfig: self.continuous = False # 环境是否为连续动作 self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU self.train_eps = 200 # 训练的回合数 - self.eval_eps = 20 # 测试的回合数 + self.test_eps = 20 # 测试的回合数 self.batch_size = 5 self.gamma=0.99 self.n_epochs = 4 diff --git a/codes/PPO/task1.py b/codes/PPO/task1.py index 178efba..38d9152 100644 --- a/codes/PPO/task1.py +++ b/codes/PPO/task1.py @@ -20,7 +20,7 @@ class PPOConfig: self.continuous = True # 环境是否为连续动作 self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU self.train_eps = 200 # 训练的回合数 - self.eval_eps = 20 # 测试的回合数 + self.test_eps = 20 # 测试的回合数 self.batch_size = 5 self.gamma=0.99 self.n_epochs = 4 diff --git a/codes/PPO/train.ipynb b/codes/PPO/train.ipynb index 9c74585..b2dc91a 100644 --- a/codes/PPO/train.ipynb +++ b/codes/PPO/train.ipynb @@ -68,7 +68,7 @@ " self.result_path = curr_path+\"/results/\" +self.env+'/'+curr_time+'/results/' # path to save results\n", " self.model_path = curr_path+\"/results/\" +self.env+'/'+curr_time+'/models/' # path to save models\n", " self.train_eps = 200 # max training episodes\n", - " self.eval_eps = 50\n", + " self.test_eps = 50\n", " self.batch_size = 5\n", " self.gamma=0.99\n", " self.n_epochs = 4\n", @@ -144,7 +144,7 @@ " print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}')\n", " rewards= []\n", " ma_rewards = [] # moving average rewards\n", - " for i_ep in range(cfg.eval_eps):\n", + " for i_ep in range(cfg.test_eps):\n", " state = env.reset()\n", " done = False\n", " ep_reward = 0\n", diff --git a/codes/PPO/train.py b/codes/PPO/train.py index aff54bf..e642df0 100644 --- a/codes/PPO/train.py +++ b/codes/PPO/train.py @@ -32,7 +32,7 @@ def eval(cfg,env,agent): print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}') rewards = [] # 记录所有回合的奖励 ma_rewards = [] # 记录所有回合的滑动平均奖励 - for i_ep in range(cfg.eval_eps): + for i_ep in range(cfg.test_eps): state = env.reset() done = False ep_reward = 0 @@ -47,7 +47,7 @@ def eval(cfg,env,agent): 0.9*ma_rewards[-1]+0.1*ep_reward) else: ma_rewards.append(ep_reward) - print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.eval_eps, ep_reward)) + print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.test_eps, ep_reward)) print('完成训练!') return rewards,ma_rewards @@ -74,7 +74,7 @@ if __name__ == '__main__': self.continuous = False # 环境是否为连续动作 self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU self.train_eps = 200 # 训练的回合数 - self.eval_eps = 20 # 测试的回合数 + self.test_eps = 20 # 测试的回合数 self.batch_size = 5 self.gamma=0.99 self.n_epochs = 4 diff --git a/codes/PolicyGradient/model.py b/codes/PolicyGradient/model.py index 7f5b1a8..6d9bc64 100644 --- a/codes/PolicyGradient/model.py +++ b/codes/PolicyGradient/model.py @@ -5,21 +5,22 @@ Author: John Email: johnjim0816@gmail.com Date: 2021-03-23 16:35:58 LastEditor: John -LastEditTime: 2021-03-23 16:36:20 +LastEditTime: 2021-12-21 23:21:26 Discription: Environment: ''' import torch.nn as nn import torch.nn.functional as F class MLP(nn.Module): + ''' 多层感知机 输入:state维度 输出:概率 ''' - def __init__(self,state_dim,hidden_dim = 36): + def __init__(self,input_dim,hidden_dim = 36): super(MLP, self).__init__() - # 24和36为hidden layer的层数,可根据state_dim, action_dim的情况来改变 - self.fc1 = nn.Linear(state_dim, hidden_dim) + # 24和36为hidden layer的层数,可根据input_dim, action_dim的情况来改变 + self.fc1 = nn.Linear(input_dim, hidden_dim) self.fc2 = nn.Linear(hidden_dim,hidden_dim) self.fc3 = nn.Linear(hidden_dim, 1) # Prob of Left diff --git a/codes/PolicyGradient/task0_train.py b/codes/PolicyGradient/task0_train.py index a7fb0d2..b6866f0 100644 --- a/codes/PolicyGradient/task0_train.py +++ b/codes/PolicyGradient/task0_train.py @@ -34,7 +34,7 @@ class PGConfig: self.model_path = curr_path+"/outputs/" + self.env + \ '/'+curr_time+'/models/' # 保存模型的路径 self.train_eps = 300 # 训练的回合数 - self.eval_eps = 30 # 测试的回合数 + self.test_eps = 30 # 测试的回合数 self.batch_size = 8 self.lr = 0.01 # 学习率 self.gamma = 0.99 @@ -94,7 +94,7 @@ def eval(cfg,env,agent): print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}') rewards = [] ma_rewards = [] - for i_ep in range(cfg.eval_eps): + for i_ep in range(cfg.test_eps): state = env.reset() ep_reward = 0 for _ in count(): diff --git a/codes/QLearning/agent.py b/codes/QLearning/agent.py index 4587c86..b72de22 100644 --- a/codes/QLearning/agent.py +++ b/codes/QLearning/agent.py @@ -5,7 +5,7 @@ Author: John Email: johnjim0816@gmail.com Date: 2020-09-11 23:03:00 LastEditor: John -LastEditTime: 2021-09-19 23:05:45 +LastEditTime: 2021-12-22 10:54:57 Discription: use defaultdict to define Q table Environment: ''' @@ -17,15 +17,15 @@ from collections import defaultdict class QLearning(object): def __init__(self,state_dim, action_dim,cfg): - self.action_dim = action_dim # dimension of acgtion - self.lr = cfg.lr # learning rate + self.action_dim = action_dim + self.lr = cfg.lr # 学习率 self.gamma = cfg.gamma self.epsilon = 0 self.sample_count = 0 self.epsilon_start = cfg.epsilon_start self.epsilon_end = cfg.epsilon_end self.epsilon_decay = cfg.epsilon_decay - self.Q_table = defaultdict(lambda: np.zeros(action_dim)) # A nested dictionary that maps state -> (action -> action-value) + self.Q_table = defaultdict(lambda: np.zeros(action_dim)) # 用嵌套字典存放状态->动作->状态-动作值(Q值)的映射,即Q表 def choose_action(self, state): self.sample_count += 1 self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20210919-211343/models/Qleaning_model.pkl b/codes/QLearning/outputs/CliffWalking-v0/20210919-211343/models/Qleaning_model.pkl deleted file mode 100644 index 45dce51..0000000 Binary files a/codes/QLearning/outputs/CliffWalking-v0/20210919-211343/models/Qleaning_model.pkl and /dev/null differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20210919-211343/results/train_ma_rewards.npy b/codes/QLearning/outputs/CliffWalking-v0/20210919-211343/results/train_ma_rewards.npy deleted file mode 100644 index 3a8bde0..0000000 Binary files a/codes/QLearning/outputs/CliffWalking-v0/20210919-211343/results/train_ma_rewards.npy and /dev/null differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20210919-211343/results/train_rewards.npy b/codes/QLearning/outputs/CliffWalking-v0/20210919-211343/results/train_rewards.npy deleted file mode 100644 index 36de6fc..0000000 Binary files a/codes/QLearning/outputs/CliffWalking-v0/20210919-211343/results/train_rewards.npy and /dev/null differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20210919-211343/results/train_rewards_curve_cn.png b/codes/QLearning/outputs/CliffWalking-v0/20210919-211343/results/train_rewards_curve_cn.png deleted file mode 100644 index 3226b8a..0000000 Binary files a/codes/QLearning/outputs/CliffWalking-v0/20210919-211343/results/train_rewards_curve_cn.png and /dev/null differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20210919-211442/models/Qleaning_model.pkl b/codes/QLearning/outputs/CliffWalking-v0/20210919-211442/models/Qleaning_model.pkl deleted file mode 100644 index 5c46ec6..0000000 Binary files a/codes/QLearning/outputs/CliffWalking-v0/20210919-211442/models/Qleaning_model.pkl and /dev/null differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20210919-211442/results/train_ma_rewards.npy b/codes/QLearning/outputs/CliffWalking-v0/20210919-211442/results/train_ma_rewards.npy deleted file mode 100644 index 1d6b889..0000000 Binary files a/codes/QLearning/outputs/CliffWalking-v0/20210919-211442/results/train_ma_rewards.npy and /dev/null differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20210919-211442/results/train_rewards.npy b/codes/QLearning/outputs/CliffWalking-v0/20210919-211442/results/train_rewards.npy deleted file mode 100644 index 6e6ccf0..0000000 Binary files a/codes/QLearning/outputs/CliffWalking-v0/20210919-211442/results/train_rewards.npy and /dev/null differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20210919-211442/results/train_rewards_curve_cn.png b/codes/QLearning/outputs/CliffWalking-v0/20210919-211442/results/train_rewards_curve_cn.png deleted file mode 100644 index e1cd04e..0000000 Binary files a/codes/QLearning/outputs/CliffWalking-v0/20210919-211442/results/train_rewards_curve_cn.png and /dev/null differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20210919-211456/models/Qleaning_model.pkl b/codes/QLearning/outputs/CliffWalking-v0/20210919-211456/models/Qleaning_model.pkl deleted file mode 100644 index 6986805..0000000 Binary files a/codes/QLearning/outputs/CliffWalking-v0/20210919-211456/models/Qleaning_model.pkl and /dev/null differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20210919-211456/results/train_ma_rewards.npy b/codes/QLearning/outputs/CliffWalking-v0/20210919-211456/results/train_ma_rewards.npy deleted file mode 100644 index e6793df..0000000 Binary files a/codes/QLearning/outputs/CliffWalking-v0/20210919-211456/results/train_ma_rewards.npy and /dev/null differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20210919-211456/results/train_rewards.npy b/codes/QLearning/outputs/CliffWalking-v0/20210919-211456/results/train_rewards.npy deleted file mode 100644 index e6793df..0000000 Binary files a/codes/QLearning/outputs/CliffWalking-v0/20210919-211456/results/train_rewards.npy and /dev/null differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20210919-211456/results/train_rewards_curve_cn.png b/codes/QLearning/outputs/CliffWalking-v0/20210919-211456/results/train_rewards_curve_cn.png deleted file mode 100644 index 9c98cc9..0000000 Binary files a/codes/QLearning/outputs/CliffWalking-v0/20210919-211456/results/train_rewards_curve_cn.png and /dev/null differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20210920-003309/models/Qleaning_model.pkl b/codes/QLearning/outputs/CliffWalking-v0/20210920-003309/models/Qleaning_model.pkl deleted file mode 100644 index 4d6ba95..0000000 Binary files a/codes/QLearning/outputs/CliffWalking-v0/20210920-003309/models/Qleaning_model.pkl and /dev/null differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20210920-003309/results/eval_rewards_curve_cn.png b/codes/QLearning/outputs/CliffWalking-v0/20210920-003309/results/eval_rewards_curve_cn.png deleted file mode 100644 index 91ca06c..0000000 Binary files a/codes/QLearning/outputs/CliffWalking-v0/20210920-003309/results/eval_rewards_curve_cn.png and /dev/null differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20210920-003309/results/train_ma_rewards.npy b/codes/QLearning/outputs/CliffWalking-v0/20210920-003309/results/train_ma_rewards.npy deleted file mode 100644 index 7184c7b..0000000 Binary files a/codes/QLearning/outputs/CliffWalking-v0/20210920-003309/results/train_ma_rewards.npy and /dev/null differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20210920-003309/results/train_rewards.npy b/codes/QLearning/outputs/CliffWalking-v0/20210920-003309/results/train_rewards.npy deleted file mode 100644 index f037a25..0000000 Binary files a/codes/QLearning/outputs/CliffWalking-v0/20210920-003309/results/train_rewards.npy and /dev/null differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20210920-003309/results/train_rewards_curve_cn.png b/codes/QLearning/outputs/CliffWalking-v0/20210920-003309/results/train_rewards_curve_cn.png deleted file mode 100644 index 9c0943a..0000000 Binary files a/codes/QLearning/outputs/CliffWalking-v0/20210920-003309/results/train_rewards_curve_cn.png and /dev/null differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/models/Qleaning_model.pkl b/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/models/Qleaning_model.pkl new file mode 100644 index 0000000..dc89386 Binary files /dev/null and b/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/models/Qleaning_model.pkl differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20210920-003309/results/eval_ma_rewards.npy b/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/test_ma_rewards.npy similarity index 100% rename from codes/QLearning/outputs/CliffWalking-v0/20210920-003309/results/eval_ma_rewards.npy rename to codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/test_ma_rewards.npy diff --git a/codes/QLearning/outputs/CliffWalking-v0/20210920-003309/results/eval_rewards.npy b/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/test_rewards.npy similarity index 100% rename from codes/QLearning/outputs/CliffWalking-v0/20210920-003309/results/eval_rewards.npy rename to codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/test_rewards.npy diff --git a/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/test_rewards_curve.png b/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/test_rewards_curve.png new file mode 100644 index 0000000..d745634 Binary files /dev/null and b/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/test_rewards_curve.png differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/train_ma_rewards.npy b/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/train_ma_rewards.npy new file mode 100644 index 0000000..23e7c95 Binary files /dev/null and b/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/train_ma_rewards.npy differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/train_rewards.npy b/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/train_rewards.npy new file mode 100644 index 0000000..0ceb153 Binary files /dev/null and b/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/train_rewards.npy differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/train_rewards_curve.png b/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/train_rewards_curve.png new file mode 100644 index 0000000..a15bd2a Binary files /dev/null and b/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/train_rewards_curve.png differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/models/Qleaning_model.pkl b/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/models/Qleaning_model.pkl new file mode 100644 index 0000000..c362dbd Binary files /dev/null and b/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/models/Qleaning_model.pkl differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/test_ma_rewards.npy b/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/test_ma_rewards.npy new file mode 100644 index 0000000..9bee5e4 Binary files /dev/null and b/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/test_ma_rewards.npy differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/test_rewards.npy b/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/test_rewards.npy new file mode 100644 index 0000000..8aeb5dd Binary files /dev/null and b/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/test_rewards.npy differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/test_rewards_curve.png b/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/test_rewards_curve.png new file mode 100644 index 0000000..5f3ffb5 Binary files /dev/null and b/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/test_rewards_curve.png differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/train_ma_rewards.npy b/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/train_ma_rewards.npy new file mode 100644 index 0000000..261a3d5 Binary files /dev/null and b/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/train_ma_rewards.npy differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/train_rewards.npy b/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/train_rewards.npy new file mode 100644 index 0000000..b1a0f23 Binary files /dev/null and b/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/train_rewards.npy differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/train_rewards_curve.png b/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/train_rewards_curve.png new file mode 100644 index 0000000..9a9d6ad Binary files /dev/null and b/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/train_rewards_curve.png differ diff --git a/codes/QLearning/task0.ipynb b/codes/QLearning/task0.ipynb new file mode 100644 index 0000000..dc447ce --- /dev/null +++ b/codes/QLearning/task0.ipynb @@ -0,0 +1,386 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "from pathlib import Path\n", + "curr_path = str(Path().absolute())\n", + "parent_path = str(Path().absolute().parent)\n", + "sys.path.append(parent_path) # 添加路径到系统路径\n", + "\n", + "import gym\n", + "import torch\n", + "import math\n", + "import datetime\n", + "import numpy as np\n", + "from collections import defaultdict\n", + "from envs.gridworld_env import CliffWalkingWapper\n", + "from QLearning.agent import QLearning\n", + "from common.utils import plot_rewards\n", + "from common.utils import save_results,make_dir\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## QLearning算法" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "class QLearning(object):\n", + " def __init__(self,state_dim,\n", + " action_dim,cfg):\n", + " self.action_dim = action_dim \n", + " self.lr = cfg.lr # 学习率\n", + " self.gamma = cfg.gamma \n", + " self.epsilon = 0 \n", + " self.sample_count = 0 \n", + " self.epsilon_start = cfg.epsilon_start\n", + " self.epsilon_end = cfg.epsilon_end\n", + " self.epsilon_decay = cfg.epsilon_decay\n", + " self.Q_table = defaultdict(lambda: np.zeros(action_dim)) # 用嵌套字典存放状态->动作->状态-动作值(Q值)的映射,即Q表\n", + " def choose_action(self, state):\n", + " self.sample_count += 1\n", + " self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \\\n", + " math.exp(-1. * self.sample_count / self.epsilon_decay) # epsilon是会递减的,这里选择指数递减\n", + " # e-greedy 策略\n", + " if np.random.uniform(0, 1) > self.epsilon:\n", + " action = np.argmax(self.Q_table[str(state)]) # 选择Q(s,a)最大对应的动作\n", + " else:\n", + " action = np.random.choice(self.action_dim) # 随机选择动作\n", + " return action\n", + " def predict(self,state):\n", + " action = np.argmax(self.Q_table[str(state)])\n", + " return action\n", + " def update(self, state, action, reward, next_state, done):\n", + " Q_predict = self.Q_table[str(state)][action] \n", + " if done: # 终止状态\n", + " Q_target = reward \n", + " else:\n", + " Q_target = reward + self.gamma * np.max(self.Q_table[str(next_state)]) \n", + " self.Q_table[str(state)][action] += self.lr * (Q_target - Q_predict)\n", + " def save(self,path):\n", + " import dill\n", + " torch.save(\n", + " obj=self.Q_table,\n", + " f=path+\"Qleaning_model.pkl\",\n", + " pickle_module=dill\n", + " )\n", + " print(\"保存模型成功!\")\n", + " def load(self, path):\n", + " import dill\n", + " self.Q_table =torch.load(f=path+'Qleaning_model.pkl',pickle_module=dill)\n", + " print(\"加载模型成功!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 训练" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "def train(cfg,env,agent):\n", + " print('开始训练!')\n", + " print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')\n", + " rewards = [] # 记录奖励\n", + " ma_rewards = [] # 记录滑动平均奖励\n", + " for i_ep in range(cfg.train_eps):\n", + " ep_reward = 0 # 记录每个episode的reward\n", + " state = env.reset() # 重置环境, 重新开一局(即开始新的一个episode)\n", + " while True:\n", + " action = agent.choose_action(state) # 根据算法选择一个动作\n", + " next_state, reward, done, _ = env.step(action) # 与环境进行一次动作交互\n", + " agent.update(state, action, reward, next_state, done) # Q-learning算法更新\n", + " state = next_state # 存储上一个观察值\n", + " ep_reward += reward\n", + " if done:\n", + " break\n", + " rewards.append(ep_reward)\n", + " if ma_rewards:\n", + " ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)\n", + " else:\n", + " ma_rewards.append(ep_reward)\n", + " if (i_ep+1)%20 == 0: \n", + " print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward))\n", + " print('完成训练!')\n", + " return rewards,ma_rewards" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 测试" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "def test(cfg,env,agent):\n", + " # env = gym.make(\"FrozenLake-v0\", is_slippery=False) # 0 left, 1 down, 2 right, 3 up\n", + " # env = FrozenLakeWapper(env)\n", + " print('开始测试!')\n", + " print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')\n", + " # 由于测试不需要使用epsilon-greedy策略,所以相应的值设置为0\n", + " cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon\n", + " cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon\n", + " rewards = [] # 记录所有回合的奖励\n", + " ma_rewards = [] # 记录所有回合的滑动平均奖励\n", + " rewards = [] # 记录所有episode的reward\n", + " ma_rewards = [] # 滑动平均的reward\n", + " for i_ep in range(cfg.test_eps):\n", + " ep_reward = 0 # 记录每个episode的reward\n", + " state = env.reset() # 重置环境, 重新开一局(即开始新的一个episode)\n", + " while True:\n", + " action = agent.predict(state) # 根据算法选择一个动作\n", + " next_state, reward, done, _ = env.step(action) # 与环境进行一个交互\n", + " state = next_state # 存储上一个观察值\n", + " ep_reward += reward\n", + " if done:\n", + " break\n", + " rewards.append(ep_reward)\n", + " if ma_rewards:\n", + " ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)\n", + " else:\n", + " ma_rewards.append(ep_reward)\n", + " print(f\"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}\")\n", + " print('完成测试!')\n", + " return rewards,ma_rewards" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 设置参数" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "curr_time = datetime.datetime.now().strftime(\"%Y%m%d-%H%M%S\") # 获取当前时间\n", + "algo_name = 'Q-learning' # 算法名称\n", + "env_name = 'CliffWalking-v0' # 环境名称\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\") # 检测GPU\n", + "class QlearningConfig:\n", + " '''训练相关参数'''\n", + " def __init__(self):\n", + " self.algo_name = algo_name # 算法名称\n", + " self.env_name = env_name # 环境名称\n", + " self.device = device # 检测GPU\n", + " self.train_eps = 400 # 训练的回合数\n", + " self.test_eps = 20 # 测试的回合数\n", + " self.gamma = 0.9 # reward的衰减率\n", + " self.epsilon_start = 0.95 # e-greedy策略中初始epsilon\n", + " self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon\n", + " self.epsilon_decay = 300 # e-greedy策略中epsilon的衰减率\n", + " self.lr = 0.1 # 学习率 \n", + "class PlotConfig:\n", + " ''' 绘图相关参数设置\n", + " '''\n", + "\n", + " def __init__(self) -> None:\n", + " self.algo_name = algo_name # 算法名称\n", + " self.env_name = env_name # 环境名称\n", + " self.device = device # 检测GPU\n", + " self.result_path = curr_path + \"/outputs/\" + self.env_name + \\\n", + " '/' + curr_time + '/results/' # 保存结果的路径\n", + " self.model_path = curr_path + \"/outputs/\" + self.env_name + \\\n", + " '/' + curr_time + '/models/' # 保存模型的路径\n", + " self.save = True # 是否保存图片" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 创建环境和智能体" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "def env_agent_config(cfg,seed=1):\n", + " '''创建环境和智能体\n", + " Args:\n", + " cfg ([type]): [description]\n", + " seed (int, optional): 随机种子. Defaults to 1.\n", + " Returns:\n", + " env [type]: 环境\n", + " agent : 智能体\n", + " ''' \n", + " env = gym.make(cfg.env_name) \n", + " env = CliffWalkingWapper(env)\n", + " env.seed(seed) # 设置随机种子\n", + " state_dim = env.observation_space.n # 状态维度\n", + " action_dim = env.action_space.n # 动作维度\n", + " agent = QLearning(state_dim,action_dim,cfg)\n", + " return env,agent" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 执行训练并输出结果" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "开始训练!\n", + "环境:CliffWalking-v0, 算法:Q-learning, 设备:cuda\n", + "回合:20/400, 奖励:-82\n", + "回合:40/400, 奖励:-51\n", + "回合:60/400, 奖励:-50\n", + "回合:80/400, 奖励:-53\n", + "回合:100/400, 奖励:-21\n", + "回合:120/400, 奖励:-35\n", + "回合:140/400, 奖励:-44\n", + "回合:160/400, 奖励:-28\n", + "回合:180/400, 奖励:-28\n", + "回合:200/400, 奖励:-17\n", + "回合:220/400, 奖励:-18\n", + "回合:240/400, 奖励:-22\n", + "回合:260/400, 奖励:-19\n", + "回合:280/400, 奖励:-15\n", + "回合:300/400, 奖励:-14\n", + "回合:320/400, 奖励:-13\n", + "回合:340/400, 奖励:-13\n", + "回合:360/400, 奖励:-13\n", + "回合:380/400, 奖励:-13\n", + "回合:400/400, 奖励:-13\n", + "完成训练!\n", + "保存模型成功!\n", + "结果保存完毕!\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "加载模型成功!\n", + "开始测试!\n", + "环境:CliffWalking-v0, 算法:Q-learning, 设备:cuda\n", + "回合:1/20,奖励:-13.0\n", + "回合:2/20,奖励:-13.0\n", + "回合:3/20,奖励:-13.0\n", + "回合:4/20,奖励:-13.0\n", + "回合:5/20,奖励:-13.0\n", + "回合:6/20,奖励:-13.0\n", + "回合:7/20,奖励:-13.0\n", + "回合:8/20,奖励:-13.0\n", + "回合:9/20,奖励:-13.0\n", + "回合:10/20,奖励:-13.0\n", + "回合:11/20,奖励:-13.0\n", + "回合:12/20,奖励:-13.0\n", + "回合:13/20,奖励:-13.0\n", + "回合:14/20,奖励:-13.0\n", + "回合:15/20,奖励:-13.0\n", + "回合:16/20,奖励:-13.0\n", + "回合:17/20,奖励:-13.0\n", + "回合:18/20,奖励:-13.0\n", + "回合:19/20,奖励:-13.0\n", + "回合:20/20,奖励:-13.0\n", + "完成测试!\n", + "结果保存完毕!\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "cfg = QlearningConfig()\n", + "plot_cfg = PlotConfig()\n", + "# 训练\n", + "env, agent = env_agent_config(cfg, seed=1)\n", + "rewards, ma_rewards = train(cfg, env, agent)\n", + "make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹\n", + "agent.save(path=plot_cfg.model_path) # 保存模型\n", + "save_results(rewards, ma_rewards, tag='train',\n", + " path=plot_cfg.result_path) # 保存结果\n", + "plot_rewards(rewards, ma_rewards, plot_cfg, tag=\"train\") # 画出结果\n", + "# 测试\n", + "env, agent = env_agent_config(cfg, seed=10)\n", + "agent.load(path=plot_cfg.model_path) # 导入模型\n", + "rewards, ma_rewards = test(cfg, env, agent)\n", + "save_results(rewards, ma_rewards, tag='test', path=plot_cfg.result_path) # 保存结果\n", + "plot_rewards(rewards, ma_rewards, plot_cfg, tag=\"test\") # 画出结果" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "fbea1422c2cf61ed9c0cfc03f38f71cc9083cc288606edc4170b5309b352ce27" + }, + "kernelspec": { + "display_name": "Python 3.7.11 64-bit ('py37': conda)", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + }, + "orig_nbformat": 2 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/codes/QLearning/task0.py b/codes/QLearning/task0.py new file mode 100644 index 0000000..59a1668 --- /dev/null +++ b/codes/QLearning/task0.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python +# coding=utf-8 +''' +Author: John +Email: johnjim0816@gmail.com +Date: 2020-09-11 23:03:00 +LastEditor: John +LastEditTime: 2021-12-22 11:13:23 +Discription: +Environment: +''' +import sys +import os +curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 +parent_path = os.path.dirname(curr_path) # 父路径 +sys.path.append(parent_path) # 添加路径到系统路径 + +import gym +import torch +import datetime + +from envs.gridworld_env import CliffWalkingWapper +from QLearning.agent import QLearning +from QLearning.train import train,test +from common.utils import plot_rewards,plot_rewards_cn +from common.utils import save_results,make_dir + +curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 +algo_name = 'Q-learning' # 算法名称 +env_name = 'CliffWalking-v0' # 环境名称 +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU +class QlearningConfig: + '''训练相关参数''' + def __init__(self): + self.algo_name = algo_name # 算法名称 + self.env_name = env_name # 环境名称 + self.device = device # 检测GPU + self.train_eps = 400 # 训练的回合数 + self.test_eps = 30 # 测试的回合数 + self.gamma = 0.9 # reward的衰减率 + self.epsilon_start = 0.95 # e-greedy策略中初始epsilon + self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon + self.epsilon_decay = 300 # e-greedy策略中epsilon的衰减率 + self.lr = 0.1 # 学习率 +class PlotConfig: + ''' 绘图相关参数设置 + ''' + + def __init__(self) -> None: + self.algo_name = algo_name # 算法名称 + self.env_name = env_name # 环境名称 + self.device = device # 检测GPU + self.result_path = curr_path + "/outputs/" + self.env_name + \ + '/' + curr_time + '/results/' # 保存结果的路径 + self.model_path = curr_path + "/outputs/" + self.env_name + \ + '/' + curr_time + '/models/' # 保存模型的路径 + self.save = True # 是否保存图片 + +def env_agent_config(cfg,seed=1): + '''创建环境和智能体 + Args: + cfg ([type]): [description] + seed (int, optional): 随机种子. Defaults to 1. + Returns: + env [type]: 环境 + agent : 智能体 + ''' + env = gym.make(cfg.env_name) + env = CliffWalkingWapper(env) + env.seed(seed) # 设置随机种子 + state_dim = env.observation_space.n # 状态维度 + action_dim = env.action_space.n # 动作维度 + agent = QLearning(state_dim,action_dim,cfg) + return env,agent + +cfg = QlearningConfig() +plot_cfg = PlotConfig() +# 训练 +env, agent = env_agent_config(cfg, seed=1) +rewards, ma_rewards = train(cfg, env, agent) +make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹 +agent.save(path=plot_cfg.model_path) # 保存模型 +save_results(rewards, ma_rewards, tag='train', + path=plot_cfg.result_path) # 保存结果 +plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果 +# 测试 +env, agent = env_agent_config(cfg, seed=10) +agent.load(path=plot_cfg.model_path) # 导入模型 +rewards, ma_rewards = test(cfg, env, agent) +save_results(rewards, ma_rewards, tag='test', path=plot_cfg.result_path) # 保存结果 +plot_rewards(rewards, ma_rewards, plot_cfg, tag="test") # 画出结果 + + diff --git a/codes/QLearning/task0_train.ipynb b/codes/QLearning/task0_train.ipynb deleted file mode 100644 index 5715766..0000000 --- a/codes/QLearning/task0_train.ipynb +++ /dev/null @@ -1,216 +0,0 @@ -{ - "metadata": { - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.11" - }, - "orig_nbformat": 2, - "kernelspec": { - "name": "python3", - "display_name": "Python 3.7.11 64-bit ('py37': conda)" - }, - "interpreter": { - "hash": "fbea1422c2cf61ed9c0cfc03f38f71cc9083cc288606edc4170b5309b352ce27" - } - }, - "nbformat": 4, - "nbformat_minor": 2, - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "source": [ - "import sys\n", - "from pathlib import Path\n", - "curr_path = str(Path().absolute())\n", - "parent_path = str(Path().absolute().parent)\n", - "sys.path.append(parent_path) # add current terminal path to sys.path\n", - "\n", - "import gym\n", - "import datetime\n", - "\n", - "from envs.gridworld_env import CliffWalkingWapper\n", - "from QLearning.agent import QLearning\n", - "from common.plot import plot_rewards\n", - "from common.utils import save_results,make_dir\n", - "curr_time = datetime.datetime.now().strftime(\"%Y%m%d-%H%M%S\") # obtain current time" - ], - "outputs": [], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 2, - "source": [ - "class QlearningConfig:\n", - " '''训练相关参数'''\n", - " def __init__(self):\n", - " self.algo = 'Qlearning'\n", - " self.env = 'CliffWalking-v0' # 0 up, 1 right, 2 down, 3 left\n", - " self.result_path = curr_path+\"/outputs/\" +self.env+'/'+curr_time+'/results/' # path to save results\n", - " self.model_path = curr_path+\"/outputs/\" +self.env+'/'+curr_time+'/models/' # path to save models\n", - " self.train_eps = 200 # 训练的episode数目\n", - " self.eval_eps = 30\n", - " self.gamma = 0.9 # reward的衰减率\n", - " self.epsilon_start = 0.95 # e-greedy策略中初始epsilon\n", - " self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon\n", - " self.epsilon_decay = 200 # e-greedy策略中epsilon的衰减率\n", - " self.lr = 0.1 # learning rate" - ], - "outputs": [], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 3, - "source": [ - "def env_agent_config(cfg,seed=1):\n", - " env = gym.make(cfg.env) \n", - " env = CliffWalkingWapper(env)\n", - " env.seed(seed)\n", - " state_dim = env.observation_space.n\n", - " action_dim = env.action_space.n\n", - " agent = QLearning(state_dim,action_dim,cfg)\n", - " return env,agent" - ], - "outputs": [], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 4, - "source": [ - "def train(cfg,env,agent):\n", - " rewards = [] \n", - " ma_rewards = [] # moving average reward\n", - " for i_ep in range(cfg.train_eps):\n", - " ep_reward = 0 # 记录每个episode的reward\n", - " state = env.reset() # 重置环境, 重新开一局(即开始新的一个episode)\n", - " while True:\n", - " action = agent.choose_action(state) # 根据算法选择一个动作\n", - " next_state, reward, done, _ = env.step(action) # 与环境进行一次动作交互\n", - " agent.update(state, action, reward, next_state, done) # Q-learning算法更新\n", - " state = next_state # 存储上一个观察值\n", - " ep_reward += reward\n", - " if done:\n", - " break\n", - " rewards.append(ep_reward)\n", - " if ma_rewards:\n", - " ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)\n", - " else:\n", - " ma_rewards.append(ep_reward)\n", - " if (i_ep+1)%10==0:\n", - " print(\"Episode:{}/{}: reward:{:.1f}\".format(i_ep+1, cfg.train_eps,ep_reward))\n", - " return rewards,ma_rewards" - ], - "outputs": [], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 5, - "source": [ - "def eval(cfg,env,agent):\n", - " # env = gym.make(\"FrozenLake-v0\", is_slippery=False) # 0 left, 1 down, 2 right, 3 up\n", - " # env = FrozenLakeWapper(env)\n", - " rewards = [] # 记录所有episode的reward\n", - " ma_rewards = [] # 滑动平均的reward\n", - " for i_ep in range(cfg.eval_eps):\n", - " ep_reward = 0 # 记录每个episode的reward\n", - " state = env.reset() # 重置环境, 重新开一局(即开始新的一个episode)\n", - " while True:\n", - " action = agent.predict(state) # 根据算法选择一个动作\n", - " next_state, reward, done, _ = env.step(action) # 与环境进行一个交互\n", - " state = next_state # 存储上一个观察值\n", - " ep_reward += reward\n", - " if done:\n", - " break\n", - " rewards.append(ep_reward)\n", - " if ma_rewards:\n", - " ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)\n", - " else:\n", - " ma_rewards.append(ep_reward)\n", - " if (i_ep+1)%10==0:\n", - " print(f\"Episode:{i_ep+1}/{cfg.eval_eps}, reward:{ep_reward:.1f}\")\n", - " return rewards,ma_rewards" - ], - "outputs": [], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": 6, - "source": [ - "cfg = QlearningConfig()\n", - "env,agent = env_agent_config(cfg,seed=1)\n", - "rewards,ma_rewards = train(cfg,env,agent)\n", - "make_dir(cfg.result_path,cfg.model_path)\n", - "agent.save(path=cfg.model_path)\n", - "save_results(rewards,ma_rewards,tag='train',path=cfg.result_path)\n", - "plot_rewards(rewards,ma_rewards,tag=\"train\",env=cfg.env,algo = cfg.algo,path=cfg.result_path)\n", - "\n", - "env,agent = env_agent_config(cfg,seed=10)\n", - "agent.load(path=cfg.model_path)\n", - "rewards,ma_rewards = eval(cfg,env,agent)\n", - "save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path)\n", - "plot_rewards(rewards,ma_rewards,tag=\"eval\",env=cfg.env,algo = cfg.algo,path=cfg.result_path)" - ], - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Episode:10/200: reward:-287.0\n", - "Episode:20/200: reward:-142.0\n", - "Episode:30/200: reward:-67.0\n", - "Episode:40/200: reward:-61.0\n", - "Episode:50/200: reward:-74.0\n", - "Episode:60/200: reward:-41.0\n", - "Episode:70/200: reward:-55.0\n", - "Episode:80/200: reward:-66.0\n", - "Episode:90/200: reward:-31.0\n", - "Episode:100/200: reward:-31.0\n", - "Episode:110/200: reward:-58.0\n", - "Episode:120/200: reward:-25.0\n", - "Episode:130/200: reward:-18.0\n", - "Episode:140/200: reward:-27.0\n", - "Episode:150/200: reward:-28.0\n", - "Episode:160/200: reward:-25.0\n", - "Episode:170/200: reward:-35.0\n", - "Episode:180/200: reward:-13.0\n", - "Episode:190/200: reward:-22.0\n", - "Episode:200/200: reward:-26.0\n", - "保存模型成功!\n", - "结果保存完毕!\n" - ] - }, - { - "output_type": "display_data", - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {} - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "加载模型成功!\n" - ] - } - ], - "metadata": {} - } - ] -} \ No newline at end of file diff --git a/codes/QLearning/task0_train.py b/codes/QLearning/task0_train.py deleted file mode 100644 index 6e616ab..0000000 --- a/codes/QLearning/task0_train.py +++ /dev/null @@ -1,126 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -Author: John -Email: johnjim0816@gmail.com -Date: 2020-09-11 23:03:00 -LastEditor: John -LastEditTime: 2021-09-23 12:22:58 -Discription: -Environment: -''' -import sys,os -curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前路径 -parent_path=os.path.dirname(curr_path) # 父路径,这里就是我们的项目路径 -sys.path.append(parent_path) # 由于需要引用项目路径下的其他模块比如envs,所以需要添加路径到sys.path - -import gym -import torch -import datetime - -from envs.gridworld_env import CliffWalkingWapper -from QLearning.agent import QLearning -from common.plot import plot_rewards,plot_rewards_cn -from common.utils import save_results,make_dir - -curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 -class QlearningConfig: - '''训练相关参数''' - def __init__(self): - self.algo = 'Q-learning' # 算法名称 - self.env = 'CliffWalking-v0' # 环境名称 - self.result_path = curr_path+"/outputs/" +self.env+'/'+curr_time+'/results/' # 保存结果的路径 - self.model_path = curr_path+"/outputs/" +self.env+'/'+curr_time+'/models/' # 保存模型的路径 - self.train_eps = 400 # 训练的回合数 - self.eval_eps = 30 # 测试的回合数 - self.gamma = 0.9 # reward的衰减率 - self.epsilon_start = 0.95 # e-greedy策略中初始epsilon - self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon - self.epsilon_decay = 300 # e-greedy策略中epsilon的衰减率 - self.lr = 0.1 # 学习率 - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU - - -def env_agent_config(cfg,seed=1): - env = gym.make(cfg.env) - env = CliffWalkingWapper(env) - env.seed(seed) # 设置随机种子 - n_states = env.observation_space.n # 状态维度 - n_actions = env.action_space.n # 动作维度 - agent = QLearning(n_states,n_actions,cfg) - return env,agent - -def train(cfg,env,agent): - print('开始训练!') - print(f'环境:{cfg.env}, 算法:{cfg.algo}, 设备:{cfg.device}') - rewards = [] # 记录奖励 - ma_rewards = [] # 记录滑动平均奖励 - for i_ep in range(cfg.train_eps): - ep_reward = 0 # 记录每个回合的奖励 - state = env.reset() # 重置环境,即开始新的回合 - while True: - action = agent.choose_action(state) # 根据算法选择一个动作 - next_state, reward, done, _ = env.step(action) # 与环境进行一次动作交互 - print(reward) - agent.update(state, action, reward, next_state, done) # Q学习算法更新 - state = next_state # 更新状态 - ep_reward += reward - if done: - break - rewards.append(ep_reward) - if ma_rewards: - ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1) - else: - ma_rewards.append(ep_reward) - print("回合数:{}/{},奖励{:.1f}".format(i_ep+1, cfg.train_eps,ep_reward)) - print('完成训练!') - return rewards,ma_rewards - -def eval(cfg,env,agent): - print('开始测试!') - print(f'环境:{cfg.env}, 算法:{cfg.algo}, 设备:{cfg.device}') - for item in agent.Q_table.items(): - print(item) - rewards = [] # 记录所有回合的奖励 - ma_rewards = [] # 滑动平均的奖励 - for i_ep in range(cfg.eval_eps): - ep_reward = 0 # 记录每个episode的reward - state = env.reset() # 重置环境, 重新开一局(即开始新的一个回合) - while True: - action = agent.predict(state) # 根据算法选择一个动作 - next_state, reward, done, _ = env.step(action) # 与环境进行一个交互 - state = next_state # 更新状态 - ep_reward += reward - if done: - break - rewards.append(ep_reward) - if ma_rewards: - ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1) - else: - ma_rewards.append(ep_reward) - print(f"回合数:{i_ep+1}/{cfg.eval_eps}, 奖励:{ep_reward:.1f}") - print('完成测试!') - return rewards,ma_rewards - -if __name__ == "__main__": - cfg = QlearningConfig() - - # 训练 - env,agent = env_agent_config(cfg,seed=0) - rewards,ma_rewards = train(cfg,env,agent) - make_dir(cfg.result_path,cfg.model_path) # 创建文件夹 - agent.save(path=cfg.model_path) # 保存模型 - for item in agent.Q_table.items(): - print(item) - save_results(rewards,ma_rewards,tag='train',path=cfg.result_path) # 保存结果 - plot_rewards_cn(rewards,ma_rewards,tag="train",env=cfg.env,algo = cfg.algo,path=cfg.result_path) - - # # 测试 - env,agent = env_agent_config(cfg,seed=10) - agent.load(path=cfg.model_path) # 加载模型 - rewards,ma_rewards = eval(cfg,env,agent) - - save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path) - plot_rewards_cn(rewards,ma_rewards,tag="eval",env=cfg.env,algo = cfg.algo,path=cfg.result_path) - - diff --git a/codes/QLearning/train.py b/codes/QLearning/train.py new file mode 100644 index 0000000..2c4aa09 --- /dev/null +++ b/codes/QLearning/train.py @@ -0,0 +1,50 @@ +def train(cfg,env,agent): + print('开始训练!') + print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') + rewards = [] # 记录奖励 + ma_rewards = [] # 记录滑动平均奖励 + for i_ep in range(cfg.train_eps): + ep_reward = 0 # 记录每个回合的奖励 + state = env.reset() # 重置环境,即开始新的回合 + while True: + action = agent.choose_action(state) # 根据算法选择一个动作 + next_state, reward, done, _ = env.step(action) # 与环境进行一次动作交互 + agent.update(state, action, reward, next_state, done) # Q学习算法更新 + state = next_state # 更新状态 + ep_reward += reward + if done: + break + rewards.append(ep_reward) + if ma_rewards: + ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1) + else: + ma_rewards.append(ep_reward) + print("回合数:{}/{},奖励{:.1f}".format(i_ep+1, cfg.train_eps,ep_reward)) + print('完成训练!') + return rewards,ma_rewards + +def test(cfg,env,agent): + print('开始测试!') + print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') + for item in agent.Q_table.items(): + print(item) + rewards = [] # 记录所有回合的奖励 + ma_rewards = [] # 滑动平均的奖励 + for i_ep in range(cfg.test_eps): + ep_reward = 0 # 记录每个episode的reward + state = env.reset() # 重置环境, 重新开一局(即开始新的一个回合) + while True: + action = agent.predict(state) # 根据算法选择一个动作 + next_state, reward, done, _ = env.step(action) # 与环境进行一个交互 + state = next_state # 更新状态 + ep_reward += reward + if done: + break + rewards.append(ep_reward) + if ma_rewards: + ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1) + else: + ma_rewards.append(ep_reward) + print(f"回合数:{i_ep+1}/{cfg.test_eps}, 奖励:{ep_reward:.1f}") + print('完成测试!') + return rewards,ma_rewards \ No newline at end of file diff --git a/codes/README.md b/codes/README.md index fdee344..3896fbb 100644 --- a/codes/README.md +++ b/codes/README.md @@ -13,9 +13,10 @@ 其中```model.py```,```memory.py```,```plot.py``` 由于不同算法都会用到,所以放入```common```文件夹中。 +**注意:新版本中将```model```,```memory```相关内容全部放到了```agent.py```里面,```plot```放到了```common.utils```中。** ## 运行环境 -python 3.7、pytorch 1.6.0-1.8.1、gym 0.17.0-0.19.0 +python 3.7、pytorch 1.6.0-1.8.1、gym 0.21.0 ## 使用说明 @@ -35,7 +36,7 @@ python 3.7、pytorch 1.6.0-1.8.1、gym 0.17.0-0.19.0 | [Hierarchical DQN](HierarchicalDQN) | [H-DQN Paper](https://arxiv.org/abs/1604.06057) | [CartPole-v0](./envs/gym_info.md) | | | [PolicyGradient](./PolicyGradient) | [Lil'log](https://lilianweng.github.io/lil-log/2018/04/08/policy-gradient-algorithms.html) | [CartPole-v0](./envs/gym_info.md) | | | [A2C](./A2C) | [A3C Paper](https://arxiv.org/abs/1602.01783) | [CartPole-v0](./envs/gym_info.md) | | -| [SAC](./SAC) | [SAC Paper](https://arxiv.org/abs/1801.01290) | [Pendulum-v0](./envs/gym_info.md) | | +| [SAC](./SoftActorCritic) | [SAC Paper](https://arxiv.org/abs/1801.01290) | [Pendulum-v0](./envs/gym_info.md) | | | [PPO](./PPO) | [PPO paper](https://arxiv.org/abs/1707.06347) | [CartPole-v0](./envs/gym_info.md) | | | [DDPG](./DDPG) | [DDPG Paper](https://arxiv.org/abs/1509.02971) | [Pendulum-v0](./envs/gym_info.md) | | | [TD3](./TD3) | [TD3 Paper](https://arxiv.org/abs/1802.09477) | [HalfCheetah-v2]((./envs/mujoco_info.md)) | | @@ -45,4 +46,6 @@ python 3.7、pytorch 1.6.0-1.8.1、gym 0.17.0-0.19.0 [RL-Adventure-2](https://github.com/higgsfield/RL-Adventure-2) -[RL-Adventure](https://github.com/higgsfield/RL-Adventure) \ No newline at end of file +[RL-Adventure](https://github.com/higgsfield/RL-Adventure) + +[Google 开源项目风格指南——中文版](https://zh-google-styleguide.readthedocs.io/en/latest/google-python-styleguide/python_style_rules/#comments) \ No newline at end of file diff --git a/codes/SAC/outputs/Pendulum-v0/20210506-014740/models/sac_policy b/codes/SAC/outputs/Pendulum-v0/20210506-014740/models/sac_policy deleted file mode 100644 index 12479e2..0000000 Binary files a/codes/SAC/outputs/Pendulum-v0/20210506-014740/models/sac_policy and /dev/null differ diff --git a/codes/SAC/outputs/Pendulum-v0/20210506-014740/models/sac_policy_optimizer b/codes/SAC/outputs/Pendulum-v0/20210506-014740/models/sac_policy_optimizer deleted file mode 100644 index 6dea232..0000000 Binary files a/codes/SAC/outputs/Pendulum-v0/20210506-014740/models/sac_policy_optimizer and /dev/null differ diff --git a/codes/SAC/outputs/Pendulum-v0/20210506-014740/models/sac_soft_q b/codes/SAC/outputs/Pendulum-v0/20210506-014740/models/sac_soft_q deleted file mode 100644 index d2d5352..0000000 Binary files a/codes/SAC/outputs/Pendulum-v0/20210506-014740/models/sac_soft_q and /dev/null differ diff --git a/codes/SAC/outputs/Pendulum-v0/20210506-014740/models/sac_soft_q_optimizer b/codes/SAC/outputs/Pendulum-v0/20210506-014740/models/sac_soft_q_optimizer deleted file mode 100644 index d4c3e48..0000000 Binary files a/codes/SAC/outputs/Pendulum-v0/20210506-014740/models/sac_soft_q_optimizer and /dev/null differ diff --git a/codes/SAC/outputs/Pendulum-v0/20210506-014740/models/sac_value b/codes/SAC/outputs/Pendulum-v0/20210506-014740/models/sac_value deleted file mode 100644 index a180f73..0000000 Binary files a/codes/SAC/outputs/Pendulum-v0/20210506-014740/models/sac_value and /dev/null differ diff --git a/codes/SAC/outputs/Pendulum-v0/20210506-014740/models/sac_value_optimizer b/codes/SAC/outputs/Pendulum-v0/20210506-014740/models/sac_value_optimizer deleted file mode 100644 index f2ab113..0000000 Binary files a/codes/SAC/outputs/Pendulum-v0/20210506-014740/models/sac_value_optimizer and /dev/null differ diff --git a/codes/SAC/outputs/Pendulum-v0/20210506-014740/results/eval_ma_rewards.npy b/codes/SAC/outputs/Pendulum-v0/20210506-014740/results/eval_ma_rewards.npy deleted file mode 100644 index 4971d4f..0000000 Binary files a/codes/SAC/outputs/Pendulum-v0/20210506-014740/results/eval_ma_rewards.npy and /dev/null differ diff --git a/codes/SAC/outputs/Pendulum-v0/20210506-014740/results/eval_rewards.npy b/codes/SAC/outputs/Pendulum-v0/20210506-014740/results/eval_rewards.npy deleted file mode 100644 index 46bd706..0000000 Binary files a/codes/SAC/outputs/Pendulum-v0/20210506-014740/results/eval_rewards.npy and /dev/null differ diff --git a/codes/SAC/outputs/Pendulum-v0/20210506-014740/results/eval_rewards_curve.png b/codes/SAC/outputs/Pendulum-v0/20210506-014740/results/eval_rewards_curve.png deleted file mode 100644 index 3d4dd84..0000000 Binary files a/codes/SAC/outputs/Pendulum-v0/20210506-014740/results/eval_rewards_curve.png and /dev/null differ diff --git a/codes/SAC/outputs/Pendulum-v0/20210506-014740/results/train_ma_rewards.npy b/codes/SAC/outputs/Pendulum-v0/20210506-014740/results/train_ma_rewards.npy deleted file mode 100644 index bffae05..0000000 Binary files a/codes/SAC/outputs/Pendulum-v0/20210506-014740/results/train_ma_rewards.npy and /dev/null differ diff --git a/codes/SAC/outputs/Pendulum-v0/20210506-014740/results/train_rewards.npy b/codes/SAC/outputs/Pendulum-v0/20210506-014740/results/train_rewards.npy deleted file mode 100644 index 37837a6..0000000 Binary files a/codes/SAC/outputs/Pendulum-v0/20210506-014740/results/train_rewards.npy and /dev/null differ diff --git a/codes/SAC/outputs/Pendulum-v0/20210506-014740/results/train_rewards_curve.png b/codes/SAC/outputs/Pendulum-v0/20210506-014740/results/train_rewards_curve.png deleted file mode 100644 index 399b952..0000000 Binary files a/codes/SAC/outputs/Pendulum-v0/20210506-014740/results/train_rewards_curve.png and /dev/null differ diff --git a/codes/Sarsa/task0_train.py b/codes/Sarsa/task0_train.py index d21db17..e477afa 100644 --- a/codes/Sarsa/task0_train.py +++ b/codes/Sarsa/task0_train.py @@ -31,7 +31,7 @@ class SarsaConfig: self.result_path = curr_path+"/outputs/" +self.env+'/'+curr_time+'/results/' # path to save results self.model_path = curr_path+"/outputs/" +self.env+'/'+curr_time+'/models/' # path to save models self.train_eps = 200 - self.eval_eps = 50 + self.test_eps = 50 self.epsilon = 0.15 # epsilon: The probability to select a random action . self.gamma = 0.9 # gamma: Gamma discount factor. self.lr = 0.2 # learning rate: step size parameter @@ -74,7 +74,7 @@ def train(cfg,env,agent): def eval(cfg,env,agent): rewards = [] ma_rewards = [] - for i_episode in range(cfg.eval_eps): + for i_episode in range(cfg.test_eps): # Print out which episode we're on, useful for debugging. # Generate an episode. # An episode is an array of (state, action, reward) tuples @@ -94,7 +94,7 @@ def eval(cfg,env,agent): ma_rewards.append(ep_reward) rewards.append(ep_reward) if (i_episode+1)%10==0: - print("Episode:{}/{}: Reward:{}".format(i_episode+1, cfg.eval_eps,ep_reward)) + print("Episode:{}/{}: Reward:{}".format(i_episode+1, cfg.test_eps,ep_reward)) print('Complete evaling!') return rewards,ma_rewards diff --git a/codes/SAC/env.py b/codes/SoftActorCritic/env_wrapper.py similarity index 95% rename from codes/SAC/env.py rename to codes/SoftActorCritic/env_wrapper.py index 14e37a7..dfe1c4d 100644 --- a/codes/SAC/env.py +++ b/codes/SoftActorCritic/env_wrapper.py @@ -5,12 +5,13 @@ Author: JiangJi Email: johnjim0816@gmail.com Date: 2021-04-29 12:52:11 LastEditor: JiangJi -LastEditTime: 2021-04-29 12:52:31 +LastEditTime: 2021-12-22 15:36:36 Discription: Environment: ''' import gym import numpy as np + class NormalizedActions(gym.ActionWrapper): def action(self, action): low = self.action_space.low diff --git a/codes/SAC/model.py b/codes/SoftActorCritic/model.py similarity index 81% rename from codes/SAC/model.py rename to codes/SoftActorCritic/model.py index 146db0d..85bbfcd 100644 --- a/codes/SAC/model.py +++ b/codes/SoftActorCritic/model.py @@ -5,7 +5,7 @@ Author: JiangJi Email: johnjim0816@gmail.com Date: 2021-04-29 12:53:58 LastEditor: JiangJi -LastEditTime: 2021-04-29 12:57:29 +LastEditTime: 2021-11-19 18:04:19 Discription: Environment: ''' @@ -35,12 +35,12 @@ class ValueNet(nn.Module): class SoftQNet(nn.Module): - def __init__(self, num_inputs, num_actions, hidden_size, init_w=3e-3): + def __init__(self, state_dim, action_dim, hidden_dim, init_w=3e-3): super(SoftQNet, self).__init__() - self.linear1 = nn.Linear(num_inputs + num_actions, hidden_size) - self.linear2 = nn.Linear(hidden_size, hidden_size) - self.linear3 = nn.Linear(hidden_size, 1) + self.linear1 = nn.Linear(state_dim + action_dim, hidden_dim) + self.linear2 = nn.Linear(hidden_dim, hidden_dim) + self.linear3 = nn.Linear(hidden_dim, 1) self.linear3.weight.data.uniform_(-init_w, init_w) self.linear3.bias.data.uniform_(-init_w, init_w) @@ -54,20 +54,20 @@ class SoftQNet(nn.Module): class PolicyNet(nn.Module): - def __init__(self, num_inputs, num_actions, hidden_size, init_w=3e-3, log_std_min=-20, log_std_max=2): + def __init__(self, state_dim, action_dim, hidden_dim, init_w=3e-3, log_std_min=-20, log_std_max=2): super(PolicyNet, self).__init__() self.log_std_min = log_std_min self.log_std_max = log_std_max - self.linear1 = nn.Linear(num_inputs, hidden_size) - self.linear2 = nn.Linear(hidden_size, hidden_size) + self.linear1 = nn.Linear(state_dim, hidden_dim) + self.linear2 = nn.Linear(hidden_dim, hidden_dim) - self.mean_linear = nn.Linear(hidden_size, num_actions) + self.mean_linear = nn.Linear(hidden_dim, action_dim) self.mean_linear.weight.data.uniform_(-init_w, init_w) self.mean_linear.bias.data.uniform_(-init_w, init_w) - self.log_std_linear = nn.Linear(hidden_size, num_actions) + self.log_std_linear = nn.Linear(hidden_dim, action_dim) self.log_std_linear.weight.data.uniform_(-init_w, init_w) self.log_std_linear.bias.data.uniform_(-init_w, init_w) diff --git a/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/models/sac_policy b/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/models/sac_policy new file mode 100644 index 0000000..9ae4e7b Binary files /dev/null and b/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/models/sac_policy differ diff --git a/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/models/sac_policy_optimizer b/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/models/sac_policy_optimizer new file mode 100644 index 0000000..49c0d2a Binary files /dev/null and b/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/models/sac_policy_optimizer differ diff --git a/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/models/sac_soft_q b/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/models/sac_soft_q new file mode 100644 index 0000000..3ff692f Binary files /dev/null and b/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/models/sac_soft_q differ diff --git a/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/models/sac_soft_q_optimizer b/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/models/sac_soft_q_optimizer new file mode 100644 index 0000000..73be931 Binary files /dev/null and b/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/models/sac_soft_q_optimizer differ diff --git a/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/models/sac_value b/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/models/sac_value new file mode 100644 index 0000000..853ac6f Binary files /dev/null and b/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/models/sac_value differ diff --git a/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/models/sac_value_optimizer b/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/models/sac_value_optimizer new file mode 100644 index 0000000..79410e4 Binary files /dev/null and b/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/models/sac_value_optimizer differ diff --git a/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/results/test_ma_rewards.npy b/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/results/test_ma_rewards.npy new file mode 100644 index 0000000..eca3369 Binary files /dev/null and b/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/results/test_ma_rewards.npy differ diff --git a/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/results/test_rewards.npy b/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/results/test_rewards.npy new file mode 100644 index 0000000..09edb0e Binary files /dev/null and b/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/results/test_rewards.npy differ diff --git a/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/results/test_rewards_curve.png b/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/results/test_rewards_curve.png new file mode 100644 index 0000000..5cc6e1d Binary files /dev/null and b/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/results/test_rewards_curve.png differ diff --git a/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/results/train_ma_rewards.npy b/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/results/train_ma_rewards.npy new file mode 100644 index 0000000..3e1feac Binary files /dev/null and b/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/results/train_ma_rewards.npy differ diff --git a/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/results/train_rewards.npy b/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/results/train_rewards.npy new file mode 100644 index 0000000..1c77a83 Binary files /dev/null and b/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/results/train_rewards.npy differ diff --git a/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/results/train_rewards_curve.png b/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/results/train_rewards_curve.png new file mode 100644 index 0000000..3e4c8aa Binary files /dev/null and b/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/results/train_rewards_curve.png differ diff --git a/codes/SAC/agent.py b/codes/SoftActorCritic/sac.py similarity index 51% rename from codes/SAC/agent.py rename to codes/SoftActorCritic/sac.py index 1568eb3..d565db5 100644 --- a/codes/SAC/agent.py +++ b/codes/SoftActorCritic/sac.py @@ -5,7 +5,7 @@ Author: JiangJi Email: johnjim0816@gmail.com Date: 2021-04-29 12:53:54 LastEditor: JiangJi -LastEditTime: 2021-04-29 13:56:39 +LastEditTime: 2021-12-22 15:41:19 Discription: Environment: ''' @@ -13,10 +13,126 @@ import copy import torch import torch.nn as nn import torch.optim as optim +import torch.nn.functional as F +from torch.distributions import Normal import numpy as np -from common.memory import ReplayBuffer -from SAC.model import ValueNet,PolicyNet,SoftQNet +import random +device=torch.device("cuda" if torch.cuda.is_available() else "cpu") +class ReplayBuffer: + def __init__(self, capacity): + self.capacity = capacity # 经验回放的容量 + self.buffer = [] # 缓冲区 + self.position = 0 + + def push(self, state, action, reward, next_state, done): + ''' 缓冲区是一个队列,容量超出时去掉开始存入的转移(transition) + ''' + if len(self.buffer) < self.capacity: + self.buffer.append(None) + self.buffer[self.position] = (state, action, reward, next_state, done) + self.position = (self.position + 1) % self.capacity + + def sample(self, batch_size): + batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移 + state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等 + return state, action, reward, next_state, done + + def __len__(self): + ''' 返回当前存储的量 + ''' + return len(self.buffer) +class ValueNet(nn.Module): + def __init__(self, state_dim, hidden_dim, init_w=3e-3): + super(ValueNet, self).__init__() + + self.linear1 = nn.Linear(state_dim, hidden_dim) + self.linear2 = nn.Linear(hidden_dim, hidden_dim) + self.linear3 = nn.Linear(hidden_dim, 1) + + self.linear3.weight.data.uniform_(-init_w, init_w) + self.linear3.bias.data.uniform_(-init_w, init_w) + + def forward(self, state): + x = F.relu(self.linear1(state)) + x = F.relu(self.linear2(x)) + x = self.linear3(x) + return x + + +class SoftQNet(nn.Module): + def __init__(self, state_dim, action_dim, hidden_dim, init_w=3e-3): + super(SoftQNet, self).__init__() + + self.linear1 = nn.Linear(state_dim + action_dim, hidden_dim) + self.linear2 = nn.Linear(hidden_dim, hidden_dim) + self.linear3 = nn.Linear(hidden_dim, 1) + + self.linear3.weight.data.uniform_(-init_w, init_w) + self.linear3.bias.data.uniform_(-init_w, init_w) + + def forward(self, state, action): + x = torch.cat([state, action], 1) + x = F.relu(self.linear1(x)) + x = F.relu(self.linear2(x)) + x = self.linear3(x) + return x + + +class PolicyNet(nn.Module): + def __init__(self, state_dim, action_dim, hidden_dim, init_w=3e-3, log_std_min=-20, log_std_max=2): + super(PolicyNet, self).__init__() + + self.log_std_min = log_std_min + self.log_std_max = log_std_max + + self.linear1 = nn.Linear(state_dim, hidden_dim) + self.linear2 = nn.Linear(hidden_dim, hidden_dim) + + self.mean_linear = nn.Linear(hidden_dim, action_dim) + self.mean_linear.weight.data.uniform_(-init_w, init_w) + self.mean_linear.bias.data.uniform_(-init_w, init_w) + + self.log_std_linear = nn.Linear(hidden_dim, action_dim) + self.log_std_linear.weight.data.uniform_(-init_w, init_w) + self.log_std_linear.bias.data.uniform_(-init_w, init_w) + + def forward(self, state): + x = F.relu(self.linear1(state)) + x = F.relu(self.linear2(x)) + + mean = self.mean_linear(x) + log_std = self.log_std_linear(x) + log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max) + + return mean, log_std + + def evaluate(self, state, epsilon=1e-6): + mean, log_std = self.forward(state) + std = log_std.exp() + + normal = Normal(mean, std) + z = normal.sample() + action = torch.tanh(z) + + log_prob = normal.log_prob(z) - torch.log(1 - action.pow(2) + epsilon) + log_prob = log_prob.sum(-1, keepdim=True) + + return action, log_prob, z, mean, log_std + + + def get_action(self, state): + state = torch.FloatTensor(state).unsqueeze(0).to(device) + mean, log_std = self.forward(state) + std = log_std.exp() + + normal = Normal(mean, std) + z = normal.sample() + action = torch.tanh(z) + + action = action.detach().cpu().numpy() + return action[0] + class SAC: def __init__(self,state_dim,action_dim,cfg) -> None: self.batch_size = cfg.batch_size @@ -81,7 +197,6 @@ class SAC: policy_loss.backward() self.policy_optimizer.step() - for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_( target_param.data * (1.0 - soft_tau) + param.data * soft_tau @@ -89,15 +204,12 @@ class SAC: def save(self, path): torch.save(self.value_net.state_dict(), path + "sac_value") torch.save(self.value_optimizer.state_dict(), path + "sac_value_optimizer") - torch.save(self.soft_q_net.state_dict(), path + "sac_soft_q") torch.save(self.soft_q_optimizer.state_dict(), path + "sac_soft_q_optimizer") torch.save(self.policy_net.state_dict(), path + "sac_policy") torch.save(self.policy_optimizer.state_dict(), path + "sac_policy_optimizer") - - def load(self, path): self.value_net.load_state_dict(torch.load(path + "sac_value")) self.value_optimizer.load_state_dict(torch.load(path + "sac_value_optimizer")) diff --git a/codes/SAC/task0_train.py b/codes/SoftActorCritic/task0.py similarity index 51% rename from codes/SAC/task0_train.py rename to codes/SoftActorCritic/task0.py index 625f1d7..e910749 100644 --- a/codes/SAC/task0_train.py +++ b/codes/SoftActorCritic/task0.py @@ -5,7 +5,7 @@ Author: JiangJi Email: johnjim0816@gmail.com Date: 2021-04-29 12:59:22 LastEditor: JiangJi -LastEditTime: 2021-05-06 16:58:01 +LastEditTime: 2021-12-22 16:27:13 Discription: Environment: ''' @@ -18,23 +18,24 @@ import gym import torch import datetime -from SAC.env import NormalizedActions -from SAC.agent import SAC +from SoftActorCritic.env_wrapper import NormalizedActions +from SoftActorCritic.sac import SAC from common.utils import save_results, make_dir -from common.plot import plot_rewards +from common.utils import plot_rewards curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 +algo_name = 'SAC' # 算法名称 +env_name = 'Pendulum-v1' # 环境名称 +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU class SACConfig: def __init__(self) -> None: - self.algo = 'SAC' - self.env_name = 'Pendulum-v1' - self.result_path = curr_path+"/outputs/" +self.env_name+'/'+curr_time+'/results/' # path to save results - self.model_path = curr_path+"/outputs/" +self.env_name+'/'+curr_time+'/models/' # path to save models + self.algo_name = algo_name + self.env_name = env_name # 环境名称 + self.device= device self.train_eps = 300 - self.train_steps = 500 - self.eval_eps = 50 - self.eval_steps = 500 + self.test_eps = 20 + self.max_steps = 500 # 每回合的最大步数 self.gamma = 0.99 self.mean_lambda=1e-3 self.std_lambda=1e-3 @@ -46,15 +47,18 @@ class SACConfig: self.capacity = 1000000 self.hidden_dim = 256 self.batch_size = 128 - self.device=torch.device("cuda" if torch.cuda.is_available() else "cpu") -class PlotConfig(SACConfig): - def __init__(self) -> None: - super().__init__() - self.result_path = curr_path+"/outputs/" + self.env_name + \ - '/'+curr_time+'/results/' # 保存结果的路径 - self.model_path = curr_path+"/outputs/" + self.env_name + \ - '/'+curr_time+'/models/' # 保存模型的路径 - self.save = True # 是否保存图片 + + +class PlotConfig: + def __init__(self) -> None: + self.algo_name = algo_name # 算法名称 + self.env_name = env_name # 环境名称 + self.device= device + self.result_path = curr_path + "/outputs/" + self.env_name + \ + '/' + curr_time + '/results/' # 保存结果的路径 + self.model_path = curr_path + "/outputs/" + self.env_name + \ + '/' + curr_time + '/models/' # 保存模型的路径 + self.save = True # 是否保存图片 def env_agent_config(cfg,seed=1): env = NormalizedActions(gym.make(cfg.env_name)) @@ -66,13 +70,13 @@ def env_agent_config(cfg,seed=1): def train(cfg,env,agent): print('开始训练!') - print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}') + print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') rewards = [] # 记录所有回合的奖励 ma_rewards = [] # 记录所有回合的滑动平均奖励 for i_ep in range(cfg.train_eps): ep_reward = 0 # 记录一回合内的奖励 state = env.reset() # 重置环境,返回初始状态 - for i_step in range(cfg.train_steps): + for i_step in range(cfg.max_steps): action = agent.policy_net.get_action(state) next_state, reward, done, _ = env.step(action) agent.memory.push(state, action, reward, next_state, done) @@ -81,57 +85,57 @@ def train(cfg,env,agent): ep_reward += reward if done: break - if (i_ep+1)%10==0: - print(f"Episode:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.3f}") rewards.append(ep_reward) if ma_rewards: ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) else: ma_rewards.append(ep_reward) - print('Complete training!') + if (i_ep+1)%10 == 0: + print(f'回合:{i_ep+1}/{cfg.train_eps}, 奖励:{ep_reward:.3f}') + print('完成训练!') return rewards, ma_rewards -def eval(cfg,env,agent): - print('Start to eval !') - print(f'Env: {cfg.env_name}, Algorithm: {cfg.algo}, Device: {cfg.device}') - rewards = [] - ma_rewards = [] # moveing average reward - for i_ep in range(cfg.eval_eps): +def test(cfg,env,agent): + print('开始测试!') + print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') + rewards = [] # 记录所有回合的奖励 + ma_rewards = [] # 记录所有回合的滑动平均奖励 + for i_ep in range(cfg.test_eps): state = env.reset() ep_reward = 0 - for i_step in range(cfg.eval_steps): + for i_step in range(cfg.max_steps): action = agent.policy_net.get_action(state) next_state, reward, done, _ = env.step(action) state = next_state ep_reward += reward if done: break - if (i_ep+1)%10==0: - print(f"Episode:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.3f}") rewards.append(ep_reward) if ma_rewards: ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) else: ma_rewards.append(ep_reward) - print('Complete evaling!') + print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}") + print('完成测试!') return rewards, ma_rewards if __name__ == "__main__": cfg=SACConfig() plot_cfg = PlotConfig() - # train - env,agent = env_agent_config(cfg,seed=1) + # 训练 + env, agent = env_agent_config(cfg, seed=1) rewards, ma_rewards = train(cfg, env, agent) - make_dir(plot_cfg.result_path, plot_cfg.model_path) - agent.save(path=plot_cfg.model_path) - save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path) - plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") - # eval - env,agent = env_agent_config(cfg,seed=10) - agent.load(path=plot_cfg.model_path) - rewards,ma_rewards = eval(cfg,env,agent) - save_results(rewards,ma_rewards,tag='eval',path=plot_cfg.result_path) - plot_rewards(rewards,ma_rewards,plot_cfg,tag="eval") + make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹 + agent.save(path=plot_cfg.model_path) # 保存模型 + save_results(rewards, ma_rewards, tag='train', + path=plot_cfg.result_path) # 保存结果 + plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果 + # 测试 + env, agent = env_agent_config(cfg, seed=10) + agent.load(path=plot_cfg.model_path) # 导入模型 + rewards, ma_rewards = test(cfg, env, agent) + save_results(rewards, ma_rewards, tag='test', path=plot_cfg.result_path) # 保存结果 + plot_rewards(rewards, ma_rewards, plot_cfg, tag="test") # 画出结果 diff --git a/codes/SAC/task0_train.ipynb b/codes/SoftActorCritic/task0_train.ipynb similarity index 99% rename from codes/SAC/task0_train.ipynb rename to codes/SoftActorCritic/task0_train.ipynb index 8148a4b..14be84e 100644 --- a/codes/SAC/task0_train.ipynb +++ b/codes/SoftActorCritic/task0_train.ipynb @@ -45,7 +45,7 @@ " self.model_path = curr_path+\"/outputs/\" +self.env+'/'+curr_time+'/models/' # path to save models\n", " self.train_eps = 300\n", " self.train_steps = 500\n", - " self.eval_eps = 50\n", + " self.test_eps = 50\n", " self.eval_steps = 500\n", " self.gamma = 0.99\n", " self.mean_lambda=1e-3\n", @@ -121,7 +121,7 @@ " print(f'Env: {cfg.env}, Algorithm: {cfg.algo}, Device: {cfg.device}')\n", " rewards = []\n", " ma_rewards = [] # moveing average reward\n", - " for i_ep in range(cfg.eval_eps):\n", + " for i_ep in range(cfg.test_eps):\n", " state = env.reset()\n", " ep_reward = 0\n", " for i_step in range(cfg.eval_steps):\n", diff --git a/codes/TD3/README.md b/codes/TD3/README.md new file mode 100644 index 0000000..8001e9c --- /dev/null +++ b/codes/TD3/README.md @@ -0,0 +1 @@ +这是对[Implementation of Twin Delayed Deep Deterministic Policy Gradients (TD3)](https://arxiv.org/abs/1802.09477)的复现 \ No newline at end of file diff --git a/codes/TD3/agent.py b/codes/TD3/agent.py index 3d43700..91939a6 100644 --- a/codes/TD3/agent.py +++ b/codes/TD3/agent.py @@ -1,3 +1,13 @@ +#!/usr/bin/env python +# coding=utf-8 +''' +Author: JiangJi +Email: johnjim0816@gmail.com +Date: 2021-12-22 10:40:05 +LastEditor: JiangJi +LastEditTime: 2021-12-22 10:43:55 +Discription: +''' import copy import numpy as np import torch @@ -5,40 +15,41 @@ import torch.nn as nn import torch.nn.functional as F from TD3.memory import ReplayBuffer - - -# Implementation of Twin Delayed Deep Deterministic Policy Gradients (TD3) -# Paper: https://arxiv.org/abs/1802.09477 - - class Actor(nn.Module): - def __init__(self, state_dim, action_dim, max_action): + + def __init__(self, input_dim, output_dim, max_action): + '''[summary] + + Args: + input_dim (int): 输入维度,这里等于state_dim + output_dim (int): 输出维度,这里等于action_dim + max_action (int): action的最大值 + ''' super(Actor, self).__init__() - self.l1 = nn.Linear(state_dim, 256) + self.l1 = nn.Linear(input_dim, 256) self.l2 = nn.Linear(256, 256) - self.l3 = nn.Linear(256, action_dim) - + self.l3 = nn.Linear(256, output_dim) self.max_action = max_action - - + def forward(self, state): + a = F.relu(self.l1(state)) a = F.relu(self.l2(a)) return self.max_action * torch.tanh(self.l3(a)) class Critic(nn.Module): - def __init__(self, state_dim, action_dim): + def __init__(self, input_dim, output_dim): super(Critic, self).__init__() # Q1 architecture - self.l1 = nn.Linear(state_dim + action_dim, 256) + self.l1 = nn.Linear(input_dim + output_dim, 256) self.l2 = nn.Linear(256, 256) self.l3 = nn.Linear(256, 1) # Q2 architecture - self.l4 = nn.Linear(state_dim + action_dim, 256) + self.l4 = nn.Linear(input_dim + output_dim, 256) self.l5 = nn.Linear(256, 256) self.l6 = nn.Linear(256, 1) @@ -68,8 +79,8 @@ class Critic(nn.Module): class TD3(object): def __init__( self, - state_dim, - action_dim, + input_dim, + output_dim, max_action, cfg, ): @@ -83,14 +94,14 @@ class TD3(object): self.device = cfg.device self.total_it = 0 - self.actor = Actor(state_dim, action_dim, max_action).to(self.device) + self.actor = Actor(input_dim, output_dim, max_action).to(self.device) self.actor_target = copy.deepcopy(self.actor) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=3e-4) - self.critic = Critic(state_dim, action_dim).to(self.device) + self.critic = Critic(input_dim, output_dim).to(self.device) self.critic_target = copy.deepcopy(self.critic) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4) - self.memory = ReplayBuffer(state_dim, action_dim) + self.memory = ReplayBuffer(input_dim, output_dim) def choose_action(self, state): state = torch.FloatTensor(state.reshape(1, -1)).to(self.device) diff --git a/codes/common/atari_wrappers.py b/codes/common/atari_wrappers.py new file mode 100644 index 0000000..48dab94 --- /dev/null +++ b/codes/common/atari_wrappers.py @@ -0,0 +1,284 @@ +import numpy as np +import os +os.environ.setdefault('PATH', '') +from collections import deque +import gym +from gym import spaces +import cv2 +cv2.ocl.setUseOpenCL(False) +from .wrappers import TimeLimit + + +class NoopResetEnv(gym.Wrapper): + def __init__(self, env, noop_max=30): + """Sample initial states by taking random number of no-ops on reset. + No-op is assumed to be action 0. + """ + gym.Wrapper.__init__(self, env) + self.noop_max = noop_max + self.override_num_noops = None + self.noop_action = 0 + assert env.unwrapped.get_action_meanings()[0] == 'NOOP' + + def reset(self, **kwargs): + """ Do no-op action for a number of steps in [1, noop_max].""" + self.env.reset(**kwargs) + if self.override_num_noops is not None: + noops = self.override_num_noops + else: + noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101 + assert noops > 0 + obs = None + for _ in range(noops): + obs, _, done, _ = self.env.step(self.noop_action) + if done: + obs = self.env.reset(**kwargs) + return obs + + def step(self, ac): + return self.env.step(ac) + +class FireResetEnv(gym.Wrapper): + def __init__(self, env): + """Take action on reset for environments that are fixed until firing.""" + gym.Wrapper.__init__(self, env) + assert env.unwrapped.get_action_meanings()[1] == 'FIRE' + assert len(env.unwrapped.get_action_meanings()) >= 3 + + def reset(self, **kwargs): + self.env.reset(**kwargs) + obs, _, done, _ = self.env.step(1) + if done: + self.env.reset(**kwargs) + obs, _, done, _ = self.env.step(2) + if done: + self.env.reset(**kwargs) + return obs + + def step(self, ac): + return self.env.step(ac) + +class EpisodicLifeEnv(gym.Wrapper): + def __init__(self, env): + """Make end-of-life == end-of-episode, but only reset on true game over. + Done by DeepMind for the DQN and co. since it helps value estimation. + """ + gym.Wrapper.__init__(self, env) + self.lives = 0 + self.was_real_done = True + + def step(self, action): + obs, reward, done, info = self.env.step(action) + self.was_real_done = done + # check current lives, make loss of life terminal, + # then update lives to handle bonus lives + lives = self.env.unwrapped.ale.lives() + if lives < self.lives and lives > 0: + # for Qbert sometimes we stay in lives == 0 condition for a few frames + # so it's important to keep lives > 0, so that we only reset once + # the environment advertises done. + done = True + self.lives = lives + return obs, reward, done, info + + def reset(self, **kwargs): + """Reset only when lives are exhausted. + This way all states are still reachable even though lives are episodic, + and the learner need not know about any of this behind-the-scenes. + """ + if self.was_real_done: + obs = self.env.reset(**kwargs) + else: + # no-op step to advance from terminal/lost life state + obs, _, _, _ = self.env.step(0) + self.lives = self.env.unwrapped.ale.lives() + return obs + +class MaxAndSkipEnv(gym.Wrapper): + def __init__(self, env, skip=4): + """Return only every `skip`-th frame""" + gym.Wrapper.__init__(self, env) + # most recent raw observations (for max pooling across time steps) + self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8) + self._skip = skip + + def step(self, action): + """Repeat action, sum reward, and max over last observations.""" + total_reward = 0.0 + done = None + for i in range(self._skip): + obs, reward, done, info = self.env.step(action) + if i == self._skip - 2: self._obs_buffer[0] = obs + if i == self._skip - 1: self._obs_buffer[1] = obs + total_reward += reward + if done: + break + # Note that the observation on the done=True frame + # doesn't matter + max_frame = self._obs_buffer.max(axis=0) + + return max_frame, total_reward, done, info + + def reset(self, **kwargs): + return self.env.reset(**kwargs) + +class ClipRewardEnv(gym.RewardWrapper): + def __init__(self, env): + gym.RewardWrapper.__init__(self, env) + + def reward(self, reward): + """Bin reward to {+1, 0, -1} by its sign.""" + return np.sign(reward) + + +class WarpFrame(gym.ObservationWrapper): + def __init__(self, env, width=84, height=84, grayscale=True, dict_space_key=None): + """ + Warp frames to 84x84 as done in the Nature paper and later work. + If the environment uses dictionary observations, `dict_space_key` can be specified which indicates which + observation should be warped. + """ + super().__init__(env) + self._width = width + self._height = height + self._grayscale = grayscale + self._key = dict_space_key + if self._grayscale: + num_colors = 1 + else: + num_colors = 3 + + new_space = gym.spaces.Box( + low=0, + high=255, + shape=(self._height, self._width, num_colors), + dtype=np.uint8, + ) + if self._key is None: + original_space = self.observation_space + self.observation_space = new_space + else: + original_space = self.observation_space.spaces[self._key] + self.observation_space.spaces[self._key] = new_space + assert original_space.dtype == np.uint8 and len(original_space.shape) == 3 + + def observation(self, obs): + if self._key is None: + frame = obs + else: + frame = obs[self._key] + + if self._grayscale: + frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) + frame = cv2.resize( + frame, (self._width, self._height), interpolation=cv2.INTER_AREA + ) + if self._grayscale: + frame = np.expand_dims(frame, -1) + + if self._key is None: + obs = frame + else: + obs = obs.copy() + obs[self._key] = frame + return obs + + +class FrameStack(gym.Wrapper): + def __init__(self, env, k): + """Stack k last frames. + Returns lazy array, which is much more memory efficient. + See Also + -------- + baselines.common.atari_wrappers.LazyFrames + """ + gym.Wrapper.__init__(self, env) + self.k = k + self.frames = deque([], maxlen=k) + shp = env.observation_space.shape + self.observation_space = spaces.Box(low=0, high=255, shape=(shp[:-1] + (shp[-1] * k,)), dtype=env.observation_space.dtype) + + def reset(self): + ob = self.env.reset() + for _ in range(self.k): + self.frames.append(ob) + return self._get_ob() + + def step(self, action): + ob, reward, done, info = self.env.step(action) + self.frames.append(ob) + return self._get_ob(), reward, done, info + + def _get_ob(self): + assert len(self.frames) == self.k + return LazyFrames(list(self.frames)) + +class ScaledFloatFrame(gym.ObservationWrapper): + def __init__(self, env): + gym.ObservationWrapper.__init__(self, env) + self.observation_space = gym.spaces.Box(low=0, high=1, shape=env.observation_space.shape, dtype=np.float32) + + def observation(self, observation): + # careful! This undoes the memory optimization, use + # with smaller replay buffers only. + return np.array(observation).astype(np.float32) / 255.0 + +class LazyFrames(object): + def __init__(self, frames): + """This object ensures that common frames between the observations are only stored once. + It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay + buffers. + This object should only be converted to numpy array before being passed to the model. + You'd not believe how complex the previous solution was.""" + self._frames = frames + self._out = None + + def _force(self): + if self._out is None: + self._out = np.concatenate(self._frames, axis=-1) + self._frames = None + return self._out + + def __array__(self, dtype=None): + out = self._force() + if dtype is not None: + out = out.astype(dtype) + return out + + def __len__(self): + return len(self._force()) + + def __getitem__(self, i): + return self._force()[i] + + def count(self): + frames = self._force() + return frames.shape[frames.ndim - 1] + + def frame(self, i): + return self._force()[..., i] + +def make_atari(env_id, max_episode_steps=None): + env = gym.make(env_id) + assert 'NoFrameskip' in env.spec.id + env = NoopResetEnv(env, noop_max=30) + env = MaxAndSkipEnv(env, skip=4) + if max_episode_steps is not None: + env = TimeLimit(env, max_episode_steps=max_episode_steps) + return env + +def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False): + """Configure environment for DeepMind-style Atari. + """ + if episode_life: + env = EpisodicLifeEnv(env) + if 'FIRE' in env.unwrapped.get_action_meanings(): + env = FireResetEnv(env) + env = WarpFrame(env) + if scale: + env = ScaledFloatFrame(env) + if clip_rewards: + env = ClipRewardEnv(env) + if frame_stack: + env = FrameStack(env, 4) + return env \ No newline at end of file diff --git a/codes/common/model.py b/codes/common/model.py index be03368..4ab0b8b 100644 --- a/codes/common/model.py +++ b/codes/common/model.py @@ -15,15 +15,15 @@ import torch.nn.functional as F from torch.distributions import Categorical class MLP(nn.Module): - def __init__(self, n_states,n_actions,hidden_dim=128): + def __init__(self, input_dim,output_dim,hidden_dim=128): """ 初始化q网络,为全连接网络 - n_states: 输入的特征数即环境的状态数 - n_actions: 输出的动作维度 + input_dim: 输入的特征数即环境的状态维度 + output_dim: 输出的动作维度 """ super(MLP, self).__init__() - self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层 + self.fc1 = nn.Linear(input_dim, hidden_dim) # 输入层 self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层 - self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层 + self.fc3 = nn.Linear(hidden_dim, output_dim) # 输出层 def forward(self, x): # 各层对应的激活函数 @@ -32,10 +32,10 @@ class MLP(nn.Module): return self.fc3(x) class Critic(nn.Module): - def __init__(self, n_obs, n_actions, hidden_size, init_w=3e-3): + def __init__(self, n_obs, action_dim, hidden_size, init_w=3e-3): super(Critic, self).__init__() - self.linear1 = nn.Linear(n_obs + n_actions, hidden_size) + self.linear1 = nn.Linear(n_obs + action_dim, hidden_size) self.linear2 = nn.Linear(hidden_size, hidden_size) self.linear3 = nn.Linear(hidden_size, 1) # 随机初始化为较小的值 @@ -51,11 +51,11 @@ class Critic(nn.Module): return x class Actor(nn.Module): - def __init__(self, n_obs, n_actions, hidden_size, init_w=3e-3): + def __init__(self, n_obs, action_dim, hidden_size, init_w=3e-3): super(Actor, self).__init__() self.linear1 = nn.Linear(n_obs, hidden_size) self.linear2 = nn.Linear(hidden_size, hidden_size) - self.linear3 = nn.Linear(hidden_size, n_actions) + self.linear3 = nn.Linear(hidden_size, action_dim) self.linear3.weight.data.uniform_(-init_w, init_w) self.linear3.bias.data.uniform_(-init_w, init_w) @@ -67,18 +67,18 @@ class Actor(nn.Module): return x class ActorCritic(nn.Module): - def __init__(self, n_states, n_actions, hidden_dim=256): + def __init__(self, state_dim, action_dim, hidden_dim=256): super(ActorCritic, self).__init__() self.critic = nn.Sequential( - nn.Linear(n_states, hidden_dim), + nn.Linear(state_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, 1) ) self.actor = nn.Sequential( - nn.Linear(n_states, hidden_dim), + nn.Linear(state_dim, hidden_dim), nn.ReLU(), - nn.Linear(hidden_dim, n_actions), + nn.Linear(hidden_dim, action_dim), nn.Softmax(dim=1), ) diff --git a/codes/common/multiprocessing_env.py b/codes/common/multiprocessing_env.py index 04b4e3c..28c8aba 100644 --- a/codes/common/multiprocessing_env.py +++ b/codes/common/multiprocessing_env.py @@ -1,5 +1,5 @@ -#This code is from openai baseline -#https://github.com/openai/baselines/tree/master/baselines/common/vec_env +# 该代码来自 openai baseline,用于多线程环境 +# https://github.com/openai/baselines/tree/master/baselines/common/vec_env import numpy as np from multiprocessing import Process, Pipe diff --git a/codes/common/plot.py b/codes/common/plot.py deleted file mode 100644 index bc9c1dd..0000000 --- a/codes/common/plot.py +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -Author: John -Email: johnjim0816@gmail.com -Date: 2020-10-07 20:57:11 -LastEditor: John -LastEditTime: 2021-09-23 12:23:01 -Discription: -Environment: -''' -import matplotlib.pyplot as plt -import seaborn as sns -from matplotlib.font_manager import FontProperties # 导入字体模块 - -def plot_rewards(rewards,ma_rewards,plot_cfg,tag='train'): - sns.set() - plt.figure() # 创建一个图形实例,方便同时多画几个图 - plt.title("learning curve on {} of {} for {}".format(plot_cfg.device, plot_cfg.algo, plot_cfg.env_name)) - plt.xlabel('epsiodes') - plt.plot(rewards,label='rewards') - plt.plot(ma_rewards,label='ma rewards') - plt.legend() - if plot_cfg.save: - plt.savefig(plot_cfg.result_path+"{}_rewards_curve".format(tag)) - plt.show() - -def plot_losses(losses,algo = "DQN",save=True,path='./'): - sns.set() - plt.figure() - plt.title("loss curve of {}".format(algo)) - plt.xlabel('epsiodes') - plt.plot(losses,label='rewards') - plt.legend() - if save: - plt.savefig(path+"losses_curve") - plt.show() - diff --git a/codes/common/utils.py b/codes/common/utils.py index a3ca7be..6027804 100644 --- a/codes/common/utils.py +++ b/codes/common/utils.py @@ -5,29 +5,90 @@ Author: John Email: johnjim0816@gmail.com Date: 2021-03-12 16:02:24 LastEditor: John -LastEditTime: 2021-09-11 21:48:49 +LastEditTime: 2021-11-30 18:39:19 Discription: Environment: ''' import os import numpy as np from pathlib import Path +import matplotlib.pyplot as plt +import seaborn as sns -def save_results(rewards,ma_rewards,tag='train',path='./results'): - '''save rewards and ma_rewards +from matplotlib.font_manager import FontProperties # 导入字体模块 + +def chinese_font(): + ''' 设置中文字体,注意需要根据自己电脑情况更改字体路径,否则还是默认的字体 + ''' + try: + font = FontProperties( + fname='/System/Library/Fonts/STHeiti Light.ttc', size=15) # fname系统字体路径,此处是mac的 + except: + font = None + return font + +def plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag='train'): + ''' 中文画图 + ''' + sns.set() + plt.figure() + plt.title(u"{}环境下{}算法的学习曲线".format(plot_cfg.env_name, + plot_cfg.algo_name), fontproperties=chinese_font()) + plt.xlabel(u'回合数', fontproperties=chinese_font()) + plt.plot(rewards) + plt.plot(ma_rewards) + plt.legend((u'奖励', u'滑动平均奖励',), loc="best", prop=chinese_font()) + if plot_cfg.save: + plt.savefig(plot_cfg.result_path+f"{tag}_rewards_curve_cn") + # plt.show() + + +def plot_rewards(rewards, ma_rewards, plot_cfg, tag='train'): + sns.set() + plt.figure() # 创建一个图形实例,方便同时多画几个图 + plt.title("learning curve on {} of {} for {}".format( + plot_cfg.device, plot_cfg.algo_name, plot_cfg.env_name)) + plt.xlabel('epsiodes') + plt.plot(rewards, label='rewards') + plt.plot(ma_rewards, label='ma rewards') + plt.legend() + if plot_cfg.save: + plt.savefig(plot_cfg.result_path+"{}_rewards_curve".format(tag)) + plt.show() + + +def plot_losses(losses, algo="DQN", save=True, path='./'): + sns.set() + plt.figure() + plt.title("loss curve of {}".format(algo)) + plt.xlabel('epsiodes') + plt.plot(losses, label='rewards') + plt.legend() + if save: + plt.savefig(path+"losses_curve") + plt.show() + + +def save_results(rewards, ma_rewards, tag='train', path='./results'): + ''' 保存奖励 ''' np.save(path+'{}_rewards.npy'.format(tag), rewards) np.save(path+'{}_ma_rewards.npy'.format(tag), ma_rewards) print('结果保存完毕!') + def make_dir(*paths): + ''' 创建文件夹 + ''' for path in paths: Path(path).mkdir(parents=True, exist_ok=True) + + def del_empty_dir(*paths): - '''del_empty_dir delete empty folders unders "paths" + ''' 删除目录下所有空文件夹 ''' for path in paths: dirs = os.listdir(path) for dir in dirs: if not os.listdir(os.path.join(path, dir)): - os.removedirs(os.path.join(path, dir)) \ No newline at end of file + os.removedirs(os.path.join(path, dir)) diff --git a/codes/common/wrappers.py b/codes/common/wrappers.py new file mode 100644 index 0000000..4793b36 --- /dev/null +++ b/codes/common/wrappers.py @@ -0,0 +1,29 @@ +import gym + +class TimeLimit(gym.Wrapper): + def __init__(self, env, max_episode_steps=None): + super(TimeLimit, self).__init__(env) + self._max_episode_steps = max_episode_steps + self._elapsed_steps = 0 + + def step(self, ac): + observation, reward, done, info = self.env.step(ac) + self._elapsed_steps += 1 + if self._elapsed_steps >= self._max_episode_steps: + done = True + info['TimeLimit.truncated'] = True + return observation, reward, done, info + + def reset(self, **kwargs): + self._elapsed_steps = 0 + return self.env.reset(**kwargs) + +class ClipActionsWrapper(gym.Wrapper): + def step(self, action): + import numpy as np + action = np.nan_to_num(action) + action = np.clip(action, self.action_space.low, self.action_space.high) + return self.env.step(action) + + def reset(self, **kwargs): + return self.env.reset(**kwargs) \ No newline at end of file diff --git a/codes/envs/assets/gym_info_20211130180023.png b/codes/envs/assets/gym_info_20211130180023.png new file mode 100644 index 0000000..723b67f Binary files /dev/null and b/codes/envs/assets/gym_info_20211130180023.png differ diff --git a/codes/envs/gym_info.md b/codes/envs/gym_info.md index dd4268a..49da18f 100644 --- a/codes/envs/gym_info.md +++ b/codes/envs/gym_info.md @@ -1,4 +1,5 @@ -## 环境说明 +# OpenAi Gym 环境说明 +## 基础控制 ### [CartPole v0](https://github.com/openai/gym/wiki/CartPole-v0) @@ -6,6 +7,17 @@ 通过向左或向右推车能够实现平衡,所以动作空间由两个动作组成。每进行一个step就会给一个reward,如果无法保持平衡那么done等于true,本次episode失败。理想状态下,每个episode至少能进行200个step,也就是说每个episode的reward总和至少为200,step数目至少为200 +### CartPole-v1 + +```CartPole v1```环境其实跟```CartPole v0```是一模一样的,区别在于每回合最大步数(max_episode_steps)以及奖励阈值(reward_threshold),如下是相关源码: + +![](assets/gym_info_20211130180023.png) + +这里先解释一下奖励阈值(reward_threshold),即Gym设置的一个合格标准,比如对于```CartPole v0```如果算法能够将奖励收敛到195以上,说明该算法合格。但实际上```CartPole v0```的每回合最大步数(max_episode_steps)是200,每步的奖励最大是1,也就是每回合最大奖励是200,比Gym设置的奖励阈值高。笔者猜测这是Gym可能是给算法学习者们设置的一个参考线,而实际中在写算法时并不会用到这个算法阈值,所以可以忽略。 + +再看每回合最大步数,可以看到```CartPole v1```的步数更长,相应的奖励要求更高,可以理解为```v1```是```v0```的难度升级版。 + + ### [Pendulum-v0](https://github.com/openai/gym/wiki/Pendulum-v0) 注:gym 0.18.0之后版本中Pendulum-v0已经改为Pendulum-v1 @@ -31,4 +43,8 @@ image-20201007211858925 -由于从起点到终点最少需要13步,每步得到-1的reward,因此最佳训练算法下,每个episode下reward总和应该为-13。 \ No newline at end of file +由于从起点到终点最少需要13步,每步得到-1的reward,因此最佳训练算法下,每个episode下reward总和应该为-13。 + +## 参考 + +[Gym环境相关源码](https://github.com/openai/gym/tree/master/gym/envs) \ No newline at end of file diff --git a/docs/chapter4/chapter4_questions&keywords.md b/docs/chapter4/chapter4_questions&keywords.md index eb04dd4..e1d5786 100644 --- a/docs/chapter4/chapter4_questions&keywords.md +++ b/docs/chapter4/chapter4_questions&keywords.md @@ -7,7 +7,7 @@ - **Trajectory:** 一个试验中我们将environment 输出的 $s$ 跟 actor 输出的行为 $a$,把这个 $s$ 跟 $a$ 全部串起来形成的集合,我们称为Trajectory,即 $\text { Trajectory } \tau=\left\{s_{1}, a_{1}, s_{2}, a_{2}, \cdots, s_{t}, a_{t}\right\}$。 - **Reward function:** 根据在某一个 state 采取的某一个 action 决定说现在这个行为可以得到多少的分数,它是一个 function。也就是给一个 $s_1$,$a_1$,它告诉你得到 $r_1$。给它 $s_2$ ,$a_2$,它告诉你得到 $r_2$。 把所有的 $r$ 都加起来,我们就得到了 $R(\tau)$ ,代表某一个 trajectory $\tau$ 的 reward。 - **Expected reward:** $\bar{R}_{\theta}=\sum_{\tau} R(\tau) p_{\theta}(\tau)=E_{\tau \sim p_{\theta}(\tau)}[R(\tau)]$。 -- **Reinforce:** 基于策略梯度的强化学习的经典算法,其采用回合更新的模式。 +- **REINFORCE:** 基于策略梯度的强化学习的经典算法,其采用回合更新的模式。 ## 2 Questions