diff --git a/codes/A2C/agent.py b/codes/A2C/agent.py index bd26785..997401b 100644 --- a/codes/A2C/agent.py +++ b/codes/A2C/agent.py @@ -40,10 +40,10 @@ class ActorCritic(nn.Module): class A2C: ''' A2C算法 ''' - def __init__(self,n_states,n_actions,cfg) -> None: + def __init__(self,state_dim,action_dim,cfg) -> None: self.gamma = cfg.gamma self.device = cfg.device - self.model = ActorCritic(n_states, n_actions, cfg.hidden_size).to(self.device) + self.model = ActorCritic(state_dim, action_dim, cfg.hidden_size).to(self.device) self.optimizer = optim.Adam(self.model.parameters()) def compute_returns(self,next_value, rewards, masks): diff --git a/codes/A2C/task0.py b/codes/A2C/task0.py index e0296ed..fd54d87 100644 --- a/codes/A2C/task0.py +++ b/codes/A2C/task0.py @@ -74,9 +74,9 @@ def train(cfg,envs): print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}') env = gym.make(cfg.env_name) # a single env env.seed(10) - n_states = envs.observation_space.shape[0] - n_actions = envs.action_space.n - model = ActorCritic(n_states, n_actions, cfg.hidden_dim).to(cfg.device) + state_dim = envs.observation_space.shape[0] + action_dim = envs.action_space.n + model = ActorCritic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device) optimizer = optim.Adam(model.parameters()) frame_idx = 0 test_rewards = [] diff --git a/codes/DDPG/agent.py b/codes/DDPG/agent.py index 01ded1c..6ec2eef 100644 --- a/codes/DDPG/agent.py +++ b/codes/DDPG/agent.py @@ -39,11 +39,11 @@ class ReplayBuffer: ''' return len(self.buffer) class Actor(nn.Module): - def __init__(self, n_states, n_actions, hidden_dim, init_w=3e-3): + def __init__(self, state_dim, action_dim, hidden_dim, init_w=3e-3): super(Actor, self).__init__() - self.linear1 = nn.Linear(n_states, hidden_dim) + self.linear1 = nn.Linear(state_dim, hidden_dim) self.linear2 = nn.Linear(hidden_dim, hidden_dim) - self.linear3 = nn.Linear(hidden_dim, n_actions) + self.linear3 = nn.Linear(hidden_dim, action_dim) self.linear3.weight.data.uniform_(-init_w, init_w) self.linear3.bias.data.uniform_(-init_w, init_w) @@ -54,10 +54,10 @@ class Actor(nn.Module): x = torch.tanh(self.linear3(x)) return x class Critic(nn.Module): - def __init__(self, n_states, n_actions, hidden_dim, init_w=3e-3): + def __init__(self, state_dim, action_dim, hidden_dim, init_w=3e-3): super(Critic, self).__init__() - self.linear1 = nn.Linear(n_states + n_actions, hidden_dim) + self.linear1 = nn.Linear(state_dim + action_dim, hidden_dim) self.linear2 = nn.Linear(hidden_dim, hidden_dim) self.linear3 = nn.Linear(hidden_dim, 1) # 随机初始化为较小的值 @@ -72,12 +72,12 @@ class Critic(nn.Module): x = self.linear3(x) return x class DDPG: - def __init__(self, n_states, n_actions, cfg): + def __init__(self, state_dim, action_dim, cfg): self.device = cfg.device - self.critic = Critic(n_states, n_actions, cfg.hidden_dim).to(cfg.device) - self.actor = Actor(n_states, n_actions, cfg.hidden_dim).to(cfg.device) - self.target_critic = Critic(n_states, n_actions, cfg.hidden_dim).to(cfg.device) - self.target_actor = Actor(n_states, n_actions, cfg.hidden_dim).to(cfg.device) + self.critic = Critic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device) + self.actor = Actor(state_dim, action_dim, cfg.hidden_dim).to(cfg.device) + self.target_critic = Critic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device) + self.target_actor = Actor(state_dim, action_dim, cfg.hidden_dim).to(cfg.device) # 复制参数到目标网络 for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()): diff --git a/codes/DDPG/env.py b/codes/DDPG/env.py index 89445cf..92fe482 100644 --- a/codes/DDPG/env.py +++ b/codes/DDPG/env.py @@ -39,15 +39,15 @@ class OUNoise(object): self.max_sigma = max_sigma self.min_sigma = min_sigma self.decay_period = decay_period - self.n_actions = action_space.shape[0] + self.action_dim = action_space.shape[0] self.low = action_space.low self.high = action_space.high self.reset() def reset(self): - self.obs = np.ones(self.n_actions) * self.mu + self.obs = np.ones(self.action_dim) * self.mu def evolve_obs(self): x = self.obs - dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.n_actions) + dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim) self.obs = x + dx return self.obs def get_action(self, action, t=0): diff --git a/codes/DDPG/task0.py b/codes/DDPG/task0.py index 550da78..81fa9a6 100644 --- a/codes/DDPG/task0.py +++ b/codes/DDPG/task0.py @@ -58,9 +58,9 @@ class PlotConfig: def env_agent_config(cfg,seed=1): env = NormalizedActions(gym.make(cfg.env_name)) # 装饰action噪声 env.seed(seed) # 随机种子 - n_states = env.observation_space.shape[0] - n_actions = env.action_space.shape[0] - agent = DDPG(n_states,n_actions,cfg) + state_dim = env.observation_space.shape[0] + action_dim = env.action_space.shape[0] + agent = DDPG(state_dim,action_dim,cfg) return env,agent cfg = DDPGConfig() diff --git a/codes/DQN/README.md b/codes/DQN/README.md index 33e7397..fc82fe6 100644 --- a/codes/DQN/README.md +++ b/codes/DQN/README.md @@ -50,15 +50,15 @@ import torch.nn as nn import torch.nn.functional as F class FCN(nn.Module): - def __init__(self, n_states=4, n_actions=18): + def __init__(self, state_dim=4, action_dim=18): """ 初始化q网络,为全连接网络 - n_states: 输入的feature即环境的state数目 - n_actions: 输出的action总个数 + state_dim: 输入的feature即环境的state数目 + action_dim: 输出的action总个数 """ super(FCN, self).__init__() - self.fc1 = nn.Linear(n_states, 128) # 输入层 + self.fc1 = nn.Linear(state_dim, 128) # 输入层 self.fc2 = nn.Linear(128, 128) # 隐藏层 - self.fc3 = nn.Linear(128, n_actions) # 输出层 + self.fc3 = nn.Linear(128, action_dim) # 输出层 def forward(self, x): # 各层对应的激活函数 @@ -66,7 +66,7 @@ class FCN(nn.Module): x = F.relu(self.fc2(x)) return self.fc3(x) ``` -输入为n_states,输出为n_actions,包含一个128维度的隐藏层,这里根据需要可增加隐藏层维度和数量,然后一般使用relu激活函数,这里跟深度学习的网路设置是一样的。 +输入为state_dim,输出为action_dim,包含一个128维度的隐藏层,这里根据需要可增加隐藏层维度和数量,然后一般使用relu激活函数,这里跟深度学习的网路设置是一样的。 ### Replay Buffer @@ -107,8 +107,8 @@ class ReplayBuffer: 在类中建立两个网络,以及optimizer和memory, ```python -self.policy_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device) -self.target_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device) +self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device) +self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device) for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # copy params from policy net target_param.data.copy_(param.data) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) @@ -124,7 +124,7 @@ def choose_action(self, state): if random.random() > self.epsilon(self.frame_idx): action = self.predict(state) else: - action = random.randrange(self.n_actions) + action = random.randrange(self.action_dim) return action ``` diff --git a/codes/DQN/dqn.py b/codes/DQN/dqn.py index e36f1d7..4a4dfc4 100644 --- a/codes/DQN/dqn.py +++ b/codes/DQN/dqn.py @@ -21,15 +21,15 @@ import math import numpy as np class MLP(nn.Module): - def __init__(self, n_states,n_actions,hidden_dim=128): + def __init__(self, state_dim,action_dim,hidden_dim=128): """ 初始化q网络,为全连接网络 - n_states: 输入的特征数即环境的状态数 - n_actions: 输出的动作维度 + state_dim: 输入的特征数即环境的状态维度 + action_dim: 输出的动作维度 """ super(MLP, self).__init__() - self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层 + self.fc1 = nn.Linear(state_dim, hidden_dim) # 输入层 self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层 - self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层 + self.fc3 = nn.Linear(hidden_dim, action_dim) # 输出层 def forward(self, x): # 各层对应的激活函数 @@ -62,9 +62,9 @@ class ReplayBuffer: return len(self.buffer) class DQN: - def __init__(self, n_states, n_actions, cfg): + def __init__(self, state_dim, action_dim, cfg): - self.n_actions = n_actions # 总的动作个数 + self.action_dim = action_dim # 总的动作个数 self.device = cfg.device # 设备,cpu或gpu等 self.gamma = cfg.gamma # 奖励的折扣因子 # e-greedy策略相关参数 @@ -73,8 +73,8 @@ class DQN: (cfg.epsilon_start - cfg.epsilon_end) * \ math.exp(-1. * frame_idx / cfg.epsilon_decay) self.batch_size = cfg.batch_size - self.policy_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device) - self.target_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device) + self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device) + self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device) for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net target_param.data.copy_(param.data) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器 @@ -90,7 +90,7 @@ class DQN: q_values = self.policy_net(state) action = q_values.max(1)[1].item() # 选择Q值最大的动作 else: - action = random.randrange(self.n_actions) + action = random.randrange(self.action_dim) return action def update(self): if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时,不更新策略 diff --git a/codes/DQN/dqn_cnn.py b/codes/DQN/dqn_cnn.py index 0f4302c..c14118f 100644 --- a/codes/DQN/dqn_cnn.py +++ b/codes/DQN/dqn_cnn.py @@ -70,9 +70,9 @@ class ReplayBuffer: return len(self.buffer) class DQN: - def __init__(self, n_states, n_actions, cfg): + def __init__(self, state_dim, action_dim, cfg): - self.n_actions = n_actions # 总的动作个数 + self.action_dim = action_dim # 总的动作个数 self.device = cfg.device # 设备,cpu或gpu等 self.gamma = cfg.gamma # 奖励的折扣因子 # e-greedy策略相关参数 @@ -81,8 +81,8 @@ class DQN: (cfg.epsilon_start - cfg.epsilon_end) * \ math.exp(-1. * frame_idx / cfg.epsilon_decay) self.batch_size = cfg.batch_size - self.policy_net = CNN(n_states, n_actions).to(self.device) - self.target_net = CNN(n_states, n_actions).to(self.device) + self.policy_net = CNN(state_dim, action_dim).to(self.device) + self.target_net = CNN(state_dim, action_dim).to(self.device) for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net target_param.data.copy_(param.data) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器 @@ -98,7 +98,7 @@ class DQN: q_values = self.policy_net(state) action = q_values.max(1)[1].item() # 选择Q值最大的动作 else: - action = random.randrange(self.n_actions) + action = random.randrange(self.action_dim) return action def update(self): if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时,不更新策略 diff --git a/codes/DQN/task0.py b/codes/DQN/task0.py index 937f412..871edf3 100644 --- a/codes/DQN/task0.py +++ b/codes/DQN/task0.py @@ -7,23 +7,29 @@ sys.path.append(parent_path) # 添加路径到系统路径 import gym import torch import datetime +import numpy as np from common.utils import save_results, make_dir from common.utils import plot_rewards from DQN.dqn import DQN curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 -algo_name = 'DQN' # 算法名称 -env_name = 'CartPole-v0' # 环境名称 -class DQNConfig: + +class Config: + '''超参数 + ''' + def __init__(self): - self.algo_name = algo_name # 算法名称 - self.env_name = env_name # 环境名称 + ################################## 环境超参数 ################################### + self.algo_name = 'DQN' # 算法名称 + self.env_name = 'CartPole-v0' # 环境名称 self.device = torch.device( - "cuda" if torch.cuda.is_available() else "cpu") # 检测GPU + "cuda" if torch.cuda.is_available() else "cpu") # 检测GPUgjgjlkhfsf风刀霜的撒发十 self.train_eps = 200 # 训练的回合数 self.test_eps = 30 # 测试的回合数 - # 超参数 + ################################################################################ + + ################################## 算法超参数 ################################### self.gamma = 0.95 # 强化学习中的折扣因子 self.epsilon_start = 0.90 # e-greedy策略中初始epsilon self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon @@ -33,99 +39,106 @@ class DQNConfig: self.batch_size = 64 # mini-batch SGD中的批量大小 self.target_update = 4 # 目标网络的更新频率 self.hidden_dim = 256 # 网络隐藏层 -class PlotConfig: - def __init__(self) -> None: - self.algo = algo_name # 算法名称 - self.env_name = env_name # 环境名称 - self.device = torch.device( - "cuda" if torch.cuda.is_available() else "cpu") # 检测GPU + ################################################################################ + + ################################# 保存结果相关参数 ################################ self.result_path = curr_path + "/outputs/" + self.env_name + \ '/' + curr_time + '/results/' # 保存结果的路径 self.model_path = curr_path + "/outputs/" + self.env_name + \ '/' + curr_time + '/models/' # 保存模型的路径 - self.save = True # 是否保存图片 + self.save = True # 是否保存图片 + ################################################################################ + def env_agent_config(cfg, seed=1): ''' 创建环境和智能体 ''' env = gym.make(cfg.env_name) # 创建环境 - env.seed(seed) # 设置随机种子 - n_states = env.observation_space.shape[0] # 状态数 - n_actions = env.action_space.n # 动作数 - agent = DQN(n_states, n_actions, cfg) # 创建智能体 + state_dim = env.observation_space.shape[0] # 状态维度 + action_dim = env.action_space.n # 动作维度 + agent = DQN(state_dim, action_dim, cfg) # 创建智能体 + if seed !=0: # 设置随机种子 + torch.manual_seed(seed) + env.seed(seed) + np.random.seed(seed) return env, agent + def train(cfg, env, agent): ''' 训练 ''' print('开始训练!') print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') - rewards = [] # 记录所有回合的奖励 + rewards = [] # 记录所有回合的奖励 ma_rewards = [] # 记录所有回合的滑动平均奖励 for i_ep in range(cfg.train_eps): - ep_reward = 0 # 记录一回合内的奖励 - state = env.reset() # 重置环境,返回初始状态 + ep_reward = 0 # 记录一回合内的奖励 + state = env.reset() # 重置环境,返回初始状态 while True: - action = agent.choose_action(state) # 选择动作 - next_state, reward, done, _ = env.step(action) # 更新环境,返回transition - agent.memory.push(state, action, reward, next_state, done) # 保存transition - state = next_state # 更新下一个状态 - agent.update() # 更新智能体 - ep_reward += reward # 累加奖励 + action = agent.choose_action(state) # 选择动作 + next_state, reward, done, _ = env.step(action) # 更新环境,返回transition + agent.memory.push(state, action, reward, + next_state, done) # 保存transition + state = next_state # 更新下一个状态 + agent.update() # 更新智能体 + ep_reward += reward # 累加奖励 if done: break - if (i_ep+1) % cfg.target_update == 0: # 智能体目标网络更新 + if (i_ep + 1) % cfg.target_update == 0: # 智能体目标网络更新 agent.target_net.load_state_dict(agent.policy_net.state_dict()) rewards.append(ep_reward) if ma_rewards: - ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) + ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward) else: ma_rewards.append(ep_reward) - if (i_ep+1)%10 == 0: - print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward)) + if (i_ep + 1) % 10 == 0: + print('回合:{}/{}, 奖励:{}'.format(i_ep + 1, cfg.train_eps, ep_reward)) print('完成训练!') return rewards, ma_rewards -def test(cfg,env,agent): + +def test(cfg, env, agent): print('开始测试!') print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') # 由于测试不需要使用epsilon-greedy策略,所以相应的值设置为0 - cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon - cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon - rewards = [] # 记录所有回合的奖励 + cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon + cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon + rewards = [] # 记录所有回合的奖励 ma_rewards = [] # 记录所有回合的滑动平均奖励 for i_ep in range(cfg.test_eps): - ep_reward = 0 # 记录一回合内的奖励 - state = env.reset() # 重置环境,返回初始状态 + ep_reward = 0 # 记录一回合内的奖励 + state = env.reset() # 重置环境,返回初始状态 while True: - action = agent.choose_action(state) # 选择动作 - next_state, reward, done, _ = env.step(action) # 更新环境,返回transition - state = next_state # 更新下一个状态 - ep_reward += reward # 累加奖励 + action = agent.choose_action(state) # 选择动作 + next_state, reward, done, _ = env.step(action) # 更新环境,返回transition + state = next_state # 更新下一个状态 + ep_reward += reward # 累加奖励 if done: break rewards.append(ep_reward) if ma_rewards: - ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1) + ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1) else: ma_rewards.append(ep_reward) print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}") print('完成测试!') - return rewards,ma_rewards + return rewards, ma_rewards + + if __name__ == "__main__": - cfg = DQNConfig() - plot_cfg = PlotConfig() + cfg = Config() # 训练 env, agent = env_agent_config(cfg, seed=1) rewards, ma_rewards = train(cfg, env, agent) - make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹 - agent.save(path=plot_cfg.model_path) # 保存模型 + make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹 + agent.save(path=cfg.model_path) # 保存模型 save_results(rewards, ma_rewards, tag='train', - path=plot_cfg.result_path) # 保存结果 - plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果 + path=cfg.result_path) # 保存结果 + plot_rewards(rewards, ma_rewards, cfg, tag="train") # 画出结果 # 测试 env, agent = env_agent_config(cfg, seed=10) - agent.load(path=plot_cfg.model_path) # 导入模型 + agent.load(path=cfg.model_path) # 导入模型 rewards, ma_rewards = test(cfg, env, agent) - save_results(rewards, ma_rewards, tag='test', path=plot_cfg.result_path) # 保存结果 - plot_rewards(rewards, ma_rewards, plot_cfg, tag="test") # 画出结果 + save_results(rewards, ma_rewards, tag='test', + path=cfg.result_path) # 保存结果 + plot_rewards(rewards, ma_rewards, cfg, tag="test") # 画出结果 diff --git a/codes/DQN/task1.py b/codes/DQN/task1.py index ac9e559..078aa4c 100644 --- a/codes/DQN/task1.py +++ b/codes/DQN/task1.py @@ -66,9 +66,9 @@ def env_agent_config(cfg, seed=1): ''' env = gym.make(cfg.env_name) # 创建环境 env.seed(seed) # 设置随机种子 - n_states = env.observation_space.shape[0] # 状态数 - n_actions = env.action_space.n # 动作数 - agent = DQN(n_states, n_actions, cfg) # 创建智能体 + state_dim = env.observation_space.shape[0] # 状态维度 + action_dim = env.action_space.n # 动作维度 + agent = DQN(state_dim, action_dim, cfg) # 创建智能体 return env, agent def train(cfg, env, agent): diff --git a/codes/DQN/task2.py b/codes/DQN/task2.py index 8e2de34..16571b2 100644 --- a/codes/DQN/task2.py +++ b/codes/DQN/task2.py @@ -68,9 +68,9 @@ def env_agent_config(cfg, seed=1): # env = wrap_deepmind(env) # env = wrap_pytorch(env) env.seed(seed) # 设置随机种子 - n_states = env.observation_space.shape[0] # 状态数 - n_actions = env.action_space.n # 动作数 - agent = DQN(n_states, n_actions, cfg) # 创建智能体 + state_dim = env.observation_space.shape[0] # 状态维度 + action_dim = env.action_space.n # 动作维度 + agent = DQN(state_dim, action_dim, cfg) # 创建智能体 return env, agent def train(cfg, env, agent): diff --git a/codes/Docs/使用DDPG解决倒立摆问题.md b/codes/Docs/使用DDPG解决倒立摆问题.md index cfcf2a9..fd625f5 100644 --- a/codes/Docs/使用DDPG解决倒立摆问题.md +++ b/codes/Docs/使用DDPG解决倒立摆问题.md @@ -6,7 +6,7 @@ image-20210915161550713 -该环境的状态数有三个,设摆针竖直方向上的顺时针旋转角为$\theta$,$\theta$设在$[-\pi,\pi]$之间,则相应的状态为$[cos\theta,sin\theta,\dot{\theta}]$,即表示角度和角速度,我们的动作则是一个-2到2之间的力矩,它是一个连续量,因而该环境不能用离散动作的算法比如 DQN 来解决。关于奖励是根据相关的物理原理而计算出的等式,如下: +该环境的状态维度有三个,设摆针竖直方向上的顺时针旋转角为$\theta$,$\theta$设在$[-\pi,\pi]$之间,则相应的状态为$[cos\theta,sin\theta,\dot{\theta}]$,即表示角度和角速度,我们的动作则是一个-2到2之间的力矩,它是一个连续量,因而该环境不能用离散动作的算法比如 DQN 来解决。关于奖励是根据相关的物理原理而计算出的等式,如下: $$ -\left(\theta^{2}+0.1 * \hat{\theta}^{2}+0.001 * \text { action }^{2}\right) $$ @@ -90,15 +90,15 @@ class OUNoise(object): self.max_sigma = max_sigma self.min_sigma = min_sigma self.decay_period = decay_period - self.n_actions = action_space.shape[0] + self.action_dim = action_space.shape[0] self.low = action_space.low self.high = action_space.high self.reset() def reset(self): - self.obs = np.ones(self.n_actions) * self.mu + self.obs = np.ones(self.action_dim) * self.mu def evolve_obs(self): x = self.obs - dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.n_actions) + dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim) self.obs = x + dx return self.obs def get_action(self, action, t=0): diff --git a/codes/Docs/使用DQN解决推车杆问题.md b/codes/Docs/使用DQN解决推车杆问题.md index a09fec7..393c52d 100644 --- a/codes/Docs/使用DQN解决推车杆问题.md +++ b/codes/Docs/使用DQN解决推车杆问题.md @@ -14,21 +14,21 @@ CartPole-v0是一个经典的入门环境,如下图,它通过向左(动作=0 import gym env = gym.make('CartPole-v0') # 建立环境 env.seed(1) # 随机种子 -n_states = env.observation_space.shape[0] # 状态数 -n_actions = env.action_space.n # 动作数 +state_dim = env.observation_space.shape[0] # 状态维度 +action_dim = env.action_space.n # 动作维度 state = env.reset() # 初始化环境 -print(f"状态数:{n_states},动作数:{n_actions}") +print(f"状态维度:{state_dim},动作维度:{action_dim}") print(f"初始状态:{state}") ``` 可以得到结果: ```bash -状态数:4,动作数:2 +状态维度:4,动作维度:2 初始状态:[ 0.03073904 0.00145001 -0.03088818 -0.03131252] ``` -该环境状态数是四个,分别为车的位置、车的速度、杆的角度以及杆顶部的速度,动作数为两个,并且是离散的向左或者向右。理论上达到最优化算法的情况下,推车杆是一直能保持平衡的,也就是每回合的步数是无限,但是这不方便训练,所以环境内部设置了每回合的最大步数为200,也就是说理想情况下,只需要我们每回合的奖励达到200就算训练完成。 +该环境状态维度是四个,分别为车的位置、车的速度、杆的角度以及杆顶部的速度,动作维度为两个,并且是离散的向左或者向右。理论上达到最优化算法的情况下,推车杆是一直能保持平衡的,也就是每回合的步数是无限,但是这不方便训练,所以环境内部设置了每回合的最大步数为200,也就是说理想情况下,只需要我们每回合的奖励达到200就算训练完成。 ## DQN基本接口 @@ -125,7 +125,7 @@ class ReplayBuffer: class MLP(nn.Module): def __init__(self, input_dim,output_dim,hidden_dim=128): """ 初始化q网络,为全连接网络 - input_dim: 输入的特征数即环境的状态数 + input_dim: 输入的特征数即环境的状态维度 output_dim: 输出的动作维度 """ super(MLP, self).__init__() @@ -157,7 +157,7 @@ def choose_action(self, state): q_values = self.policy_net(state) action = q_values.max(1)[1].item() # 选择Q值最大的动作 else: - action = random.randrange(self.n_actions) + action = random.randrange(self.action_dim) ``` 可以看到跟Q学习算法其实是一样的,都是用的$\epsilon-greedy$策略,只是使用神经网络的话我们需要通过Torch或者Tensorflow工具来处理相应的数据。 diff --git a/codes/Docs/使用Q-learning解决悬崖寻路问题.md b/codes/Docs/使用Q-learning解决悬崖寻路问题.md index 3480d2f..44e5b6c 100644 --- a/codes/Docs/使用Q-learning解决悬崖寻路问题.md +++ b/codes/Docs/使用Q-learning解决悬崖寻路问题.md @@ -27,21 +27,21 @@ env = gym.make('CliffWalking-v0') # 定义环境 env = CliffWalkingWapper(env) # 装饰环境 ``` -这里我们在程序中使用了一个装饰器重新定义环境,但不影响对环境的理解,感兴趣的同学具体看相关代码。可以由于gym环境封装得比较好,所以我们想要使用这个环境只需要使用gym.make命令输入函数名即可,然后我们可以查看环境的状态和动作数目: +这里我们在程序中使用了一个装饰器重新定义环境,但不影响对环境的理解,感兴趣的同学具体看相关代码。可以由于gym环境封装得比较好,所以我们想要使用这个环境只需要使用gym.make命令输入函数名即可,然后我们可以查看环境的状态和动作维度目: ```python -n_states = env.observation_space.n # 状态数 -n_actions = env.action_space.n # 动作数 -print(f"状态数:{n_states},动作数:{n_actions}") +state_dim = env.observation_space.n # 状态维度 +action_dim = env.action_space.n # 动作维度 +print(f"状态维度:{state_dim},动作维度:{action_dim}") ``` 打印出来的结果如下: ```bash -状态数:48,动作数:4 +状态维度:48,动作维度:4 ``` -我们的状态数是48个,这里我们设置的是智能体当前所在网格的编号,而动作数是4,这表示有0,1,2,3对应着上下左右四个动作。另外我们也可以初始化环境并打印当前所在的状态: +我们的状态维度是48个,这里我们设置的是智能体当前所在网格的编号,而动作维度是4,这表示有0,1,2,3对应着上下左右四个动作。另外我们也可以初始化环境并打印当前所在的状态: ```python state = env.reset() @@ -72,9 +72,9 @@ print(state) env = gym.make('CliffWalking-v0') # 定义环境 env = CliffWalkingWapper(env) # 装饰环境 env.seed(1) # 设置随机种子 -n_states = env.observation_space.n # 状态数 -n_actions = env.action_space.n # 动作数 -agent = QLearning(n_states,n_actions,cfg) # cfg存储算法相关参数 +state_dim = env.observation_space.n # 状态维度 +action_dim = env.action_space.n # 动作维度 +agent = QLearning(state_dim,action_dim,cfg) # cfg存储算法相关参数 for i_ep in range(cfg.train_eps): # cfg.train_eps表示最大训练的回合数 ep_reward = 0 # 记录每个回合的奖励 state = env.reset() # 重置环境 @@ -126,7 +126,7 @@ def choose_action(self, state): if np.random.uniform(0, 1) > self.epsilon: action = np.argmax(self.Q_table[str(state)]) # 选择Q(s,a)最大对应的动作 else: - action = np.random.choice(self.n_actions) # 随机选择动作 + action = np.random.choice(self.action_dim) # 随机选择动作 return action ``` diff --git a/codes/DoubleDQN/agent.py b/codes/DoubleDQN/agent.py index 7b26fa1..e712edb 100644 --- a/codes/DoubleDQN/agent.py +++ b/codes/DoubleDQN/agent.py @@ -46,15 +46,15 @@ class ReplayBuffer: return len(self.buffer) class MLP(nn.Module): - def __init__(self, n_states,n_actions,hidden_dim=128): + def __init__(self, state_dim,action_dim,hidden_dim=128): """ 初始化q网络,为全连接网络 - n_states: 输入的特征数即环境的状态数 - n_actions: 输出的动作维度 + state_dim: 输入的特征数即环境的状态维度 + action_dim: 输出的动作维度 """ super(MLP, self).__init__() - self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层 + self.fc1 = nn.Linear(state_dim, hidden_dim) # 输入层 self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层 - self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层 + self.fc3 = nn.Linear(hidden_dim, action_dim) # 输出层 def forward(self, x): # 各层对应的激活函数 @@ -63,8 +63,8 @@ class MLP(nn.Module): return self.fc3(x) class DoubleDQN: - def __init__(self, n_states, n_actions, cfg): - self.n_actions = n_actions # 总的动作个数 + def __init__(self, state_dim, action_dim, cfg): + self.action_dim = action_dim # 总的动作个数 self.device = cfg.device # 设备,cpu或gpu等 self.gamma = cfg.gamma # e-greedy策略相关参数 @@ -73,8 +73,8 @@ class DoubleDQN: self.epsilon_end = cfg.epsilon_end self.epsilon_decay = cfg.epsilon_decay self.batch_size = cfg.batch_size - self.policy_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device) - self.target_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device) + self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device) + self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device) # target_net copy from policy_net for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()): target_param.data.copy_(param.data) @@ -103,7 +103,7 @@ class DoubleDQN: # 所以tensor.max(1)[1]返回最大值对应的下标,即action action = q_value.max(1)[1].item() else: - action = random.randrange(self.n_actions) + action = random.randrange(self.action_dim) return action def update(self): diff --git a/codes/DoubleDQN/task0.py b/codes/DoubleDQN/task0.py index 4fe9579..945753a 100644 --- a/codes/DoubleDQN/task0.py +++ b/codes/DoubleDQN/task0.py @@ -61,9 +61,9 @@ class PlotConfig: def env_agent_config(cfg,seed=1): env = gym.make(cfg.env_name) env.seed(seed) - n_states = env.observation_space.shape[0] - n_actions = env.action_space.n - agent = DoubleDQN(n_states,n_actions,cfg) + state_dim = env.observation_space.shape[0] + action_dim = env.action_space.n + agent = DoubleDQN(state_dim,action_dim,cfg) return env,agent cfg = DoubleDQNConfig() diff --git a/codes/DuelingDQN/task0_train.ipynb b/codes/DuelingDQN/task0_train.ipynb index efa485f..7e38218 100644 --- a/codes/DuelingDQN/task0_train.ipynb +++ b/codes/DuelingDQN/task0_train.ipynb @@ -136,12 +136,12 @@ "outputs": [], "source": [ "class DuelingNet(nn.Module):\n", - " def __init__(self, n_states, n_actions,hidden_size=128):\n", + " def __init__(self, state_dim, action_dim,hidden_size=128):\n", " super(DuelingNet, self).__init__()\n", " \n", " # 隐藏层\n", " self.hidden = nn.Sequential(\n", - " nn.Linear(n_states, hidden_size),\n", + " nn.Linear(state_dim, hidden_size),\n", " nn.ReLU()\n", " )\n", " \n", @@ -149,7 +149,7 @@ " self.advantage = nn.Sequential(\n", " nn.Linear(hidden_size, hidden_size),\n", " nn.ReLU(),\n", - " nn.Linear(hidden_size, n_actions)\n", + " nn.Linear(hidden_size, action_dim)\n", " )\n", " \n", " # 价值函数\n", @@ -192,7 +192,7 @@ ], "source": [ "class DuelingDQN:\n", - " def __init__(self,n_states,n_actions,cfg) -> None:\n", + " def __init__(self,state_dim,action_dim,cfg) -> None:\n", " self.batch_size = cfg.batch_size\n", " self.device = cfg.device\n", " self.loss_history = [] # 记录loss的变化\n", @@ -200,8 +200,8 @@ " self.epsilon = lambda frame_idx: cfg.epsilon_end + \\\n", " (cfg.epsilon_start - cfg.epsilon_end) * \\\n", " math.exp(-1. * frame_idx / cfg.epsilon_decay)\n", - " self.policy_net = DuelingNet(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)\n", - " self.target_net = DuelingNet(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)\n", + " self.policy_net = DuelingNet(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)\n", + " self.target_net = DuelingNet(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)\n", " for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网络targe_net\n", " target_param.data.copy_(param.data)\n", " self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器\n", @@ -214,7 +214,7 @@ " q_values = self.policy_net(state)\n", " action = q_values.max(1)[1].item() # 选择Q值最大的动作\n", " else:\n", - " action = random.randrange(self.n_actions)\n", + " action = random.randrange(self.action_dim)\n", " return action\n", " def update(self):\n", " if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时,不更新策略\n", diff --git a/codes/HierarchicalDQN/agent.py b/codes/HierarchicalDQN/agent.py index 62c539a..ce0cd1f 100644 --- a/codes/HierarchicalDQN/agent.py +++ b/codes/HierarchicalDQN/agent.py @@ -42,7 +42,7 @@ class ReplayBuffer: class MLP(nn.Module): def __init__(self, input_dim,output_dim,hidden_dim=128): """ 初始化q网络,为全连接网络 - input_dim: 输入的特征数即环境的状态数 + input_dim: 输入的特征数即环境的状态维度 output_dim: 输出的动作维度 """ super(MLP, self).__init__() @@ -57,16 +57,16 @@ class MLP(nn.Module): return self.fc3(x) class HierarchicalDQN: - def __init__(self,n_states,n_actions,cfg): - self.n_states = n_states - self.n_actions = n_actions + def __init__(self,state_dim,action_dim,cfg): + self.state_dim = state_dim + self.action_dim = action_dim self.gamma = cfg.gamma self.device = cfg.device self.batch_size = cfg.batch_size self.frame_idx = 0 # 用于epsilon的衰减计数 self.epsilon = lambda frame_idx: cfg.epsilon_end + (cfg.epsilon_start - cfg.epsilon_end ) * math.exp(-1. * frame_idx / cfg.epsilon_decay) - self.policy_net = MLP(2*n_states, n_actions,cfg.hidden_dim).to(self.device) - self.meta_policy_net = MLP(n_states, n_states,cfg.hidden_dim).to(self.device) + self.policy_net = MLP(2*state_dim, action_dim,cfg.hidden_dim).to(self.device) + self.meta_policy_net = MLP(state_dim, state_dim,cfg.hidden_dim).to(self.device) self.optimizer = optim.Adam(self.policy_net.parameters(),lr=cfg.lr) self.meta_optimizer = optim.Adam(self.meta_policy_net.parameters(),lr=cfg.lr) self.memory = ReplayBuffer(cfg.memory_capacity) @@ -76,7 +76,7 @@ class HierarchicalDQN: self.losses = [] self.meta_losses = [] def to_onehot(self,x): - oh = np.zeros(self.n_states) + oh = np.zeros(self.state_dim) oh[x - 1] = 1. return oh def set_goal(self,state): @@ -85,7 +85,7 @@ class HierarchicalDQN: state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(0) goal = self.meta_policy_net(state).max(1)[1].item() else: - goal = random.randrange(self.n_states) + goal = random.randrange(self.state_dim) return goal def choose_action(self,state): self.frame_idx += 1 @@ -95,7 +95,7 @@ class HierarchicalDQN: q_value = self.policy_net(state) action = q_value.max(1)[1].item() else: - action = random.randrange(self.n_actions) + action = random.randrange(self.action_dim) return action def update(self): self.update_policy() diff --git a/codes/HierarchicalDQN/task0.py b/codes/HierarchicalDQN/task0.py index b2cf312..3eceefd 100644 --- a/codes/HierarchicalDQN/task0.py +++ b/codes/HierarchicalDQN/task0.py @@ -63,9 +63,9 @@ class PlotConfig: def env_agent_config(cfg,seed=1): env = gym.make(cfg.env_name) env.seed(seed) - n_states = env.observation_space.shape[0] - n_actions = env.action_space.n - agent = HierarchicalDQN(n_states,n_actions,cfg) + state_dim = env.observation_space.shape[0] + action_dim = env.action_space.n + agent = HierarchicalDQN(state_dim,action_dim,cfg) return env,agent if __name__ == "__main__": diff --git a/codes/MonteCarlo/agent.py b/codes/MonteCarlo/agent.py index bfe6940..44af71d 100644 --- a/codes/MonteCarlo/agent.py +++ b/codes/MonteCarlo/agent.py @@ -17,11 +17,11 @@ import dill class FisrtVisitMC: ''' On-Policy First-Visit MC Control ''' - def __init__(self,n_actions,cfg): - self.n_actions = n_actions + def __init__(self,action_dim,cfg): + self.action_dim = action_dim self.epsilon = cfg.epsilon self.gamma = cfg.gamma - self.Q_table = defaultdict(lambda: np.zeros(n_actions)) + self.Q_table = defaultdict(lambda: np.zeros(action_dim)) self.returns_sum = defaultdict(float) # sum of returns self.returns_count = defaultdict(float) @@ -29,11 +29,11 @@ class FisrtVisitMC: ''' e-greed policy ''' if state in self.Q_table.keys(): best_action = np.argmax(self.Q_table[state]) - action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions + action_probs = np.ones(self.action_dim, dtype=float) * self.epsilon / self.action_dim action_probs[best_action] += (1.0 - self.epsilon) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) else: - action = np.random.randint(0,self.n_actions) + action = np.random.randint(0,self.action_dim) return action def update(self,one_ep_transition): # Find all (state, action) pairs we've visited in this one_ep_transition diff --git a/codes/MonteCarlo/task0_train.py b/codes/MonteCarlo/task0_train.py index 51858f8..dae0c95 100644 --- a/codes/MonteCarlo/task0_train.py +++ b/codes/MonteCarlo/task0_train.py @@ -43,8 +43,8 @@ class MCConfig: def env_agent_config(cfg,seed=1): env = RacetrackEnv() - n_actions = 9 - agent = FisrtVisitMC(n_actions, cfg) + action_dim = 9 + agent = FisrtVisitMC(action_dim, cfg) return env,agent def train(cfg, env, agent): diff --git a/codes/PPO/README.md b/codes/PPO/README.md index 125ef51..66825c9 100644 --- a/codes/PPO/README.md +++ b/codes/PPO/README.md @@ -57,16 +57,16 @@ model就是actor和critic两个网络了: import torch.nn as nn from torch.distributions.categorical import Categorical class Actor(nn.Module): - def __init__(self,n_states, n_actions, + def __init__(self,state_dim, action_dim, hidden_dim=256): super(Actor, self).__init__() self.actor = nn.Sequential( - nn.Linear(n_states, hidden_dim), + nn.Linear(state_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim), nn.ReLU(), - nn.Linear(hidden_dim, n_actions), + nn.Linear(hidden_dim, action_dim), nn.Softmax(dim=-1) ) def forward(self, state): @@ -75,10 +75,10 @@ class Actor(nn.Module): return dist class Critic(nn.Module): - def __init__(self, n_states,hidden_dim=256): + def __init__(self, state_dim,hidden_dim=256): super(Critic, self).__init__() self.critic = nn.Sequential( - nn.Linear(n_states, hidden_dim), + nn.Linear(state_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim), nn.ReLU(), @@ -88,7 +88,7 @@ class Critic(nn.Module): value = self.critic(state) return value ``` -这里Actor就是得到一个概率分布(Categorica,也可以是别的分布,可以搜索torch distributionsl),critc根据当前状态得到一个值,这里的输入维度可以是```n_states+n_actions```,即将action信息也纳入critic网络中,这样会更好一些,感兴趣的小伙伴可以试试。 +这里Actor就是得到一个概率分布(Categorica,也可以是别的分布,可以搜索torch distributionsl),critc根据当前状态得到一个值,这里的输入维度可以是```state_dim+action_dim```,即将action信息也纳入critic网络中,这样会更好一些,感兴趣的小伙伴可以试试。 ### PPO update 定义一个update函数主要实现伪代码中的第六步和第七步: diff --git a/codes/PPO/agent.py b/codes/PPO/agent.py index ebda626..0a7edd9 100644 --- a/codes/PPO/agent.py +++ b/codes/PPO/agent.py @@ -16,15 +16,15 @@ import torch.optim as optim from PPO.model import Actor,Critic from PPO.memory import PPOMemory class PPO: - def __init__(self, n_states, n_actions,cfg): + def __init__(self, state_dim, action_dim,cfg): self.gamma = cfg.gamma self.continuous = cfg.continuous self.policy_clip = cfg.policy_clip self.n_epochs = cfg.n_epochs self.gae_lambda = cfg.gae_lambda self.device = cfg.device - self.actor = Actor(n_states, n_actions,cfg.hidden_dim).to(self.device) - self.critic = Critic(n_states,cfg.hidden_dim).to(self.device) + self.actor = Actor(state_dim, action_dim,cfg.hidden_dim).to(self.device) + self.critic = Critic(state_dim,cfg.hidden_dim).to(self.device) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=cfg.actor_lr) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=cfg.critic_lr) self.memory = PPOMemory(cfg.batch_size) diff --git a/codes/PPO/model.py b/codes/PPO/model.py index 612ddff..fc182d5 100644 --- a/codes/PPO/model.py +++ b/codes/PPO/model.py @@ -12,16 +12,16 @@ Environment: import torch.nn as nn from torch.distributions.categorical import Categorical class Actor(nn.Module): - def __init__(self,n_states, n_actions, + def __init__(self,state_dim, action_dim, hidden_dim): super(Actor, self).__init__() self.actor = nn.Sequential( - nn.Linear(n_states, hidden_dim), + nn.Linear(state_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim), nn.ReLU(), - nn.Linear(hidden_dim, n_actions), + nn.Linear(hidden_dim, action_dim), nn.Softmax(dim=-1) ) def forward(self, state): @@ -30,10 +30,10 @@ class Actor(nn.Module): return dist class Critic(nn.Module): - def __init__(self, n_states,hidden_dim): + def __init__(self, state_dim,hidden_dim): super(Critic, self).__init__() self.critic = nn.Sequential( - nn.Linear(n_states, hidden_dim), + nn.Linear(state_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim), nn.ReLU(), diff --git a/codes/PPO/task0.py b/codes/PPO/task0.py index 15794ec..8e0d92a 100644 --- a/codes/PPO/task0.py +++ b/codes/PPO/task0.py @@ -45,9 +45,9 @@ class PlotConfig: def env_agent_config(cfg,seed=1): env = gym.make(cfg.env_name) env.seed(seed) - n_states = env.observation_space.shape[0] - n_actions = env.action_space.n - agent = PPO(n_states,n_actions,cfg) + state_dim = env.observation_space.shape[0] + action_dim = env.action_space.n + agent = PPO(state_dim,action_dim,cfg) return env,agent cfg = PPOConfig() diff --git a/codes/PPO/task1.py b/codes/PPO/task1.py index 00feb2f..38d9152 100644 --- a/codes/PPO/task1.py +++ b/codes/PPO/task1.py @@ -45,9 +45,9 @@ class PlotConfig: def env_agent_config(cfg,seed=1): env = gym.make(cfg.env_name) env.seed(seed) - n_states = env.observation_space.shape[0] - n_actions = env.action_space.shape[0] - agent = PPO(n_states,n_actions,cfg) + state_dim = env.observation_space.shape[0] + action_dim = env.action_space.shape[0] + agent = PPO(state_dim,action_dim,cfg) return env,agent diff --git a/codes/PPO/train.ipynb b/codes/PPO/train.ipynb index 2fe6570..b2dc91a 100644 --- a/codes/PPO/train.ipynb +++ b/codes/PPO/train.ipynb @@ -90,9 +90,9 @@ "def env_agent_config(cfg,seed=1):\n", " env = gym.make(cfg.env) \n", " env.seed(seed)\n", - " n_states = env.observation_space.shape[0]\n", - " n_actions = env.action_space.n\n", - " agent = PPO(n_states,n_actions,cfg)\n", + " state_dim = env.observation_space.shape[0]\n", + " action_dim = env.action_space.n\n", + " agent = PPO(state_dim,action_dim,cfg)\n", " return env,agent" ] }, diff --git a/codes/PPO/train.py b/codes/PPO/train.py index b97e287..e642df0 100644 --- a/codes/PPO/train.py +++ b/codes/PPO/train.py @@ -99,9 +99,9 @@ if __name__ == '__main__': def env_agent_config(cfg,seed=1): env = gym.make(cfg.env_name) env.seed(seed) - n_states = env.observation_space.shape[0] - n_actions = env.action_space.n - agent = PPO(n_states,n_actions,cfg) + state_dim = env.observation_space.shape[0] + action_dim = env.action_space.n + agent = PPO(state_dim,action_dim,cfg) return env,agent cfg = PPOConfig() diff --git a/codes/PolicyGradient/agent.py b/codes/PolicyGradient/agent.py index fa63ba0..8f349b5 100644 --- a/codes/PolicyGradient/agent.py +++ b/codes/PolicyGradient/agent.py @@ -17,9 +17,9 @@ from PolicyGradient.model import MLP class PolicyGradient: - def __init__(self, n_states,cfg): + def __init__(self, state_dim,cfg): self.gamma = cfg.gamma - self.policy_net = MLP(n_states,hidden_dim=cfg.hidden_dim) + self.policy_net = MLP(state_dim,hidden_dim=cfg.hidden_dim) self.optimizer = torch.optim.RMSprop(self.policy_net.parameters(), lr=cfg.lr) self.batch_size = cfg.batch_size diff --git a/codes/PolicyGradient/model.py b/codes/PolicyGradient/model.py index 97d9935..6d9bc64 100644 --- a/codes/PolicyGradient/model.py +++ b/codes/PolicyGradient/model.py @@ -19,7 +19,7 @@ class MLP(nn.Module): ''' def __init__(self,input_dim,hidden_dim = 36): super(MLP, self).__init__() - # 24和36为hidden layer的层数,可根据input_dim, n_actions的情况来改变 + # 24和36为hidden layer的层数,可根据input_dim, action_dim的情况来改变 self.fc1 = nn.Linear(input_dim, hidden_dim) self.fc2 = nn.Linear(hidden_dim,hidden_dim) self.fc3 = nn.Linear(hidden_dim, 1) # Prob of Left diff --git a/codes/PolicyGradient/task0_train.py b/codes/PolicyGradient/task0_train.py index 1025a91..b6866f0 100644 --- a/codes/PolicyGradient/task0_train.py +++ b/codes/PolicyGradient/task0_train.py @@ -46,8 +46,8 @@ class PGConfig: def env_agent_config(cfg,seed=1): env = gym.make(cfg.env) env.seed(seed) - n_states = env.observation_space.shape[0] - agent = PolicyGradient(n_states,cfg) + state_dim = env.observation_space.shape[0] + agent = PolicyGradient(state_dim,cfg) return env,agent def train(cfg,env,agent): diff --git a/codes/QLearning/agent.py b/codes/QLearning/agent.py index be57831..b72de22 100644 --- a/codes/QLearning/agent.py +++ b/codes/QLearning/agent.py @@ -15,9 +15,9 @@ import torch from collections import defaultdict class QLearning(object): - def __init__(self,n_states, - n_actions,cfg): - self.n_actions = n_actions + def __init__(self,state_dim, + action_dim,cfg): + self.action_dim = action_dim self.lr = cfg.lr # 学习率 self.gamma = cfg.gamma self.epsilon = 0 @@ -25,7 +25,7 @@ class QLearning(object): self.epsilon_start = cfg.epsilon_start self.epsilon_end = cfg.epsilon_end self.epsilon_decay = cfg.epsilon_decay - self.Q_table = defaultdict(lambda: np.zeros(n_actions)) # 用嵌套字典存放状态->动作->状态-动作值(Q值)的映射,即Q表 + self.Q_table = defaultdict(lambda: np.zeros(action_dim)) # 用嵌套字典存放状态->动作->状态-动作值(Q值)的映射,即Q表 def choose_action(self, state): self.sample_count += 1 self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ @@ -34,7 +34,7 @@ class QLearning(object): if np.random.uniform(0, 1) > self.epsilon: action = np.argmax(self.Q_table[str(state)]) # 选择Q(s,a)最大对应的动作 else: - action = np.random.choice(self.n_actions) # 随机选择动作 + action = np.random.choice(self.action_dim) # 随机选择动作 return action def predict(self,state): action = np.argmax(self.Q_table[str(state)]) diff --git a/codes/QLearning/task0.ipynb b/codes/QLearning/task0.ipynb index a8be93b..dc447ce 100644 --- a/codes/QLearning/task0.ipynb +++ b/codes/QLearning/task0.ipynb @@ -38,9 +38,9 @@ "outputs": [], "source": [ "class QLearning(object):\n", - " def __init__(self,n_states,\n", - " n_actions,cfg):\n", - " self.n_actions = n_actions \n", + " def __init__(self,state_dim,\n", + " action_dim,cfg):\n", + " self.action_dim = action_dim \n", " self.lr = cfg.lr # 学习率\n", " self.gamma = cfg.gamma \n", " self.epsilon = 0 \n", @@ -48,7 +48,7 @@ " self.epsilon_start = cfg.epsilon_start\n", " self.epsilon_end = cfg.epsilon_end\n", " self.epsilon_decay = cfg.epsilon_decay\n", - " self.Q_table = defaultdict(lambda: np.zeros(n_actions)) # 用嵌套字典存放状态->动作->状态-动作值(Q值)的映射,即Q表\n", + " self.Q_table = defaultdict(lambda: np.zeros(action_dim)) # 用嵌套字典存放状态->动作->状态-动作值(Q值)的映射,即Q表\n", " def choose_action(self, state):\n", " self.sample_count += 1\n", " self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \\\n", @@ -57,7 +57,7 @@ " if np.random.uniform(0, 1) > self.epsilon:\n", " action = np.argmax(self.Q_table[str(state)]) # 选择Q(s,a)最大对应的动作\n", " else:\n", - " action = np.random.choice(self.n_actions) # 随机选择动作\n", + " action = np.random.choice(self.action_dim) # 随机选择动作\n", " return action\n", " def predict(self,state):\n", " action = np.argmax(self.Q_table[str(state)])\n", @@ -238,9 +238,9 @@ " env = gym.make(cfg.env_name) \n", " env = CliffWalkingWapper(env)\n", " env.seed(seed) # 设置随机种子\n", - " n_states = env.observation_space.n # 状态维度\n", - " n_actions = env.action_space.n # 动作维度\n", - " agent = QLearning(n_states,n_actions,cfg)\n", + " state_dim = env.observation_space.n # 状态维度\n", + " action_dim = env.action_space.n # 动作维度\n", + " agent = QLearning(state_dim,action_dim,cfg)\n", " return env,agent" ] }, diff --git a/codes/QLearning/task0.py b/codes/QLearning/task0.py index 3f93d08..59a1668 100644 --- a/codes/QLearning/task0.py +++ b/codes/QLearning/task0.py @@ -68,9 +68,9 @@ def env_agent_config(cfg,seed=1): env = gym.make(cfg.env_name) env = CliffWalkingWapper(env) env.seed(seed) # 设置随机种子 - n_states = env.observation_space.n # 状态维度 - n_actions = env.action_space.n # 动作维度 - agent = QLearning(n_states,n_actions,cfg) + state_dim = env.observation_space.n # 状态维度 + action_dim = env.action_space.n # 动作维度 + agent = QLearning(state_dim,action_dim,cfg) return env,agent cfg = QlearningConfig() diff --git a/codes/Sarsa/agent.py b/codes/Sarsa/agent.py index 3753381..020f6da 100644 --- a/codes/Sarsa/agent.py +++ b/codes/Sarsa/agent.py @@ -14,17 +14,17 @@ from collections import defaultdict import torch class Sarsa(object): def __init__(self, - n_actions,sarsa_cfg,): - self.n_actions = n_actions # number of actions + action_dim,sarsa_cfg,): + self.action_dim = action_dim # number of actions self.lr = sarsa_cfg.lr # learning rate self.gamma = sarsa_cfg.gamma self.epsilon = sarsa_cfg.epsilon - self.Q = defaultdict(lambda: np.zeros(n_actions)) - # self.Q = np.zeros((n_states, n_actions)) # Q表 + self.Q = defaultdict(lambda: np.zeros(action_dim)) + # self.Q = np.zeros((state_dim, action_dim)) # Q表 def choose_action(self, state): best_action = np.argmax(self.Q[state]) # action = best_action - action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions + action_probs = np.ones(self.action_dim, dtype=float) * self.epsilon / self.action_dim action_probs[best_action] += (1.0 - self.epsilon) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) return action diff --git a/codes/Sarsa/task0_train.py b/codes/Sarsa/task0_train.py index 38fc598..e477afa 100644 --- a/codes/Sarsa/task0_train.py +++ b/codes/Sarsa/task0_train.py @@ -39,8 +39,8 @@ class SarsaConfig: def env_agent_config(cfg,seed=1): env = RacetrackEnv() - n_actions=9 - agent = Sarsa(n_actions,cfg) + action_dim=9 + agent = Sarsa(action_dim,cfg) return env,agent def train(cfg,env,agent): diff --git a/codes/SoftActorCritic/model.py b/codes/SoftActorCritic/model.py index ba04737..85bbfcd 100644 --- a/codes/SoftActorCritic/model.py +++ b/codes/SoftActorCritic/model.py @@ -17,10 +17,10 @@ from torch.distributions import Normal device=torch.device("cuda" if torch.cuda.is_available() else "cpu") class ValueNet(nn.Module): - def __init__(self, n_states, hidden_dim, init_w=3e-3): + def __init__(self, state_dim, hidden_dim, init_w=3e-3): super(ValueNet, self).__init__() - self.linear1 = nn.Linear(n_states, hidden_dim) + self.linear1 = nn.Linear(state_dim, hidden_dim) self.linear2 = nn.Linear(hidden_dim, hidden_dim) self.linear3 = nn.Linear(hidden_dim, 1) @@ -35,10 +35,10 @@ class ValueNet(nn.Module): class SoftQNet(nn.Module): - def __init__(self, n_states, n_actions, hidden_dim, init_w=3e-3): + def __init__(self, state_dim, action_dim, hidden_dim, init_w=3e-3): super(SoftQNet, self).__init__() - self.linear1 = nn.Linear(n_states + n_actions, hidden_dim) + self.linear1 = nn.Linear(state_dim + action_dim, hidden_dim) self.linear2 = nn.Linear(hidden_dim, hidden_dim) self.linear3 = nn.Linear(hidden_dim, 1) @@ -54,20 +54,20 @@ class SoftQNet(nn.Module): class PolicyNet(nn.Module): - def __init__(self, n_states, n_actions, hidden_dim, init_w=3e-3, log_std_min=-20, log_std_max=2): + def __init__(self, state_dim, action_dim, hidden_dim, init_w=3e-3, log_std_min=-20, log_std_max=2): super(PolicyNet, self).__init__() self.log_std_min = log_std_min self.log_std_max = log_std_max - self.linear1 = nn.Linear(n_states, hidden_dim) + self.linear1 = nn.Linear(state_dim, hidden_dim) self.linear2 = nn.Linear(hidden_dim, hidden_dim) - self.mean_linear = nn.Linear(hidden_dim, n_actions) + self.mean_linear = nn.Linear(hidden_dim, action_dim) self.mean_linear.weight.data.uniform_(-init_w, init_w) self.mean_linear.bias.data.uniform_(-init_w, init_w) - self.log_std_linear = nn.Linear(hidden_dim, n_actions) + self.log_std_linear = nn.Linear(hidden_dim, action_dim) self.log_std_linear.weight.data.uniform_(-init_w, init_w) self.log_std_linear.bias.data.uniform_(-init_w, init_w) diff --git a/codes/SoftActorCritic/sac.py b/codes/SoftActorCritic/sac.py index c67257f..d565db5 100644 --- a/codes/SoftActorCritic/sac.py +++ b/codes/SoftActorCritic/sac.py @@ -43,10 +43,10 @@ class ReplayBuffer: return len(self.buffer) class ValueNet(nn.Module): - def __init__(self, n_states, hidden_dim, init_w=3e-3): + def __init__(self, state_dim, hidden_dim, init_w=3e-3): super(ValueNet, self).__init__() - self.linear1 = nn.Linear(n_states, hidden_dim) + self.linear1 = nn.Linear(state_dim, hidden_dim) self.linear2 = nn.Linear(hidden_dim, hidden_dim) self.linear3 = nn.Linear(hidden_dim, 1) @@ -61,10 +61,10 @@ class ValueNet(nn.Module): class SoftQNet(nn.Module): - def __init__(self, n_states, n_actions, hidden_dim, init_w=3e-3): + def __init__(self, state_dim, action_dim, hidden_dim, init_w=3e-3): super(SoftQNet, self).__init__() - self.linear1 = nn.Linear(n_states + n_actions, hidden_dim) + self.linear1 = nn.Linear(state_dim + action_dim, hidden_dim) self.linear2 = nn.Linear(hidden_dim, hidden_dim) self.linear3 = nn.Linear(hidden_dim, 1) @@ -80,20 +80,20 @@ class SoftQNet(nn.Module): class PolicyNet(nn.Module): - def __init__(self, n_states, n_actions, hidden_dim, init_w=3e-3, log_std_min=-20, log_std_max=2): + def __init__(self, state_dim, action_dim, hidden_dim, init_w=3e-3, log_std_min=-20, log_std_max=2): super(PolicyNet, self).__init__() self.log_std_min = log_std_min self.log_std_max = log_std_max - self.linear1 = nn.Linear(n_states, hidden_dim) + self.linear1 = nn.Linear(state_dim, hidden_dim) self.linear2 = nn.Linear(hidden_dim, hidden_dim) - self.mean_linear = nn.Linear(hidden_dim, n_actions) + self.mean_linear = nn.Linear(hidden_dim, action_dim) self.mean_linear.weight.data.uniform_(-init_w, init_w) self.mean_linear.bias.data.uniform_(-init_w, init_w) - self.log_std_linear = nn.Linear(hidden_dim, n_actions) + self.log_std_linear = nn.Linear(hidden_dim, action_dim) self.log_std_linear.weight.data.uniform_(-init_w, init_w) self.log_std_linear.bias.data.uniform_(-init_w, init_w) @@ -134,14 +134,14 @@ class PolicyNet(nn.Module): return action[0] class SAC: - def __init__(self,n_states,n_actions,cfg) -> None: + def __init__(self,state_dim,action_dim,cfg) -> None: self.batch_size = cfg.batch_size self.memory = ReplayBuffer(cfg.capacity) self.device = cfg.device - self.value_net = ValueNet(n_states, cfg.hidden_dim).to(self.device) - self.target_value_net = ValueNet(n_states, cfg.hidden_dim).to(self.device) - self.soft_q_net = SoftQNet(n_states, n_actions, cfg.hidden_dim).to(self.device) - self.policy_net = PolicyNet(n_states, n_actions, cfg.hidden_dim).to(self.device) + self.value_net = ValueNet(state_dim, cfg.hidden_dim).to(self.device) + self.target_value_net = ValueNet(state_dim, cfg.hidden_dim).to(self.device) + self.soft_q_net = SoftQNet(state_dim, action_dim, cfg.hidden_dim).to(self.device) + self.policy_net = PolicyNet(state_dim, action_dim, cfg.hidden_dim).to(self.device) self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=cfg.value_lr) self.soft_q_optimizer = optim.Adam(self.soft_q_net.parameters(), lr=cfg.soft_q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.policy_lr) diff --git a/codes/SoftActorCritic/task0.py b/codes/SoftActorCritic/task0.py index 668d289..e910749 100644 --- a/codes/SoftActorCritic/task0.py +++ b/codes/SoftActorCritic/task0.py @@ -63,9 +63,9 @@ class PlotConfig: def env_agent_config(cfg,seed=1): env = NormalizedActions(gym.make(cfg.env_name)) env.seed(seed) - n_actions = env.action_space.shape[0] - n_states = env.observation_space.shape[0] - agent = SAC(n_states,n_actions,cfg) + action_dim = env.action_space.shape[0] + state_dim = env.observation_space.shape[0] + agent = SAC(state_dim,action_dim,cfg) return env,agent def train(cfg,env,agent): diff --git a/codes/SoftActorCritic/task0_train.ipynb b/codes/SoftActorCritic/task0_train.ipynb index 3be10c6..14be84e 100644 --- a/codes/SoftActorCritic/task0_train.ipynb +++ b/codes/SoftActorCritic/task0_train.ipynb @@ -70,9 +70,9 @@ "def env_agent_config(cfg,seed=1):\n", " env = NormalizedActions(gym.make(\"Pendulum-v0\"))\n", " env.seed(seed)\n", - " n_actions = env.action_space.shape[0]\n", - " n_states = env.observation_space.shape[0]\n", - " agent = SAC(n_states,n_actions,cfg)\n", + " action_dim = env.action_space.shape[0]\n", + " state_dim = env.observation_space.shape[0]\n", + " agent = SAC(state_dim,action_dim,cfg)\n", " return env,agent" ] }, @@ -159,7 +159,7 @@ "\nDuring handling of the above exception, another exception occurred:\n", "\u001b[0;31mDeprecatedEnv\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;31m# train\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0menv\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0magent\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0menv_agent_config\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcfg\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mseed\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mrewards\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mma_rewards\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcfg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0magent\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mmake_dir\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcfg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresult_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcfg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m\u001b[0m in \u001b[0;36menv_agent_config\u001b[0;34m(cfg, seed)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0menv_agent_config\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcfg\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mseed\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0menv\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mNormalizedActions\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgym\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Pendulum-v0\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseed\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mn_actions\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maction_space\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mn_states\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mobservation_space\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36menv_agent_config\u001b[0;34m(cfg, seed)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0menv_agent_config\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcfg\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mseed\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0menv\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mNormalizedActions\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgym\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Pendulum-v0\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseed\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0maction_dim\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maction_space\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mstate_dim\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mobservation_space\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/anaconda3/envs/py37/lib/python3.7/site-packages/gym/envs/registration.py\u001b[0m in \u001b[0;36mmake\u001b[0;34m(id, **kwargs)\u001b[0m\n\u001b[1;32m 233\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 234\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mid\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 235\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mregistry\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mid\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 236\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 237\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/anaconda3/envs/py37/lib/python3.7/site-packages/gym/envs/registration.py\u001b[0m in \u001b[0;36mmake\u001b[0;34m(self, path, **kwargs)\u001b[0m\n\u001b[1;32m 126\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 127\u001b[0m \u001b[0mlogger\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Making new env: %s\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 128\u001b[0;31m \u001b[0mspec\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mspec\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 129\u001b[0m \u001b[0menv\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 130\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/anaconda3/envs/py37/lib/python3.7/site-packages/gym/envs/registration.py\u001b[0m in \u001b[0;36mspec\u001b[0;34m(self, path)\u001b[0m\n\u001b[1;32m 185\u001b[0m raise error.DeprecatedEnv(\n\u001b[1;32m 186\u001b[0m \"Env {} not found (valid versions include {})\".format(\n\u001b[0;32m--> 187\u001b[0;31m \u001b[0mid\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmatching_envs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 188\u001b[0m )\n\u001b[1;32m 189\u001b[0m )\n", diff --git a/codes/TD3/agent.py b/codes/TD3/agent.py index f77a912..91939a6 100644 --- a/codes/TD3/agent.py +++ b/codes/TD3/agent.py @@ -21,8 +21,8 @@ class Actor(nn.Module): '''[summary] Args: - input_dim (int): 输入维度,这里等于n_states - output_dim (int): 输出维度,这里等于n_actions + input_dim (int): 输入维度,这里等于state_dim + output_dim (int): 输出维度,这里等于action_dim max_action (int): action的最大值 ''' super(Actor, self).__init__() diff --git a/codes/TD3/memory.py b/codes/TD3/memory.py index bcf38bb..7e2671c 100644 --- a/codes/TD3/memory.py +++ b/codes/TD3/memory.py @@ -14,13 +14,13 @@ import torch class ReplayBuffer(object): - def __init__(self, n_states, n_actions, max_size=int(1e6)): + def __init__(self, state_dim, action_dim, max_size=int(1e6)): self.max_size = max_size self.ptr = 0 self.size = 0 - self.state = np.zeros((max_size, n_states)) - self.action = np.zeros((max_size, n_actions)) - self.next_state = np.zeros((max_size, n_states)) + self.state = np.zeros((max_size, state_dim)) + self.action = np.zeros((max_size, action_dim)) + self.next_state = np.zeros((max_size, state_dim)) self.reward = np.zeros((max_size, 1)) self.not_done = np.zeros((max_size, 1)) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") diff --git a/codes/TD3/task0_eval.py b/codes/TD3/task0_eval.py index cb977b4..0420dce 100644 --- a/codes/TD3/task0_eval.py +++ b/codes/TD3/task0_eval.py @@ -74,10 +74,10 @@ if __name__ == "__main__": env.seed(cfg.seed) # Set seeds torch.manual_seed(cfg.seed) np.random.seed(cfg.seed) - n_states = env.observation_space.shape[0] - n_actions = env.action_space.shape[0] + state_dim = env.observation_space.shape[0] + action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) - td3= TD3(n_states,n_actions,max_action,cfg) + td3= TD3(state_dim,action_dim,max_action,cfg) cfg.model_path = './TD3/results/HalfCheetah-v2/20210416-130341/models/' td3.load(cfg.model_path) td3_rewards,td3_ma_rewards = eval(cfg.env,td3,cfg.seed) diff --git a/codes/TD3/task0_train.py b/codes/TD3/task0_train.py index 58e4af9..11e2adf 100644 --- a/codes/TD3/task0_train.py +++ b/codes/TD3/task0_train.py @@ -72,7 +72,7 @@ def train(cfg,env,agent): else: action = ( agent.choose_action(np.array(state)) - + np.random.normal(0, max_action * cfg.expl_noise, size=n_actions) + + np.random.normal(0, max_action * cfg.expl_noise, size=action_dim) ).clip(-max_action, max_action) # Perform action next_state, reward, done, _ = env.step(action) @@ -121,11 +121,11 @@ def train(cfg,env,agent): # else: # action = ( # agent.choose_action(np.array(state)) -# + np.random.normal(0, max_action * cfg.expl_noise, size=n_actions) +# + np.random.normal(0, max_action * cfg.expl_noise, size=action_dim) # ).clip(-max_action, max_action) # # action = ( # # agent.choose_action(np.array(state)) -# # + np.random.normal(0, max_action * cfg.expl_noise, size=n_actions) +# # + np.random.normal(0, max_action * cfg.expl_noise, size=action_dim) # # ).clip(-max_action, max_action) # # Perform action # next_state, reward, done, _ = env.step(action) @@ -157,10 +157,10 @@ if __name__ == "__main__": env.seed(cfg.seed) # Set seeds torch.manual_seed(cfg.seed) np.random.seed(cfg.seed) - n_states = env.observation_space.shape[0] - n_actions = env.action_space.shape[0] + state_dim = env.observation_space.shape[0] + action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) - agent = TD3(n_states,n_actions,max_action,cfg) + agent = TD3(state_dim,action_dim,max_action,cfg) rewards,ma_rewards = train(cfg,env,agent) make_dir(cfg.result_path,cfg.model_path) agent.save(path=cfg.model_path) diff --git a/codes/TD3/task1_eval.py b/codes/TD3/task1_eval.py index 0d28c48..ae17681 100644 --- a/codes/TD3/task1_eval.py +++ b/codes/TD3/task1_eval.py @@ -70,10 +70,10 @@ if __name__ == "__main__": env.seed(cfg.seed) # Set seeds torch.manual_seed(cfg.seed) np.random.seed(cfg.seed) - n_states = env.observation_space.shape[0] - n_actions = env.action_space.shape[0] + state_dim = env.observation_space.shape[0] + action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) - td3= TD3(n_states,n_actions,max_action,cfg) + td3= TD3(state_dim,action_dim,max_action,cfg) cfg.model_path = './TD3/results/Pendulum-v0/20210428-092059/models/' cfg.result_path = './TD3/results/Pendulum-v0/20210428-092059/results/' td3.load(cfg.model_path) diff --git a/codes/TD3/task1_train.py b/codes/TD3/task1_train.py index 868f686..9780f76 100644 --- a/codes/TD3/task1_train.py +++ b/codes/TD3/task1_train.py @@ -79,7 +79,7 @@ def train(cfg,env,agent): else: action = ( agent.choose_action(np.array(state)) - + np.random.normal(0, max_action * cfg.expl_noise, size=n_actions) + + np.random.normal(0, max_action * cfg.expl_noise, size=action_dim) ).clip(-max_action, max_action) # Perform action next_state, reward, done, _ = env.step(action) @@ -109,10 +109,10 @@ if __name__ == "__main__": env.seed(1) # 随机种子 torch.manual_seed(1) np.random.seed(1) - n_states = env.observation_space.shape[0] - n_actions = env.action_space.shape[0] + state_dim = env.observation_space.shape[0] + action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) - agent = TD3(n_states,n_actions,max_action,cfg) + agent = TD3(state_dim,action_dim,max_action,cfg) rewards,ma_rewards = train(cfg,env,agent) make_dir(plot_cfg.result_path,plot_cfg.model_path) agent.save(path=plot_cfg.model_path) diff --git a/codes/common/model.py b/codes/common/model.py index 1518df0..4ab0b8b 100644 --- a/codes/common/model.py +++ b/codes/common/model.py @@ -17,7 +17,7 @@ from torch.distributions import Categorical class MLP(nn.Module): def __init__(self, input_dim,output_dim,hidden_dim=128): """ 初始化q网络,为全连接网络 - input_dim: 输入的特征数即环境的状态数 + input_dim: 输入的特征数即环境的状态维度 output_dim: 输出的动作维度 """ super(MLP, self).__init__() @@ -32,10 +32,10 @@ class MLP(nn.Module): return self.fc3(x) class Critic(nn.Module): - def __init__(self, n_obs, n_actions, hidden_size, init_w=3e-3): + def __init__(self, n_obs, action_dim, hidden_size, init_w=3e-3): super(Critic, self).__init__() - self.linear1 = nn.Linear(n_obs + n_actions, hidden_size) + self.linear1 = nn.Linear(n_obs + action_dim, hidden_size) self.linear2 = nn.Linear(hidden_size, hidden_size) self.linear3 = nn.Linear(hidden_size, 1) # 随机初始化为较小的值 @@ -51,11 +51,11 @@ class Critic(nn.Module): return x class Actor(nn.Module): - def __init__(self, n_obs, n_actions, hidden_size, init_w=3e-3): + def __init__(self, n_obs, action_dim, hidden_size, init_w=3e-3): super(Actor, self).__init__() self.linear1 = nn.Linear(n_obs, hidden_size) self.linear2 = nn.Linear(hidden_size, hidden_size) - self.linear3 = nn.Linear(hidden_size, n_actions) + self.linear3 = nn.Linear(hidden_size, action_dim) self.linear3.weight.data.uniform_(-init_w, init_w) self.linear3.bias.data.uniform_(-init_w, init_w) @@ -67,18 +67,18 @@ class Actor(nn.Module): return x class ActorCritic(nn.Module): - def __init__(self, n_states, n_actions, hidden_dim=256): + def __init__(self, state_dim, action_dim, hidden_dim=256): super(ActorCritic, self).__init__() self.critic = nn.Sequential( - nn.Linear(n_states, hidden_dim), + nn.Linear(state_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, 1) ) self.actor = nn.Sequential( - nn.Linear(n_states, hidden_dim), + nn.Linear(state_dim, hidden_dim), nn.ReLU(), - nn.Linear(hidden_dim, n_actions), + nn.Linear(hidden_dim, action_dim), nn.Softmax(dim=1), ) diff --git a/codes/envs/blackjack.py b/codes/envs/blackjack.py index 87f02d2..6946895 100644 --- a/codes/envs/blackjack.py +++ b/codes/envs/blackjack.py @@ -77,7 +77,7 @@ class BlackjackEnv(gym.Env): self.natural = natural # Start the first game self._reset() # Number of - self.n_actions = 2 + self.action_dim = 2 def reset(self): return self._reset() diff --git a/codes/envs/cliff_walking.py b/codes/envs/cliff_walking.py index 05b9b2e..73e33c7 100644 --- a/codes/envs/cliff_walking.py +++ b/codes/envs/cliff_walking.py @@ -31,7 +31,7 @@ class CliffWalkingEnv(discrete.DiscreteEnv): self.shape = (4, 12) nS = np.prod(self.shape) - n_actions = 4 + action_dim = 4 # Cliff Location self._cliff = np.zeros(self.shape, dtype=np.bool) @@ -41,7 +41,7 @@ class CliffWalkingEnv(discrete.DiscreteEnv): P = {} for s in range(nS): position = np.unravel_index(s, self.shape) - P[s] = { a : [] for a in range(n_actions) } + P[s] = { a : [] for a in range(action_dim) } P[s][UP] = self._calculate_transition_prob(position, [-1, 0]) P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1]) P[s][DOWN] = self._calculate_transition_prob(position, [1, 0]) @@ -51,7 +51,7 @@ class CliffWalkingEnv(discrete.DiscreteEnv): isd = np.zeros(nS) isd[np.ravel_multi_index((3,0), self.shape)] = 1.0 - super(CliffWalkingEnv, self).__init__(nS, n_actions, P, isd) + super(CliffWalkingEnv, self).__init__(nS, action_dim, P, isd) def render(self, mode='human', close=False): self._render(mode, close) diff --git a/codes/envs/gridworld.py b/codes/envs/gridworld.py index cf3aec2..c4fd512 100644 --- a/codes/envs/gridworld.py +++ b/codes/envs/gridworld.py @@ -37,7 +37,7 @@ class GridworldEnv(discrete.DiscreteEnv): self.shape = shape nS = np.prod(shape) - n_actions = 4 + action_dim = 4 MAX_Y = shape[0] MAX_X = shape[1] @@ -51,7 +51,7 @@ class GridworldEnv(discrete.DiscreteEnv): y, x = it.multi_index # P[s][a] = (prob, next_state, reward, is_done) - P[s] = {a : [] for a in range(n_actions)} + P[s] = {a : [] for a in range(action_dim)} is_done = lambda s: s == 0 or s == (nS - 1) reward = 0.0 if is_done(s) else -1.0 @@ -82,7 +82,7 @@ class GridworldEnv(discrete.DiscreteEnv): # This should not be used in any model-free learning algorithm self.P = P - super(GridworldEnv, self).__init__(nS, n_actions, P, isd) + super(GridworldEnv, self).__init__(nS, action_dim, P, isd) def _render(self, mode='human', close=False): """ Renders the current gridworld layout diff --git a/codes/envs/stochastic_mdp.py b/codes/envs/stochastic_mdp.py index 3c1ad4d..5770fa5 100644 --- a/codes/envs/stochastic_mdp.py +++ b/codes/envs/stochastic_mdp.py @@ -17,31 +17,31 @@ class StochasticMDP: def __init__(self): self.end = False self.curr_state = 2 - self.n_actions = 2 - self.n_states = 6 + self.action_dim = 2 + self.state_dim = 6 self.p_right = 0.5 def reset(self): self.end = False self.curr_state = 2 - state = np.zeros(self.n_states) + state = np.zeros(self.state_dim) state[self.curr_state - 1] = 1. return state def step(self, action): if self.curr_state != 1: if action == 1: - if random.random() < self.p_right and self.curr_state < self.n_states: + if random.random() < self.p_right and self.curr_state < self.state_dim: self.curr_state += 1 else: self.curr_state -= 1 if action == 0: self.curr_state -= 1 - if self.curr_state == self.n_states: + if self.curr_state == self.state_dim: self.end = True - state = np.zeros(self.n_states) + state = np.zeros(self.state_dim) state[self.curr_state - 1] = 1. if self.curr_state == 1: diff --git a/codes/envs/windy_gridworld.py b/codes/envs/windy_gridworld.py index 2a9d4a4..ac9c66a 100644 --- a/codes/envs/windy_gridworld.py +++ b/codes/envs/windy_gridworld.py @@ -30,7 +30,7 @@ class WindyGridworldEnv(discrete.DiscreteEnv): self.shape = (7, 10) nS = np.prod(self.shape) - n_actions = 4 + action_dim = 4 # Wind strength winds = np.zeros(self.shape) @@ -41,7 +41,7 @@ class WindyGridworldEnv(discrete.DiscreteEnv): P = {} for s in range(nS): position = np.unravel_index(s, self.shape) - P[s] = { a : [] for a in range(n_actions) } + P[s] = { a : [] for a in range(action_dim) } P[s][UP] = self._calculate_transition_prob(position, [-1, 0], winds) P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1], winds) P[s][DOWN] = self._calculate_transition_prob(position, [1, 0], winds) @@ -51,7 +51,7 @@ class WindyGridworldEnv(discrete.DiscreteEnv): isd = np.zeros(nS) isd[np.ravel_multi_index((3,0), self.shape)] = 1.0 - super(WindyGridworldEnv, self).__init__(nS, n_actions, P, isd) + super(WindyGridworldEnv, self).__init__(nS, action_dim, P, isd) def render(self, mode='human', close=False): self._render(mode, close)