update codes

2021-12-22 16:55:09 +08:00
parent 75df999258
commit 41fb561d25
75 changed files with 1248 additions and 918 deletions
--- a/codes/A2C/agent.py
+++ b/codes/A2C/agent.py
@@ -40,10 +40,10 @@ class ActorCritic(nn.Module):
 class A2C:
    ''' A2C算法
    '''
-    def __init__(self,state_dim,action_dim,cfg) -> None:
+    def __init__(self,n_states,n_actions,cfg) -> None:
        self.gamma = cfg.gamma
        self.device = cfg.device
-        self.model = ActorCritic(state_dim, action_dim, cfg.hidden_size).to(self.device)
+        self.model = ActorCritic(n_states, n_actions, cfg.hidden_size).to(self.device)
        self.optimizer = optim.Adam(self.model.parameters())

    def compute_returns(self,next_value, rewards, masks):
--- a/codes/A2C/task0.py
+++ b/codes/A2C/task0.py
@@ -74,9 +74,9 @@ def train(cfg,envs):
    print(f'环境：{cfg.env_name}, 算法：{cfg.algo}, 设备：{cfg.device}')
    env = gym.make(cfg.env_name) # a single env
    env.seed(10)
-    state_dim  = envs.observation_space.shape[0]
-    action_dim = envs.action_space.n
-    model = ActorCritic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
+    n_states  = envs.observation_space.shape[0]
+    n_actions = envs.action_space.n
+    model = ActorCritic(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
    optimizer = optim.Adam(model.parameters())
    frame_idx    = 0
    test_rewards = []
--- a/codes/DDPG/env.py
+++ b/codes/DDPG/env.py
@@ -39,15 +39,15 @@ class OUNoise(object):
        self.max_sigma    = max_sigma
        self.min_sigma    = min_sigma
        self.decay_period = decay_period
-        self.action_dim   = action_space.shape[0]
+        self.n_actions   = action_space.shape[0]
        self.low          = action_space.low
        self.high         = action_space.high
        self.reset()
    def reset(self):
-        self.obs = np.ones(self.action_dim) * self.mu
+        self.obs = np.ones(self.n_actions) * self.mu
    def evolve_obs(self):
        x  = self.obs
-        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
+        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.n_actions)
        self.obs = x + dx
        return self.obs
    def get_action(self, action, t=0):
--- a/codes/DQN/README.md
+++ b/codes/DQN/README.md
@@ -50,15 +50,15 @@ import torch.nn as nn
 import torch.nn.functional as F

 class FCN(nn.Module):
-    def __init__(self, state_dim=4, action_dim=18):
+    def __init__(self, n_states=4, n_actions=18):
        """ 初始化q网络，为全连接网络
-            state_dim: 输入的feature即环境的state数目
-            action_dim: 输出的action总个数
+            n_states: 输入的feature即环境的state数目
+            n_actions: 输出的action总个数
        """
        super(FCN, self).__init__()
-        self.fc1 = nn.Linear(state_dim, 128) # 输入层
+        self.fc1 = nn.Linear(n_states, 128) # 输入层
        self.fc2 = nn.Linear(128, 128) # 隐藏层
-        self.fc3 = nn.Linear(128, action_dim) # 输出层
+        self.fc3 = nn.Linear(128, n_actions) # 输出层
        
    def forward(self, x):
        # 各层对应的激活函数
@@ -66,7 +66,7 @@ class FCN(nn.Module):
        x = F.relu(self.fc2(x))
        return self.fc3(x)
 ```
-输入为state_dim，输出为action_dim，包含一个128维度的隐藏层，这里根据需要可增加隐藏层维度和数量，然后一般使用relu激活函数，这里跟深度学习的网路设置是一样的。
+输入为n_states，输出为n_actions，包含一个128维度的隐藏层，这里根据需要可增加隐藏层维度和数量，然后一般使用relu激活函数，这里跟深度学习的网路设置是一样的。

 ### Replay Buffer

@@ -107,8 +107,8 @@ class ReplayBuffer:
 在类中建立两个网络，以及optimizer和memory，

 ```python
-self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
-self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
+self.policy_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
+self.target_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
 for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # copy params from policy net
    target_param.data.copy_(param.data)
 self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr)
@@ -124,7 +124,7 @@ def choose_action(self, state):
    if random.random() > self.epsilon(self.frame_idx):
        action = self.predict(state)
    else:
-        action = random.randrange(self.action_dim)
+        action = random.randrange(self.n_actions)
    return action
 ```

--- a/codes/DQN/agent.py
+++ b/codes/DQN/agent.py
@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:50:49
@LastEditor: John
-LastEditTime: 2021-09-15 13:35:36
+LastEditTime: 2021-12-22 14:01:37
@Discription: 
@Environment: python 3.7.7
 '''
@@ -21,15 +21,15 @@ import math
 import numpy as np

 class MLP(nn.Module):
-    def __init__(self, state_dim,action_dim,hidden_dim=128):
+    def __init__(self, n_states,n_actions,hidden_dim=128):
        """ 初始化q网络，为全连接网络
-            state_dim: 输入的特征数即环境的状态数
-            action_dim: 输出的动作维度
+            n_states: 输入的特征数即环境的状态数
+            n_actions: 输出的动作维度
        """
        super(MLP, self).__init__()
-        self.fc1 = nn.Linear(state_dim, hidden_dim) # 输入层
+        self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
        self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
-        self.fc3 = nn.Linear(hidden_dim, action_dim) # 输出层
+        self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层
        
    def forward(self, x):
        # 各层对应的激活函数
@@ -62,9 +62,9 @@ class ReplayBuffer:
        return len(self.buffer)

 class DQN:
-    def __init__(self, state_dim, action_dim, cfg):
+    def __init__(self, n_states, n_actions, cfg):

-        self.action_dim = action_dim  # 总的动作个数
+        self.n_actions = n_actions  # 总的动作个数
        self.device = cfg.device  # 设备，cpu或gpu等
        self.gamma = cfg.gamma  # 奖励的折扣因子
        # e-greedy策略相关参数
@@ -73,8 +73,8 @@ class DQN:
            (cfg.epsilon_start - cfg.epsilon_end) * \
            math.exp(-1. * frame_idx / cfg.epsilon_decay)
        self.batch_size = cfg.batch_size
-        self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
-        self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
+        self.policy_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
+        self.target_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
        for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net
            target_param.data.copy_(param.data)
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器
@@ -90,7 +90,7 @@ class DQN:
                q_values = self.policy_net(state)
                action = q_values.max(1)[1].item() # 选择Q值最大的动作
        else:
-            action = random.randrange(self.action_dim)
+            action = random.randrange(self.n_actions)
        return action
    def update(self):
        if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时，不更新策略
--- a/codes/DQN/dqn_cnn.py
+++ b/codes/DQN/dqn_cnn.py
@@ -0,0 +1,133 @@
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.autograd as autograd 
+import random
+import math
+class CNN(nn.Module):
+    def __init__(self, input_dim, output_dim):
+        super(CNN, self).__init__()
+        
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        
+        self.features = nn.Sequential(
+            nn.Conv2d(input_dim[0], 32, kernel_size=8, stride=4),
+            nn.ReLU(),
+            nn.Conv2d(32, 64, kernel_size=4, stride=2),
+            nn.ReLU(),
+            nn.Conv2d(64, 64, kernel_size=3, stride=1),
+            nn.ReLU()
+        )
+        
+        self.fc = nn.Sequential(
+            nn.Linear(self.feature_size(), 512),
+            nn.ReLU(),
+            nn.Linear(512, self.output_dim)
+        )
+        
+    def forward(self, x):
+        x = self.features(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+        return x
+    
+    def feature_size(self):
+        return self.features(autograd.Variable(torch.zeros(1, *self.input_dim))).view(1, -1).size(1)
+
+
+    def act(self, state, epsilon):
+        if random.random() > epsilon:
+            state   = Variable(torch.FloatTensor(np.float32(state)).unsqueeze(0), volatile=True)
+            q_value = self.forward(state)
+            action  = q_value.max(1)[1].data[0]
+        else:
+            action = random.randrange(env.action_space.n)
+        return action
+
+class ReplayBuffer:
+    def __init__(self, capacity):
+        self.capacity = capacity # 经验回放的容量
+        self.buffer = [] # 缓冲区
+        self.position = 0 
+    
+    def push(self, state, action, reward, next_state, done):
+        ''' 缓冲区是一个队列，容量超出时去掉开始存入的转移(transition)
+        '''
+        if len(self.buffer) < self.capacity:
+            self.buffer.append(None)
+        self.buffer[self.position] = (state, action, reward, next_state, done)
+        self.position = (self.position + 1) % self.capacity 
+    
+    def sample(self, batch_size):
+        batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移
+        state, action, reward, next_state, done =  zip(*batch) # 解压成状态，动作等
+        return state, action, reward, next_state, done
+    
+    def __len__(self):
+        ''' 返回当前存储的量
+        '''
+        return len(self.buffer)
+
+class DQN:
+    def __init__(self, n_states, n_actions, cfg):
+
+        self.n_actions = n_actions  # 总的动作个数
+        self.device = cfg.device  # 设备，cpu或gpu等
+        self.gamma = cfg.gamma  # 奖励的折扣因子
+        # e-greedy策略相关参数
+        self.frame_idx = 0  # 用于epsilon的衰减计数
+        self.epsilon = lambda frame_idx: cfg.epsilon_end + \
+            (cfg.epsilon_start - cfg.epsilon_end) * \
+            math.exp(-1. * frame_idx / cfg.epsilon_decay)
+        self.batch_size = cfg.batch_size
+        self.policy_net = CNN(n_states, n_actions).to(self.device)
+        self.target_net = CNN(n_states, n_actions).to(self.device)
+        for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net
+            target_param.data.copy_(param.data)
+        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器
+        self.memory = ReplayBuffer(cfg.memory_capacity) # 经验回放
+
+    def choose_action(self, state):
+        ''' 选择动作
+        '''
+        self.frame_idx += 1
+        if random.random() > self.epsilon(self.frame_idx):
+            with torch.no_grad():
+                state = torch.tensor([state], device=self.device, dtype=torch.float32)
+                q_values = self.policy_net(state)
+                action = q_values.max(1)[1].item() # 选择Q值最大的动作
+        else:
+            action = random.randrange(self.n_actions)
+        return action
+    def update(self):
+        if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时，不更新策略
+            return
+        # 从经验回放中(replay memory)中随机采样一个批量的转移(transition)
+        state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
+            self.batch_size)
+        # 转为张量
+        state_batch = torch.tensor(state_batch, device=self.device, dtype=torch.float)
+        action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1)  
+        reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float)  
+        next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float)
+        done_batch = torch.tensor(np.float32(done_batch), device=self.device)
+        q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch) # 计算当前状态(s_t,a)对应的Q(s_t, a)
+        next_q_values = self.target_net(next_state_batch).max(1)[0].detach() # 计算下一时刻的状态(s_t_,a)对应的Q值
+        # 计算期望的Q值，对于终止状态，此时done_batch[0]=1, 对应的expected_q_value等于reward
+        expected_q_values = reward_batch + self.gamma * next_q_values * (1-done_batch)
+        loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1))  # 计算均方根损失
+        # 优化更新模型
+        self.optimizer.zero_grad()  
+        loss.backward()
+        for param in self.policy_net.parameters():  # clip防止梯度爆炸
+            param.grad.data.clamp_(-1, 1)
+        self.optimizer.step() 
+
+    def save(self, path):
+        torch.save(self.target_net.state_dict(), path+'dqn_checkpoint.pth')
+
+    def load(self, path):
+        self.target_net.load_state_dict(torch.load(path+'dqn_checkpoint.pth'))
+        for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
+            param.data.copy_(target_param.data)
--- a/codes/DQN/task0.py
+++ b/codes/DQN/task0.py
@@ -9,11 +9,10 @@ import torch
 import datetime
 from common.utils import save_results, make_dir
 from common.utils import plot_rewards
-from DQN.agent import DQN
-from DQN.train import train,test
+from DQN.dqn import DQN

 curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
-algo_name = "DQN"  # 算法名称
+algo_name = 'DQN'  # 算法名称
 env_name = 'CartPole-v0'  # 环境名称

 class DQNConfig:
@@ -51,25 +50,82 @@ def env_agent_config(cfg, seed=1):
    '''
    env = gym.make(cfg.env_name)  # 创建环境
    env.seed(seed)  # 设置随机种子
-    state_dim = env.observation_space.shape[0]  # 状态数
-    action_dim = env.action_space.n  # 动作数
-    agent = DQN(state_dim, action_dim, cfg)  # 创建智能体
+    n_states = env.observation_space.shape[0]  # 状态数
+    n_actions = env.action_space.n  # 动作数
+    agent = DQN(n_states, n_actions, cfg)  # 创建智能体
    return env, agent

+def train(cfg, env, agent):
+    ''' 训练
+    '''
+    print('开始训练!')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
+    rewards = [] # 记录所有回合的奖励
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    for i_ep in range(cfg.train_eps):
+        ep_reward = 0 # 记录一回合内的奖励
+        state = env.reset() # 重置环境，返回初始状态
+        while True:
+            action = agent.choose_action(state) # 选择动作
+            next_state, reward, done, _ = env.step(action) # 更新环境，返回transition
+            agent.memory.push(state, action, reward, next_state, done) # 保存transition
+            state = next_state # 更新下一个状态
+            agent.update() # 更新智能体
+            ep_reward += reward # 累加奖励
+            if done:
+                break
+        if (i_ep+1) % cfg.target_update == 0: # 智能体目标网络更新
+            agent.target_net.load_state_dict(agent.policy_net.state_dict())
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
+        else:
+            ma_rewards.append(ep_reward)
+        if (i_ep+1)%10 == 0: 
+            print('回合：{}/{}, 奖励：{}'.format(i_ep+1, cfg.train_eps, ep_reward))
+    print('完成训练！')
+    return rewards, ma_rewards

-cfg = DQNConfig()
-plot_cfg = PlotConfig()
-# 训练
-env, agent = env_agent_config(cfg, seed=1)
-rewards, ma_rewards = train(cfg, env, agent)
-make_dir(plot_cfg.result_path, plot_cfg.model_path)  # 创建保存结果和模型路径的文件夹
-agent.save(path=plot_cfg.model_path)  # 保存模型
-save_results(rewards, ma_rewards, tag='train',
-             path=plot_cfg.result_path)  # 保存结果
-plot_rewards(rewards, ma_rewards, plot_cfg, tag="train")  # 画出结果
-# 测试
-env, agent = env_agent_config(cfg, seed=10)
-agent.load(path=plot_cfg.model_path)  # 导入模型
-rewards, ma_rewards = test(cfg, env, agent)
-save_results(rewards, ma_rewards, tag='test', path=plot_cfg.result_path)  # 保存结果
-plot_rewards(rewards, ma_rewards, plot_cfg, tag="test")  # 画出结果
+def test(cfg,env,agent):
+    print('开始测试!')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
+    # 由于测试不需要使用epsilon-greedy策略，所以相应的值设置为0
+    cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon
+    cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
+    rewards = [] # 记录所有回合的奖励
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    for i_ep in range(cfg.test_eps):
+        ep_reward = 0 # 记录一回合内的奖励
+        state = env.reset() # 重置环境，返回初始状态
+        while True:
+            action = agent.choose_action(state) # 选择动作
+            next_state, reward, done, _ = env.step(action) # 更新环境，返回transition
+            state = next_state # 更新下一个状态
+            ep_reward += reward # 累加奖励
+            if done:
+                break
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
+        else:
+            ma_rewards.append(ep_reward)
+        print(f"回合：{i_ep+1}/{cfg.test_eps}，奖励：{ep_reward:.1f}")
+    print('完成测试！')
+    return rewards,ma_rewards
+if __name__ == "__main__":
+    cfg = DQNConfig()
+    plot_cfg = PlotConfig()
+    # 训练
+    env, agent = env_agent_config(cfg, seed=1)
+    rewards, ma_rewards = train(cfg, env, agent)
+    make_dir(plot_cfg.result_path, plot_cfg.model_path)  # 创建保存结果和模型路径的文件夹
+    agent.save(path=plot_cfg.model_path)  # 保存模型
+    save_results(rewards, ma_rewards, tag='train',
+                path=plot_cfg.result_path)  # 保存结果
+    plot_rewards(rewards, ma_rewards, plot_cfg, tag="train")  # 画出结果
+    # 测试
+    env, agent = env_agent_config(cfg, seed=10)
+    agent.load(path=plot_cfg.model_path)  # 导入模型
+    rewards, ma_rewards = test(cfg, env, agent)
+    save_results(rewards, ma_rewards, tag='test', path=plot_cfg.result_path)  # 保存结果
+    plot_rewards(rewards, ma_rewards, plot_cfg, tag="test")  # 画出结果
--- a/codes/DQN/task1.py
+++ b/codes/DQN/task1.py
@@ -1,3 +1,13 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Author: JiangJi
+Email: johnjim0816@gmail.com
+Date: 2021-12-22 11:14:17
+LastEditor: JiangJi
+LastEditTime: 2021-12-22 11:40:44
+Discription: 使用 Nature DQN 训练 CartPole-v1
+'''
 import sys
 import os
 curr_path = os.path.dirname(os.path.abspath(__file__))  # 当前文件所在绝对路径
@@ -9,9 +19,7 @@ import torch
 import datetime
 from common.utils import save_results, make_dir
 from common.utils import plot_rewards, plot_rewards_cn
-from DQN.agent import DQN
-from DQN.train import train,test
-
+from DQN.dqn import DQN

 curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
 algo_name = "DQN"  # 算法名称
@@ -58,26 +66,83 @@ def env_agent_config(cfg, seed=1):
    '''
    env = gym.make(cfg.env_name)  # 创建环境
    env.seed(seed)  # 设置随机种子
-    state_dim = env.observation_space.shape[0]  # 状态数
-    action_dim = env.action_space.n  # 动作数
-    agent = DQN(state_dim, action_dim, cfg)  # 创建智能体
+    n_states = env.observation_space.shape[0]  # 状态数
+    n_actions = env.action_space.n  # 动作数
+    agent = DQN(n_states, n_actions, cfg)  # 创建智能体
    return env, agent

+def train(cfg, env, agent):
+    ''' 训练
+    '''
+    print('开始训练!')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
+    rewards = [] # 记录所有回合的奖励
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    for i_ep in range(cfg.train_eps):
+        ep_reward = 0 # 记录一回合内的奖励
+        state = env.reset() # 重置环境，返回初始状态
+        while True:
+            action = agent.choose_action(state) # 选择动作
+            next_state, reward, done, _ = env.step(action) # 更新环境，返回transition
+            agent.memory.push(state, action, reward, next_state, done) # 保存transition
+            state = next_state # 更新下一个状态
+            agent.update() # 更新智能体
+            ep_reward += reward # 累加奖励
+            if done:
+                break
+        if (i_ep+1) % cfg.target_update == 0: # 智能体目标网络更新
+            agent.target_net.load_state_dict(agent.policy_net.state_dict())
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
+        else:
+            ma_rewards.append(ep_reward)
+        if (i_ep+1)%10 == 0: 
+            print('回合：{}/{}, 奖励：{}'.format(i_ep+1, cfg.train_eps, ep_reward))
+    print('完成训练！')
+    return rewards, ma_rewards

-cfg = DQNConfig()
-plot_cfg = PlotConfig()
-# 训练
-env, agent = env_agent_config(cfg, seed=1)
-rewards, ma_rewards = train(cfg, env, agent)
-make_dir(plot_cfg.result_path, plot_cfg.model_path)  # 创建保存结果和模型路径的文件夹
-agent.save(path=plot_cfg.model_path)  # 保存模型
-save_results(rewards, ma_rewards, tag='train',
-             path=plot_cfg.result_path)  # 保存结果
-plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="train")  # 画出结果
-# 测试
-env, agent = env_agent_config(cfg, seed=10)
-agent.load(path=plot_cfg.model_path)  # 导入模型
-rewards, ma_rewards = test(cfg, env, agent)
-save_results(rewards, ma_rewards, tag='test',
-             path=plot_cfg.result_path)  # 保存结果
-plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="test")  # 画出结果
+def test(cfg,env,agent):
+    print('开始测试!')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
+    # 由于测试不需要使用epsilon-greedy策略，所以相应的值设置为0
+    cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon
+    cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
+    rewards = [] # 记录所有回合的奖励
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    for i_ep in range(cfg.test_eps):
+        ep_reward = 0 # 记录一回合内的奖励
+        state = env.reset() # 重置环境，返回初始状态
+        while True:
+            action = agent.choose_action(state) # 选择动作
+            next_state, reward, done, _ = env.step(action) # 更新环境，返回transition
+            state = next_state # 更新下一个状态
+            ep_reward += reward # 累加奖励
+            if done:
+                break
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
+        else:
+            ma_rewards.append(ep_reward)
+        print(f"回合：{i_ep+1}/{cfg.test_eps}，奖励：{ep_reward:.1f}")
+    print('完成测试！')
+    return rewards,ma_rewards
+if __name__ == "__main__":
+    cfg = DQNConfig()
+    plot_cfg = PlotConfig()
+    # 训练
+    env, agent = env_agent_config(cfg, seed=1)
+    rewards, ma_rewards = train(cfg, env, agent)
+    make_dir(plot_cfg.result_path, plot_cfg.model_path)  # 创建保存结果和模型路径的文件夹
+    agent.save(path=plot_cfg.model_path)  # 保存模型
+    save_results(rewards, ma_rewards, tag='train',
+                path=plot_cfg.result_path)  # 保存结果
+    plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="train")  # 画出结果
+    # 测试
+    env, agent = env_agent_config(cfg, seed=10)
+    agent.load(path=plot_cfg.model_path)  # 导入模型
+    rewards, ma_rewards = test(cfg, env, agent)
+    save_results(rewards, ma_rewards, tag='test',
+                path=plot_cfg.result_path)  # 保存结果
+    plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="test")  # 画出结果
--- a/codes/DQN/task2.py
+++ b/codes/DQN/task2.py
@@ -0,0 +1,150 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Author: JiangJi
+Email: johnjim0816@gmail.com
+Date: 2021-12-22 11:14:17
+LastEditor: JiangJi
+LastEditTime: 2021-12-22 15:27:48
+Discription: 使用 DQN-cnn  训练 PongNoFrameskip-v4
+'''
+import sys
+import os
+curr_path = os.path.dirname(os.path.abspath(__file__))  # 当前文件所在绝对路径
+parent_path = os.path.dirname(curr_path)  # 父路径
+sys.path.append(parent_path)  # 添加路径到系统路径
+
+import gym
+import torch
+import datetime
+from common.utils import save_results, make_dir
+from common.utils import plot_rewards, plot_rewards_cn
+from common.atari_wrappers import make_atari, wrap_deepmind
+from DQN.dqn import DQN
+
+curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
+algo_name = 'DQN-cnn'  # 算法名称
+env_name = 'PongNoFrameskip-v4'  # 环境名称
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 检测GPU
+class DQNConfig:
+    ''' 算法相关参数设置
+    '''
+
+    def __init__(self):
+        self.algo_name = algo_name  # 算法名称
+        self.env_name = env_name  # 环境名称
+        self.device = device # 检测GPU
+        self.train_eps = 500  # 训练的回合数
+        self.test_eps = 30  # 测试的回合数
+        # 超参数
+        self.gamma = 0.95  # 强化学习中的折扣因子
+        self.epsilon_start = 0.90  # e-greedy策略中初始epsilon
+        self.epsilon_end = 0.01  # e-greedy策略中的终止epsilon
+        self.epsilon_decay = 500  # e-greedy策略中epsilon的衰减率
+        self.lr = 0.0001  # 学习率
+        self.memory_capacity = 100000  # 经验回放的容量
+        self.batch_size = 64  # mini-batch SGD中的批量大小
+        self.target_update = 4  # 目标网络的更新频率
+        self.hidden_dim = 256  # 网络隐藏层
+class PlotConfig:
+    ''' 绘图相关参数设置
+    '''
+
+    def __init__(self) -> None:
+        self.algo_name = algo_name  # 算法名称
+        self.env_name = env_name  # 环境名称
+        self.device = device  # 检测GPU
+        self.result_path = curr_path + "/outputs/" + self.env_name + \
+            '/' + curr_time + '/results/'  # 保存结果的路径
+        self.model_path = curr_path + "/outputs/" + self.env_name + \
+            '/' + curr_time + '/models/'  # 保存模型的路径
+        self.save = True  # 是否保存图片
+
+
+def env_agent_config(cfg, seed=1):
+    ''' 创建环境和智能体
+    '''
+    env    = make_atari(cfg.env_name) # 创建环境
+    # env    = wrap_deepmind(env)
+    # env    = wrap_pytorch(env) 
+    env.seed(seed)  # 设置随机种子
+    n_states = env.observation_space.shape[0]  # 状态数
+    n_actions = env.action_space.n  # 动作数
+    agent = DQN(n_states, n_actions, cfg)  # 创建智能体
+    return env, agent
+
+def train(cfg, env, agent):
+    ''' 训练
+    '''
+    print('开始训练!')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
+    rewards = [] # 记录所有回合的奖励
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    for i_ep in range(cfg.train_eps):
+        ep_reward = 0 # 记录一回合内的奖励
+        state = env.reset() # 重置环境，返回初始状态
+        while True:
+            action = agent.choose_action(state) # 选择动作
+            next_state, reward, done, _ = env.step(action) # 更新环境，返回transition
+            agent.memory.push(state, action, reward, next_state, done) # 保存transition
+            state = next_state # 更新下一个状态
+            agent.update() # 更新智能体
+            ep_reward += reward # 累加奖励
+            if done:
+                break
+        if (i_ep+1) % cfg.target_update == 0: # 智能体目标网络更新
+            agent.target_net.load_state_dict(agent.policy_net.state_dict())
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
+        else:
+            ma_rewards.append(ep_reward)
+        if (i_ep+1)%10 == 0: 
+            print('回合：{}/{}, 奖励：{}'.format(i_ep+1, cfg.train_eps, ep_reward))
+    print('完成训练！')
+    return rewards, ma_rewards
+
+def test(cfg,env,agent):
+    print('开始测试!')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
+    # 由于测试不需要使用epsilon-greedy策略，所以相应的值设置为0
+    cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon
+    cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
+    rewards = [] # 记录所有回合的奖励
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    for i_ep in range(cfg.test_eps):
+        ep_reward = 0 # 记录一回合内的奖励
+        state = env.reset() # 重置环境，返回初始状态
+        while True:
+            action = agent.choose_action(state) # 选择动作
+            next_state, reward, done, _ = env.step(action) # 更新环境，返回transition
+            state = next_state # 更新下一个状态
+            ep_reward += reward # 累加奖励
+            if done:
+                break
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
+        else:
+            ma_rewards.append(ep_reward)
+        print(f"回合：{i_ep+1}/{cfg.test_eps}，奖励：{ep_reward:.1f}")
+    print('完成测试！')
+    return rewards,ma_rewards
+if __name__ == "__main__":
+    cfg = DQNConfig()
+    plot_cfg = PlotConfig()
+    # 训练
+    env, agent = env_agent_config(cfg, seed=1)
+    rewards, ma_rewards = train(cfg, env, agent)
+    make_dir(plot_cfg.result_path, plot_cfg.model_path)  # 创建保存结果和模型路径的文件夹
+    agent.save(path=plot_cfg.model_path)  # 保存模型
+    save_results(rewards, ma_rewards, tag='train',
+                path=plot_cfg.result_path)  # 保存结果
+    plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="train")  # 画出结果
+    # 测试
+    env, agent = env_agent_config(cfg, seed=10)
+    agent.load(path=plot_cfg.model_path)  # 导入模型
+    rewards, ma_rewards = test(cfg, env, agent)
+    save_results(rewards, ma_rewards, tag='test',
+                path=plot_cfg.result_path)  # 保存结果
+    plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="test")  # 画出结果
--- a/codes/DQN/train.ipynb
+++ b/codes/DQN/train.ipynb
--- a/codes/DQN/train.py
+++ b/codes/DQN/train.py
@@ -1,138 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-'''
-@Author: John
-@Email: johnjim0816@gmail.com
-@Date: 2020-06-12 00:48:57
-@LastEditor: John
-LastEditTime: 2021-12-22 11:08:04
-@Discription: 
-@Environment: python 3.7.7
-'''
-def train(cfg, env, agent):
-    ''' 训练
-    '''
-    print('开始训练!')
-    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
-    rewards = [] # 记录所有回合的奖励
-    ma_rewards = []  # 记录所有回合的滑动平均奖励
-    for i_ep in range(cfg.train_eps):
-        ep_reward = 0 # 记录一回合内的奖励
-        state = env.reset() # 重置环境，返回初始状态
-        while True:
-            action = agent.choose_action(state) # 选择动作
-            next_state, reward, done, _ = env.step(action) # 更新环境，返回transition
-            agent.memory.push(state, action, reward, next_state, done) # 保存transition
-            state = next_state # 更新下一个状态
-            agent.update() # 更新智能体
-            ep_reward += reward # 累加奖励
-            if done:
-                break
-        if (i_ep+1) % cfg.target_update == 0: # 智能体目标网络更新
-            agent.target_net.load_state_dict(agent.policy_net.state_dict())
-        rewards.append(ep_reward)
-        if ma_rewards:
-            ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
-        else:
-            ma_rewards.append(ep_reward)
-        if (i_ep+1)%10 == 0: 
-            print('回合：{}/{}, 奖励：{}'.format(i_ep+1, cfg.train_eps, ep_reward))
-    print('完成训练！')
-    return rewards, ma_rewards
-
-def test(cfg,env,agent):
-    print('开始测试!')
-    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
-    # 由于测试不需要使用epsilon-greedy策略，所以相应的值设置为0
-    cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon
-    cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
-    rewards = [] # 记录所有回合的奖励
-    ma_rewards = []  # 记录所有回合的滑动平均奖励
-    for i_ep in range(cfg.test_eps):
-        ep_reward = 0 # 记录一回合内的奖励
-        state = env.reset() # 重置环境，返回初始状态
-        while True:
-            action = agent.choose_action(state) # 选择动作
-            next_state, reward, done, _ = env.step(action) # 更新环境，返回transition
-            state = next_state # 更新下一个状态
-            ep_reward += reward # 累加奖励
-            if done:
-                break
-        rewards.append(ep_reward)
-        if ma_rewards:
-            ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
-        else:
-            ma_rewards.append(ep_reward)
-        print(f"回合：{i_ep+1}/{cfg.test_eps}，奖励：{ep_reward:.1f}")
-    print('完成测试！')
-    return rewards,ma_rewards
-
-if __name__ == "__main__":
-    import sys,os
-    curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
-    parent_path = os.path.dirname(curr_path) # 父路径
-    sys.path.append(parent_path) # 添加路径到系统路径
-
-    import gym
-    import torch
-    import datetime
-
-    from common.utils import save_results, make_dir
-    from common.utils import plot_rewards
-    from DQN.agent import DQN
-    from DQN.train import train
-
-    curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
-    class DQNConfig:
-        def __init__(self):
-            self.algo = "DQN"  # 算法名称
-            self.env_name = 'CartPole-v0' # 环境名称
-            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 检测GPU
-            self.train_eps = 200 # 训练的回合数
-            self.test_eps = 30 # 测试的回合数
-            # 超参数
-            self.gamma = 0.95 # 强化学习中的折扣因子
-            self.epsilon_start = 0.90 # e-greedy策略中初始epsilon
-            self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon
-            self.epsilon_decay = 500 # e-greedy策略中epsilon的衰减率
-            self.lr = 0.0001  # 学习率
-            self.memory_capacity = 100000  # 经验回放的容量
-            self.batch_size = 64 # mini-batch SGD中的批量大小
-            self.target_update = 4 # 目标网络的更新频率
-            self.hidden_dim = 256  # 网络隐藏层
-    class PlotConfig:
-        def __init__(self) -> None:
-            self.algo = "DQN"  # 算法名称
-            self.env_name = 'CartPole-v0' # 环境名称
-            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 检测GPU
-            self.result_path = curr_path+"/outputs/" + self.env_name + \
-                '/'+curr_time+'/results/'  # 保存结果的路径
-            self.model_path = curr_path+"/outputs/" + self.env_name + \
-                '/'+curr_time+'/models/'  # 保存模型的路径
-            self.save = True # 是否保存图片
-
-    def env_agent_config(cfg,seed=1):
-        ''' 创建环境和智能体
-        '''
-        env = gym.make(cfg.env_name)  # 创建环境
-        env.seed(seed) # 设置随机种子
-        state_dim = env.observation_space.shape[0] # 状态数
-        action_dim = env.action_space.n # 动作数
-        agent = DQN(state_dim,action_dim,cfg) # 创建智能体
-        return env,agent
-
-    cfg = DQNConfig()
-    plot_cfg = PlotConfig()
-    # 训练
-    env,agent = env_agent_config(cfg,seed=1)
-    rewards, ma_rewards = train(cfg, env, agent)
-    make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹
-    agent.save(path=plot_cfg.model_path) # 保存模型
-    save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path) # 保存结果
-    plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果
-    # 测试
-    env,agent = env_agent_config(cfg,seed=10)
-    agent.load(path=plot_cfg.model_path) # 导入模型
-    rewards,ma_rewards = test(cfg,env,agent)
-    save_results(rewards,ma_rewards,tag='test',path=plot_cfg.result_path) # 保存结果
-    plot_rewards(rewards,ma_rewards, plot_cfg, tag="test") # 画出结果
--- a/codes/Docs/使用DDPG解决倒立摆问题.md
+++ b/codes/Docs/使用DDPG解决倒立摆问题.md
@@ -90,15 +90,15 @@ class OUNoise(object):
        self.max_sigma    = max_sigma
        self.min_sigma    = min_sigma
        self.decay_period = decay_period
-        self.action_dim   = action_space.shape[0]
+        self.n_actions   = action_space.shape[0]
        self.low          = action_space.low
        self.high         = action_space.high
        self.reset()
    def reset(self):
-        self.obs = np.ones(self.action_dim) * self.mu
+        self.obs = np.ones(self.n_actions) * self.mu
    def evolve_obs(self):
        x  = self.obs
-        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
+        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.n_actions)
        self.obs = x + dx
        return self.obs
    def get_action(self, action, t=0):
--- a/codes/Docs/使用DQN解决推车杆问题.md
+++ b/codes/Docs/使用DQN解决推车杆问题.md
@@ -14,10 +14,10 @@ CartPole-v0是一个经典的入门环境，如下图，它通过向左(动作=0
 import gym
 env = gym.make('CartPole-v0')  # 建立环境
 env.seed(1) # 随机种子
-state_dim = env.observation_space.shape[0] # 状态数
-action_dim = env.action_space.n # 动作数
+n_states = env.observation_space.shape[0] # 状态数
+n_actions = env.action_space.n # 动作数
 state = env.reset() # 初始化环境
-print(f"状态数：{state_dim}，动作数：{action_dim}")
+print(f"状态数：{n_states}，动作数：{n_actions}")
 print(f"初始状态：{state}")
 ```

@@ -157,7 +157,7 @@ def choose_action(self, state):
                q_values = self.policy_net(state)
                action = q_values.max(1)[1].item() # 选择Q值最大的动作
        else:
-            action = random.randrange(self.action_dim)
+            action = random.randrange(self.n_actions)
 ```

 可以看到跟Q学习算法其实是一样的，都是用的$\epsilon-greedy$策略，只是使用神经网络的话我们需要通过Torch或者Tensorflow工具来处理相应的数据。
--- a/codes/Docs/使用Q-learning解决悬崖寻路问题.md
+++ b/codes/Docs/使用Q-learning解决悬崖寻路问题.md
@@ -30,9 +30,9 @@ env = CliffWalkingWapper(env) # 装饰环境
 这里我们在程序中使用了一个装饰器重新定义环境，但不影响对环境的理解，感兴趣的同学具体看相关代码。可以由于gym环境封装得比较好，所以我们想要使用这个环境只需要使用gym.make命令输入函数名即可，然后我们可以查看环境的状态和动作数目：

 ```python
-state_dim = env.observation_space.n # 状态数
-action_dim = env.action_space.n # 动作数
-print(f"状态数：{state_dim}，动作数：{action_dim}")
+n_states = env.observation_space.n # 状态数
+n_actions = env.action_space.n # 动作数
+print(f"状态数：{n_states}，动作数：{n_actions}")
 ```

 打印出来的结果如下：
@@ -72,9 +72,9 @@ print(state)
 env = gym.make('CliffWalking-v0')  # 定义环境
 env = CliffWalkingWapper(env) # 装饰环境
 env.seed(1) # 设置随机种子
-state_dim = env.observation_space.n # 状态数
-action_dim = env.action_space.n # 动作数
-agent = QLearning(state_dim,action_dim,cfg) # cfg存储算法相关参数
+n_states = env.observation_space.n # 状态数
+n_actions = env.action_space.n # 动作数
+agent = QLearning(n_states,n_actions,cfg) # cfg存储算法相关参数
 for i_ep in range(cfg.train_eps): # cfg.train_eps表示最大训练的回合数
    ep_reward = 0  # 记录每个回合的奖励
    state = env.reset()  # 重置环境
@@ -126,7 +126,7 @@ def choose_action(self, state):
      if np.random.uniform(0, 1) > self.epsilon:
          action = np.argmax(self.Q_table[str(state)]) # 选择Q(s,a)最大对应的动作
      else:
-          action = np.random.choice(self.action_dim) # 随机选择动作
+          action = np.random.choice(self.n_actions) # 随机选择动作
      return action
 ```

--- a/codes/DuelingDQN/task0_train.ipynb
+++ b/codes/DuelingDQN/task0_train.ipynb
@@ -136,12 +136,12 @@
   "outputs": [],
   "source": [
    "class DuelingNet(nn.Module):\n",
-    "    def __init__(self, state_dim, action_dim,hidden_size=128):\n",
+    "    def __init__(self, n_states, n_actions,hidden_size=128):\n",
    "        super(DuelingNet, self).__init__()\n",
    "        \n",
    "        # 隐藏层\n",
    "        self.hidden = nn.Sequential(\n",
-    "            nn.Linear(state_dim, hidden_size),\n",
+    "            nn.Linear(n_states, hidden_size),\n",
    "            nn.ReLU()\n",
    "        )\n",
    "        \n",
@@ -149,7 +149,7 @@
    "        self.advantage = nn.Sequential(\n",
    "            nn.Linear(hidden_size, hidden_size),\n",
    "            nn.ReLU(),\n",
-    "            nn.Linear(hidden_size, action_dim)\n",
+    "            nn.Linear(hidden_size, n_actions)\n",
    "        )\n",
    "        \n",
    "        # 价值函数\n",
@@ -192,7 +192,7 @@
   ],
   "source": [
    "class DuelingDQN:\n",
-    "    def __init__(self,state_dim,action_dim,cfg) -> None:\n",
+    "    def __init__(self,n_states,n_actions,cfg) -> None:\n",
    "        self.batch_size = cfg.batch_size\n",
    "        self.device = cfg.device\n",
    "        self.loss_history = [] # 记录loss的变化\n",
@@ -200,8 +200,8 @@
    "        self.epsilon = lambda frame_idx: cfg.epsilon_end + \\\n",
    "            (cfg.epsilon_start - cfg.epsilon_end) * \\\n",
    "            math.exp(-1. * frame_idx / cfg.epsilon_decay)\n",
-    "        self.policy_net = DuelingNet(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)\n",
-    "        self.target_net = DuelingNet(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)\n",
+    "        self.policy_net = DuelingNet(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)\n",
+    "        self.target_net = DuelingNet(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)\n",
    "        for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网络targe_net\n",
    "            target_param.data.copy_(param.data)\n",
    "        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器\n",
@@ -214,7 +214,7 @@
    "                q_values = self.policy_net(state)\n",
    "                action = q_values.max(1)[1].item() # 选择Q值最大的动作\n",
    "        else:\n",
-    "            action = random.randrange(self.action_dim)\n",
+    "            action = random.randrange(self.n_actions)\n",
    "        return action\n",
    "    def update(self):\n",
    "        if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时，不更新策略\n",
--- a/codes/Logs.md
+++ b/codes/Logs.md
@@ -0,0 +1,5 @@
+## 记录笔者更新的日志
+
+**2021.12.22-3**：将```agent.py```更改为对应的算法名称，便于区分如```dqn```与```dqn_cnn```的情况  
+**2021.12.22-2**：简化了代码结构，将原来的```train.py```和```task.py```等合并到```task.py```中  
+**2021.12.22-1**：简化了代码结构，将原来的```model.py```和```memory.py```等合并到```agent.py```中，```plot.py```的内容合并到```common.utils.py```中
--- a/codes/MonteCarlo/agent.py
+++ b/codes/MonteCarlo/agent.py
@@ -17,11 +17,11 @@ import dill
 class FisrtVisitMC:
    ''' On-Policy First-Visit MC Control
    '''
-    def __init__(self,action_dim,cfg):
-        self.action_dim = action_dim
+    def __init__(self,n_actions,cfg):
+        self.n_actions = n_actions
        self.epsilon = cfg.epsilon
        self.gamma = cfg.gamma 
-        self.Q_table = defaultdict(lambda: np.zeros(action_dim))
+        self.Q_table = defaultdict(lambda: np.zeros(n_actions))
        self.returns_sum = defaultdict(float) # sum of returns
        self.returns_count = defaultdict(float)
        
@@ -29,11 +29,11 @@ class FisrtVisitMC:
        ''' e-greed policy '''
        if state in self.Q_table.keys():
            best_action = np.argmax(self.Q_table[state])
-            action_probs = np.ones(self.action_dim, dtype=float) * self.epsilon / self.action_dim
+            action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions
            action_probs[best_action] += (1.0 - self.epsilon)
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        else:
-            action = np.random.randint(0,self.action_dim)
+            action = np.random.randint(0,self.n_actions)
        return action
    def update(self,one_ep_transition):
        # Find all (state, action) pairs we've visited in this one_ep_transition
--- a/codes/MonteCarlo/task0_train.py
+++ b/codes/MonteCarlo/task0_train.py
@@ -43,8 +43,8 @@ class MCConfig:

 def env_agent_config(cfg,seed=1):
    env = RacetrackEnv()
-    action_dim = 9
-    agent = FisrtVisitMC(action_dim, cfg)
+    n_actions = 9
+    agent = FisrtVisitMC(n_actions, cfg)
    return env,agent
    
 def train(cfg, env, agent):
--- a/codes/NoisyDQN/noisy_dqn.py
+++ b/codes/NoisyDQN/noisy_dqn.py
@@ -0,0 +1,52 @@
+import torch
+import torch.nn as nn
+
+class NoisyLinear(nn.Module):
+    def __init__(self, input_dim, output_dim, std_init=0.4):
+        super(NoisyLinear, self).__init__()
+        
+        self.input_dim  = input_dim
+        self.output_dim = output_dim
+        self.std_init     = std_init
+        
+        self.weight_mu    = nn.Parameter(torch.FloatTensor(output_dim, input_dim))
+        self.weight_sigma = nn.Parameter(torch.FloatTensor(output_dim, input_dim))
+        self.register_buffer('weight_epsilon', torch.FloatTensor(output_dim, input_dim))
+        
+        self.bias_mu    = nn.Parameter(torch.FloatTensor(output_dim))
+        self.bias_sigma = nn.Parameter(torch.FloatTensor(output_dim))
+        self.register_buffer('bias_epsilon', torch.FloatTensor(output_dim))
+        
+        self.reset_parameters()
+        self.reset_noise()
+    
+    def forward(self, x):
+        if self.training: 
+            weight = self.weight_mu + self.weight_sigma.mul( (self.weight_epsilon))
+            bias   = self.bias_mu   + self.bias_sigma.mul(Variable(self.bias_epsilon))
+        else:
+            weight = self.weight_mu
+            bias   = self.bias_mu
+        
+        return F.linear(x, weight, bias)
+    
+    def reset_parameters(self):
+        mu_range = 1 / math.sqrt(self.weight_mu.size(1))
+        
+        self.weight_mu.data.uniform_(-mu_range, mu_range)
+        self.weight_sigma.data.fill_(self.std_init / math.sqrt(self.weight_sigma.size(1)))
+        
+        self.bias_mu.data.uniform_(-mu_range, mu_range)
+        self.bias_sigma.data.fill_(self.std_init / math.sqrt(self.bias_sigma.size(0)))
+    
+    def reset_noise(self):
+        epsilon_in  = self._scale_noise(self.input_dim)
+        epsilon_out = self._scale_noise(self.output_dim)
+        
+        self.weight_epsilon.copy_(epsilon_out.ger(epsilon_in))
+        self.bias_epsilon.copy_(self._scale_noise(self.output_dim))
+    
+    def _scale_noise(self, size):
+        x = torch.randn(size)
+        x = x.sign().mul(x.abs().sqrt())
+        return x
--- a/codes/PPO/README.md
+++ b/codes/PPO/README.md
@@ -57,16 +57,16 @@ model就是actor和critic两个网络了：
 import torch.nn as nn
 from torch.distributions.categorical import Categorical
 class Actor(nn.Module):
-    def __init__(self,state_dim, action_dim,
+    def __init__(self,n_states, n_actions,
            hidden_dim=256):
        super(Actor, self).__init__()

        self.actor = nn.Sequential(
-                nn.Linear(state_dim, hidden_dim),
+                nn.Linear(n_states, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, hidden_dim),
                nn.ReLU(),
-                nn.Linear(hidden_dim, action_dim),
+                nn.Linear(hidden_dim, n_actions),
                nn.Softmax(dim=-1)
        )
    def forward(self, state):
@@ -75,10 +75,10 @@ class Actor(nn.Module):
        return dist

 class Critic(nn.Module):
-    def __init__(self, state_dim,hidden_dim=256):
+    def __init__(self, n_states,hidden_dim=256):
        super(Critic, self).__init__()
        self.critic = nn.Sequential(
-                nn.Linear(state_dim, hidden_dim),
+                nn.Linear(n_states, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, hidden_dim),
                nn.ReLU(),
@@ -88,7 +88,7 @@ class Critic(nn.Module):
        value = self.critic(state)
        return value
 ```
-这里Actor就是得到一个概率分布(Categorica，也可以是别的分布，可以搜索torch distributionsl)，critc根据当前状态得到一个值，这里的输入维度可以是```state_dim+action_dim```，即将action信息也纳入critic网络中，这样会更好一些，感兴趣的小伙伴可以试试。
+这里Actor就是得到一个概率分布(Categorica，也可以是别的分布，可以搜索torch distributionsl)，critc根据当前状态得到一个值，这里的输入维度可以是```n_states+n_actions```，即将action信息也纳入critic网络中，这样会更好一些，感兴趣的小伙伴可以试试。

 ### PPO update
 定义一个update函数主要实现伪代码中的第六步和第七步：
--- a/codes/PPO/agent.py
+++ b/codes/PPO/agent.py
@@ -16,15 +16,15 @@ import torch.optim as optim
 from PPO.model import Actor,Critic
 from PPO.memory import PPOMemory
 class PPO:
-    def __init__(self, state_dim, action_dim,cfg):
+    def __init__(self, n_states, n_actions,cfg):
        self.gamma = cfg.gamma
        self.continuous = cfg.continuous 
        self.policy_clip = cfg.policy_clip
        self.n_epochs = cfg.n_epochs
        self.gae_lambda = cfg.gae_lambda
        self.device = cfg.device
-        self.actor = Actor(state_dim, action_dim,cfg.hidden_dim).to(self.device)
-        self.critic = Critic(state_dim,cfg.hidden_dim).to(self.device)
+        self.actor = Actor(n_states, n_actions,cfg.hidden_dim).to(self.device)
+        self.critic = Critic(n_states,cfg.hidden_dim).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=cfg.actor_lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=cfg.critic_lr)
        self.memory = PPOMemory(cfg.batch_size)
--- a/codes/PPO/model.py
+++ b/codes/PPO/model.py
@@ -12,16 +12,16 @@ Environment:
 import torch.nn as nn
 from torch.distributions.categorical import Categorical
 class Actor(nn.Module):
-    def __init__(self,state_dim, action_dim,
+    def __init__(self,n_states, n_actions,
            hidden_dim):
        super(Actor, self).__init__()

        self.actor = nn.Sequential(
-                nn.Linear(state_dim, hidden_dim),
+                nn.Linear(n_states, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, hidden_dim),
                nn.ReLU(),
-                nn.Linear(hidden_dim, action_dim),
+                nn.Linear(hidden_dim, n_actions),
                nn.Softmax(dim=-1)
        )
    def forward(self, state):
@@ -30,10 +30,10 @@ class Actor(nn.Module):
        return dist

 class Critic(nn.Module):
-    def __init__(self, state_dim,hidden_dim):
+    def __init__(self, n_states,hidden_dim):
        super(Critic, self).__init__()
        self.critic = nn.Sequential(
-                nn.Linear(state_dim, hidden_dim),
+                nn.Linear(n_states, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, hidden_dim),
                nn.ReLU(),
--- a/codes/PPO/task0.py
+++ b/codes/PPO/task0.py
@@ -45,9 +45,9 @@ class PlotConfig:
 def env_agent_config(cfg,seed=1):
    env = gym.make(cfg.env_name)  
    env.seed(seed)
-    state_dim = env.observation_space.shape[0]
-    action_dim = env.action_space.n
-    agent = PPO(state_dim,action_dim,cfg)
+    n_states = env.observation_space.shape[0]
+    n_actions = env.action_space.n
+    agent = PPO(n_states,n_actions,cfg)
    return env,agent

 cfg  = PPOConfig()
--- a/codes/PPO/task1.py
+++ b/codes/PPO/task1.py
@@ -45,9 +45,9 @@ class PlotConfig:
 def env_agent_config(cfg,seed=1):
    env = gym.make(cfg.env_name)  
    env.seed(seed)
-    state_dim = env.observation_space.shape[0]
-    action_dim = env.action_space.shape[0]
-    agent = PPO(state_dim,action_dim,cfg)
+    n_states = env.observation_space.shape[0]
+    n_actions = env.action_space.shape[0]
+    agent = PPO(n_states,n_actions,cfg)
    return env,agent


--- a/codes/PPO/train.ipynb
+++ b/codes/PPO/train.ipynb
@@ -90,9 +90,9 @@
    "def env_agent_config(cfg,seed=1):\n",
    "    env = gym.make(cfg.env)  \n",
    "    env.seed(seed)\n",
-    "    state_dim = env.observation_space.shape[0]\n",
-    "    action_dim = env.action_space.n\n",
-    "    agent = PPO(state_dim,action_dim,cfg)\n",
+    "    n_states = env.observation_space.shape[0]\n",
+    "    n_actions = env.action_space.n\n",
+    "    agent = PPO(n_states,n_actions,cfg)\n",
    "    return env,agent"
   ]
  },
--- a/codes/PPO/train.py
+++ b/codes/PPO/train.py
@@ -99,9 +99,9 @@ if __name__ == '__main__':
    def env_agent_config(cfg,seed=1):
        env = gym.make(cfg.env_name)  
        env.seed(seed)
-        state_dim = env.observation_space.shape[0]
-        action_dim = env.action_space.n
-        agent = PPO(state_dim,action_dim,cfg)
+        n_states = env.observation_space.shape[0]
+        n_actions = env.action_space.n
+        agent = PPO(n_states,n_actions,cfg)
        return env,agent

    cfg  = PPOConfig()
--- a/codes/PolicyGradient/agent.py
+++ b/codes/PolicyGradient/agent.py
@@ -17,9 +17,9 @@ from PolicyGradient.model import MLP

 class PolicyGradient:
    
-    def __init__(self, state_dim,cfg):
+    def __init__(self, n_states,cfg):
        self.gamma = cfg.gamma
-        self.policy_net = MLP(state_dim,hidden_dim=cfg.hidden_dim)
+        self.policy_net = MLP(n_states,hidden_dim=cfg.hidden_dim)
        self.optimizer = torch.optim.RMSprop(self.policy_net.parameters(), lr=cfg.lr)
        self.batch_size = cfg.batch_size

--- a/codes/PolicyGradient/model.py
+++ b/codes/PolicyGradient/model.py
@@ -19,7 +19,7 @@ class MLP(nn.Module):
    '''
    def __init__(self,input_dim,hidden_dim = 36):
        super(MLP, self).__init__()
-        # 24和36为hidden layer的层数，可根据input_dim, action_dim的情况来改变
+        # 24和36为hidden layer的层数，可根据input_dim, n_actions的情况来改变
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim,hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, 1)  # Prob of Left
--- a/codes/PolicyGradient/task0_train.py
+++ b/codes/PolicyGradient/task0_train.py
@@ -46,8 +46,8 @@ class PGConfig:
 def env_agent_config(cfg,seed=1):
    env = gym.make(cfg.env)  
    env.seed(seed)
-    state_dim = env.observation_space.shape[0]
-    agent = PolicyGradient(state_dim,cfg)
+    n_states = env.observation_space.shape[0]
+    agent = PolicyGradient(n_states,cfg)
    return env,agent

 def train(cfg,env,agent):
--- a/codes/README.md
+++ b/codes/README.md
@@ -16,7 +16,7 @@
 **注意：新版本中将```model```,```memory```相关内容全部放到了```agent.py```里面，```plot```放到了```common.utils```中。**
 ## 运行环境

-python 3.7、pytorch 1.6.0-1.8.1、gym 0.17.0-0.19.0
+python 3.7、pytorch 1.6.0-1.8.1、gym 0.21.0

 ## 使用说明

@@ -36,7 +36,7 @@ python 3.7、pytorch 1.6.0-1.8.1、gym 0.17.0-0.19.0
 |   [Hierarchical DQN](HierarchicalDQN)    |       [H-DQN Paper](https://arxiv.org/abs/1604.06057)        | [CartPole-v0](./envs/gym_info.md)         |                                    |
 |    [PolicyGradient](./PolicyGradient)    | [Lil'log](https://lilianweng.github.io/lil-log/2018/04/08/policy-gradient-algorithms.html) | [CartPole-v0](./envs/gym_info.md)         |                                    |
 |               [A2C](./A2C)               |        [A3C Paper](https://arxiv.org/abs/1602.01783)         | [CartPole-v0](./envs/gym_info.md)         |                                    |
-|               [SAC](./SAC)               |        [SAC Paper](https://arxiv.org/abs/1801.01290)         | [Pendulum-v0](./envs/gym_info.md)         |                                    |
+|               [SAC](./SoftActorCritic)               |        [SAC Paper](https://arxiv.org/abs/1801.01290)         | [Pendulum-v0](./envs/gym_info.md)         |                                    |
 |               [PPO](./PPO)               |        [PPO paper](https://arxiv.org/abs/1707.06347)         | [CartPole-v0](./envs/gym_info.md)         |                                    |
 |              [DDPG](./DDPG)              |        [DDPG Paper](https://arxiv.org/abs/1509.02971)        | [Pendulum-v0](./envs/gym_info.md)         |                                    |
 |               [TD3](./TD3)               |        [TD3 Paper](https://arxiv.org/abs/1802.09477)         | [HalfCheetah-v2]((./envs/mujoco_info.md)) |                                    |
--- a/codes/SAC/agent.py
+++ b/codes/SAC/agent.py
@@ -1,110 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-'''
-Author: JiangJi
-Email: johnjim0816@gmail.com
-Date: 2021-04-29 12:53:54
-LastEditor: JiangJi
-LastEditTime: 2021-04-29 13:56:39
-Discription: 
-Environment: 
-'''
-import copy
-import torch
-import torch.nn as nn
-import torch.optim as optim
-import numpy as np
-from common.memory import ReplayBuffer
-from SAC.model import ValueNet,PolicyNet,SoftQNet
-
-class SAC:
-    def __init__(self,state_dim,action_dim,cfg) -> None:
-        self.batch_size  = cfg.batch_size 
-        self.memory = ReplayBuffer(cfg.capacity)
-        self.device = cfg.device
-        self.value_net  = ValueNet(state_dim, cfg.hidden_dim).to(self.device)
-        self.target_value_net = ValueNet(state_dim, cfg.hidden_dim).to(self.device)
-        self.soft_q_net = SoftQNet(state_dim, action_dim, cfg.hidden_dim).to(self.device)
-        self.policy_net = PolicyNet(state_dim, action_dim, cfg.hidden_dim).to(self.device)  
-        self.value_optimizer  = optim.Adam(self.value_net.parameters(), lr=cfg.value_lr)
-        self.soft_q_optimizer = optim.Adam(self.soft_q_net.parameters(), lr=cfg.soft_q_lr)
-        self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.policy_lr)  
-        for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()):
-            target_param.data.copy_(param.data)
-        self.value_criterion  = nn.MSELoss()
-        self.soft_q_criterion = nn.MSELoss()
-    def update(self, gamma=0.99,mean_lambda=1e-3,
-        std_lambda=1e-3,
-        z_lambda=0.0,
-        soft_tau=1e-2,
-        ):
-        if len(self.memory) < self.batch_size:
-            return 
-        state, action, reward, next_state, done = self.memory.sample(self.batch_size)
-        state      = torch.FloatTensor(state).to(self.device)
-        next_state = torch.FloatTensor(next_state).to(self.device)
-        action     = torch.FloatTensor(action).to(self.device)
-        reward     = torch.FloatTensor(reward).unsqueeze(1).to(self.device)
-        done       = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(self.device)
-        expected_q_value = self.soft_q_net(state, action)
-        expected_value   = self.value_net(state)
-        new_action, log_prob, z, mean, log_std = self.policy_net.evaluate(state)
-
-
-        target_value = self.target_value_net(next_state)
-        next_q_value = reward + (1 - done) * gamma * target_value
-        q_value_loss = self.soft_q_criterion(expected_q_value, next_q_value.detach())
-
-        expected_new_q_value = self.soft_q_net(state, new_action)
-        next_value = expected_new_q_value - log_prob
-        value_loss = self.value_criterion(expected_value, next_value.detach())
-
-        log_prob_target = expected_new_q_value - expected_value
-        policy_loss = (log_prob * (log_prob - log_prob_target).detach()).mean()
-
-
-        mean_loss = mean_lambda * mean.pow(2).mean()
-        std_loss  = std_lambda  * log_std.pow(2).mean()
-        z_loss    = z_lambda    * z.pow(2).sum(1).mean()
-
-        policy_loss += mean_loss + std_loss + z_loss
-
-        self.soft_q_optimizer.zero_grad()
-        q_value_loss.backward()
-        self.soft_q_optimizer.step()
-
-        self.value_optimizer.zero_grad()
-        value_loss.backward()
-        self.value_optimizer.step()
-
-        self.policy_optimizer.zero_grad()
-        policy_loss.backward()
-        self.policy_optimizer.step()
-
-
-        for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()):
-            target_param.data.copy_(
-                target_param.data * (1.0 - soft_tau) + param.data * soft_tau
-            )
-    def save(self, path):
-        torch.save(self.value_net.state_dict(), path + "sac_value")
-        torch.save(self.value_optimizer.state_dict(), path + "sac_value_optimizer")
-
-        torch.save(self.soft_q_net.state_dict(), path + "sac_soft_q")
-        torch.save(self.soft_q_optimizer.state_dict(), path + "sac_soft_q_optimizer")
-        
-        torch.save(self.policy_net.state_dict(), path + "sac_policy")
-        torch.save(self.policy_optimizer.state_dict(), path + "sac_policy_optimizer")
-        
-
-
-    def load(self, path):
-        self.value_net.load_state_dict(torch.load(path + "sac_value"))
-        self.value_optimizer.load_state_dict(torch.load(path + "sac_value_optimizer"))
-        self.target_value_net = copy.deepcopy(self.value_net)
-
-        self.soft_q_net.load_state_dict(torch.load(path + "sac_soft_q"))
-        self.soft_q_optimizer.load_state_dict(torch.load(path + "sac_soft_q_optimizer"))
-
-        self.policy_net.load_state_dict(torch.load(path + "sac_policy"))
-        self.policy_optimizer.load_state_dict(torch.load(path + "sac_policy_optimizer"))
--- a/codes/SAC/outputs/Pendulum-v0/20210506-014740/models/sac_policy
+++ b/codes/SAC/outputs/Pendulum-v0/20210506-014740/models/sac_policy
--- a/codes/SAC/outputs/Pendulum-v0/20210506-014740/models/sac_policy_optimizer
+++ b/codes/SAC/outputs/Pendulum-v0/20210506-014740/models/sac_policy_optimizer
--- a/codes/SAC/outputs/Pendulum-v0/20210506-014740/models/sac_soft_q
+++ b/codes/SAC/outputs/Pendulum-v0/20210506-014740/models/sac_soft_q
--- a/codes/SAC/outputs/Pendulum-v0/20210506-014740/models/sac_soft_q_optimizer
+++ b/codes/SAC/outputs/Pendulum-v0/20210506-014740/models/sac_soft_q_optimizer
--- a/codes/SAC/outputs/Pendulum-v0/20210506-014740/models/sac_value
+++ b/codes/SAC/outputs/Pendulum-v0/20210506-014740/models/sac_value
--- a/codes/SAC/outputs/Pendulum-v0/20210506-014740/models/sac_value_optimizer
+++ b/codes/SAC/outputs/Pendulum-v0/20210506-014740/models/sac_value_optimizer
--- a/codes/SAC/outputs/Pendulum-v0/20210506-014740/results/eval_ma_rewards.npy
+++ b/codes/SAC/outputs/Pendulum-v0/20210506-014740/results/eval_ma_rewards.npy
--- a/codes/SAC/outputs/Pendulum-v0/20210506-014740/results/eval_rewards.npy
+++ b/codes/SAC/outputs/Pendulum-v0/20210506-014740/results/eval_rewards.npy
--- a/codes/SAC/outputs/Pendulum-v0/20210506-014740/results/eval_rewards_curve.png
+++ b/codes/SAC/outputs/Pendulum-v0/20210506-014740/results/eval_rewards_curve.png
--- a/codes/SAC/outputs/Pendulum-v0/20210506-014740/results/train_ma_rewards.npy
+++ b/codes/SAC/outputs/Pendulum-v0/20210506-014740/results/train_ma_rewards.npy
--- a/codes/SAC/outputs/Pendulum-v0/20210506-014740/results/train_rewards.npy
+++ b/codes/SAC/outputs/Pendulum-v0/20210506-014740/results/train_rewards.npy
--- a/codes/SAC/outputs/Pendulum-v0/20210506-014740/results/train_rewards_curve.png
+++ b/codes/SAC/outputs/Pendulum-v0/20210506-014740/results/train_rewards_curve.png
--- a/codes/Sarsa/agent.py
+++ b/codes/Sarsa/agent.py
@@ -14,17 +14,17 @@ from collections import defaultdict
 import torch
 class Sarsa(object):
    def __init__(self,
-                 action_dim,sarsa_cfg,):
-        self.action_dim = action_dim  # number of actions
+                 n_actions,sarsa_cfg,):
+        self.n_actions = n_actions  # number of actions
        self.lr = sarsa_cfg.lr  # learning rate
        self.gamma = sarsa_cfg.gamma  
        self.epsilon = sarsa_cfg.epsilon  
-        self.Q  = defaultdict(lambda: np.zeros(action_dim))
-        # self.Q = np.zeros((state_dim, action_dim))  # Q表
+        self.Q  = defaultdict(lambda: np.zeros(n_actions))
+        # self.Q = np.zeros((n_states, n_actions))  # Q表
    def choose_action(self, state):
        best_action = np.argmax(self.Q[state])
        # action = best_action
-        action_probs = np.ones(self.action_dim, dtype=float) * self.epsilon / self.action_dim
+        action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions
        action_probs[best_action] += (1.0 - self.epsilon)
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs) 
        return action
--- a/codes/Sarsa/task0_train.py
+++ b/codes/Sarsa/task0_train.py
@@ -39,8 +39,8 @@ class SarsaConfig:

 def env_agent_config(cfg,seed=1):
    env = RacetrackEnv()
-    action_dim=9
-    agent = Sarsa(action_dim,cfg)
+    n_actions=9
+    agent = Sarsa(n_actions,cfg)
    return env,agent
        
 def train(cfg,env,agent):
--- a/codes/SoftActorCritic/env_wrapper.py
+++ b/codes/SoftActorCritic/env_wrapper.py
@@ -5,12 +5,13 @@ Author: JiangJi
 Email: johnjim0816@gmail.com
 Date: 2021-04-29 12:52:11
 LastEditor: JiangJi
-LastEditTime: 2021-04-29 12:52:31
+LastEditTime: 2021-12-22 15:36:36
 Discription: 
 Environment: 
 '''
 import gym
 import numpy as np
+
 class NormalizedActions(gym.ActionWrapper):
    def action(self, action):
        low  = self.action_space.low
--- a/codes/SoftActorCritic/model.py
+++ b/codes/SoftActorCritic/model.py
@@ -17,10 +17,10 @@ from torch.distributions import Normal
 device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

 class ValueNet(nn.Module):
-    def __init__(self, state_dim, hidden_dim, init_w=3e-3):
+    def __init__(self, n_states, hidden_dim, init_w=3e-3):
        super(ValueNet, self).__init__()
        
-        self.linear1 = nn.Linear(state_dim, hidden_dim)
+        self.linear1 = nn.Linear(n_states, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
        self.linear3 = nn.Linear(hidden_dim, 1)

@@ -35,10 +35,10 @@ class ValueNet(nn.Module):
        
        
 class SoftQNet(nn.Module):
-    def __init__(self, state_dim, action_dim, hidden_dim, init_w=3e-3):
+    def __init__(self, n_states, n_actions, hidden_dim, init_w=3e-3):
        super(SoftQNet, self).__init__()
        
-        self.linear1 = nn.Linear(state_dim + action_dim, hidden_dim)
+        self.linear1 = nn.Linear(n_states + n_actions, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
        self.linear3 = nn.Linear(hidden_dim, 1)
        
@@ -54,20 +54,20 @@ class SoftQNet(nn.Module):
        
        
 class PolicyNet(nn.Module):
-    def __init__(self, state_dim, action_dim, hidden_dim, init_w=3e-3, log_std_min=-20, log_std_max=2):
+    def __init__(self, n_states, n_actions, hidden_dim, init_w=3e-3, log_std_min=-20, log_std_max=2):
        super(PolicyNet, self).__init__()
        
        self.log_std_min = log_std_min
        self.log_std_max = log_std_max
        
-        self.linear1 = nn.Linear(state_dim, hidden_dim)
+        self.linear1 = nn.Linear(n_states, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
        
-        self.mean_linear = nn.Linear(hidden_dim, action_dim)
+        self.mean_linear = nn.Linear(hidden_dim, n_actions)
        self.mean_linear.weight.data.uniform_(-init_w, init_w)
        self.mean_linear.bias.data.uniform_(-init_w, init_w)
        
-        self.log_std_linear = nn.Linear(hidden_dim, action_dim)
+        self.log_std_linear = nn.Linear(hidden_dim, n_actions)
        self.log_std_linear.weight.data.uniform_(-init_w, init_w)
        self.log_std_linear.bias.data.uniform_(-init_w, init_w)
        
--- a/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/models/sac_policy
+++ b/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/models/sac_policy
--- a/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/models/sac_policy_optimizer
+++ b/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/models/sac_policy_optimizer
--- a/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/models/sac_soft_q
+++ b/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/models/sac_soft_q
--- a/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/models/sac_soft_q_optimizer
+++ b/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/models/sac_soft_q_optimizer
--- a/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/models/sac_value
+++ b/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/models/sac_value
--- a/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/models/sac_value_optimizer
+++ b/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/models/sac_value_optimizer
--- a/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/results/test_ma_rewards.npy
+++ b/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/results/test_ma_rewards.npy
--- a/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/results/test_rewards.npy
+++ b/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/results/test_rewards.npy
--- a/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/results/test_rewards_curve.png
+++ b/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/results/test_rewards_curve.png
--- a/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/results/train_ma_rewards.npy
+++ b/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/results/train_ma_rewards.npy
--- a/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/results/train_rewards.npy
+++ b/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/results/train_rewards.npy
--- a/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/results/train_rewards_curve.png
+++ b/codes/SoftActorCritic/outputs/Pendulum-v1/20211222-162722/results/train_rewards_curve.png
--- a/codes/SoftActorCritic/sac.py
+++ b/codes/SoftActorCritic/sac.py
@@ -0,0 +1,222 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Author: JiangJi
+Email: johnjim0816@gmail.com
+Date: 2021-04-29 12:53:54
+LastEditor: JiangJi
+LastEditTime: 2021-12-22 15:41:19
+Discription: 
+Environment: 
+'''
+import copy
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.nn.functional as F
+from torch.distributions import Normal
+import numpy as np
+import random 
+device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
+class ReplayBuffer:
+    def __init__(self, capacity):
+        self.capacity = capacity # 经验回放的容量
+        self.buffer = [] # 缓冲区
+        self.position = 0 
+    
+    def push(self, state, action, reward, next_state, done):
+        ''' 缓冲区是一个队列，容量超出时去掉开始存入的转移(transition)
+        '''
+        if len(self.buffer) < self.capacity:
+            self.buffer.append(None)
+        self.buffer[self.position] = (state, action, reward, next_state, done)
+        self.position = (self.position + 1) % self.capacity 
+    
+    def sample(self, batch_size):
+        batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移
+        state, action, reward, next_state, done =  zip(*batch) # 解压成状态，动作等
+        return state, action, reward, next_state, done
+    
+    def __len__(self):
+        ''' 返回当前存储的量
+        '''
+        return len(self.buffer)
+
+class ValueNet(nn.Module):
+    def __init__(self, n_states, hidden_dim, init_w=3e-3):
+        super(ValueNet, self).__init__()
+        
+        self.linear1 = nn.Linear(n_states, hidden_dim)
+        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
+        self.linear3 = nn.Linear(hidden_dim, 1)
+
+        self.linear3.weight.data.uniform_(-init_w, init_w)
+        self.linear3.bias.data.uniform_(-init_w, init_w)
+        
+    def forward(self, state):
+        x = F.relu(self.linear1(state))
+        x = F.relu(self.linear2(x))
+        x = self.linear3(x)
+        return x
+        
+        
+class SoftQNet(nn.Module):
+    def __init__(self, n_states, n_actions, hidden_dim, init_w=3e-3):
+        super(SoftQNet, self).__init__()
+        
+        self.linear1 = nn.Linear(n_states + n_actions, hidden_dim)
+        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
+        self.linear3 = nn.Linear(hidden_dim, 1)
+        
+        self.linear3.weight.data.uniform_(-init_w, init_w)
+        self.linear3.bias.data.uniform_(-init_w, init_w)
+        
+    def forward(self, state, action):
+        x = torch.cat([state, action], 1)
+        x = F.relu(self.linear1(x))
+        x = F.relu(self.linear2(x))
+        x = self.linear3(x)
+        return x
+        
+        
+class PolicyNet(nn.Module):
+    def __init__(self, n_states, n_actions, hidden_dim, init_w=3e-3, log_std_min=-20, log_std_max=2):
+        super(PolicyNet, self).__init__()
+        
+        self.log_std_min = log_std_min
+        self.log_std_max = log_std_max
+        
+        self.linear1 = nn.Linear(n_states, hidden_dim)
+        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
+        
+        self.mean_linear = nn.Linear(hidden_dim, n_actions)
+        self.mean_linear.weight.data.uniform_(-init_w, init_w)
+        self.mean_linear.bias.data.uniform_(-init_w, init_w)
+        
+        self.log_std_linear = nn.Linear(hidden_dim, n_actions)
+        self.log_std_linear.weight.data.uniform_(-init_w, init_w)
+        self.log_std_linear.bias.data.uniform_(-init_w, init_w)
+        
+    def forward(self, state):
+        x = F.relu(self.linear1(state))
+        x = F.relu(self.linear2(x))
+        
+        mean    = self.mean_linear(x)
+        log_std = self.log_std_linear(x)
+        log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max)
+        
+        return mean, log_std
+    
+    def evaluate(self, state, epsilon=1e-6):
+        mean, log_std = self.forward(state)
+        std = log_std.exp()
+        
+        normal = Normal(mean, std)
+        z = normal.sample()
+        action = torch.tanh(z)
+        
+        log_prob = normal.log_prob(z) - torch.log(1 - action.pow(2) + epsilon)
+        log_prob = log_prob.sum(-1, keepdim=True)
+        
+        return action, log_prob, z, mean, log_std
+        
+    
+    def get_action(self, state):
+        state = torch.FloatTensor(state).unsqueeze(0).to(device)
+        mean, log_std = self.forward(state)
+        std = log_std.exp()
+        
+        normal = Normal(mean, std)
+        z      = normal.sample()
+        action = torch.tanh(z)
+        
+        action  = action.detach().cpu().numpy()
+        return action[0]
+        
+class SAC:
+    def __init__(self,n_states,n_actions,cfg) -> None:
+        self.batch_size  = cfg.batch_size 
+        self.memory = ReplayBuffer(cfg.capacity)
+        self.device = cfg.device
+        self.value_net  = ValueNet(n_states, cfg.hidden_dim).to(self.device)
+        self.target_value_net = ValueNet(n_states, cfg.hidden_dim).to(self.device)
+        self.soft_q_net = SoftQNet(n_states, n_actions, cfg.hidden_dim).to(self.device)
+        self.policy_net = PolicyNet(n_states, n_actions, cfg.hidden_dim).to(self.device)  
+        self.value_optimizer  = optim.Adam(self.value_net.parameters(), lr=cfg.value_lr)
+        self.soft_q_optimizer = optim.Adam(self.soft_q_net.parameters(), lr=cfg.soft_q_lr)
+        self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.policy_lr)  
+        for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()):
+            target_param.data.copy_(param.data)
+        self.value_criterion  = nn.MSELoss()
+        self.soft_q_criterion = nn.MSELoss()
+    def update(self, gamma=0.99,mean_lambda=1e-3,
+        std_lambda=1e-3,
+        z_lambda=0.0,
+        soft_tau=1e-2,
+        ):
+        if len(self.memory) < self.batch_size:
+            return 
+        state, action, reward, next_state, done = self.memory.sample(self.batch_size)
+        state      = torch.FloatTensor(state).to(self.device)
+        next_state = torch.FloatTensor(next_state).to(self.device)
+        action     = torch.FloatTensor(action).to(self.device)
+        reward     = torch.FloatTensor(reward).unsqueeze(1).to(self.device)
+        done       = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(self.device)
+        expected_q_value = self.soft_q_net(state, action)
+        expected_value   = self.value_net(state)
+        new_action, log_prob, z, mean, log_std = self.policy_net.evaluate(state)
+
+
+        target_value = self.target_value_net(next_state)
+        next_q_value = reward + (1 - done) * gamma * target_value
+        q_value_loss = self.soft_q_criterion(expected_q_value, next_q_value.detach())
+
+        expected_new_q_value = self.soft_q_net(state, new_action)
+        next_value = expected_new_q_value - log_prob
+        value_loss = self.value_criterion(expected_value, next_value.detach())
+
+        log_prob_target = expected_new_q_value - expected_value
+        policy_loss = (log_prob * (log_prob - log_prob_target).detach()).mean()
+
+
+        mean_loss = mean_lambda * mean.pow(2).mean()
+        std_loss  = std_lambda  * log_std.pow(2).mean()
+        z_loss    = z_lambda    * z.pow(2).sum(1).mean()
+
+        policy_loss += mean_loss + std_loss + z_loss
+
+        self.soft_q_optimizer.zero_grad()
+        q_value_loss.backward()
+        self.soft_q_optimizer.step()
+
+        self.value_optimizer.zero_grad()
+        value_loss.backward()
+        self.value_optimizer.step()
+
+        self.policy_optimizer.zero_grad()
+        policy_loss.backward()
+        self.policy_optimizer.step()
+
+        for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()):
+            target_param.data.copy_(
+                target_param.data * (1.0 - soft_tau) + param.data * soft_tau
+            )
+    def save(self, path):
+        torch.save(self.value_net.state_dict(), path + "sac_value")
+        torch.save(self.value_optimizer.state_dict(), path + "sac_value_optimizer")
+        torch.save(self.soft_q_net.state_dict(), path + "sac_soft_q")
+        torch.save(self.soft_q_optimizer.state_dict(), path + "sac_soft_q_optimizer")
+        
+        torch.save(self.policy_net.state_dict(), path + "sac_policy")
+        torch.save(self.policy_optimizer.state_dict(), path + "sac_policy_optimizer")
+        
+    def load(self, path):
+        self.value_net.load_state_dict(torch.load(path + "sac_value"))
+        self.value_optimizer.load_state_dict(torch.load(path + "sac_value_optimizer"))
+        self.target_value_net = copy.deepcopy(self.value_net)
+
+        self.soft_q_net.load_state_dict(torch.load(path + "sac_soft_q"))
+        self.soft_q_optimizer.load_state_dict(torch.load(path + "sac_soft_q_optimizer"))
+
+        self.policy_net.load_state_dict(torch.load(path + "sac_policy"))
+        self.policy_optimizer.load_state_dict(torch.load(path + "sac_policy_optimizer"))
--- a/codes/SoftActorCritic/task0.py
+++ b/codes/SoftActorCritic/task0.py
@@ -5,7 +5,7 @@ Author: JiangJi
 Email: johnjim0816@gmail.com
 Date: 2021-04-29 12:59:22
 LastEditor: JiangJi
-LastEditTime: 2021-05-06 16:58:01
+LastEditTime: 2021-12-22 16:27:13
 Discription: 
 Environment: 
 '''
@@ -18,23 +18,24 @@ import gym
 import torch
 import datetime

-from SAC.env import NormalizedActions
-from SAC.agent import SAC
+from SoftActorCritic.env_wrapper import NormalizedActions
+from SoftActorCritic.sac import SAC
 from common.utils import save_results, make_dir
-from common.plot import plot_rewards
+from common.utils import plot_rewards

 curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
+algo_name = 'SAC'  # 算法名称
+env_name = 'Pendulum-v1'  # 环境名称
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU

 class SACConfig:
    def __init__(self) -> None:
-        self.algo = 'SAC'
-        self.env_name = 'Pendulum-v1'
-        self.result_path = curr_path+"/outputs/" +self.env_name+'/'+curr_time+'/results/'  # path to save results
-        self.model_path = curr_path+"/outputs/" +self.env_name+'/'+curr_time+'/models/'  # path to save models
+        self.algo_name = algo_name
+        self.env_name = env_name # 环境名称
+        self.device= device
        self.train_eps = 300
-        self.train_steps = 500
-        self.test_eps = 50
-        self.eval_steps = 500
+        self.test_eps = 20
+        self.max_steps = 500 # 每回合的最大步数
        self.gamma = 0.99
        self.mean_lambda=1e-3
        self.std_lambda=1e-3
@@ -46,33 +47,36 @@ class SACConfig:
        self.capacity = 1000000
        self.hidden_dim = 256
        self.batch_size  = 128
-        self.device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
-class PlotConfig(SACConfig):
-	def __init__(self) -> None:
-		super().__init__()
-		self.result_path = curr_path+"/outputs/" + self.env_name + \
-            '/'+curr_time+'/results/'  # 保存结果的路径
-		self.model_path = curr_path+"/outputs/" + self.env_name + \
-            '/'+curr_time+'/models/'  # 保存模型的路径
-		self.save = True # 是否保存图片
+        
+
+class PlotConfig:
+    def __init__(self) -> None:
+        self.algo_name = algo_name  # 算法名称
+        self.env_name = env_name  # 环境名称
+        self.device= device
+        self.result_path = curr_path + "/outputs/" + self.env_name + \
+            '/' + curr_time + '/results/'  # 保存结果的路径
+        self.model_path = curr_path + "/outputs/" + self.env_name + \
+            '/' + curr_time + '/models/'  # 保存模型的路径
+        self.save = True  # 是否保存图片

 def env_agent_config(cfg,seed=1):
    env = NormalizedActions(gym.make(cfg.env_name))
    env.seed(seed)
-    action_dim = env.action_space.shape[0]
-    state_dim  = env.observation_space.shape[0]
-    agent = SAC(state_dim,action_dim,cfg)
+    n_actions = env.action_space.shape[0]
+    n_states  = env.observation_space.shape[0]
+    agent = SAC(n_states,n_actions,cfg)
    return env,agent

 def train(cfg,env,agent):
    print('开始训练!')
-    print(f'环境：{cfg.env_name}, 算法：{cfg.algo}, 设备：{cfg.device}')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
    rewards = [] # 记录所有回合的奖励
    ma_rewards = []  # 记录所有回合的滑动平均奖励
    for i_ep in range(cfg.train_eps):
        ep_reward = 0 # 记录一回合内的奖励
        state = env.reset() # 重置环境，返回初始状态
-        for i_step in range(cfg.train_steps):
+        for i_step in range(cfg.max_steps):
            action = agent.policy_net.get_action(state)
            next_state, reward, done, _ = env.step(action)
            agent.memory.push(state, action, reward, next_state, done)
@@ -81,57 +85,57 @@ def train(cfg,env,agent):
            ep_reward += reward
            if done:
                break
-        if (i_ep+1)%10==0:
-            print(f"Episode:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.3f}")
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
        else:
            ma_rewards.append(ep_reward) 
-    print('Complete training！')
+        if (i_ep+1)%10 == 0: 
+            print(f'回合：{i_ep+1}/{cfg.train_eps}, 奖励：{ep_reward:.3f}')
+    print('完成训练！')
    return rewards, ma_rewards

-def eval(cfg,env,agent):
-    print('Start to eval !')
-    print(f'Env: {cfg.env_name}, Algorithm: {cfg.algo}, Device: {cfg.device}')
-    rewards  = []
-    ma_rewards = [] # moveing average reward
+def test(cfg,env,agent):
+    print('开始测试!')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
+    rewards = [] # 记录所有回合的奖励
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
    for i_ep in range(cfg.test_eps):
        state = env.reset()
        ep_reward = 0
-        for i_step in range(cfg.eval_steps):
+        for i_step in range(cfg.max_steps):
            action = agent.policy_net.get_action(state)
            next_state, reward, done, _ = env.step(action)
            state = next_state
            ep_reward += reward
            if done:
                break
-        if (i_ep+1)%10==0:
-            print(f"Episode:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.3f}")
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
        else:
            ma_rewards.append(ep_reward) 
-    print('Complete evaling！')
+        print(f"回合：{i_ep+1}/{cfg.test_eps}，奖励：{ep_reward:.1f}")
+    print('完成测试！')
    return rewards, ma_rewards

 if __name__ == "__main__":
    cfg=SACConfig()
    plot_cfg = PlotConfig()
-    # train
-    env,agent = env_agent_config(cfg,seed=1)
+    # 训练
+    env, agent = env_agent_config(cfg, seed=1)
    rewards, ma_rewards = train(cfg, env, agent)
-    make_dir(plot_cfg.result_path, plot_cfg.model_path)
-    agent.save(path=plot_cfg.model_path)
-    save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path)
-    plot_rewards(rewards, ma_rewards, plot_cfg, tag="train")
-    # eval
-    env,agent = env_agent_config(cfg,seed=10)
-    agent.load(path=plot_cfg.model_path)
-    rewards,ma_rewards = eval(cfg,env,agent)
-    save_results(rewards,ma_rewards,tag='eval',path=plot_cfg.result_path)
-    plot_rewards(rewards,ma_rewards,plot_cfg,tag="eval")
+    make_dir(plot_cfg.result_path, plot_cfg.model_path)  # 创建保存结果和模型路径的文件夹
+    agent.save(path=plot_cfg.model_path)  # 保存模型
+    save_results(rewards, ma_rewards, tag='train',
+                path=plot_cfg.result_path)  # 保存结果
+    plot_rewards(rewards, ma_rewards, plot_cfg, tag="train")  # 画出结果
+    # 测试
+    env, agent = env_agent_config(cfg, seed=10)
+    agent.load(path=plot_cfg.model_path)  # 导入模型
+    rewards, ma_rewards = test(cfg, env, agent)
+    save_results(rewards, ma_rewards, tag='test', path=plot_cfg.result_path)  # 保存结果
+    plot_rewards(rewards, ma_rewards, plot_cfg, tag="test")  # 画出结果



--- a/codes/SoftActorCritic/task0_train.ipynb
+++ b/codes/SoftActorCritic/task0_train.ipynb
@@ -70,9 +70,9 @@
    "def env_agent_config(cfg,seed=1):\n",
    "    env = NormalizedActions(gym.make(\"Pendulum-v0\"))\n",
    "    env.seed(seed)\n",
-    "    action_dim = env.action_space.shape[0]\n",
-    "    state_dim  = env.observation_space.shape[0]\n",
-    "    agent = SAC(state_dim,action_dim,cfg)\n",
+    "    n_actions = env.action_space.shape[0]\n",
+    "    n_states  = env.observation_space.shape[0]\n",
+    "    agent = SAC(n_states,n_actions,cfg)\n",
    "    return env,agent"
   ]
  },
@@ -159,7 +159,7 @@
      "\nDuring handling of the above exception, another exception occurred:\n",
      "\u001b[0;31mDeprecatedEnv\u001b[0m                             Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-7-91b1038013e4>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m     \u001b[0;31m# train\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m     \u001b[0menv\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0magent\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0menv_agent_config\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcfg\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mseed\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      6\u001b[0m     \u001b[0mrewards\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mma_rewards\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcfg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0magent\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      7\u001b[0m     \u001b[0mmake_dir\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcfg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresult_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcfg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m<ipython-input-4-040773221550>\u001b[0m in \u001b[0;36menv_agent_config\u001b[0;34m(cfg, seed)\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0menv_agent_config\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcfg\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mseed\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m     \u001b[0menv\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mNormalizedActions\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgym\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Pendulum-v0\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m     \u001b[0menv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseed\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m     \u001b[0maction_dim\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maction_space\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m     \u001b[0mstate_dim\u001b[0m  \u001b[0;34m=\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mobservation_space\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m<ipython-input-4-040773221550>\u001b[0m in \u001b[0;36menv_agent_config\u001b[0;34m(cfg, seed)\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0menv_agent_config\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcfg\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mseed\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m     \u001b[0menv\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mNormalizedActions\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgym\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Pendulum-v0\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m     \u001b[0menv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseed\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m     \u001b[0mn_actions\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maction_space\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m     \u001b[0mn_states\u001b[0m  \u001b[0;34m=\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mobservation_space\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/envs/py37/lib/python3.7/site-packages/gym/envs/registration.py\u001b[0m in \u001b[0;36mmake\u001b[0;34m(id, **kwargs)\u001b[0m\n\u001b[1;32m    233\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    234\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mid\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 235\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0mregistry\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mid\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    236\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    237\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/envs/py37/lib/python3.7/site-packages/gym/envs/registration.py\u001b[0m in \u001b[0;36mmake\u001b[0;34m(self, path, **kwargs)\u001b[0m\n\u001b[1;32m    126\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    127\u001b[0m             \u001b[0mlogger\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Making new env: %s\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 128\u001b[0;31m         \u001b[0mspec\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mspec\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    129\u001b[0m         \u001b[0menv\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    130\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0menv\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/envs/py37/lib/python3.7/site-packages/gym/envs/registration.py\u001b[0m in \u001b[0;36mspec\u001b[0;34m(self, path)\u001b[0m\n\u001b[1;32m    185\u001b[0m                 raise error.DeprecatedEnv(\n\u001b[1;32m    186\u001b[0m                     \"Env {} not found (valid versions include {})\".format(\n\u001b[0;32m--> 187\u001b[0;31m                         \u001b[0mid\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmatching_envs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    188\u001b[0m                     )\n\u001b[1;32m    189\u001b[0m                 )\n",
--- a/codes/TD3/memory.py
+++ b/codes/TD3/memory.py
@@ -14,13 +14,13 @@ import torch


 class ReplayBuffer(object):
-	def __init__(self, state_dim, action_dim, max_size=int(1e6)):
+	def __init__(self, n_states, n_actions, max_size=int(1e6)):
 		self.max_size = max_size
 		self.ptr = 0
 		self.size = 0
-		self.state = np.zeros((max_size, state_dim))
-		self.action = np.zeros((max_size, action_dim))
-		self.next_state = np.zeros((max_size, state_dim))
+		self.state = np.zeros((max_size, n_states))
+		self.action = np.zeros((max_size, n_actions))
+		self.next_state = np.zeros((max_size, n_states))
 		self.reward = np.zeros((max_size, 1))
 		self.not_done = np.zeros((max_size, 1))
 		self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
--- a/codes/TD3/task0_eval.py
+++ b/codes/TD3/task0_eval.py
@@ -74,10 +74,10 @@ if __name__ == "__main__":
 	env.seed(cfg.seed) # Set seeds
 	torch.manual_seed(cfg.seed)
 	np.random.seed(cfg.seed)
-	state_dim = env.observation_space.shape[0]
-	action_dim = env.action_space.shape[0] 
+	n_states = env.observation_space.shape[0]
+	n_actions = env.action_space.shape[0] 
 	max_action = float(env.action_space.high[0])
-	td3= TD3(state_dim,action_dim,max_action,cfg)
+	td3= TD3(n_states,n_actions,max_action,cfg)
 	cfg.model_path = './TD3/results/HalfCheetah-v2/20210416-130341/models/'
 	td3.load(cfg.model_path)
 	td3_rewards,td3_ma_rewards = eval(cfg.env,td3,cfg.seed)
--- a/codes/TD3/task0_train.py
+++ b/codes/TD3/task0_train.py
@@ -72,7 +72,7 @@ def train(cfg,env,agent):
 		else:
 			action = (
 				agent.choose_action(np.array(state))
-				+ np.random.normal(0, max_action * cfg.expl_noise, size=action_dim)
+				+ np.random.normal(0, max_action * cfg.expl_noise, size=n_actions)
 			).clip(-max_action, max_action)
 		# Perform action
 		next_state, reward, done, _ = env.step(action) 
@@ -121,11 +121,11 @@ def train(cfg,env,agent):
 # 			else:
 # 				action = (
 # 					agent.choose_action(np.array(state))
-# 					+ np.random.normal(0, max_action * cfg.expl_noise, size=action_dim)
+# 					+ np.random.normal(0, max_action * cfg.expl_noise, size=n_actions)
 # 				).clip(-max_action, max_action)
 # 			# action = (
 # 			# 		agent.choose_action(np.array(state))
-# 			# 		+ np.random.normal(0, max_action * cfg.expl_noise, size=action_dim)
+# 			# 		+ np.random.normal(0, max_action * cfg.expl_noise, size=n_actions)
 # 			# 	).clip(-max_action, max_action)
 # 			# Perform action
 # 			next_state, reward, done, _ = env.step(action) 
@@ -157,10 +157,10 @@ if __name__ == "__main__":
 	env.seed(cfg.seed) # Set seeds
 	torch.manual_seed(cfg.seed)
 	np.random.seed(cfg.seed)
-	state_dim = env.observation_space.shape[0]
-	action_dim = env.action_space.shape[0] 
+	n_states = env.observation_space.shape[0]
+	n_actions = env.action_space.shape[0] 
 	max_action = float(env.action_space.high[0])
-	agent = TD3(state_dim,action_dim,max_action,cfg)
+	agent = TD3(n_states,n_actions,max_action,cfg)
 	rewards,ma_rewards = train(cfg,env,agent)
 	make_dir(cfg.result_path,cfg.model_path)
 	agent.save(path=cfg.model_path)
--- a/codes/TD3/task1_eval.py
+++ b/codes/TD3/task1_eval.py
@@ -70,10 +70,10 @@ if __name__ == "__main__":
 	env.seed(cfg.seed) # Set seeds
 	torch.manual_seed(cfg.seed)
 	np.random.seed(cfg.seed)
-	state_dim = env.observation_space.shape[0]
-	action_dim = env.action_space.shape[0] 
+	n_states = env.observation_space.shape[0]
+	n_actions = env.action_space.shape[0] 
 	max_action = float(env.action_space.high[0])
-	td3= TD3(state_dim,action_dim,max_action,cfg)
+	td3= TD3(n_states,n_actions,max_action,cfg)
 	cfg.model_path = './TD3/results/Pendulum-v0/20210428-092059/models/'
 	cfg.result_path = './TD3/results/Pendulum-v0/20210428-092059/results/'
 	td3.load(cfg.model_path)
--- a/codes/TD3/task1_train.py
+++ b/codes/TD3/task1_train.py
@@ -79,7 +79,7 @@ def train(cfg,env,agent):
 			else:
 				action = (
 					agent.choose_action(np.array(state))
-					+ np.random.normal(0, max_action * cfg.expl_noise, size=action_dim)
+					+ np.random.normal(0, max_action * cfg.expl_noise, size=n_actions)
 				).clip(-max_action, max_action)
 			# Perform action
 			next_state, reward, done, _ = env.step(action) 
@@ -109,10 +109,10 @@ if __name__ == "__main__":
 	env.seed(1) # 随机种子
 	torch.manual_seed(1)
 	np.random.seed(1)
-	state_dim = env.observation_space.shape[0]
-	action_dim = env.action_space.shape[0] 
+	n_states = env.observation_space.shape[0]
+	n_actions = env.action_space.shape[0] 
 	max_action = float(env.action_space.high[0])
-	agent = TD3(state_dim,action_dim,max_action,cfg)
+	agent = TD3(n_states,n_actions,max_action,cfg)
 	rewards,ma_rewards = train(cfg,env,agent)
 	make_dir(plot_cfg.result_path,plot_cfg.model_path)
 	agent.save(path=plot_cfg.model_path)
--- a/codes/common/atari_wrappers.py
+++ b/codes/common/atari_wrappers.py
@@ -0,0 +1,284 @@
+import numpy as np
+import os
+os.environ.setdefault('PATH', '')
+from collections import deque
+import gym
+from gym import spaces
+import cv2
+cv2.ocl.setUseOpenCL(False)
+from .wrappers import TimeLimit
+
+
+class NoopResetEnv(gym.Wrapper):
+    def __init__(self, env, noop_max=30):
+        """Sample initial states by taking random number of no-ops on reset.
+        No-op is assumed to be action 0.
+        """
+        gym.Wrapper.__init__(self, env)
+        self.noop_max = noop_max
+        self.override_num_noops = None
+        self.noop_action = 0
+        assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
+
+    def reset(self, **kwargs):
+        """ Do no-op action for a number of steps in [1, noop_max]."""
+        self.env.reset(**kwargs)
+        if self.override_num_noops is not None:
+            noops = self.override_num_noops
+        else:
+            noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101
+        assert noops > 0
+        obs = None
+        for _ in range(noops):
+            obs, _, done, _ = self.env.step(self.noop_action)
+            if done:
+                obs = self.env.reset(**kwargs)
+        return obs
+
+    def step(self, ac):
+        return self.env.step(ac)
+
+class FireResetEnv(gym.Wrapper):
+    def __init__(self, env):
+        """Take action on reset for environments that are fixed until firing."""
+        gym.Wrapper.__init__(self, env)
+        assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
+        assert len(env.unwrapped.get_action_meanings()) >= 3
+
+    def reset(self, **kwargs):
+        self.env.reset(**kwargs)
+        obs, _, done, _ = self.env.step(1)
+        if done:
+            self.env.reset(**kwargs)
+        obs, _, done, _ = self.env.step(2)
+        if done:
+            self.env.reset(**kwargs)
+        return obs
+
+    def step(self, ac):
+        return self.env.step(ac)
+
+class EpisodicLifeEnv(gym.Wrapper):
+    def __init__(self, env):
+        """Make end-of-life == end-of-episode, but only reset on true game over.
+        Done by DeepMind for the DQN and co. since it helps value estimation.
+        """
+        gym.Wrapper.__init__(self, env)
+        self.lives = 0
+        self.was_real_done  = True
+
+    def step(self, action):
+        obs, reward, done, info = self.env.step(action)
+        self.was_real_done = done
+        # check current lives, make loss of life terminal,
+        # then update lives to handle bonus lives
+        lives = self.env.unwrapped.ale.lives()
+        if lives < self.lives and lives > 0:
+            # for Qbert sometimes we stay in lives == 0 condition for a few frames
+            # so it's important to keep lives > 0, so that we only reset once
+            # the environment advertises done.
+            done = True
+        self.lives = lives
+        return obs, reward, done, info
+
+    def reset(self, **kwargs):
+        """Reset only when lives are exhausted.
+        This way all states are still reachable even though lives are episodic,
+        and the learner need not know about any of this behind-the-scenes.
+        """
+        if self.was_real_done:
+            obs = self.env.reset(**kwargs)
+        else:
+            # no-op step to advance from terminal/lost life state
+            obs, _, _, _ = self.env.step(0)
+        self.lives = self.env.unwrapped.ale.lives()
+        return obs
+
+class MaxAndSkipEnv(gym.Wrapper):
+    def __init__(self, env, skip=4):
+        """Return only every `skip`-th frame"""
+        gym.Wrapper.__init__(self, env)
+        # most recent raw observations (for max pooling across time steps)
+        self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8)
+        self._skip       = skip
+
+    def step(self, action):
+        """Repeat action, sum reward, and max over last observations."""
+        total_reward = 0.0
+        done = None
+        for i in range(self._skip):
+            obs, reward, done, info = self.env.step(action)
+            if i == self._skip - 2: self._obs_buffer[0] = obs
+            if i == self._skip - 1: self._obs_buffer[1] = obs
+            total_reward += reward
+            if done:
+                break
+        # Note that the observation on the done=True frame
+        # doesn't matter
+        max_frame = self._obs_buffer.max(axis=0)
+
+        return max_frame, total_reward, done, info
+
+    def reset(self, **kwargs):
+        return self.env.reset(**kwargs)
+
+class ClipRewardEnv(gym.RewardWrapper):
+    def __init__(self, env):
+        gym.RewardWrapper.__init__(self, env)
+
+    def reward(self, reward):
+        """Bin reward to {+1, 0, -1} by its sign."""
+        return np.sign(reward)
+
+
+class WarpFrame(gym.ObservationWrapper):
+    def __init__(self, env, width=84, height=84, grayscale=True, dict_space_key=None):
+        """
+        Warp frames to 84x84 as done in the Nature paper and later work.
+        If the environment uses dictionary observations, `dict_space_key` can be specified which indicates which
+        observation should be warped.
+        """
+        super().__init__(env)
+        self._width = width
+        self._height = height
+        self._grayscale = grayscale
+        self._key = dict_space_key
+        if self._grayscale:
+            num_colors = 1
+        else:
+            num_colors = 3
+
+        new_space = gym.spaces.Box(
+            low=0,
+            high=255,
+            shape=(self._height, self._width, num_colors),
+            dtype=np.uint8,
+        )
+        if self._key is None:
+            original_space = self.observation_space
+            self.observation_space = new_space
+        else:
+            original_space = self.observation_space.spaces[self._key]
+            self.observation_space.spaces[self._key] = new_space
+        assert original_space.dtype == np.uint8 and len(original_space.shape) == 3
+
+    def observation(self, obs):
+        if self._key is None:
+            frame = obs
+        else:
+            frame = obs[self._key]
+
+        if self._grayscale:
+            frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
+        frame = cv2.resize(
+            frame, (self._width, self._height), interpolation=cv2.INTER_AREA
+        )
+        if self._grayscale:
+            frame = np.expand_dims(frame, -1)
+
+        if self._key is None:
+            obs = frame
+        else:
+            obs = obs.copy()
+            obs[self._key] = frame
+        return obs
+
+
+class FrameStack(gym.Wrapper):
+    def __init__(self, env, k):
+        """Stack k last frames.
+        Returns lazy array, which is much more memory efficient.
+        See Also
+        --------
+        baselines.common.atari_wrappers.LazyFrames
+        """
+        gym.Wrapper.__init__(self, env)
+        self.k = k
+        self.frames = deque([], maxlen=k)
+        shp = env.observation_space.shape
+        self.observation_space = spaces.Box(low=0, high=255, shape=(shp[:-1] + (shp[-1] * k,)), dtype=env.observation_space.dtype)
+
+    def reset(self):
+        ob = self.env.reset()
+        for _ in range(self.k):
+            self.frames.append(ob)
+        return self._get_ob()
+
+    def step(self, action):
+        ob, reward, done, info = self.env.step(action)
+        self.frames.append(ob)
+        return self._get_ob(), reward, done, info
+
+    def _get_ob(self):
+        assert len(self.frames) == self.k
+        return LazyFrames(list(self.frames))
+
+class ScaledFloatFrame(gym.ObservationWrapper):
+    def __init__(self, env):
+        gym.ObservationWrapper.__init__(self, env)
+        self.observation_space = gym.spaces.Box(low=0, high=1, shape=env.observation_space.shape, dtype=np.float32)
+
+    def observation(self, observation):
+        # careful! This undoes the memory optimization, use
+        # with smaller replay buffers only.
+        return np.array(observation).astype(np.float32) / 255.0
+
+class LazyFrames(object):
+    def __init__(self, frames):
+        """This object ensures that common frames between the observations are only stored once.
+        It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
+        buffers.
+        This object should only be converted to numpy array before being passed to the model.
+        You'd not believe how complex the previous solution was."""
+        self._frames = frames
+        self._out = None
+
+    def _force(self):
+        if self._out is None:
+            self._out = np.concatenate(self._frames, axis=-1)
+            self._frames = None
+        return self._out
+
+    def __array__(self, dtype=None):
+        out = self._force()
+        if dtype is not None:
+            out = out.astype(dtype)
+        return out
+
+    def __len__(self):
+        return len(self._force())
+
+    def __getitem__(self, i):
+        return self._force()[i]
+
+    def count(self):
+        frames = self._force()
+        return frames.shape[frames.ndim - 1]
+
+    def frame(self, i):
+        return self._force()[..., i]
+
+def make_atari(env_id, max_episode_steps=None):
+    env = gym.make(env_id)
+    assert 'NoFrameskip' in env.spec.id
+    env = NoopResetEnv(env, noop_max=30)
+    env = MaxAndSkipEnv(env, skip=4)
+    if max_episode_steps is not None:
+        env = TimeLimit(env, max_episode_steps=max_episode_steps)
+    return env
+
+def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False):
+    """Configure environment for DeepMind-style Atari.
+    """
+    if episode_life:
+        env = EpisodicLifeEnv(env)
+    if 'FIRE' in env.unwrapped.get_action_meanings():
+        env = FireResetEnv(env)
+    env = WarpFrame(env)
+    if scale:
+        env = ScaledFloatFrame(env)
+    if clip_rewards:
+        env = ClipRewardEnv(env)
+    if frame_stack:
+        env = FrameStack(env, 4)
+    return env
--- a/codes/common/model.py
+++ b/codes/common/model.py
@@ -32,10 +32,10 @@ class MLP(nn.Module):
        return self.fc3(x)

 class Critic(nn.Module):
-    def __init__(self, n_obs, action_dim, hidden_size, init_w=3e-3):
+    def __init__(self, n_obs, n_actions, hidden_size, init_w=3e-3):
        super(Critic, self).__init__()
        
-        self.linear1 = nn.Linear(n_obs + action_dim, hidden_size)
+        self.linear1 = nn.Linear(n_obs + n_actions, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.linear3 = nn.Linear(hidden_size, 1)
        # 随机初始化为较小的值
@@ -51,11 +51,11 @@ class Critic(nn.Module):
        return x

 class Actor(nn.Module):
-    def __init__(self, n_obs, action_dim, hidden_size, init_w=3e-3):
+    def __init__(self, n_obs, n_actions, hidden_size, init_w=3e-3):
        super(Actor, self).__init__()  
        self.linear1 = nn.Linear(n_obs, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
-        self.linear3 = nn.Linear(hidden_size, action_dim)
+        self.linear3 = nn.Linear(hidden_size, n_actions)
        
        self.linear3.weight.data.uniform_(-init_w, init_w)
        self.linear3.bias.data.uniform_(-init_w, init_w)
@@ -67,18 +67,18 @@ class Actor(nn.Module):
        return x

 class ActorCritic(nn.Module):
-    def __init__(self, state_dim, action_dim, hidden_dim=256):
+    def __init__(self, n_states, n_actions, hidden_dim=256):
        super(ActorCritic, self).__init__()
        self.critic = nn.Sequential(
-            nn.Linear(state_dim, hidden_dim),
+            nn.Linear(n_states, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )
        
        self.actor = nn.Sequential(
-            nn.Linear(state_dim, hidden_dim),
+            nn.Linear(n_states, hidden_dim),
            nn.ReLU(),
-            nn.Linear(hidden_dim, action_dim),
+            nn.Linear(hidden_dim, n_actions),
            nn.Softmax(dim=1),
        )
        
--- a/codes/common/wrappers.py
+++ b/codes/common/wrappers.py
@@ -0,0 +1,29 @@
+import gym
+
+class TimeLimit(gym.Wrapper):
+    def __init__(self, env, max_episode_steps=None):
+        super(TimeLimit, self).__init__(env)
+        self._max_episode_steps = max_episode_steps
+        self._elapsed_steps = 0
+
+    def step(self, ac):
+        observation, reward, done, info = self.env.step(ac)
+        self._elapsed_steps += 1
+        if self._elapsed_steps >= self._max_episode_steps:
+            done = True
+            info['TimeLimit.truncated'] = True
+        return observation, reward, done, info
+
+    def reset(self, **kwargs):
+        self._elapsed_steps = 0
+        return self.env.reset(**kwargs)
+
+class ClipActionsWrapper(gym.Wrapper):
+    def step(self, action):
+        import numpy as np
+        action = np.nan_to_num(action)
+        action = np.clip(action, self.action_space.low, self.action_space.high)
+        return self.env.step(action)
+
+    def reset(self, **kwargs):
+        return self.env.reset(**kwargs)
--- a/codes/envs/blackjack.py
+++ b/codes/envs/blackjack.py
@@ -77,7 +77,7 @@ class BlackjackEnv(gym.Env):
        self.natural = natural
        # Start the first game
        self._reset()        # Number of 
-        self.action_dim = 2
+        self.n_actions = 2

    def reset(self):
        return self._reset()
--- a/codes/envs/cliff_walking.py
+++ b/codes/envs/cliff_walking.py
@@ -31,7 +31,7 @@ class CliffWalkingEnv(discrete.DiscreteEnv):
        self.shape = (4, 12)

        nS = np.prod(self.shape)
-        action_dim = 4
+        n_actions = 4

        # Cliff Location
        self._cliff = np.zeros(self.shape, dtype=np.bool)
@@ -41,7 +41,7 @@ class CliffWalkingEnv(discrete.DiscreteEnv):
        P = {}
        for s in range(nS):
            position = np.unravel_index(s, self.shape)
-            P[s] = { a : [] for a in range(action_dim) }
+            P[s] = { a : [] for a in range(n_actions) }
            P[s][UP] = self._calculate_transition_prob(position, [-1, 0])
            P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1])
            P[s][DOWN] = self._calculate_transition_prob(position, [1, 0])
@@ -51,7 +51,7 @@ class CliffWalkingEnv(discrete.DiscreteEnv):
        isd = np.zeros(nS)
        isd[np.ravel_multi_index((3,0), self.shape)] = 1.0

-        super(CliffWalkingEnv, self).__init__(nS, action_dim, P, isd)
+        super(CliffWalkingEnv, self).__init__(nS, n_actions, P, isd)

    def render(self, mode='human', close=False):
        self._render(mode, close)
--- a/codes/envs/gridworld.py
+++ b/codes/envs/gridworld.py
@@ -37,7 +37,7 @@ class GridworldEnv(discrete.DiscreteEnv):
        self.shape = shape

        nS = np.prod(shape)
-        action_dim = 4
+        n_actions = 4

        MAX_Y = shape[0]
        MAX_X = shape[1]
@@ -51,7 +51,7 @@ class GridworldEnv(discrete.DiscreteEnv):
            y, x = it.multi_index

            # P[s][a] = (prob, next_state, reward, is_done)
-            P[s] = {a : [] for a in range(action_dim)}
+            P[s] = {a : [] for a in range(n_actions)}

            is_done = lambda s: s == 0 or s == (nS - 1)
            reward = 0.0 if is_done(s) else -1.0
@@ -82,7 +82,7 @@ class GridworldEnv(discrete.DiscreteEnv):
        # This should not be used in any model-free learning algorithm
        self.P = P

-        super(GridworldEnv, self).__init__(nS, action_dim, P, isd)
+        super(GridworldEnv, self).__init__(nS, n_actions, P, isd)

    def _render(self, mode='human', close=False):
        """ Renders the current gridworld layout
--- a/codes/envs/stochastic_mdp.py
+++ b/codes/envs/stochastic_mdp.py
@@ -17,31 +17,31 @@ class StochasticMDP:
    def __init__(self):
        self.end = False
        self.curr_state = 2
-        self.action_dim = 2
-        self.state_dim = 6
+        self.n_actions = 2
+        self.n_states = 6
        self.p_right = 0.5

    def reset(self):
        self.end = False
        self.curr_state = 2
-        state = np.zeros(self.state_dim)
+        state = np.zeros(self.n_states)
        state[self.curr_state - 1] = 1.
        return state

    def step(self, action):
        if self.curr_state != 1:
            if action == 1:
-                if random.random() < self.p_right and self.curr_state < self.state_dim:
+                if random.random() < self.p_right and self.curr_state < self.n_states:
                    self.curr_state += 1
                else:
                    self.curr_state -= 1

            if action == 0:
                self.curr_state -= 1
-        if self.curr_state == self.state_dim:
+        if self.curr_state == self.n_states:
            self.end = True

-        state = np.zeros(self.state_dim)
+        state = np.zeros(self.n_states)
        state[self.curr_state - 1] = 1.

        if self.curr_state == 1:
--- a/codes/envs/windy_gridworld.py
+++ b/codes/envs/windy_gridworld.py
@@ -30,7 +30,7 @@ class WindyGridworldEnv(discrete.DiscreteEnv):
        self.shape = (7, 10)

        nS = np.prod(self.shape)
-        action_dim = 4
+        n_actions = 4

        # Wind strength
        winds = np.zeros(self.shape)
@@ -41,7 +41,7 @@ class WindyGridworldEnv(discrete.DiscreteEnv):
        P = {}
        for s in range(nS):
            position = np.unravel_index(s, self.shape)
-            P[s] = { a : [] for a in range(action_dim) }
+            P[s] = { a : [] for a in range(n_actions) }
            P[s][UP] = self._calculate_transition_prob(position, [-1, 0], winds)
            P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1], winds)
            P[s][DOWN] = self._calculate_transition_prob(position, [1, 0], winds)
@@ -51,7 +51,7 @@ class WindyGridworldEnv(discrete.DiscreteEnv):
        isd = np.zeros(nS)
        isd[np.ravel_multi_index((3,0), self.shape)] = 1.0

-        super(WindyGridworldEnv, self).__init__(nS, action_dim, P, isd)
+        super(WindyGridworldEnv, self).__init__(nS, n_actions, P, isd)

    def render(self, mode='human', close=False):
        self._render(mode, close)