update rainbowdqn

2022-05-31 01:20:58 +08:00
parent cfc0f6492e
commit c7c94468c9
149 changed files with 1866 additions and 1549 deletions
--- a/codes/A2C/agent.py
+++ b/codes/A2C/agent.py
@@ -40,10 +40,10 @@ class ActorCritic(nn.Module):
 class A2C:
    ''' A2C算法
    '''
-    def __init__(self,state_dim,action_dim,cfg) -> None:
+    def __init__(self,n_states,n_actions,cfg) -> None:
        self.gamma = cfg.gamma
        self.device = cfg.device
-        self.model = ActorCritic(state_dim, action_dim, cfg.hidden_size).to(self.device)
+        self.model = ActorCritic(n_states, n_actions, cfg.hidden_size).to(self.device)
        self.optimizer = optim.Adam(self.model.parameters())

    def compute_returns(self,next_value, rewards, masks):
--- a/codes/A2C/task0.py
+++ b/codes/A2C/task0.py
@@ -10,7 +10,7 @@ import torch
 import torch.optim as optim
 import datetime
 from common.multiprocessing_env import SubprocVecEnv
-from A2C.agent import ActorCritic
+from a2c import ActorCritic
 from common.utils import save_results, make_dir
 from common.utils import plot_rewards

@@ -74,9 +74,9 @@ def train(cfg,envs):
    print(f'环境：{cfg.env_name}, 算法：{cfg.algo}, 设备：{cfg.device}')
    env = gym.make(cfg.env_name) # a single env
    env.seed(10)
-    state_dim  = envs.observation_space.shape[0]
-    action_dim = envs.action_space.n
-    model = ActorCritic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
+    n_states  = envs.observation_space.shape[0]
+    n_actions = envs.action_space.n
+    model = ActorCritic(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
    optimizer = optim.Adam(model.parameters())
    frame_idx    = 0
    test_rewards = []
--- a/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/models/checkpoint.pt
+++ b/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/models/checkpoint.pt
--- a/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/eval_ma_rewards.npy
+++ b/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/eval_ma_rewards.npy
--- a/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/eval_rewards.npy
+++ b/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/eval_rewards.npy
--- a/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/eval_rewards_curve_cn.png
+++ b/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/eval_rewards_curve_cn.png
--- a/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/train_ma_rewards.npy
+++ b/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/train_ma_rewards.npy
--- a/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/train_rewards.npy
+++ b/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/train_rewards.npy
--- a/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/train_rewards_curve_cn.png
+++ b/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/train_rewards_curve_cn.png
--- a/codes/DDPG/agent.py
+++ b/codes/DDPG/agent.py
@@ -39,11 +39,11 @@ class ReplayBuffer:
        '''
        return len(self.buffer)
 class Actor(nn.Module):
-    def __init__(self, state_dim, action_dim, hidden_dim, init_w=3e-3):
+    def __init__(self, n_states, n_actions, hidden_dim, init_w=3e-3):
        super(Actor, self).__init__()  
-        self.linear1 = nn.Linear(state_dim, hidden_dim)
+        self.linear1 = nn.Linear(n_states, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
-        self.linear3 = nn.Linear(hidden_dim, action_dim)
+        self.linear3 = nn.Linear(hidden_dim, n_actions)
        
        self.linear3.weight.data.uniform_(-init_w, init_w)
        self.linear3.bias.data.uniform_(-init_w, init_w)
@@ -54,10 +54,10 @@ class Actor(nn.Module):
        x = torch.tanh(self.linear3(x))
        return x
 class Critic(nn.Module):
-    def __init__(self, state_dim, action_dim, hidden_dim, init_w=3e-3):
+    def __init__(self, n_states, n_actions, hidden_dim, init_w=3e-3):
        super(Critic, self).__init__()
        
-        self.linear1 = nn.Linear(state_dim + action_dim, hidden_dim)
+        self.linear1 = nn.Linear(n_states + n_actions, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
        self.linear3 = nn.Linear(hidden_dim, 1)
        # 随机初始化为较小的值
@@ -72,12 +72,12 @@ class Critic(nn.Module):
        x = self.linear3(x)
        return x
 class DDPG:
-    def __init__(self, state_dim, action_dim, cfg):
+    def __init__(self, n_states, n_actions, cfg):
        self.device = cfg.device
-        self.critic = Critic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
-        self.actor = Actor(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
-        self.target_critic = Critic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
-        self.target_actor = Actor(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
+        self.critic = Critic(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
+        self.actor = Actor(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
+        self.target_critic = Critic(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
+        self.target_actor = Actor(n_states, n_actions, cfg.hidden_dim).to(cfg.device)

        # 复制参数到目标网络
        for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()):
--- a/codes/DDPG/env.py
+++ b/codes/DDPG/env.py
@@ -39,15 +39,15 @@ class OUNoise(object):
        self.max_sigma    = max_sigma
        self.min_sigma    = min_sigma
        self.decay_period = decay_period
-        self.action_dim   = action_space.shape[0]
+        self.n_actions   = action_space.shape[0]
        self.low          = action_space.low
        self.high         = action_space.high
        self.reset()
    def reset(self):
-        self.obs = np.ones(self.action_dim) * self.mu
+        self.obs = np.ones(self.n_actions) * self.mu
    def evolve_obs(self):
        x  = self.obs
-        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
+        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.n_actions)
        self.obs = x + dx
        return self.obs
    def get_action(self, action, t=0):
--- a/codes/DDPG/task0.py
+++ b/codes/DDPG/task0.py
@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-11 20:58:21
@LastEditor: John
-LastEditTime: 2021-09-16 01:31:33
+LastEditTime: 2022-02-10 06:23:27
@Discription: 
@Environment: python 3.7.7
 '''
@@ -18,23 +18,29 @@ import datetime
 import gym
 import torch

-from DDPG.env import NormalizedActions
-from DDPG.agent import DDPG
+from env import NormalizedActions,OUNoise
+from ddpg import DDPG
 from DDPG.train import train,test
 from common.utils import save_results,make_dir
 from common.utils import plot_rewards

 curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
-algo_name = 'DDPG'  # 算法名称
-env_name = 'Pendulum-v1'  # 环境名称，gym新版本（约0.21.0之后）中Pendulum-v0改为Pendulum-v1
+class Config:
+    '''超参数
+    '''

-class DDPGConfig:
    def __init__(self):
-        self.algo_name = algo_name # 算法名称
-        self.env_name = env_name # 环境名称
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
+        ################################## 环境超参数 ###################################
+        self.algo_name = 'DDPG'  # 算法名称
+        self.env_name = 'Pendulum-v1'  # 环境名称，gym新版本（约0.21.0之后）中Pendulum-v0改为Pendulum-v1
+        self.device = torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu")  # 检测GPUgjgjlkhfsf风刀霜的撒发十
+        self.seed = 10 # 随机种子，置0则不设置随机种子
        self.train_eps = 300 # 训练的回合数
        self.test_eps = 50 # 测试的回合数
+        ################################################################################
+        
+        ################################## 算法超参数 ###################################
        self.gamma = 0.99 # 折扣因子
        self.critic_lr = 1e-3 # 评论家网络的学习率
        self.actor_lr = 1e-4 # 演员网络的学习率
@@ -43,39 +49,92 @@ class DDPGConfig:
        self.target_update = 2 # 目标网络的更新频率
        self.hidden_dim = 256 # 网络隐藏层维度
        self.soft_tau = 1e-2 # 软更新参数
+        ################################################################################
        
-class PlotConfig:
-    def __init__(self) -> None:
-        self.algo_name = algo_name  # 算法名称
-        self.env_name = env_name # 环境名称
-        self.result_path = curr_path+"/outputs/" + self.env_name + \
-            '/'+curr_time+'/results/'  # 保存结果的路径
-        self.model_path = curr_path+"/outputs/" + self.env_name + \
-            '/'+curr_time+'/models/'  # 保存模型的路径
+        ################################# 保存结果相关参数 ################################
+        self.result_path = curr_path + "/outputs/" + self.env_name + \
+            '/' + curr_time + '/results/'  # 保存结果的路径
+        self.model_path = curr_path + "/outputs/" + self.env_name + \
+            '/' + curr_time + '/models/'  # 保存模型的路径
        self.save = True # 是否保存图片
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 检测GPU
+        ################################################################################

 def env_agent_config(cfg,seed=1):
    env = NormalizedActions(gym.make(cfg.env_name)) # 装饰action噪声
    env.seed(seed) # 随机种子
-    state_dim = env.observation_space.shape[0]
-    action_dim = env.action_space.shape[0]
-    agent = DDPG(state_dim,action_dim,cfg)
+    n_states = env.observation_space.shape[0]
+    n_actions = env.action_space.shape[0]
+    agent = DDPG(n_states,n_actions,cfg)
    return env,agent
+def train(cfg, env, agent):
+    print('开始训练！')
+    print(f'环境：{cfg.env_name}，算法：{cfg.algo}，设备：{cfg.device}')
+    ou_noise = OUNoise(env.action_space)  # 动作噪声
+    rewards = [] # 记录所有回合的奖励
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    for i_ep in range(cfg.train_eps):
+        state = env.reset()
+        ou_noise.reset()
+        done = False
+        ep_reward = 0
+        i_step = 0
+        while not done:
+            i_step += 1
+            action = agent.choose_action(state)
+            action = ou_noise.get_action(action, i_step) 
+            next_state, reward, done, _ = env.step(action)
+            ep_reward += reward
+            agent.memory.push(state, action, reward, next_state, done)
+            agent.update()
+            state = next_state
+        if (i_ep+1)%10 == 0:
+            print('回合：{}/{}，奖励：{:.2f}'.format(i_ep+1, cfg.train_eps, ep_reward))
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
+        else:
+            ma_rewards.append(ep_reward)
+    print('完成训练！')
+    return rewards, ma_rewards

-cfg = DDPGConfig()
-plot_cfg = PlotConfig()
-# 训练
-env,agent = env_agent_config(cfg,seed=1)
-rewards, ma_rewards = train(cfg, env, agent)
-make_dir(plot_cfg.result_path, plot_cfg.model_path)
-agent.save(path=plot_cfg.model_path)
-save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path)
-plot_rewards(rewards, ma_rewards, plot_cfg, tag="train")  # 画出结果
-# 测试
-env,agent = env_agent_config(cfg,seed=10)
-agent.load(path=plot_cfg.model_path)
-rewards,ma_rewards = test(plot_cfg,env,agent)
-save_results(rewards,ma_rewards,tag = 'test',path = cfg.result_path)
-plot_rewards(rewards, ma_rewards, plot_cfg, tag="test")  # 画出结果
+def test(cfg, env, agent):
+    print('开始测试！')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo}, 设备：{cfg.device}')
+    rewards = [] # 记录所有回合的奖励
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    for i_ep in range(cfg.test_eps):
+        state = env.reset() 
+        done = False
+        ep_reward = 0
+        i_step = 0
+        while not done:
+            i_step += 1
+            action = agent.choose_action(state)
+            next_state, reward, done, _ = env.step(action)
+            ep_reward += reward
+            state = next_state
+        print('回合：{}/{}, 奖励：{}'.format(i_ep+1, cfg.train_eps, ep_reward))
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
+        else:
+            ma_rewards.append(ep_reward)
+        print(f"回合：{i_ep+1}/{cfg.test_eps}，奖励：{ep_reward:.1f}")
+    print('完成测试！')
+    return rewards, ma_rewards
+if __name__ == "__main__":
+    cfg = Config()
+    # 训练
+    env,agent = env_agent_config(cfg,seed=1)
+    rewards, ma_rewards = train(cfg, env, agent)
+    make_dir(cfg.result_path, cfg.model_path)
+    agent.save(path=cfg.model_path)
+    save_results(rewards, ma_rewards, tag='train', path=cfg.result_path)
+    plot_rewards(rewards, ma_rewards, cfg, tag="train")  # 画出结果
+    # 测试
+    env,agent = env_agent_config(cfg,seed=10)
+    agent.load(path=cfg.model_path)
+    rewards,ma_rewards = test(cfg,env,agent)
+    save_results(rewards,ma_rewards,tag = 'test',path = cfg.result_path)
+    plot_rewards(rewards, ma_rewards, cfg, tag="test")  # 画出结果

--- a/codes/DDPG/train.py
+++ b/codes/DDPG/train.py
@@ -1,64 +0,0 @@
-import sys
-import os
-curr_path = os.path.dirname(os.path.abspath(__file__))  # 当前文件所在绝对路径
-parent_path = os.path.dirname(curr_path)  # 父路径
-sys.path.append(parent_path)  # 添加路径到系统路径
-
-from DDPG.env import OUNoise
-
-def train(cfg, env, agent):
-    print('开始训练！')
-    print(f'环境：{cfg.env_name}，算法：{cfg.algo}，设备：{cfg.device}')
-    ou_noise = OUNoise(env.action_space)  # 动作噪声
-    rewards = [] # 记录所有回合的奖励
-    ma_rewards = []  # 记录所有回合的滑动平均奖励
-    for i_ep in range(cfg.train_eps):
-        state = env.reset()
-        ou_noise.reset()
-        done = False
-        ep_reward = 0
-        i_step = 0
-        while not done:
-            i_step += 1
-            action = agent.choose_action(state)
-            action = ou_noise.get_action(action, i_step) 
-            next_state, reward, done, _ = env.step(action)
-            ep_reward += reward
-            agent.memory.push(state, action, reward, next_state, done)
-            agent.update()
-            state = next_state
-        if (i_ep+1)%10 == 0:
-            print('回合：{}/{}，奖励：{:.2f}'.format(i_ep+1, cfg.train_eps, ep_reward))
-        rewards.append(ep_reward)
-        if ma_rewards:
-            ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
-        else:
-            ma_rewards.append(ep_reward)
-    print('完成训练！')
-    return rewards, ma_rewards
-
-def test(cfg, env, agent):
-    print('开始测试！')
-    print(f'环境：{cfg.env_name}, 算法：{cfg.algo}, 设备：{cfg.device}')
-    rewards = [] # 记录所有回合的奖励
-    ma_rewards = []  # 记录所有回合的滑动平均奖励
-    for i_ep in range(cfg.test_eps):
-        state = env.reset() 
-        done = False
-        ep_reward = 0
-        i_step = 0
-        while not done:
-            i_step += 1
-            action = agent.choose_action(state)
-            next_state, reward, done, _ = env.step(action)
-            ep_reward += reward
-            state = next_state
-        print('回合：{}/{}, 奖励：{}'.format(i_ep+1, cfg.train_eps, ep_reward))
-        rewards.append(ep_reward)
-        if ma_rewards:
-            ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
-        else:
-            ma_rewards.append(ep_reward)
-        print(f"回合：{i_ep+1}/{cfg.test_eps}，奖励：{ep_reward:.1f}")
-    print('完成测试！')
-    return rewards, ma_rewards
--- a/codes/DQN/README.md
+++ b/codes/DQN/README.md
@@ -50,15 +50,15 @@ import torch.nn as nn
 import torch.nn.functional as F

 class FCN(nn.Module):
-    def __init__(self, state_dim=4, action_dim=18):
+    def __init__(self, n_states=4, n_actions=18):
        """ 初始化q网络，为全连接网络
-            state_dim: 输入的feature即环境的state数目
-            action_dim: 输出的action总个数
+            n_states: 输入的feature即环境的state数目
+            n_actions: 输出的action总个数
        """
        super(FCN, self).__init__()
-        self.fc1 = nn.Linear(state_dim, 128) # 输入层
+        self.fc1 = nn.Linear(n_states, 128) # 输入层
        self.fc2 = nn.Linear(128, 128) # 隐藏层
-        self.fc3 = nn.Linear(128, action_dim) # 输出层
+        self.fc3 = nn.Linear(128, n_actions) # 输出层
        
    def forward(self, x):
        # 各层对应的激活函数
@@ -66,7 +66,7 @@ class FCN(nn.Module):
        x = F.relu(self.fc2(x))
        return self.fc3(x)
 ```
-输入为state_dim，输出为action_dim，包含一个128维度的隐藏层，这里根据需要可增加隐藏层维度和数量，然后一般使用relu激活函数，这里跟深度学习的网路设置是一样的。
+输入为n_states，输出为n_actions，包含一个128维度的隐藏层，这里根据需要可增加隐藏层维度和数量，然后一般使用relu激活函数，这里跟深度学习的网路设置是一样的。

 ### Replay Buffer

@@ -107,8 +107,8 @@ class ReplayBuffer:
 在类中建立两个网络，以及optimizer和memory，

 ```python
-self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
-self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
+self.policy_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
+self.target_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
 for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # copy params from policy net
    target_param.data.copy_(param.data)
 self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr)
@@ -124,7 +124,7 @@ def choose_action(self, state):
    if random.random() > self.epsilon(self.frame_idx):
        action = self.predict(state)
    else:
-        action = random.randrange(self.action_dim)
+        action = random.randrange(self.n_actions)
    return action
 ```

--- a/codes/DQN/dqn.py
+++ b/codes/DQN/dqn.py
@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:50:49
@LastEditor: John
-LastEditTime: 2021-12-22 14:01:37
+LastEditTime: 2022-03-02 11:05:11
@Discription: 
@Environment: python 3.7.7
 '''
@@ -20,22 +20,7 @@ import random
 import math
 import numpy as np

-class MLP(nn.Module):
-    def __init__(self, state_dim,action_dim,hidden_dim=128):
-        """ 初始化q网络，为全连接网络
-            state_dim: 输入的特征数即环境的状态维度
-            action_dim: 输出的动作维度
-        """
-        super(MLP, self).__init__()
-        self.fc1 = nn.Linear(state_dim, hidden_dim) # 输入层
-        self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
-        self.fc3 = nn.Linear(hidden_dim, action_dim) # 输出层
-        
-    def forward(self, x):
-        # 各层对应的激活函数
-        x = F.relu(self.fc1(x)) 
-        x = F.relu(self.fc2(x))
-        return self.fc3(x)
+

 class ReplayBuffer:
    def __init__(self, capacity):
@@ -62,9 +47,9 @@ class ReplayBuffer:
        return len(self.buffer)

 class DQN:
-    def __init__(self, state_dim, action_dim, cfg):
+    def __init__(self, n_actions,model,cfg):

-        self.action_dim = action_dim  # 总的动作个数
+        self.n_actions = n_actions  # 总的动作个数
        self.device = cfg.device  # 设备，cpu或gpu等
        self.gamma = cfg.gamma  # 奖励的折扣因子
        # e-greedy策略相关参数
@@ -73,8 +58,8 @@ class DQN:
            (cfg.epsilon_start - cfg.epsilon_end) * \
            math.exp(-1. * frame_idx / cfg.epsilon_decay)
        self.batch_size = cfg.batch_size
-        self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
-        self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
+        self.policy_net = model.to(self.device)
+        self.target_net = model.to(self.device)
        for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net
            target_param.data.copy_(param.data)
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器
@@ -86,23 +71,24 @@ class DQN:
        self.frame_idx += 1
        if random.random() > self.epsilon(self.frame_idx):
            with torch.no_grad():
-                state = torch.tensor([state], device=self.device, dtype=torch.float32)
+                state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
                q_values = self.policy_net(state)
                action = q_values.max(1)[1].item() # 选择Q值最大的动作
        else:
-            action = random.randrange(self.action_dim)
+            action = random.randrange(self.n_actions)
        return action
    def update(self):
        if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时，不更新策略
            return
        # 从经验回放中(replay memory)中随机采样一个批量的转移(transition)
+        # print('updating')
+        
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
            self.batch_size)
-        # 转为张量
-        state_batch = torch.tensor(state_batch, device=self.device, dtype=torch.float)
+        state_batch = torch.tensor(np.array(state_batch), device=self.device, dtype=torch.float)
        action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1)  
        reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float)  
-        next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float)
+        next_state_batch = torch.tensor(np.array(next_state_batch), device=self.device, dtype=torch.float)
        done_batch = torch.tensor(np.float32(done_batch), device=self.device)
        q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch) # 计算当前状态(s_t,a)对应的Q(s_t, a)
        next_q_values = self.target_net(next_state_batch).max(1)[0].detach() # 计算下一时刻的状态(s_t_,a)对应的Q值
--- a/codes/DQN/dqn_cnn.py
+++ b/codes/DQN/dqn_cnn.py
@@ -70,9 +70,9 @@ class ReplayBuffer:
        return len(self.buffer)

 class DQN:
-    def __init__(self, state_dim, action_dim, cfg):
+    def __init__(self, n_states, n_actions, cfg):

-        self.action_dim = action_dim  # 总的动作个数
+        self.n_actions = n_actions  # 总的动作个数
        self.device = cfg.device  # 设备，cpu或gpu等
        self.gamma = cfg.gamma  # 奖励的折扣因子
        # e-greedy策略相关参数
@@ -81,8 +81,8 @@ class DQN:
            (cfg.epsilon_start - cfg.epsilon_end) * \
            math.exp(-1. * frame_idx / cfg.epsilon_decay)
        self.batch_size = cfg.batch_size
-        self.policy_net = CNN(state_dim, action_dim).to(self.device)
-        self.target_net = CNN(state_dim, action_dim).to(self.device)
+        self.policy_net = CNN(n_states, n_actions).to(self.device)
+        self.target_net = CNN(n_states, n_actions).to(self.device)
        for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net
            target_param.data.copy_(param.data)
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器
@@ -94,11 +94,12 @@ class DQN:
        self.frame_idx += 1
        if random.random() > self.epsilon(self.frame_idx):
            with torch.no_grad():
+                print(type(state))
                state = torch.tensor([state], device=self.device, dtype=torch.float32)
                q_values = self.policy_net(state)
                action = q_values.max(1)[1].item() # 选择Q值最大的动作
        else:
-            action = random.randrange(self.action_dim)
+            action = random.randrange(self.n_actions)
        return action
    def update(self):
        if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时，不更新策略
--- a/codes/DQN/dqn_cnn2.py
+++ b/codes/DQN/dqn_cnn2.py
@@ -0,0 +1,142 @@
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.autograd as autograd 
+import random
+import math
+import numpy as np
+class CNN(nn.Module):
+    def __init__(self, n_frames, n_actions):
+        super(CNN,self).__init__()
+        self.n_frames = n_frames
+        self.n_actions = n_actions
+        
+        # Layers
+        self.conv1 = nn.Conv2d(
+            in_channels=n_frames,
+            out_channels=16,
+            kernel_size=8,
+            stride=4,
+            padding=2
+            )
+        self.conv2 = nn.Conv2d(
+            in_channels=16,
+            out_channels=32,
+            kernel_size=4,
+            stride=2,
+            padding=1
+            )
+        self.fc1 = nn.Linear(
+            in_features=3200,
+            out_features=256,
+            )
+        self.fc2 = nn.Linear(
+            in_features=256,
+            out_features=n_actions,
+            )
+        
+        # Activation Functions
+        self.relu = nn.ReLU()
+    
+    def flatten(self, x):
+        batch_size = x.size()[0]
+        x = x.view(batch_size, -1)
+        return x
+    
+    def forward(self, x):
+        
+        # Forward pass
+        x = self.relu(self.conv1(x))  # In: (80, 80, 4)  Out: (20, 20, 16)
+        x = self.relu(self.conv2(x))  # In: (20, 20, 16) Out: (10, 10, 32)
+        x = self.flatten(x)           # In: (10, 10, 32) Out: (3200,)
+        x = self.relu(self.fc1(x))    # In: (3200,)      Out: (256,)
+        x = self.fc2(x)               # In: (256,)       Out: (4,)
+        
+        return x
+
+class ReplayBuffer:
+    def __init__(self, capacity):
+        self.capacity = capacity # 经验回放的容量
+        self.buffer = [] # 缓冲区
+        self.position = 0 
+    
+    def push(self, state, action, reward, next_state, done):
+        ''' 缓冲区是一个队列，容量超出时去掉开始存入的转移(transition)
+        '''
+        if len(self.buffer) < self.capacity:
+            self.buffer.append(None)
+        self.buffer[self.position] = (state, action, reward, next_state, done)
+        self.position = (self.position + 1) % self.capacity 
+    
+    def sample(self, batch_size):
+        batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移
+        state, action, reward, next_state, done =  zip(*batch) # 解压成状态，动作等
+        return state, action, reward, next_state, done
+    
+    def __len__(self):
+        ''' 返回当前存储的量
+        '''
+        return len(self.buffer)
+
+class DQN:
+    def __init__(self, n_states, n_actions, cfg):
+
+        self.n_actions = n_actions  # 总的动作个数
+        self.device = cfg.device  # 设备，cpu或gpu等
+        self.gamma = cfg.gamma  # 奖励的折扣因子
+        # e-greedy策略相关参数
+        self.frame_idx = 0  # 用于epsilon的衰减计数
+        self.epsilon = lambda frame_idx: cfg.epsilon_end + \
+            (cfg.epsilon_start - cfg.epsilon_end) * \
+            math.exp(-1. * frame_idx / cfg.epsilon_decay)
+        self.batch_size = cfg.batch_size
+        self.policy_net = CNN(n_states, n_actions).to(self.device)
+        self.target_net = CNN(n_states, n_actions).to(self.device)
+        for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net
+            target_param.data.copy_(param.data)
+        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器
+        self.memory = ReplayBuffer(cfg.memory_capacity) # 经验回放
+
+    def choose_action(self, state):
+        ''' 选择动作
+        '''
+        self.frame_idx += 1
+        if random.random() > self.epsilon(self.frame_idx):
+            with torch.no_grad():
+                state = torch.tensor([state], device=self.device, dtype=torch.float32)
+                q_values = self.policy_net(state)
+                action = q_values.max(1)[1].item() # 选择Q值最大的动作
+        else:
+            action = random.randrange(self.n_actions)
+        return action
+    def update(self):
+        if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时，不更新策略
+            return
+        # 从经验回放中(replay memory)中随机采样一个批量的转移(transition)
+        state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
+            self.batch_size)
+        # 转为张量
+        state_batch = torch.tensor(state_batch, device=self.device, dtype=torch.float)
+        action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1)  
+        reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float)  
+        next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float)
+        done_batch = torch.tensor(np.float32(done_batch), device=self.device)
+        q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch) # 计算当前状态(s_t,a)对应的Q(s_t, a)
+        next_q_values = self.target_net(next_state_batch).max(1)[0].detach() # 计算下一时刻的状态(s_t_,a)对应的Q值
+        # 计算期望的Q值，对于终止状态，此时done_batch[0]=1, 对应的expected_q_value等于reward
+        expected_q_values = reward_batch + self.gamma * next_q_values * (1-done_batch)
+        loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1))  # 计算均方根损失
+        # 优化更新模型
+        self.optimizer.zero_grad()  
+        loss.backward()
+        for param in self.policy_net.parameters():  # clip防止梯度爆炸
+            param.grad.data.clamp_(-1, 1)
+        self.optimizer.step() 
+
+    def save(self, path):
+        torch.save(self.target_net.state_dict(), path+'dqn_checkpoint.pth')
+
+    def load(self, path):
+        self.target_net.load_state_dict(torch.load(path+'dqn_checkpoint.pth'))
+        for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
+            param.data.copy_(target_param.data)
--- a/codes/DQN/outputs/CartPole-v0/20220302-111332/models/dqn_checkpoint.pth
+++ b/codes/DQN/outputs/CartPole-v0/20220302-111332/models/dqn_checkpoint.pth
--- a/codes/DQN/outputs/CartPole-v0/20220302-111332/results/test_ma_rewards.npy
+++ b/codes/DQN/outputs/CartPole-v0/20220302-111332/results/test_ma_rewards.npy
--- a/codes/DQN/outputs/CartPole-v0/20220302-111332/results/test_rewards.npy
+++ b/codes/DQN/outputs/CartPole-v0/20220302-111332/results/test_rewards.npy
--- a/codes/DQN/outputs/CartPole-v0/20220302-111332/results/test_rewards_curve.png
+++ b/codes/DQN/outputs/CartPole-v0/20220302-111332/results/test_rewards_curve.png
--- a/codes/Sarsa/outputs/CliffWalking-v0/20220424-221748/results/test_rewards.npy
+++ b/codes/Sarsa/outputs/CliffWalking-v0/20220424-221748/results/test_rewards.npy
--- a/codes/DQN/outputs/CartPole-v0/20220302-111332/results/train_ma_rewards.npy
+++ b/codes/DQN/outputs/CartPole-v0/20220302-111332/results/train_ma_rewards.npy
--- a/codes/DQN/outputs/CartPole-v0/20220302-111332/results/train_rewards.npy
+++ b/codes/DQN/outputs/CartPole-v0/20220302-111332/results/train_rewards.npy
--- a/codes/DQN/outputs/CartPole-v0/20220302-111332/results/train_rewards_curve.png
+++ b/codes/DQN/outputs/CartPole-v0/20220302-111332/results/train_rewards_curve.png
--- a/codes/DQN/outputs/CartPole-v0/20220302-111332/results/train_steps.npy
+++ b/codes/DQN/outputs/CartPole-v0/20220302-111332/results/train_steps.npy
--- a/codes/DQN/task0.py
+++ b/codes/DQN/task0.py
@@ -1,5 +1,7 @@
 import sys
 import os
+import torch.nn as nn
+import torch.nn.functional as F
 curr_path = os.path.dirname(os.path.abspath(__file__))  # 当前文件所在绝对路径
 parent_path = os.path.dirname(curr_path)  # 父路径
 sys.path.append(parent_path)  # 添加路径到系统路径
@@ -8,26 +10,42 @@ import gym
 import torch
 import datetime
 import numpy as np
-from common.utils import save_results, make_dir
+from common.utils import save_results_1, make_dir
 from common.utils import plot_rewards
-from DQN.dqn import DQN
+from dqn import DQN

 curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间

+class MLP(nn.Module):
+    def __init__(self, n_states,n_actions,hidden_dim=128):
+        """ 初始化q网络，为全连接网络
+            n_states: 输入的特征数即环境的状态维度
+            n_actions: 输出的动作维度
+        """
+        super(MLP, self).__init__()
+        self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
+        self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
+        self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层
+        
+    def forward(self, x):
+        # 各层对应的激活函数
+        x = F.relu(self.fc1(x)) 
+        x = F.relu(self.fc2(x))
+        return self.fc3(x)

 class Config:
    '''超参数
    '''

    def __init__(self):
-        ################################## 环境超参数 ###################################
-        self.algo_name = 'DQN'  # 算法名称
-        self.env_name = 'CartPole-v0'  # 环境名称
+        ############################### hyperparameters ################################
+        self.algo_name = 'DQN'  # algorithm name
+        self.env_name = 'CartPole-v0'  # environment name
        self.device = torch.device(
-            "cuda" if torch.cuda.is_available() else "cpu")  # 检测GPUgjgjlkhfsf风刀霜的撒发十
+            "cuda" if torch.cuda.is_available() else "cpu")  # check GPU
        self.seed = 10 # 随机种子，置0则不设置随机种子
        self.train_eps = 200  # 训练的回合数
-        self.test_eps = 30  # 测试的回合数
+        self.test_eps = 20  # 测试的回合数
        ################################################################################
        
        ################################## 算法超参数 ###################################
@@ -41,8 +59,8 @@ class Config:
        self.target_update = 4  # 目标网络的更新频率
        self.hidden_dim = 256  # 网络隐藏层
        ################################################################################
-
-        ################################# 保存结果相关参数 ##############################
+        
+        ################################# 保存结果相关参数 ################################
        self.result_path = curr_path + "/outputs/" + self.env_name + \
            '/' + curr_time + '/results/'  # 保存结果的路径
        self.model_path = curr_path + "/outputs/" + self.env_name + \
@@ -55,9 +73,11 @@ def env_agent_config(cfg):
    ''' 创建环境和智能体
    '''
    env = gym.make(cfg.env_name)  # 创建环境
-    state_dim = env.observation_space.shape[0]  # 状态维度
-    action_dim = env.action_space.n  # 动作维度
-    agent = DQN(state_dim, action_dim, cfg)  # 创建智能体
+    n_states = env.observation_space.shape[0]  # 状态维度
+    n_actions = env.action_space.n  # 动作维度
+    print(f"n states: {n_states}, n actions: {n_actions}")
+    model = MLP(n_states,n_actions)
+    agent = DQN(n_actions, model, cfg)  # 创建智能体
    if cfg.seed !=0: # 设置随机种子
        torch.manual_seed(cfg.seed)
        env.seed(cfg.seed)
@@ -72,10 +92,13 @@ def train(cfg, env, agent):
    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
    rewards = []  # 记录所有回合的奖励
    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    steps = []
    for i_ep in range(cfg.train_eps):
        ep_reward = 0  # 记录一回合内的奖励
+        ep_step = 0
        state = env.reset()  # 重置环境，返回初始状态
        while True:
+            ep_step += 1
            action = agent.choose_action(state)  # 选择动作
            next_state, reward, done, _ = env.step(action)  # 更新环境，返回transition
            agent.memory.push(state, action, reward,
@@ -87,16 +110,18 @@ def train(cfg, env, agent):
                break
        if (i_ep + 1) % cfg.target_update == 0:  # 智能体目标网络更新
            agent.target_net.load_state_dict(agent.policy_net.state_dict())
+        steps.append(ep_step)
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward)
        else:
            ma_rewards.append(ep_reward)
-        if (i_ep + 1) % 10 == 0:
-            print('回合：{}/{}, 奖励：{}'.format(i_ep + 1, cfg.train_eps, ep_reward))
-    print('完成训练！')
+        if (i_ep + 1) % 1 == 0:
+            print(f'Episode：{i_ep+1}/{cfg.test_eps}, Reward:{ep_reward:.2f}, Step:{ep_step:.2f} Epislon:{agent.epsilon(agent.frame_idx):.3f}')
+    print('Finish training!')
    env.close()
-    return rewards, ma_rewards
+    res_dic = {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}
+    return res_dic


 def test(cfg, env, agent):
@@ -108,41 +133,45 @@ def test(cfg, env, agent):
    ################################################################################
    rewards = []  # 记录所有回合的奖励
    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    steps = []
    for i_ep in range(cfg.test_eps):
        ep_reward = 0  # 记录一回合内的奖励
+        ep_step = 0
        state = env.reset()  # 重置环境，返回初始状态
        while True:
+            ep_step+=1
            action = agent.choose_action(state)  # 选择动作
            next_state, reward, done, _ = env.step(action)  # 更新环境，返回transition
            state = next_state  # 更新下一个状态
            ep_reward += reward  # 累加奖励
            if done:
                break
+        steps.append(ep_step)
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1)
        else:
            ma_rewards.append(ep_reward)
-        print(f"回合：{i_ep+1}/{cfg.test_eps}，奖励：{ep_reward:.1f}")
+        print(f'Episode：{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}, Step:{ep_step:.2f}')
    print('完成测试！')
    env.close()
-    return rewards, ma_rewards
+    return {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}


 if __name__ == "__main__":
    cfg = Config()
    # 训练
    env, agent = env_agent_config(cfg)
-    rewards, ma_rewards = train(cfg, env, agent)
+    res_dic = train(cfg, env, agent)
    make_dir(cfg.result_path, cfg.model_path)  # 创建保存结果和模型路径的文件夹
    agent.save(path=cfg.model_path)  # 保存模型
-    save_results(rewards, ma_rewards, tag='train',
+    save_results_1(res_dic, tag='train',
                 path=cfg.result_path)  # 保存结果
-    plot_rewards(rewards, ma_rewards, cfg, tag="train")  # 画出结果
+    plot_rewards(res_dic['rewards'], res_dic['ma_rewards'], cfg, tag="train")  # 画出结果
    # 测试
    env, agent = env_agent_config(cfg)
    agent.load(path=cfg.model_path)  # 导入模型
-    rewards, ma_rewards = test(cfg, env, agent)
-    save_results(rewards, ma_rewards, tag='test',
+    res_dic = test(cfg, env, agent)
+    save_results_1(res_dic, tag='test',
                 path=cfg.result_path)  # 保存结果
-    plot_rewards(rewards, ma_rewards, cfg, tag="test")  # 画出结果
+    plot_rewards(res_dic['rewards'], res_dic['ma_rewards'],cfg, tag="test")  # 画出结果
--- a/codes/DQN/task1.py
+++ b/codes/DQN/task1.py
@@ -5,7 +5,7 @@ Author: JiangJi
 Email: johnjim0816@gmail.com
 Date: 2021-12-22 11:14:17
 LastEditor: JiangJi
-LastEditTime: 2021-12-22 11:40:44
+LastEditTime: 2022-02-10 06:17:41
 Discription: 使用 Nature DQN 训练 CartPole-v1
 '''
 import sys
@@ -19,7 +19,7 @@ import torch
 import datetime
 from common.utils import save_results, make_dir
 from common.utils import plot_rewards, plot_rewards_cn
-from DQN.dqn import DQN
+from dqn import DQN

 curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
 algo_name = "DQN"  # 算法名称
@@ -66,9 +66,9 @@ def env_agent_config(cfg, seed=1):
    '''
    env = gym.make(cfg.env_name)  # 创建环境
    env.seed(seed)  # 设置随机种子
-    state_dim = env.observation_space.shape[0]  # 状态维度
-    action_dim = env.action_space.n  # 动作维度
-    agent = DQN(state_dim, action_dim, cfg)  # 创建智能体
+    n_states = env.observation_space.shape[0]  # 状态维度
+    n_actions = env.action_space.n  # 动作维度
+    agent = DQN(n_states, n_actions, cfg)  # 创建智能体
    return env, agent

 def train(cfg, env, agent):
--- a/codes/DQN/task2.py
+++ b/codes/DQN/task2.py
@@ -5,7 +5,7 @@ Author: JiangJi
 Email: johnjim0816@gmail.com
 Date: 2021-12-22 11:14:17
 LastEditor: JiangJi
-LastEditTime: 2021-12-22 15:27:48
+LastEditTime: 2022-02-10 06:17:46
 Discription: 使用 DQN-cnn  训练 PongNoFrameskip-v4
 '''
 import sys
@@ -20,7 +20,7 @@ import datetime
 from common.utils import save_results, make_dir
 from common.utils import plot_rewards, plot_rewards_cn
 from common.atari_wrappers import make_atari, wrap_deepmind
-from DQN.dqn import DQN
+from dqn import DQN

 curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
 algo_name = 'DQN-cnn'  # 算法名称
@@ -68,9 +68,9 @@ def env_agent_config(cfg, seed=1):
    # env    = wrap_deepmind(env)
    # env    = wrap_pytorch(env) 
    env.seed(seed)  # 设置随机种子
-    state_dim = env.observation_space.shape[0]  # 状态维度
-    action_dim = env.action_space.n  # 动作维度
-    agent = DQN(state_dim, action_dim, cfg)  # 创建智能体
+    n_states = env.observation_space.shape[0]  # 状态维度
+    n_actions = env.action_space.n  # 动作维度
+    agent = DQN(n_states, n_actions, cfg)  # 创建智能体
    return env, agent

 def train(cfg, env, agent):
--- a/codes/DQN/task4.py
+++ b/codes/DQN/task4.py
@@ -0,0 +1,180 @@
+import sys
+import os
+import torch.nn as nn
+import torch.nn.functional as F
+curr_path = os.path.dirname(os.path.abspath(__file__))  # 当前文件所在绝对路径
+parent_path = os.path.dirname(curr_path)  # 父路径
+sys.path.append(parent_path)  # 添加路径到系统路径
+
+import gym
+import torch
+import datetime
+import numpy as np
+from common.utils import save_results_1, make_dir
+from common.utils import plot_rewards
+from dqn_1 import DQN
+
+curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
+
+class MLP(nn.Module):
+    def __init__(self, n_states,n_actions,hidden_dim=256):
+        """ 初始化q网络，为全连接网络
+            n_states: 输入的特征数即环境的状态维度
+            n_actions: 输出的动作维度
+        """
+        super(MLP, self).__init__()
+        self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
+        self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
+        self.fc3 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
+        self.fc4 = nn.Linear(hidden_dim, n_actions) # 输出层
+        
+    def forward(self, x):
+        # 各层对应的激活函数
+        x = F.relu(self.fc1(x)) 
+        x = F.relu(self.fc2(x))
+        x = F.relu(self.fc3(x))
+        return self.fc4(x)
+
+class Config:
+    '''超参数
+    '''
+
+    def __init__(self):
+        ################################## 环境超参数 ###################################
+        self.algo_name = 'DQN'  # 算法名称
+        # self.env_name = 'Breakout-ram-v0'  # 环境名称
+        self.env_name = 'ALE/Pong-ram-v5'
+        self.device = torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu")  # 检测GPUgjgjlkhfsf风刀霜的撒发十
+        self.seed = 10 # 随机种子，置0则不设置随机种子
+        self.train_eps = 5  # 训练的回合数
+        self.test_eps = 30  # 测试的回合数
+        ################################################################################
+        
+        ################################## 算法超参数 ###################################
+        self.gamma = 0.99  # 强化学习中的折扣因子
+        self.epsilon_start = 0.95  # e-greedy策略中初始epsilon
+        self.epsilon_end = 0.01  # e-greedy策略中的终止epsilon
+        self.epsilon_decay = 500000  # e-greedy策略中epsilon的衰减率
+        self.lr = 0.00025 # 学习率
+        self.memory_capacity = int(5e4)  # 经验回放的容量
+        self.batch_size = 32  # mini-batch SGD中的批量大小
+        self.target_update = 4  # 目标网络的更新频率
+        self.hidden_dim = 512  # 网络隐藏层
+        ################################################################################
+        
+        ################################# 保存结果相关参数 ################################
+        self.result_path = curr_path + "/outputs/" + self.env_name + \
+            '/' + curr_time + '/results/'  # 保存结果的路径
+        self.model_path = curr_path + "/outputs/" + self.env_name + \
+            '/' + curr_time + '/models/'  # 保存模型的路径
+        self.save = True # 是否保存图片
+        ################################################################################
+
+
+def env_agent_config(cfg):
+    ''' 创建环境和智能体
+    '''
+    env = gym.make(cfg.env_name)  # 创建环境
+    n_states = env.observation_space.shape[0]  # 状态维度
+    n_actions = env.action_space.n  # 动作维度
+    print(f"n states: {n_states}, n actions: {n_actions}")
+    model = MLP(n_states,n_actions)
+    agent = DQN(n_states, n_actions, model, cfg)  # 创建智能体
+    if cfg.seed !=0: # 设置随机种子
+        torch.manual_seed(cfg.seed)
+        env.seed(cfg.seed)
+        np.random.seed(cfg.seed)
+    return env, agent
+
+
+def train(cfg, env, agent):
+    ''' 训练
+    '''
+    print('开始训练!')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
+    rewards = []  # 记录所有回合的奖励
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    steps = []
+    for i_ep in range(cfg.train_eps):
+        ep_reward = 0  # 记录一回合内的奖励
+        state = env.reset()  # 重置环境，返回初始状态
+        ep_step = 0
+        while True:
+            ep_step+=1
+            action = agent.choose_action(state)  # 选择动作
+            next_state, reward, done, _ = env.step(action)  # 更新环境，返回transition
+            agent.memory.push(state, action, reward,
+                              next_state, done)  # 保存transition
+            state = next_state  # 更新下一个状态
+            agent.update()  # 更新智能体
+            ep_reward += reward  # 累加奖励
+            if done:
+                break
+        if (i_ep + 1) % cfg.target_update == 0:  # 智能体目标网络更新
+            agent.target_net.load_state_dict(agent.policy_net.state_dict())
+        steps.append(ep_step)
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward)
+        else:
+            ma_rewards.append(ep_reward)
+        if (i_ep + 1) % 1 == 0:
+            print(f'Episode：{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}, Epislon:{agent.epsilon(agent.frame_idx):.3f}')
+    print('完成训练！')
+    env.close()
+    res_dic = {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}
+    return res_dic
+
+
+def test(cfg, env, agent):
+    print('开始测试!')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
+    ############# 由于测试不需要使用epsilon-greedy策略，所以相应的值设置为0 ###############
+    cfg.epsilon_start = 0.0  # e-greedy策略中初始epsilon
+    cfg.epsilon_end = 0.0  # e-greedy策略中的终止epsilon
+    ################################################################################
+    rewards = []  # 记录所有回合的奖励
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    steps = []
+    for i_ep in range(cfg.test_eps):
+        ep_reward = 0  # 记录一回合内的奖励
+        ep_step = 0
+        state = env.reset()  # 重置环境，返回初始状态
+        while True:
+            ep_step+=1
+            action = agent.choose_action(state)  # 选择动作
+            next_state, reward, done, _ = env.step(action)  # 更新环境，返回transition
+            state = next_state  # 更新下一个状态
+            ep_reward += reward  # 累加奖励
+            if done:
+                break
+        steps.append(ep_step)
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1)
+        else:
+            ma_rewards.append(ep_reward)
+        print(f"回合：{i_ep+1}/{cfg.test_eps}，奖励：{ep_reward:.1f}")
+    print('完成测试！')
+    env.close()
+    return {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}
+
+
+if __name__ == "__main__":
+    cfg = Config()
+    # 训练
+    env, agent = env_agent_config(cfg)
+    res_dic = train(cfg, env, agent)
+    make_dir(cfg.result_path, cfg.model_path)  # 创建保存结果和模型路径的文件夹
+    agent.save(path=cfg.model_path)  # 保存模型
+    save_results_1(res_dic, tag='train',
+                 path=cfg.result_path)  # 保存结果
+    plot_rewards(res_dic['rewards'], res_dic['ma_rewards'], cfg, tag="train")  # 画出结果
+    # 测试
+    env, agent = env_agent_config(cfg)
+    agent.load(path=cfg.model_path)  # 导入模型
+    res_dic = test(cfg, env, agent)
+    save_results_1(res_dic, tag='test',
+                 path=cfg.result_path)  # 保存结果
+    plot_rewards(res_dic['rewards'], res_dic['ma_rewards'],cfg, tag="test")  # 画出结果
--- a/codes/DQN/task5.py
+++ b/codes/DQN/task5.py
@@ -0,0 +1,149 @@
+import sys
+import os
+curr_path = os.path.dirname(os.path.abspath(__file__))  # 当前文件所在绝对路径
+parent_path = os.path.dirname(curr_path)  # 父路径
+sys.path.append(parent_path)  # 添加路径到系统路径
+
+import gym
+import torch
+import datetime
+import numpy as np
+from common.utils import save_results, make_dir
+from common.utils import plot_rewards
+from dqn import DQN
+
+curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
+
+
+class Config:
+    '''超参数
+    '''
+
+    def __init__(self):
+        ################################## 环境超参数 ###################################
+        self.algo_name = 'DQN'  # 算法名称
+        self.env_name = 'SpaceInvaders-ram-v0'  # 环境名称
+        self.device = torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu")  # 检测GPUgjgjlkhfsf风刀霜的撒发十
+        self.seed = 10 # 随机种子，置0则不设置随机种子
+        self.train_eps = 200  # 训练的回合数
+        self.test_eps = 30  # 测试的回合数
+        ################################################################################
+        
+        ################################## 算法超参数 ###################################
+        self.gamma = 0.99  # 强化学习中的折扣因子
+        self.epsilon_start = 0.95  # e-greedy策略中初始epsilon
+        self.epsilon_end = 0.01  # e-greedy策略中的终止epsilon
+        self.epsilon_decay = 20000  # e-greedy策略中epsilon的衰减率
+        self.lr = 2e-4  # 学习率
+        self.memory_capacity = int(1e5)  # 经验回放的容量
+        self.batch_size = 32  # mini-batch SGD中的批量大小
+        self.target_update = 4  # 目标网络的更新频率
+        self.hidden_dim = 512  # 网络隐藏层
+        ################################################################################
+        
+        ################################# 保存结果相关参数 ################################
+        self.result_path = curr_path + "/outputs/" + self.env_name + \
+            '/' + curr_time + '/results/'  # 保存结果的路径
+        self.model_path = curr_path + "/outputs/" + self.env_name + \
+            '/' + curr_time + '/models/'  # 保存模型的路径
+        self.save = True # 是否保存图片
+        ################################################################################
+
+
+def env_agent_config(cfg):
+    ''' 创建环境和智能体
+    '''
+    env = gym.make(cfg.env_name)  # 创建环境
+    n_states = env.observation_space.shape[0]  # 状态维度
+    n_actions = env.action_space.n  # 动作维度
+    print(f"n states: {n_states}, n actions: {n_actions}")
+    agent = DQN(n_states, n_actions, cfg)  # 创建智能体
+    if cfg.seed !=0: # 设置随机种子
+        torch.manual_seed(cfg.seed)
+        env.seed(cfg.seed)
+        np.random.seed(cfg.seed)
+    return env, agent
+
+
+def train(cfg, env, agent):
+    ''' 训练
+    '''
+    print('开始训练!')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
+    rewards = []  # 记录所有回合的奖励
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    for i_ep in range(cfg.train_eps):
+        ep_reward = 0  # 记录一回合内的奖励
+        state = env.reset()  # 重置环境，返回初始状态
+        while True:
+            action = agent.choose_action(state)  # 选择动作
+            next_state, reward, done, _ = env.step(action)  # 更新环境，返回transition
+            agent.memory.push(state, action, reward,
+                              next_state, done)  # 保存transition
+            state = next_state  # 更新下一个状态
+            agent.update()  # 更新智能体
+            ep_reward += reward  # 累加奖励
+            if done:
+                break
+        if (i_ep + 1) % cfg.target_update == 0:  # 智能体目标网络更新
+            agent.target_net.load_state_dict(agent.policy_net.state_dict())
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward)
+        else:
+            ma_rewards.append(ep_reward)
+        if (i_ep + 1) % 1 == 0:
+            print(f'Episode：{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}, Epislon:{agent.epsilon(agent.frame_idx):.3f}')
+    print('完成训练！')
+    env.close()
+    return rewards, ma_rewards
+
+
+def test(cfg, env, agent):
+    print('开始测试!')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
+    ############# 由于测试不需要使用epsilon-greedy策略，所以相应的值设置为0 ###############
+    cfg.epsilon_start = 0.0  # e-greedy策略中初始epsilon
+    cfg.epsilon_end = 0.0  # e-greedy策略中的终止epsilon
+    ################################################################################
+    rewards = []  # 记录所有回合的奖励
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    for i_ep in range(cfg.test_eps):
+        ep_reward = 0  # 记录一回合内的奖励
+        state = env.reset()  # 重置环境，返回初始状态
+        while True:
+            action = agent.choose_action(state)  # 选择动作
+            next_state, reward, done, _ = env.step(action)  # 更新环境，返回transition
+            state = next_state  # 更新下一个状态
+            ep_reward += reward  # 累加奖励
+            if done:
+                break
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1)
+        else:
+            ma_rewards.append(ep_reward)
+        print(f"回合：{i_ep+1}/{cfg.test_eps}，奖励：{ep_reward:.1f}")
+    print('完成测试！')
+    env.close()
+    return rewards, ma_rewards
+
+
+if __name__ == "__main__":
+    cfg = Config()
+    # 训练
+    env, agent = env_agent_config(cfg)
+    rewards, ma_rewards = train(cfg, env, agent)
+    make_dir(cfg.result_path, cfg.model_path)  # 创建保存结果和模型路径的文件夹
+    agent.save(path=cfg.model_path)  # 保存模型
+    save_results(rewards, ma_rewards, tag='train',
+                 path=cfg.result_path)  # 保存结果
+    plot_rewards(rewards, ma_rewards, cfg, tag="train")  # 画出结果
+    # 测试
+    env, agent = env_agent_config(cfg)
+    agent.load(path=cfg.model_path)  # 导入模型
+    rewards, ma_rewards = test(cfg, env, agent)
+    save_results(rewards, ma_rewards, tag='test',
+                 path=cfg.result_path)  # 保存结果
+    plot_rewards(rewards, ma_rewards, cfg, tag="test")  # 画出结果
--- a/codes/DQN/test
+++ b/codes/DQN/test
@@ -0,0 +1,184 @@
+import random
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+import os
+import gym
+import time
+from collections import deque
+from tensorflow.keras import optimizers
+from keras.models import Sequential
+from keras.layers import Dense, Dropout
+from keras.layers import Activation, Flatten, Conv1D, MaxPooling1D,Reshape
+import matplotlib.pyplot as plt
+
+class DQN:
+    def __init__(self, env):
+        self.env = env
+        self.memory = deque(maxlen=400000)
+        self.gamma = 0.99
+        self.epsilon = 1.0
+        self.epsilon_min = 0.01
+        self.epsilon_decay =  self.epsilon_min / 500000
+        
+        self.batch_size = 32
+        self.train_start = 1000
+        self.state_size = self.env.observation_space.shape[0]*4
+        self.action_size = self.env.action_space.n
+        self.learning_rate = 0.00025
+        
+        self.evaluation_model = self.create_model()
+        self.target_model = self.create_model()
+        
+    def create_model(self):
+        model = Sequential()
+        model.add(Dense(128*2, input_dim=self.state_size,activation='relu'))
+        model.add(Dense(128*2, activation='relu'))
+        model.add(Dense(128*2, activation='relu'))
+        model.add(Dense(self.env.action_space.n, activation='linear'))
+        model.compile(loss='mean_squared_error', optimizer=optimizers.RMSprop(lr=self.learning_rate,decay=0.99,epsilon=1e-6))
+        return model
+    
+    def choose_action(self, state, steps):
+        if steps > 50000:
+            if self.epsilon > self.epsilon_min:
+                self.epsilon -= self.epsilon_decay
+        if np.random.random() < self.epsilon:
+            return self.env.action_space.sample()
+        return np.argmax(self.evaluation_model.predict(state)[0])
+        
+    def remember(self, cur_state, action, reward, new_state, done):
+        if not hasattr(self, 'memory_counter'):
+            self.memory_counter = 0
+        
+        transition = (cur_state, action, reward, new_state, done)
+        self.memory.extend([transition])
+        
+        self.memory_counter += 1
+    
+    def replay(self):
+        if len(self.memory) < self.train_start:
+            return
+        
+        mini_batch = random.sample(self.memory, self.batch_size)
+        
+        update_input = np.zeros((self.batch_size, self.state_size))
+        update_target = np.zeros((self.batch_size, self.action_size))
+        
+        for i in range(self.batch_size):
+            state, action, reward, new_state, done = mini_batch[i]
+            target = self.evaluation_model.predict(state)[0]
+        
+            if done:
+                target[action] = reward
+            else:
+                target[action] = reward + self.gamma * np.amax(self.target_model.predict(new_state)[0])
+            
+            update_input[i] = state
+            update_target[i] = target
+    
+        self.evaluation_model.fit(update_input, update_target, batch_size=self.batch_size, epochs=1, verbose=0)
+    
+    def target_train(self):
+        self.target_model.set_weights(self.evaluation_model.get_weights())
+        return
+    
+    def visualize(self, reward, episode):
+        plt.plot(episode, reward, 'ob-')
+        plt.title('Average reward each 100 episode')
+        plt.ylabel('Reward')
+        plt.xlabel('Episodes')
+        plt.grid()
+        plt.show()
+    
+    def transform(self,state):
+        if state.shape[1]==512:
+            return state
+        a=[np.binary_repr(x,width=8) for x in state[0]]
+        res=[]
+        for x in a:
+            res.extend([x[:2],x[2:4],x[4:6],x[6:]])
+        res=[int(x,2) for x in res]
+        return np.array(res)
+        
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+def main():
+    # env = gym.make('Breakout-ram-v0')
+    env = gym.make('Breakout-ram-v0')
+    env = env.unwrapped
+    
+    print(env.action_space)
+    print(env.observation_space.shape[0])
+    print(env.observation_space.high)
+    print(env.observation_space.low)
+    
+    #print(env.observation_space.shape)
+    
+    
+    episodes = 5000
+    trial_len = 10000
+    
+    tmp_reward=0
+    sum_rewards = 0
+    n_success = 0
+    total_steps = 0
+    
+    graph_reward = []
+    graph_episodes = []
+    time_record = []
+    
+    dqn_agent = DQN(env=env)
+    for i_episode in range(episodes):
+        start_time = time.time()
+        total_reward = 0
+        cur_state = env.reset().reshape(1,128)
+        cur_state=dqn_agent.transform(cur_state).reshape(1,128*4)/4
+        i_step=0
+        for step in range(trial_len):
+            #env.render()
+            i_step+=1
+            action = dqn_agent.choose_action(cur_state, total_steps)
+            new_state, reward, done, _ = env.step(action)
+            new_state = new_state.reshape(1, 128)
+            new_state = dqn_agent.transform(new_state).reshape(1,128*4)/4
+            total_reward += reward
+            sum_rewards += reward
+            tmp_reward += reward
+            if reward>0:    #Testing whether it is good.
+                reward=1
+            
+            dqn_agent.remember(cur_state, action, reward, new_state, done)
+            if total_steps > 10000:
+                if total_steps%4 == 0:
+                    dqn_agent.replay()
+                if total_steps%5000 == 0:
+                    dqn_agent.target_train()
+            
+            cur_state = new_state
+            total_steps += 1
+            if done:
+                env.reset()
+                break
+        if (i_episode+1) % 100 == 0:
+            graph_reward.append(sum_rewards/100)
+            graph_episodes.append(i_episode+1)
+            sum_rewards = 0
+            print("Episode ",i_episode+1," Reward: ")
+            print(graph_reward[-1])
+        end_time = time.time()
+        time_record.append(end_time-start_time)
+        print("NOW in episode: " + str(i_episode))
+        print("Time cost: " + str(end_time-start_time))
+        print("Reward: ",tmp_reward)
+        print("Step:", i_step)
+        tmp_reward=0
+    print("Reward: ")
+    print(graph_reward)
+    print("Episode: ")
+    print(graph_episodes)
+    print("Average_time: ")
+    print(sum(time_record)/5000)
+    dqn_agent.visualize(graph_reward, graph_episodes)
+    
+if __name__ == '__main__':
+    main()
--- a/codes/Docs/使用DDPG解决倒立摆问题.md
+++ b/codes/Docs/使用DDPG解决倒立摆问题.md
@@ -90,15 +90,15 @@ class OUNoise(object):
        self.max_sigma    = max_sigma
        self.min_sigma    = min_sigma
        self.decay_period = decay_period
-        self.action_dim   = action_space.shape[0]
+        self.n_actions   = action_space.shape[0]
        self.low          = action_space.low
        self.high         = action_space.high
        self.reset()
    def reset(self):
-        self.obs = np.ones(self.action_dim) * self.mu
+        self.obs = np.ones(self.n_actions) * self.mu
    def evolve_obs(self):
        x  = self.obs
-        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
+        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.n_actions)
        self.obs = x + dx
        return self.obs
    def get_action(self, action, t=0):
--- a/codes/Docs/使用DQN解决推车杆问题.md
+++ b/codes/Docs/使用DQN解决推车杆问题.md
@@ -14,10 +14,10 @@ CartPole-v0是一个经典的入门环境，如下图，它通过向左(动作=0
 import gym
 env = gym.make('CartPole-v0')  # 建立环境
 env.seed(1) # 随机种子
-state_dim = env.observation_space.shape[0] # 状态维度
-action_dim = env.action_space.n # 动作维度
+n_states = env.observation_space.shape[0] # 状态维度
+n_actions = env.action_space.n # 动作维度
 state = env.reset() # 初始化环境
-print(f"状态维度：{state_dim}，动作维度：{action_dim}")
+print(f"状态维度：{n_states}，动作维度：{n_actions}")
 print(f"初始状态：{state}")
 ```

@@ -157,7 +157,7 @@ def choose_action(self, state):
                q_values = self.policy_net(state)
                action = q_values.max(1)[1].item() # 选择Q值最大的动作
        else:
-            action = random.randrange(self.action_dim)
+            action = random.randrange(self.n_actions)
 ```

 可以看到跟Q学习算法其实是一样的，都是用的$\epsilon-greedy$策略，只是使用神经网络的话我们需要通过Torch或者Tensorflow工具来处理相应的数据。
--- a/codes/Docs/使用Q-learning解决悬崖寻路问题.md
+++ b/codes/Docs/使用Q-learning解决悬崖寻路问题.md
@@ -30,9 +30,9 @@ env = CliffWalkingWapper(env) # 装饰环境
 这里我们在程序中使用了一个装饰器重新定义环境，但不影响对环境的理解，感兴趣的同学具体看相关代码。可以由于gym环境封装得比较好，所以我们想要使用这个环境只需要使用gym.make命令输入函数名即可，然后我们可以查看环境的状态和动作维度目：

 ```python
-state_dim = env.observation_space.n # 状态维度
-action_dim = env.action_space.n # 动作维度
-print(f"状态维度：{state_dim}，动作维度：{action_dim}")
+n_states = env.observation_space.n # 状态维度
+n_actions = env.action_space.n # 动作维度
+print(f"状态维度：{n_states}，动作维度：{n_actions}")
 ```

 打印出来的结果如下：
@@ -72,9 +72,9 @@ print(state)
 env = gym.make('CliffWalking-v0')  # 定义环境
 env = CliffWalkingWapper(env) # 装饰环境
 env.seed(1) # 设置随机种子
-state_dim = env.observation_space.n # 状态维度
-action_dim = env.action_space.n # 动作维度
-agent = QLearning(state_dim,action_dim,cfg) # cfg存储算法相关参数
+n_states = env.observation_space.n # 状态维度
+n_actions = env.action_space.n # 动作维度
+agent = QLearning(n_states,n_actions,cfg) # cfg存储算法相关参数
 for i_ep in range(cfg.train_eps): # cfg.train_eps表示最大训练的回合数
    ep_reward = 0  # 记录每个回合的奖励
    state = env.reset()  # 重置环境
@@ -126,7 +126,7 @@ def choose_action(self, state):
      if np.random.uniform(0, 1) > self.epsilon:
          action = np.argmax(self.Q_table[str(state)]) # 选择Q(s,a)最大对应的动作
      else:
-          action = np.random.choice(self.action_dim) # 随机选择动作
+          action = np.random.choice(self.n_actions) # 随机选择动作
      return action
 ```

--- a/codes/DoubleDQN/double_dqn.py
+++ b/codes/DoubleDQN/double_dqn.py
@@ -46,15 +46,15 @@ class ReplayBuffer:
        return len(self.buffer)

 class MLP(nn.Module):
-    def __init__(self, state_dim,action_dim,hidden_dim=128):
+    def __init__(self, n_states,n_actions,hidden_dim=128):
        """ 初始化q网络，为全连接网络
-            state_dim: 输入的特征数即环境的状态维度
-            action_dim: 输出的动作维度
+            n_states: 输入的特征数即环境的状态维度
+            n_actions: 输出的动作维度
        """
        super(MLP, self).__init__()
-        self.fc1 = nn.Linear(state_dim, hidden_dim) # 输入层
+        self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
        self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
-        self.fc3 = nn.Linear(hidden_dim, action_dim) # 输出层
+        self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层
        
    def forward(self, x):
        # 各层对应的激活函数
@@ -63,8 +63,8 @@ class MLP(nn.Module):
        return self.fc3(x)
        
 class DoubleDQN:
-    def __init__(self, state_dim, action_dim, cfg):
-        self.action_dim = action_dim  # 总的动作个数
+    def __init__(self, n_states, n_actions, cfg):
+        self.n_actions = n_actions  # 总的动作个数
        self.device = cfg.device  # 设备，cpu或gpu等
        self.gamma = cfg.gamma
        # e-greedy策略相关参数
@@ -73,8 +73,8 @@ class DoubleDQN:
        self.epsilon_end = cfg.epsilon_end
        self.epsilon_decay = cfg.epsilon_decay
        self.batch_size = cfg.batch_size
-        self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
-        self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
+        self.policy_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
+        self.target_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
        # target_net copy from policy_net
        for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
            target_param.data.copy_(param.data)
@@ -103,7 +103,7 @@ class DoubleDQN:
                # 所以tensor.max(1)[1]返回最大值对应的下标，即action
                action = q_value.max(1)[1].item()  
        else:
-            action = random.randrange(self.action_dim)
+            action = random.randrange(self.n_actions)
        return action
    def update(self):

--- a/codes/DoubleDQN/task0.py
+++ b/codes/DoubleDQN/task0.py
@@ -59,9 +59,9 @@ class Config:
 def env_agent_config(cfg,seed=1):
    env = gym.make(cfg.env_name)  
    env.seed(seed)
-    state_dim = env.observation_space.shape[0]
-    action_dim = env.action_space.n
-    agent = DoubleDQN(state_dim,action_dim,cfg)
+    n_states = env.observation_space.shape[0]
+    n_actions = env.action_space.n
+    agent = DoubleDQN(n_states,n_actions,cfg)
    return env,agent

 def train(cfg,env,agent):
--- a/codes/DuelingDQN/task0_train.ipynb
+++ b/codes/DuelingDQN/task0_train.ipynb
@@ -136,12 +136,12 @@
   "outputs": [],
   "source": [
    "class DuelingNet(nn.Module):\n",
-    "    def __init__(self, state_dim, action_dim,hidden_size=128):\n",
+    "    def __init__(self, n_states, n_actions,hidden_size=128):\n",
    "        super(DuelingNet, self).__init__()\n",
    "        \n",
    "        # 隐藏层\n",
    "        self.hidden = nn.Sequential(\n",
-    "            nn.Linear(state_dim, hidden_size),\n",
+    "            nn.Linear(n_states, hidden_size),\n",
    "            nn.ReLU()\n",
    "        )\n",
    "        \n",
@@ -149,7 +149,7 @@
    "        self.advantage = nn.Sequential(\n",
    "            nn.Linear(hidden_size, hidden_size),\n",
    "            nn.ReLU(),\n",
-    "            nn.Linear(hidden_size, action_dim)\n",
+    "            nn.Linear(hidden_size, n_actions)\n",
    "        )\n",
    "        \n",
    "        # 价值函数\n",
@@ -192,7 +192,7 @@
   ],
   "source": [
    "class DuelingDQN:\n",
-    "    def __init__(self,state_dim,action_dim,cfg) -> None:\n",
+    "    def __init__(self,n_states,n_actions,cfg) -> None:\n",
    "        self.batch_size = cfg.batch_size\n",
    "        self.device = cfg.device\n",
    "        self.loss_history = [] # 记录loss的变化\n",
@@ -200,8 +200,8 @@
    "        self.epsilon = lambda frame_idx: cfg.epsilon_end + \\\n",
    "            (cfg.epsilon_start - cfg.epsilon_end) * \\\n",
    "            math.exp(-1. * frame_idx / cfg.epsilon_decay)\n",
-    "        self.policy_net = DuelingNet(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)\n",
-    "        self.target_net = DuelingNet(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)\n",
+    "        self.policy_net = DuelingNet(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)\n",
+    "        self.target_net = DuelingNet(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)\n",
    "        for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网络targe_net\n",
    "            target_param.data.copy_(param.data)\n",
    "        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器\n",
@@ -214,7 +214,7 @@
    "                q_values = self.policy_net(state)\n",
    "                action = q_values.max(1)[1].item() # 选择Q值最大的动作\n",
    "        else:\n",
-    "            action = random.randrange(self.action_dim)\n",
+    "            action = random.randrange(self.n_actions)\n",
    "        return action\n",
    "    def update(self):\n",
    "        if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时，不更新策略\n",
--- a/codes/HierarchicalDQN/agent.py
+++ b/codes/HierarchicalDQN/agent.py
@@ -57,16 +57,16 @@ class MLP(nn.Module):
        return self.fc3(x)
        
 class HierarchicalDQN:
-    def __init__(self,state_dim,action_dim,cfg):
-        self.state_dim = state_dim
-        self.action_dim = action_dim
+    def __init__(self,n_states,n_actions,cfg):
+        self.n_states = n_states
+        self.n_actions = n_actions
        self.gamma = cfg.gamma
        self.device = cfg.device
        self.batch_size = cfg.batch_size
        self.frame_idx = 0  # 用于epsilon的衰减计数
        self.epsilon = lambda frame_idx: cfg.epsilon_end + (cfg.epsilon_start - cfg.epsilon_end ) * math.exp(-1. * frame_idx / cfg.epsilon_decay)
-        self.policy_net = MLP(2*state_dim, action_dim,cfg.hidden_dim).to(self.device)
-        self.meta_policy_net = MLP(state_dim, state_dim,cfg.hidden_dim).to(self.device)
+        self.policy_net = MLP(2*n_states, n_actions,cfg.hidden_dim).to(self.device)
+        self.meta_policy_net = MLP(n_states, n_states,cfg.hidden_dim).to(self.device)
        self.optimizer = optim.Adam(self.policy_net.parameters(),lr=cfg.lr)
        self.meta_optimizer = optim.Adam(self.meta_policy_net.parameters(),lr=cfg.lr)
        self.memory = ReplayBuffer(cfg.memory_capacity)
@@ -76,7 +76,7 @@ class HierarchicalDQN:
        self.losses = []
        self.meta_losses = []
    def to_onehot(self,x):
-        oh = np.zeros(self.state_dim)
+        oh = np.zeros(self.n_states)
        oh[x - 1] = 1.
        return oh
    def set_goal(self,state):
@@ -85,7 +85,7 @@ class HierarchicalDQN:
                state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(0)
                goal = self.meta_policy_net(state).max(1)[1].item() 
        else:
-            goal = random.randrange(self.state_dim)
+            goal = random.randrange(self.n_states)
        return goal
    def choose_action(self,state):
        self.frame_idx += 1
@@ -95,7 +95,7 @@ class HierarchicalDQN:
                q_value = self.policy_net(state)
                action = q_value.max(1)[1].item()  
        else:
-            action = random.randrange(self.action_dim)
+            action = random.randrange(self.n_actions)
        return action
    def update(self):
        self.update_policy()
--- a/codes/HierarchicalDQN/task0.py
+++ b/codes/HierarchicalDQN/task0.py
@@ -63,9 +63,9 @@ class PlotConfig:
 def env_agent_config(cfg,seed=1):
    env = gym.make(cfg.env_name)  
    env.seed(seed)
-    state_dim = env.observation_space.shape[0]
-    action_dim = env.action_space.n
-    agent = HierarchicalDQN(state_dim,action_dim,cfg)
+    n_states = env.observation_space.shape[0]
+    n_actions = env.action_space.n
+    agent = HierarchicalDQN(n_states,n_actions,cfg)
    return env,agent

 if __name__ == "__main__":
--- a/codes/LICENSE
+++ b/codes/LICENSE
@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2020 John Jim
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
--- a/codes/Logs.md
+++ b/codes/Logs.md
@@ -1,7 +0,0 @@
-## 记录笔者更新的日志
-
-**2021.12.28-1**：将```task.py```中的两个Config类合并为一个，并加以注释便于阅读，从DQN算法开始更新
-
-**2021.12.22-3**：将```agent.py```更改为对应的算法名称，便于区分如```dqn```与```dqn_cnn```的情况  
-**2021.12.22-2**：简化了代码结构，将原来的```train.py```和```task.py```等合并到```task.py```中  
-**2021.12.22-1**：简化了代码结构，将原来的```model.py```和```memory.py```等合并到```agent.py```中，```plot.py```的内容合并到```common.utils.py```中
--- a/codes/MonteCarlo/agent.py
+++ b/codes/MonteCarlo/agent.py
@@ -17,11 +17,11 @@ import dill
 class FisrtVisitMC:
    ''' On-Policy First-Visit MC Control
    '''
-    def __init__(self,action_dim,cfg):
-        self.action_dim = action_dim
+    def __init__(self,n_actions,cfg):
+        self.n_actions = n_actions
        self.epsilon = cfg.epsilon
        self.gamma = cfg.gamma 
-        self.Q_table = defaultdict(lambda: np.zeros(action_dim))
+        self.Q_table = defaultdict(lambda: np.zeros(n_actions))
        self.returns_sum = defaultdict(float) # sum of returns
        self.returns_count = defaultdict(float)
        
@@ -29,11 +29,11 @@ class FisrtVisitMC:
        ''' e-greed policy '''
        if state in self.Q_table.keys():
            best_action = np.argmax(self.Q_table[state])
-            action_probs = np.ones(self.action_dim, dtype=float) * self.epsilon / self.action_dim
+            action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions
            action_probs[best_action] += (1.0 - self.epsilon)
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        else:
-            action = np.random.randint(0,self.action_dim)
+            action = np.random.randint(0,self.n_actions)
        return action
    def update(self,one_ep_transition):
        # Find all (state, action) pairs we've visited in this one_ep_transition
--- a/codes/MonteCarlo/task0_train.py
+++ b/codes/MonteCarlo/task0_train.py
@@ -43,8 +43,8 @@ class MCConfig:

 def env_agent_config(cfg,seed=1):
    env = RacetrackEnv()
-    action_dim = 9
-    agent = FisrtVisitMC(action_dim, cfg)
+    n_actions = 9
+    agent = FisrtVisitMC(n_actions, cfg)
    return env,agent
    
 def train(cfg, env, agent):
--- a/codes/PPO/README.md
+++ b/codes/PPO/README.md
@@ -57,16 +57,16 @@ model就是actor和critic两个网络了：
 import torch.nn as nn
 from torch.distributions.categorical import Categorical
 class Actor(nn.Module):
-    def __init__(self,state_dim, action_dim,
+    def __init__(self,n_states, n_actions,
            hidden_dim=256):
        super(Actor, self).__init__()

        self.actor = nn.Sequential(
-                nn.Linear(state_dim, hidden_dim),
+                nn.Linear(n_states, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, hidden_dim),
                nn.ReLU(),
-                nn.Linear(hidden_dim, action_dim),
+                nn.Linear(hidden_dim, n_actions),
                nn.Softmax(dim=-1)
        )
    def forward(self, state):
@@ -75,10 +75,10 @@ class Actor(nn.Module):
        return dist

 class Critic(nn.Module):
-    def __init__(self, state_dim,hidden_dim=256):
+    def __init__(self, n_states,hidden_dim=256):
        super(Critic, self).__init__()
        self.critic = nn.Sequential(
-                nn.Linear(state_dim, hidden_dim),
+                nn.Linear(n_states, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, hidden_dim),
                nn.ReLU(),
@@ -88,7 +88,7 @@ class Critic(nn.Module):
        value = self.critic(state)
        return value
 ```
-这里Actor就是得到一个概率分布(Categorica，也可以是别的分布，可以搜索torch distributionsl)，critc根据当前状态得到一个值，这里的输入维度可以是```state_dim+action_dim```，即将action信息也纳入critic网络中，这样会更好一些，感兴趣的小伙伴可以试试。
+这里Actor就是得到一个概率分布(Categorica，也可以是别的分布，可以搜索torch distributionsl)，critc根据当前状态得到一个值，这里的输入维度可以是```n_states+n_actions```，即将action信息也纳入critic网络中，这样会更好一些，感兴趣的小伙伴可以试试。

 ### PPO update
 定义一个update函数主要实现伪代码中的第六步和第七步：
--- a/codes/PPO/memory.py
+++ b/codes/PPO/memory.py
@@ -1,44 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-'''
-Author: John
-Email: johnjim0816@gmail.com
-Date: 2021-03-23 15:30:46
-LastEditor: John
-LastEditTime: 2021-09-26 22:00:07
-Discription: 
-Environment: 
-'''
-import numpy as np
-class PPOMemory:
-    def __init__(self, batch_size):
-        self.states = []
-        self.probs = []
-        self.vals = []
-        self.actions = []
-        self.rewards = []
-        self.dones = []
-        self.batch_size = batch_size
-    def sample(self):
-        batch_step = np.arange(0, len(self.states), self.batch_size)
-        indices = np.arange(len(self.states), dtype=np.int64)
-        np.random.shuffle(indices)
-        batches = [indices[i:i+self.batch_size] for i in batch_step]
-        return np.array(self.states),np.array(self.actions),np.array(self.probs),\
-                np.array(self.vals),np.array(self.rewards),np.array(self.dones),batches
-                
-    def push(self, state, action, probs, vals, reward, done):
-        self.states.append(state)
-        self.actions.append(action)
-        self.probs.append(probs)
-        self.vals.append(vals)
-        self.rewards.append(reward)
-        self.dones.append(done)
-
-    def clear(self):
-        self.states = []
-        self.probs = []
-        self.actions = []
-        self.rewards = []
-        self.dones = []
-        self.vals = []
--- a/codes/PPO/model.py
+++ b/codes/PPO/model.py
@@ -1,44 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-'''
-Author: John
-Email: johnjim0816@gmail.com
-Date: 2021-03-23 15:29:24
-LastEditor: John
-LastEditTime: 2021-04-08 22:36:43
-Discription: 
-Environment: 
-'''
-import torch.nn as nn
-from torch.distributions.categorical import Categorical
-class Actor(nn.Module):
-    def __init__(self,state_dim, action_dim,
-            hidden_dim):
-        super(Actor, self).__init__()
-
-        self.actor = nn.Sequential(
-                nn.Linear(state_dim, hidden_dim),
-                nn.ReLU(),
-                nn.Linear(hidden_dim, hidden_dim),
-                nn.ReLU(),
-                nn.Linear(hidden_dim, action_dim),
-                nn.Softmax(dim=-1)
-        )
-    def forward(self, state):
-        dist = self.actor(state)
-        dist = Categorical(dist)
-        return dist
-
-class Critic(nn.Module):
-    def __init__(self, state_dim,hidden_dim):
-        super(Critic, self).__init__()
-        self.critic = nn.Sequential(
-                nn.Linear(state_dim, hidden_dim),
-                nn.ReLU(),
-                nn.Linear(hidden_dim, hidden_dim),
-                nn.ReLU(),
-                nn.Linear(hidden_dim, 1)
-        )
-    def forward(self, state):
-        value = self.critic(state)
-        return value
--- a/codes/PPO/outputs/CartPole-v0/20211117-184614/models/ppo_actor.pt
+++ b/codes/PPO/outputs/CartPole-v0/20211117-184614/models/ppo_actor.pt
--- a/codes/PPO/outputs/CartPole-v0/20211117-184614/models/ppo_critic.pt
+++ b/codes/PPO/outputs/CartPole-v0/20211117-184614/models/ppo_critic.pt
--- a/codes/PPO/outputs/CartPole-v0/20211117-184614/results/eval_rewards_curve.png
+++ b/codes/PPO/outputs/CartPole-v0/20211117-184614/results/eval_rewards_curve.png
--- a/codes/PPO/outputs/CartPole-v0/20211117-184614/results/train_ma_rewards.npy
+++ b/codes/PPO/outputs/CartPole-v0/20211117-184614/results/train_ma_rewards.npy
--- a/codes/PPO/outputs/CartPole-v0/20211117-184614/results/train_rewards.npy
+++ b/codes/PPO/outputs/CartPole-v0/20211117-184614/results/train_rewards.npy
--- a/codes/PPO/outputs/CartPole-v0/20211117-184614/results/train_rewards_curve.png
+++ b/codes/PPO/outputs/CartPole-v0/20211117-184614/results/train_rewards_curve.png
--- a/codes/PPO/outputs/CartPole-v0/20211231-193837/models/ppo_actor.pt
+++ b/codes/PPO/outputs/CartPole-v0/20211231-193837/models/ppo_actor.pt
--- a/codes/PPO/outputs/CartPole-v0/20211231-193837/models/ppo_critic.pt
+++ b/codes/PPO/outputs/CartPole-v0/20211231-193837/models/ppo_critic.pt
--- a/codes/PPO/outputs/CartPole-v0/20211231-193837/results/test_ma_rewards.npy
+++ b/codes/PPO/outputs/CartPole-v0/20211231-193837/results/test_ma_rewards.npy
--- a/codes/PPO/outputs/CartPole-v0/20211231-193837/results/test_rewards.npy
+++ b/codes/PPO/outputs/CartPole-v0/20211231-193837/results/test_rewards.npy
--- a/codes/PPO/outputs/CartPole-v0/20211231-193837/results/test_rewards_curve.png
+++ b/codes/PPO/outputs/CartPole-v0/20211231-193837/results/test_rewards_curve.png
--- a/codes/PPO/outputs/CartPole-v0/20211231-193837/results/train_ma_rewards.npy
+++ b/codes/PPO/outputs/CartPole-v0/20211231-193837/results/train_ma_rewards.npy
--- a/codes/PPO/outputs/CartPole-v0/20211231-193837/results/train_rewards.npy
+++ b/codes/PPO/outputs/CartPole-v0/20211231-193837/results/train_rewards.npy
--- a/codes/PPO/outputs/CartPole-v0/20211231-193837/results/train_rewards_curve.png
+++ b/codes/PPO/outputs/CartPole-v0/20211231-193837/results/train_rewards_curve.png
--- a/codes/PPO/agent.py
+++ b/codes/PPO/agent.py
@@ -5,7 +5,7 @@ Author: John
 Email: johnjim0816@gmail.com
 Date: 2021-03-23 15:17:42
 LastEditor: John
-LastEditTime: 2021-09-26 22:02:00
+LastEditTime: 2021-12-31 19:38:33
 Discription: 
 Environment: 
 '''
@@ -13,25 +13,89 @@ import os
 import numpy as np
 import torch 
 import torch.optim as optim
-from PPO.model import Actor,Critic
-from PPO.memory import PPOMemory
+import torch.nn as nn
+from torch.distributions.categorical import Categorical
+class PPOMemory:
+    def __init__(self, batch_size):
+        self.states = []
+        self.probs = []
+        self.vals = []
+        self.actions = []
+        self.rewards = []
+        self.dones = []
+        self.batch_size = batch_size
+    def sample(self):
+        batch_step = np.arange(0, len(self.states), self.batch_size)
+        indices = np.arange(len(self.states), dtype=np.int64)
+        np.random.shuffle(indices)
+        batches = [indices[i:i+self.batch_size] for i in batch_step]
+        return np.array(self.states),np.array(self.actions),np.array(self.probs),\
+                np.array(self.vals),np.array(self.rewards),np.array(self.dones),batches
+                
+    def push(self, state, action, probs, vals, reward, done):
+        self.states.append(state)
+        self.actions.append(action)
+        self.probs.append(probs)
+        self.vals.append(vals)
+        self.rewards.append(reward)
+        self.dones.append(done)
+
+    def clear(self):
+        self.states = []
+        self.probs = []
+        self.actions = []
+        self.rewards = []
+        self.dones = []
+        self.vals = []
+class Actor(nn.Module):
+    def __init__(self,n_states, n_actions,
+            hidden_dim):
+        super(Actor, self).__init__()
+
+        self.actor = nn.Sequential(
+                nn.Linear(n_states, hidden_dim),
+                nn.ReLU(),
+                nn.Linear(hidden_dim, hidden_dim),
+                nn.ReLU(),
+                nn.Linear(hidden_dim, n_actions),
+                nn.Softmax(dim=-1)
+        )
+    def forward(self, state):
+        dist = self.actor(state)
+        dist = Categorical(dist)
+        return dist
+
+class Critic(nn.Module):
+    def __init__(self, n_states,hidden_dim):
+        super(Critic, self).__init__()
+        self.critic = nn.Sequential(
+                nn.Linear(n_states, hidden_dim),
+                nn.ReLU(),
+                nn.Linear(hidden_dim, hidden_dim),
+                nn.ReLU(),
+                nn.Linear(hidden_dim, 1)
+        )
+    def forward(self, state):
+        value = self.critic(state)
+        return value
 class PPO:
-    def __init__(self, state_dim, action_dim,cfg):
+    def __init__(self, n_states, n_actions,cfg):
        self.gamma = cfg.gamma
        self.continuous = cfg.continuous 
        self.policy_clip = cfg.policy_clip
        self.n_epochs = cfg.n_epochs
        self.gae_lambda = cfg.gae_lambda
        self.device = cfg.device
-        self.actor = Actor(state_dim, action_dim,cfg.hidden_dim).to(self.device)
-        self.critic = Critic(state_dim,cfg.hidden_dim).to(self.device)
+        self.actor = Actor(n_states, n_actions,cfg.hidden_dim).to(self.device)
+        self.critic = Critic(n_states,cfg.hidden_dim).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=cfg.actor_lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=cfg.critic_lr)
        self.memory = PPOMemory(cfg.batch_size)
        self.loss = 0

    def choose_action(self, state):
-        state = torch.tensor([state], dtype=torch.float).to(self.device)
+        state = np.array([state]) # 先转成数组再转tensor更高效
+        state = torch.tensor(state, dtype=torch.float).to(self.device)
        dist = self.actor(state)
        value = self.critic(state)
        action = dist.sample()
--- a/codes/PPO/task0.py
+++ b/codes/PPO/task0.py
@@ -5,63 +5,127 @@ sys.path.append(parent_path) # 添加路径到系统路径

 import gym
 import torch
+import numpy as np
 import datetime
-from common.plot import plot_rewards
+from common.utils import plot_rewards
 from common.utils import save_results,make_dir
-from PPO.agent import PPO
-from PPO.train import train
+from ppo2 import PPO

 curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间

-class PPOConfig:
+class Config:
    def __init__(self) -> None:
-        self.algo = "DQN"  # 算法名称
+        ################################## 环境超参数 ###################################
+        self.algo_name = "DQN"  # 算法名称
        self.env_name = 'CartPole-v0' # 环境名称
        self.continuous = False # 环境是否为连续动作
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 检测GPU
+        self.seed = 10 # 随机种子，置0则不设置随机种子
        self.train_eps = 200 # 训练的回合数
        self.test_eps = 20 # 测试的回合数
-        self.batch_size = 5
-        self.gamma=0.99
+        ################################################################################
+        
+        ################################## 算法超参数 ####################################
+        self.batch_size = 5  # mini-batch SGD中的批量大小
+        self.gamma = 0.95  # 强化学习中的折扣因子
        self.n_epochs = 4
-        self.actor_lr = 0.0003
-        self.critic_lr = 0.0003
-        self.gae_lambda=0.95
-        self.policy_clip=0.2
+        self.actor_lr = 0.0003 # actor的学习率
+        self.critic_lr = 0.0003 # critic的学习率
+        self.gae_lambda = 0.95
+        self.policy_clip = 0.2
        self.hidden_dim = 256
-        self.update_fre = 20 # frequency of agent update
-
-class PlotConfig:
-    def __init__(self) -> None:
-        self.algo = "DQN"  # 算法名称
-        self.env_name = 'CartPole-v0' # 环境名称
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 检测GPU
+        self.update_fre = 20 # 策略更新频率
+        ################################################################################
+        
+        ################################# 保存结果相关参数 ################################
        self.result_path = curr_path+"/outputs/" + self.env_name + \
            '/'+curr_time+'/results/'  # 保存结果的路径
        self.model_path = curr_path+"/outputs/" + self.env_name + \
            '/'+curr_time+'/models/'  # 保存模型的路径
        self.save = True # 是否保存图片
+        ################################################################################
+        
+def env_agent_config(cfg):
+    ''' 创建环境和智能体
+    '''
+    env = gym.make(cfg.env_name)  # 创建环境
+    n_states = env.observation_space.shape[0]  # 状态维度
+    if cfg.continuous:
+        n_actions = env.action_space.shape[0] # 动作维度
+    else:
+        n_actions = env.action_space.n  # 动作维度
+    agent = PPO(n_states, n_actions, cfg)  # 创建智能体
+    if cfg.seed !=0: # 设置随机种子
+        torch.manual_seed(cfg.seed)
+        env.seed(cfg.seed)
+        np.random.seed(cfg.seed)
+    return env, agent

-def env_agent_config(cfg,seed=1):
-    env = gym.make(cfg.env_name)  
-    env.seed(seed)
-    state_dim = env.observation_space.shape[0]
-    action_dim = env.action_space.n
-    agent = PPO(state_dim,action_dim,cfg)
-    return env,agent
+def train(cfg,env,agent):
+    print('开始训练！')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
+    rewards = [] # 记录所有回合的奖励
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    steps = 0
+    for i_ep in range(cfg.train_eps):
+        state = env.reset()
+        done = False
+        ep_reward = 0
+        while not done:
+            action, prob, val = agent.choose_action(state)
+            state_, reward, done, _ = env.step(action)
+            steps += 1
+            ep_reward += reward
+            agent.memory.push(state, action, prob, val, reward, done)
+            if steps % cfg.update_fre == 0:
+                agent.update()
+            state = state_
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
+        else:
+            ma_rewards.append(ep_reward)
+        if (i_ep+1)%10 == 0: 
+            print(f"回合：{i_ep+1}/{cfg.train_eps}，奖励：{ep_reward:.2f}")
+    print('完成训练！')
+    return rewards,ma_rewards

-cfg  = PPOConfig()
-plot_cfg = PlotConfig()
-# 训练
-env,agent = env_agent_config(cfg,seed=1)
-rewards, ma_rewards = train(cfg, env, agent)
-make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹
-agent.save(path=plot_cfg.model_path)
-save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path)
-plot_rewards(rewards, ma_rewards, plot_cfg, tag="train")
-# 测试
-env,agent = env_agent_config(cfg,seed=10)
-agent.load(path=plot_cfg.model_path)
-rewards,ma_rewards = eval(cfg,env,agent)
-save_results(rewards,ma_rewards,tag='eval',path=plot_cfg.result_path)
-plot_rewards(rewards,ma_rewards,plot_cfg,tag="eval")
+def test(cfg,env,agent):
+    print('开始测试!')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
+    rewards = [] # 记录所有回合的奖励
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    for i_ep in range(cfg.test_eps):
+        state = env.reset()
+        done = False
+        ep_reward = 0
+        while not done:
+            action, prob, val = agent.choose_action(state)
+            state_, reward, done, _ = env.step(action)
+            ep_reward += reward
+            state = state_
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(
+                0.9*ma_rewards[-1]+0.1*ep_reward)
+        else:
+            ma_rewards.append(ep_reward)
+        print('回合：{}/{}, 奖励：{}'.format(i_ep+1, cfg.test_eps, ep_reward))
+    print('完成训练！')
+    return rewards,ma_rewards
+
+if __name__ == "__main__":
+    cfg  = Config()
+    # 训练
+    env,agent = env_agent_config(cfg)
+    rewards, ma_rewards = train(cfg, env, agent)
+    make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹
+    agent.save(path=cfg.model_path)
+    save_results(rewards, ma_rewards, tag='train', path=cfg.result_path)
+    plot_rewards(rewards, ma_rewards, cfg, tag="train")
+    # 测试
+    env,agent = env_agent_config(cfg)
+    agent.load(path=cfg.model_path)
+    rewards,ma_rewards = test(cfg,env,agent)
+    save_results(rewards,ma_rewards,tag='test',path=cfg.result_path)
+    plot_rewards(rewards,ma_rewards,cfg,tag="test")
--- a/codes/PPO/task1.py
+++ b/codes/PPO/task1.py
@@ -6,10 +6,9 @@ sys.path.append(parent_path) # 添加路径到系统路径
 import gym
 import torch
 import datetime
-from common.plot import plot_rewards
+from common.utils import plot_rewards
 from common.utils import save_results,make_dir
-from PPO.agent import PPO
-from PPO.train import train
+from ppo2 import PPO

 curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间

@@ -45,9 +44,9 @@ class PlotConfig:
 def env_agent_config(cfg,seed=1):
    env = gym.make(cfg.env_name)  
    env.seed(seed)
-    state_dim = env.observation_space.shape[0]
-    action_dim = env.action_space.shape[0]
-    agent = PPO(state_dim,action_dim,cfg)
+    n_states = env.observation_space.shape[0]
+    n_actions = env.action_space.shape[0]
+    agent = PPO(n_states,n_actions,cfg)
    return env,agent


--- a/codes/PPO/train.ipynb
+++ b/codes/PPO/train.ipynb
--- a/codes/PPO/train.py
+++ b/codes/PPO/train.py
@@ -1,121 +0,0 @@
-def train(cfg,env,agent):
-    print('开始训练！')
-    print(f'环境：{cfg.env_name}, 算法：{cfg.algo}, 设备：{cfg.device}')
-    rewards = [] # 记录所有回合的奖励
-    ma_rewards = []  # 记录所有回合的滑动平均奖励
-    steps = 0
-    for i_ep in range(cfg.train_eps):
-        state = env.reset()
-        done = False
-        ep_reward = 0
-        while not done:
-            action, prob, val = agent.choose_action(state)
-            state_, reward, done, _ = env.step(action)
-            steps += 1
-            ep_reward += reward
-            agent.memory.push(state, action, prob, val, reward, done)
-            if steps % cfg.update_fre == 0:
-                agent.update()
-            state = state_
-        rewards.append(ep_reward)
-        if ma_rewards:
-            ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
-        else:
-            ma_rewards.append(ep_reward)
-        if (i_ep+1)%10 == 0: 
-            print(f"回合：{i_ep+1}/{cfg.train_eps}，奖励：{ep_reward:.2f}")
-    print('完成训练！')
-    return rewards,ma_rewards
-
-def eval(cfg,env,agent):
-    print('开始测试!')
-    print(f'环境：{cfg.env_name}, 算法：{cfg.algo}, 设备：{cfg.device}')
-    rewards = [] # 记录所有回合的奖励
-    ma_rewards = []  # 记录所有回合的滑动平均奖励
-    for i_ep in range(cfg.test_eps):
-        state = env.reset()
-        done = False
-        ep_reward = 0
-        while not done:
-            action, prob, val = agent.choose_action(state)
-            state_, reward, done, _ = env.step(action)
-            ep_reward += reward
-            state = state_
-        rewards.append(ep_reward)
-        if ma_rewards:
-            ma_rewards.append(
-                0.9*ma_rewards[-1]+0.1*ep_reward)
-        else:
-            ma_rewards.append(ep_reward)
-        print('回合：{}/{}, 奖励：{}'.format(i_ep+1, cfg.test_eps, ep_reward))
-    print('完成训练！')
-    return rewards,ma_rewards
-
-if __name__ == '__main__':
-    import sys,os
-    curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
-    parent_path = os.path.dirname(curr_path) # 父路径
-    sys.path.append(parent_path) # 添加路径到系统路径
-
-    import gym
-    import torch
-    import datetime
-    from common.plot import plot_rewards
-    from common.utils import save_results,make_dir
-    from PPO.agent import PPO
-    from PPO.train import train
-
-    curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
-
-    class PPOConfig:
-        def __init__(self) -> None:
-            self.algo = "DQN"  # 算法名称
-            self.env_name = 'CartPole-v0' # 环境名称
-            self.continuous = False # 环境是否为连续动作
-            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 检测GPU
-            self.train_eps = 200 # 训练的回合数
-            self.test_eps = 20 # 测试的回合数
-            self.batch_size = 5
-            self.gamma=0.99
-            self.n_epochs = 4
-            self.actor_lr = 0.0003
-            self.critic_lr = 0.0003
-            self.gae_lambda=0.95
-            self.policy_clip=0.2
-            self.hidden_dim = 256
-            self.update_fre = 20 # frequency of agent update
-
-    class PlotConfig:
-        def __init__(self) -> None:
-            self.algo = "DQN"  # 算法名称
-            self.env_name = 'CartPole-v0' # 环境名称
-            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 检测GPU
-            self.result_path = curr_path+"/outputs/" + self.env_name + \
-                '/'+curr_time+'/results/'  # 保存结果的路径
-            self.model_path = curr_path+"/outputs/" + self.env_name + \
-                '/'+curr_time+'/models/'  # 保存模型的路径
-            self.save = True # 是否保存图片
-
-    def env_agent_config(cfg,seed=1):
-        env = gym.make(cfg.env_name)  
-        env.seed(seed)
-        state_dim = env.observation_space.shape[0]
-        action_dim = env.action_space.n
-        agent = PPO(state_dim,action_dim,cfg)
-        return env,agent
-
-    cfg  = PPOConfig()
-    plot_cfg = PlotConfig()
-    # 训练
-    env,agent = env_agent_config(cfg,seed=1)
-    rewards, ma_rewards = train(cfg, env, agent)
-    make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹
-    agent.save(path=plot_cfg.model_path)
-    save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path)
-    plot_rewards(rewards, ma_rewards, plot_cfg, tag="train")
-    # 测试
-    env,agent = env_agent_config(cfg,seed=10)
-    agent.load(path=plot_cfg.model_path)
-    rewards,ma_rewards = eval(cfg,env,agent)
-    save_results(rewards,ma_rewards,tag='eval',path=plot_cfg.result_path)
-    plot_rewards(rewards,ma_rewards,plot_cfg,tag="eval")
--- a/codes/PolicyGradient/model.py
+++ b/codes/PolicyGradient/model.py
@@ -1,31 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-'''
-Author: John
-Email: johnjim0816@gmail.com
-Date: 2021-03-23 16:35:58
-LastEditor: John
-LastEditTime: 2021-12-21 23:21:26
-Discription: 
-Environment: 
-'''
-import torch.nn as nn
-import torch.nn.functional as F
-class MLP(nn.Module):
-    
-    ''' 多层感知机
-        输入：state维度
-        输出：概率
-    '''
-    def __init__(self,input_dim,hidden_dim = 36):
-        super(MLP, self).__init__()
-        # 24和36为hidden layer的层数，可根据input_dim, action_dim的情况来改变
-        self.fc1 = nn.Linear(input_dim, hidden_dim)
-        self.fc2 = nn.Linear(hidden_dim,hidden_dim)
-        self.fc3 = nn.Linear(hidden_dim, 1)  # Prob of Left
-
-    def forward(self, x):
-        x = F.relu(self.fc1(x))
-        x = F.relu(self.fc2(x))
-        x = F.sigmoid(self.fc3(x))
-        return x
--- a/codes/PolicyGradient/outputs/CartPole-v0/20210505-173524/models/pg_checkpoint.pt
+++ b/codes/PolicyGradient/outputs/CartPole-v0/20210505-173524/models/pg_checkpoint.pt
--- a/codes/PolicyGradient/outputs/CartPole-v0/20210505-173524/results/eval_ma_rewards.npy
+++ b/codes/PolicyGradient/outputs/CartPole-v0/20210505-173524/results/eval_ma_rewards.npy
--- a/codes/PolicyGradient/outputs/CartPole-v0/20210505-173524/results/eval_rewards.npy
+++ b/codes/PolicyGradient/outputs/CartPole-v0/20210505-173524/results/eval_rewards.npy
--- a/codes/PolicyGradient/outputs/CartPole-v0/20210505-173524/results/eval_rewards_curve.png
+++ b/codes/PolicyGradient/outputs/CartPole-v0/20210505-173524/results/eval_rewards_curve.png
--- a/codes/PolicyGradient/outputs/CartPole-v0/20210505-173524/results/train_ma_rewards.npy
+++ b/codes/PolicyGradient/outputs/CartPole-v0/20210505-173524/results/train_ma_rewards.npy
--- a/codes/PolicyGradient/outputs/CartPole-v0/20210505-173524/results/train_rewards.npy
+++ b/codes/PolicyGradient/outputs/CartPole-v0/20210505-173524/results/train_rewards.npy
--- a/codes/PolicyGradient/outputs/CartPole-v0/20210505-173524/results/train_rewards_curve.png
+++ b/codes/PolicyGradient/outputs/CartPole-v0/20210505-173524/results/train_rewards_curve.png
--- a/codes/PolicyGradient/outputs/CartPole-v0/20220210-061325/models/pg_checkpoint.pt
+++ b/codes/PolicyGradient/outputs/CartPole-v0/20220210-061325/models/pg_checkpoint.pt
--- a/codes/PolicyGradient/outputs/CartPole-v0/20220210-061325/results/test_ma_rewards.npy
+++ b/codes/PolicyGradient/outputs/CartPole-v0/20220210-061325/results/test_ma_rewards.npy
--- a/codes/PolicyGradient/outputs/CartPole-v0/20220210-061325/results/test_rewards.npy
+++ b/codes/PolicyGradient/outputs/CartPole-v0/20220210-061325/results/test_rewards.npy
--- a/codes/PolicyGradient/outputs/CartPole-v0/20220210-061325/results/test_rewards_curve.png
+++ b/codes/PolicyGradient/outputs/CartPole-v0/20220210-061325/results/test_rewards_curve.png
--- a/codes/PolicyGradient/outputs/CartPole-v0/20220210-061325/results/train_ma_rewards.npy
+++ b/codes/PolicyGradient/outputs/CartPole-v0/20220210-061325/results/train_ma_rewards.npy
--- a/codes/PolicyGradient/outputs/CartPole-v0/20220210-061325/results/train_rewards.npy
+++ b/codes/PolicyGradient/outputs/CartPole-v0/20220210-061325/results/train_rewards.npy
--- a/codes/PolicyGradient/outputs/CartPole-v0/20220210-061325/results/train_rewards_curve.png
+++ b/codes/PolicyGradient/outputs/CartPole-v0/20220210-061325/results/train_rewards_curve.png
--- a/codes/PolicyGradient/agent.py
+++ b/codes/PolicyGradient/agent.py
@@ -5,21 +5,41 @@ Author: John
 Email: johnjim0816@gmail.com
 Date: 2020-11-22 23:27:44
 LastEditor: John
-LastEditTime: 2021-10-16 00:43:52
+LastEditTime: 2022-02-10 01:25:27
 Discription: 
 Environment: 
 '''
 import torch
+import torch.nn as nn
+import torch.nn.functional as F
 from torch.distributions import Bernoulli
 from torch.autograd import Variable
 import numpy as np
-from PolicyGradient.model import MLP

+class MLP(nn.Module):
+    
+    ''' 多层感知机
+        输入：state维度
+        输出：概率
+    '''
+    def __init__(self,input_dim,hidden_dim = 36):
+        super(MLP, self).__init__()
+        # 24和36为hidden layer的层数，可根据input_dim, n_actions的情况来改变
+        self.fc1 = nn.Linear(input_dim, hidden_dim)
+        self.fc2 = nn.Linear(hidden_dim,hidden_dim)
+        self.fc3 = nn.Linear(hidden_dim, 1)  # Prob of Left
+
+    def forward(self, x):
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = F.sigmoid(self.fc3(x))
+        return x
+        
 class PolicyGradient:
    
-    def __init__(self, state_dim,cfg):
+    def __init__(self, n_states,cfg):
        self.gamma = cfg.gamma
-        self.policy_net = MLP(state_dim,hidden_dim=cfg.hidden_dim)
+        self.policy_net = MLP(n_states,hidden_dim=cfg.hidden_dim)
        self.optimizer = torch.optim.RMSprop(self.policy_net.parameters(), lr=cfg.lr)
        self.batch_size = cfg.batch_size

--- a/codes/PolicyGradient/task0.py
+++ b/codes/PolicyGradient/task0.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Author: John
+Email: johnjim0816@gmail.com
+Date: 2020-11-22 23:21:53
+LastEditor: John
+LastEditTime: 2022-02-10 06:13:21
+Discription: 
+Environment: 
+'''
+import sys
+import os
+curr_path = os.path.dirname(os.path.abspath(__file__))  # 当前文件所在绝对路径
+parent_path = os.path.dirname(curr_path)  # 父路径
+sys.path.append(parent_path)  # 添加路径到系统路径
+
+import gym
+import torch
+import datetime
+from itertools import count
+
+from pg import PolicyGradient
+from common.utils import save_results, make_dir
+from common.utils import plot_rewards
+
+curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
+
+class Config:
+    '''超参数
+    '''
+
+    def __init__(self):
+        ################################## 环境超参数 ###################################
+        self.algo_name = "PolicyGradient"  # 算法名称
+        self.env_name = 'CartPole-v0' # 环境名称
+        self.device = torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu")  # 检测GPUgjgjlkhfsf风刀霜的撒发十
+        self.seed = 10 # 随机种子，置0则不设置随机种子
+        self.train_eps = 300 # 训练的回合数
+        self.test_eps = 30 # 测试的回合数
+        ################################################################################
+        
+        ################################## 算法超参数 ###################################
+        self.batch_size = 8 # mini-batch SGD中的批量大小
+        self.lr = 0.01 # 学习率
+        self.gamma = 0.99 # 强化学习中的折扣因子
+        self.hidden_dim = 36 # 网络隐藏层
+        ################################################################################
+        
+        ################################# 保存结果相关参数 ################################
+        self.result_path = curr_path + "/outputs/" + self.env_name + \
+            '/' + curr_time + '/results/'  # 保存结果的路径
+        self.model_path = curr_path + "/outputs/" + self.env_name + \
+            '/' + curr_time + '/models/'  # 保存模型的路径
+        self.save = True # 是否保存图片
+        ################################################################################
+
+
+def env_agent_config(cfg,seed=1):
+    env = gym.make(cfg.env_name)  
+    env.seed(seed)
+    n_states = env.observation_space.shape[0]
+    agent = PolicyGradient(n_states,cfg)
+    return env,agent
+
+def train(cfg,env,agent):
+    print('开始训练!')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
+    state_pool = [] # 存放每batch_size个episode的state序列
+    action_pool = []
+    reward_pool = [] 
+    rewards = []
+    ma_rewards = []
+    for i_ep in range(cfg.train_eps):
+        state = env.reset()
+        ep_reward = 0
+        for _ in count():
+            action = agent.choose_action(state) # 根据当前环境state选择action
+            next_state, reward, done, _ = env.step(action)
+            ep_reward += reward
+            if done:
+                reward = 0
+            state_pool.append(state)
+            action_pool.append(float(action))
+            reward_pool.append(reward)
+            state = next_state
+            if done:
+                print('回合：{}/{}, 奖励：{}'.format(i_ep + 1, cfg.train_eps, ep_reward))
+                break
+        if i_ep > 0 and i_ep % cfg.batch_size == 0:
+            agent.update(reward_pool,state_pool,action_pool)
+            state_pool = [] # 每个episode的state
+            action_pool = []
+            reward_pool = []
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(
+                0.9*ma_rewards[-1]+0.1*ep_reward)
+        else:
+            ma_rewards.append(ep_reward)
+    print('完成训练！')
+    env.close()
+    return rewards, ma_rewards
+            
+
+def test(cfg,env,agent):
+    print('开始测试!')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
+    rewards = []
+    ma_rewards = []
+    for i_ep in range(cfg.test_eps):
+        state = env.reset()
+        ep_reward = 0
+        for _ in count():
+            action = agent.choose_action(state) # 根据当前环境state选择action
+            next_state, reward, done, _ = env.step(action)
+            ep_reward += reward
+            if done:
+                reward = 0
+            state = next_state
+            if done:
+                print('回合：{}/{}, 奖励：{}'.format(i_ep + 1, cfg.train_eps, ep_reward))
+                break
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(
+                0.9*ma_rewards[-1]+0.1*ep_reward)
+        else:
+            ma_rewards.append(ep_reward)
+    print('完成测试！')
+    env.close()
+    return rewards, ma_rewards
+    
+if __name__ == "__main__":
+    cfg = Config()
+    # 训练
+    env, agent = env_agent_config(cfg)
+    rewards, ma_rewards = train(cfg, env, agent)
+    make_dir(cfg.result_path, cfg.model_path)  # 创建保存结果和模型路径的文件夹
+    agent.save(path=cfg.model_path)  # 保存模型
+    save_results(rewards, ma_rewards, tag='train',
+                 path=cfg.result_path)  # 保存结果
+    plot_rewards(rewards, ma_rewards, cfg, tag="train")  # 画出结果
+    # 测试
+    env, agent = env_agent_config(cfg)
+    agent.load(path=cfg.model_path)  # 导入模型
+    rewards, ma_rewards = test(cfg, env, agent)
+    save_results(rewards, ma_rewards, tag='test',
+                 path=cfg.result_path)  # 保存结果
+    plot_rewards(rewards, ma_rewards, cfg, tag="test")  # 画出结果
+
--- a/codes/PolicyGradient/task0_train.py
+++ b/codes/PolicyGradient/task0_train.py
@@ -1,136 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-'''
-Author: John
-Email: johnjim0816@gmail.com
-Date: 2020-11-22 23:21:53
-LastEditor: John
-LastEditTime: 2021-10-16 00:34:13
-Discription: 
-Environment: 
-'''
-import sys,os
-curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
-parent_path = os.path.dirname(curr_path) # 父路径
-sys.path.append(parent_path) # 添加父路径到系统路径sys.path
-
-import gym
-import torch
-import datetime
-from itertools import count
-
-from PolicyGradient.agent import PolicyGradient
-from common.plot import plot_rewards
-from common.utils import save_results,make_dir
-
-curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
-
-class PGConfig:
-    def __init__(self):
-        self.algo = "PolicyGradient"  # 算法名称
-        self.env = 'CartPole-v0' # 环境名称
-        self.result_path = curr_path+"/outputs/" + self.env + \
-            '/'+curr_time+'/results/'  # 保存结果的路径
-        self.model_path = curr_path+"/outputs/" + self.env + \
-            '/'+curr_time+'/models/'  # 保存模型的路径
-        self.train_eps = 300 # 训练的回合数
-        self.test_eps = 30 # 测试的回合数
-        self.batch_size = 8
-        self.lr = 0.01 # 学习率
-        self.gamma = 0.99
-        self.hidden_dim = 36 # dimmension of hidden layer
-        self.device = torch.device(
-            "cuda" if torch.cuda.is_available() else "cpu")  # check gpu
-
-
-def env_agent_config(cfg,seed=1):
-    env = gym.make(cfg.env)  
-    env.seed(seed)
-    state_dim = env.observation_space.shape[0]
-    agent = PolicyGradient(state_dim,cfg)
-    return env,agent
-
-def train(cfg,env,agent):
-    print('Start to eval !')
-    print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}')
-    state_pool = [] # 存放每batch_size个episode的state序列
-    action_pool = []
-    reward_pool = [] 
-    rewards = []
-    ma_rewards = []
-    for i_ep in range(cfg.train_eps):
-        state = env.reset()
-        ep_reward = 0
-        for _ in count():
-            action = agent.choose_action(state) # 根据当前环境state选择action
-            next_state, reward, done, _ = env.step(action)
-            ep_reward += reward
-            if done:
-                reward = 0
-            state_pool.append(state)
-            action_pool.append(float(action))
-            reward_pool.append(reward)
-            state = next_state
-            if done:
-                print('Episode:', i_ep, ' Reward:',  ep_reward)
-                break
-        if i_ep > 0 and i_ep % cfg.batch_size == 0:
-            agent.update(reward_pool,state_pool,action_pool)
-            state_pool = [] # 每个episode的state
-            action_pool = []
-            reward_pool = []
-        rewards.append(ep_reward)
-        if ma_rewards:
-            ma_rewards.append(
-                0.9*ma_rewards[-1]+0.1*ep_reward)
-        else:
-            ma_rewards.append(ep_reward)
-    print('complete training！')
-    return rewards, ma_rewards
-            
-
-def eval(cfg,env,agent):
-    print('Start to eval !')
-    print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}')
-    rewards = []
-    ma_rewards = []
-    for i_ep in range(cfg.test_eps):
-        state = env.reset()
-        ep_reward = 0
-        for _ in count():
-            action = agent.choose_action(state) # 根据当前环境state选择action
-            next_state, reward, done, _ = env.step(action)
-            ep_reward += reward
-            if done:
-                reward = 0
-            state = next_state
-            if done:
-                print('Episode:', i_ep, ' Reward:',  ep_reward)
-                break
-        rewards.append(ep_reward)
-        if ma_rewards:
-            ma_rewards.append(
-                0.9*ma_rewards[-1]+0.1*ep_reward)
-        else:
-            ma_rewards.append(ep_reward)
-    print('complete evaling！')
-    return rewards, ma_rewards
-    
-if __name__ == "__main__":
-    cfg = PGConfig()
-
-     # train
-    env,agent = env_agent_config(cfg,seed=1)
-    rewards, ma_rewards = train(cfg, env, agent)
-    make_dir(cfg.result_path, cfg.model_path)
-    agent.save(path=cfg.model_path)
-    save_results(rewards, ma_rewards, tag='train', path=cfg.result_path)
-    plot_rewards(rewards, ma_rewards, tag="train",
-                 algo=cfg.algo, path=cfg.result_path)
-    # eval
-    env,agent = env_agent_config(cfg,seed=10)
-    agent.load(path=cfg.model_path)
-    rewards,ma_rewards = eval(cfg,env,agent)
-    save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path)
-    plot_rewards(rewards,ma_rewards,tag="eval",env=cfg.env,algo = cfg.algo,path=cfg.result_path)
-
--- a/codes/QLearning/env/gridworld_env.py
+++ b/codes/QLearning/env/gridworld_env.py
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/models/Qleaning_model.pkl
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/models/Qleaning_model.pkl
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/test_rewards_curve.png
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/test_rewards_curve.png
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/train_ma_rewards.npy
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/train_ma_rewards.npy
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/train_rewards.npy
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/train_rewards.npy
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/train_rewards_curve.png
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/train_rewards_curve.png
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/models/Qleaning_model.pkl
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/models/Qleaning_model.pkl
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/test_ma_rewards.npy
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/test_ma_rewards.npy
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/test_rewards.npy
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/test_rewards.npy
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/test_rewards_curve.png
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/test_rewards_curve.png
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/train_ma_rewards.npy
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/train_ma_rewards.npy
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/train_rewards.npy
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/train_rewards.npy
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/train_rewards_curve.png
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/train_rewards_curve.png
--- a/codes/QLearning/outputs/CliffWalking-v0/20220210-005501/models/Qleaning_model.pkl
+++ b/codes/QLearning/outputs/CliffWalking-v0/20220210-005501/models/Qleaning_model.pkl
--- a/codes/QLearning/outputs/CliffWalking-v0/20220210-005501/results/test_ma_rewards.npy
+++ b/codes/QLearning/outputs/CliffWalking-v0/20220210-005501/results/test_ma_rewards.npy
--- a/codes/QLearning/outputs/CliffWalking-v0/20220210-005501/results/test_rewards.npy
+++ b/codes/QLearning/outputs/CliffWalking-v0/20220210-005501/results/test_rewards.npy
--- a/Show More
+++ b/Show More