Merge branch 'master' of github.com:datawhalechina/easy-rl

2022-06-01 23:08:03 +08:00
parent 6b0734fbc7 088a2c90e1
commit 70d1e5695e
149 changed files with 1867 additions and 1542 deletions
--- a/codes/A2C/agent.py
+++ b/codes/A2C/agent.py
@@ -40,10 +40,10 @@ class ActorCritic(nn.Module):
 class A2C:
    ''' A2C算法
    '''
-    def __init__(self,state_dim,action_dim,cfg) -> None:
+    def __init__(self,n_states,n_actions,cfg) -> None:
        self.gamma = cfg.gamma
        self.device = cfg.device
-        self.model = ActorCritic(state_dim, action_dim, cfg.hidden_size).to(self.device)
+        self.model = ActorCritic(n_states, n_actions, cfg.hidden_size).to(self.device)
        self.optimizer = optim.Adam(self.model.parameters())
    def compute_returns(self,next_value, rewards, masks):
--- a/codes/A2C/task0.py
+++ b/codes/A2C/task0.py
@@ -10,7 +10,7 @@ import torch
 import torch.optim as optim
 import datetime
 from common.multiprocessing_env import SubprocVecEnv
-from A2C.agent import ActorCritic
+from a2c import ActorCritic
 from common.utils import save_results, make_dir
 from common.utils import plot_rewards
@@ -74,9 +74,9 @@ def train(cfg,envs):
    print(f'环境：{cfg.env_name}, 算法：{cfg.algo}, 设备：{cfg.device}')
    env = gym.make(cfg.env_name) # a single env
    env.seed(10)
-    state_dim  = envs.observation_space.shape[0]
+    n_states  = envs.observation_space.shape[0]
-    action_dim = envs.action_space.n
+    n_actions = envs.action_space.n
-    model = ActorCritic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
+    model = ActorCritic(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
    optimizer = optim.Adam(model.parameters())
    frame_idx    = 0
    test_rewards = []
--- a/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/models/checkpoint.pt
+++ b/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/models/checkpoint.pt
--- a/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/eval_ma_rewards.npy
+++ b/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/eval_ma_rewards.npy
--- a/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/eval_rewards.npy
+++ b/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/eval_rewards.npy
--- a/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/eval_rewards_curve_cn.png
+++ b/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/eval_rewards_curve_cn.png
--- a/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/train_ma_rewards.npy
+++ b/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/train_ma_rewards.npy
--- a/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/train_rewards.npy
+++ b/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/train_rewards.npy
--- a/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/train_rewards_curve_cn.png
+++ b/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/train_rewards_curve_cn.png
--- a/codes/DDPG/agent.py
+++ b/codes/DDPG/agent.py
@@ -39,11 +39,11 @@ class ReplayBuffer:
        '''
        return len(self.buffer)
 class Actor(nn.Module):
-    def __init__(self, state_dim, action_dim, hidden_dim, init_w=3e-3):
+    def __init__(self, n_states, n_actions, hidden_dim, init_w=3e-3):
        super(Actor, self).__init__()  
-        self.linear1 = nn.Linear(state_dim, hidden_dim)
+        self.linear1 = nn.Linear(n_states, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
-        self.linear3 = nn.Linear(hidden_dim, action_dim)
+        self.linear3 = nn.Linear(hidden_dim, n_actions)
        self.linear3.weight.data.uniform_(-init_w, init_w)
        self.linear3.bias.data.uniform_(-init_w, init_w)
@@ -54,10 +54,10 @@ class Actor(nn.Module):
        x = torch.tanh(self.linear3(x))
        return x
 class Critic(nn.Module):
-    def __init__(self, state_dim, action_dim, hidden_dim, init_w=3e-3):
+    def __init__(self, n_states, n_actions, hidden_dim, init_w=3e-3):
        super(Critic, self).__init__()
-        self.linear1 = nn.Linear(state_dim + action_dim, hidden_dim)
+        self.linear1 = nn.Linear(n_states + n_actions, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
        self.linear3 = nn.Linear(hidden_dim, 1)
        # 随机初始化为较小的值
@@ -72,12 +72,12 @@ class Critic(nn.Module):
        x = self.linear3(x)
        return x
 class DDPG:
-    def __init__(self, state_dim, action_dim, cfg):
+    def __init__(self, n_states, n_actions, cfg):
        self.device = cfg.device
-        self.critic = Critic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
+        self.critic = Critic(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
-        self.actor = Actor(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
+        self.actor = Actor(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
-        self.target_critic = Critic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
+        self.target_critic = Critic(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
-        self.target_actor = Actor(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
+        self.target_actor = Actor(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
        # 复制参数到目标网络
        for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()):
--- a/codes/DDPG/env.py
+++ b/codes/DDPG/env.py
@@ -39,15 +39,15 @@ class OUNoise(object):
        self.max_sigma    = max_sigma
        self.min_sigma    = min_sigma
        self.decay_period = decay_period
-        self.action_dim   = action_space.shape[0]
+        self.n_actions   = action_space.shape[0]
        self.low          = action_space.low
        self.high         = action_space.high
        self.reset()
    def reset(self):
-        self.obs = np.ones(self.action_dim) * self.mu
+        self.obs = np.ones(self.n_actions) * self.mu
    def evolve_obs(self):
        x  = self.obs
-        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
+        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.n_actions)
        self.obs = x + dx
        return self.obs
    def get_action(self, action, t=0):
--- a/codes/DDPG/task0.py
+++ b/codes/DDPG/task0.py
@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-11 20:58:21
@LastEditor: John
-LastEditTime: 2021-09-16 01:31:33
+LastEditTime: 2022-02-10 06:23:27
@Discription: 
@Environment: python 3.7.7
 '''
@@ -18,23 +18,29 @@ import datetime
 import gym
 import torch
-from DDPG.env import NormalizedActions
+from env import NormalizedActions,OUNoise
-from DDPG.agent import DDPG
+from ddpg import DDPG
 from DDPG.train import train,test
 from common.utils import save_results,make_dir
 from common.utils import plot_rewards
 curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
-algo_name = 'DDPG'  # 算法名称
+class Config:
-env_name = 'Pendulum-v1'  # 环境名称，gym新版本（约0.21.0之后）中Pendulum-v0改为Pendulum-v1
+    '''超参数
    '''
 class DDPGConfig:
    def __init__(self):
-        self.algo_name = algo_name # 算法名称
+        ################################## 环境超参数 ###################################
-        self.env_name = env_name # 环境名称
+        self.algo_name = 'DDPG'  # 算法名称
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
+        self.env_name = 'Pendulum-v1'  # 环境名称，gym新版本（约0.21.0之后）中Pendulum-v0改为Pendulum-v1
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")  # 检测GPUgjgjlkhfsf风刀霜的撒发十
        self.seed = 10 # 随机种子，置0则不设置随机种子
        self.train_eps = 300 # 训练的回合数
        self.test_eps = 50 # 测试的回合数
        ################################################################################
        ################################## 算法超参数 ###################################
        self.gamma = 0.99 # 折扣因子
        self.critic_lr = 1e-3 # 评论家网络的学习率
        self.actor_lr = 1e-4 # 演员网络的学习率
@@ -43,39 +49,92 @@ class DDPGConfig:
        self.target_update = 2 # 目标网络的更新频率
        self.hidden_dim = 256 # 网络隐藏层维度
        self.soft_tau = 1e-2 # 软更新参数
        ################################################################################
-class PlotConfig:
+        ################################# 保存结果相关参数 ################################
-    def __init__(self) -> None:
+        self.result_path = curr_path + "/outputs/" + self.env_name + \
-        self.algo_name = algo_name  # 算法名称
+            '/' + curr_time + '/results/'  # 保存结果的路径
-        self.env_name = env_name # 环境名称
+        self.model_path = curr_path + "/outputs/" + self.env_name + \
-        self.result_path = curr_path+"/outputs/" + self.env_name + \
+            '/' + curr_time + '/models/'  # 保存模型的路径
            '/'+curr_time+'/results/'  # 保存结果的路径
        self.model_path = curr_path+"/outputs/" + self.env_name + \
            '/'+curr_time+'/models/'  # 保存模型的路径
        self.save = True # 是否保存图片
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 检测GPU
+        ################################################################################
 def env_agent_config(cfg,seed=1):
    env = NormalizedActions(gym.make(cfg.env_name)) # 装饰action噪声
    env.seed(seed) # 随机种子
-    state_dim = env.observation_space.shape[0]
+    n_states = env.observation_space.shape[0]
-    action_dim = env.action_space.shape[0]
+    n_actions = env.action_space.shape[0]
-    agent = DDPG(state_dim,action_dim,cfg)
+    agent = DDPG(n_states,n_actions,cfg)
    return env,agent
 def train(cfg, env, agent):
    print('开始训练！')
    print(f'环境：{cfg.env_name}，算法：{cfg.algo}，设备：{cfg.device}')
    ou_noise = OUNoise(env.action_space)  # 动作噪声
    rewards = [] # 记录所有回合的奖励
    ma_rewards = []  # 记录所有回合的滑动平均奖励
    for i_ep in range(cfg.train_eps):
        state = env.reset()
        ou_noise.reset()
        done = False
        ep_reward = 0
        i_step = 0
        while not done:
            i_step += 1
            action = agent.choose_action(state)
            action = ou_noise.get_action(action, i_step) 
            next_state, reward, done, _ = env.step(action)
            ep_reward += reward
            agent.memory.push(state, action, reward, next_state, done)
            agent.update()
            state = next_state
        if (i_ep+1)%10 == 0:
            print('回合：{}/{}，奖励：{:.2f}'.format(i_ep+1, cfg.train_eps, ep_reward))
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
        else:
            ma_rewards.append(ep_reward)
    print('完成训练！')
    return rewards, ma_rewards
-cfg = DDPGConfig()
+def test(cfg, env, agent):
-plot_cfg = PlotConfig()
+    print('开始测试！')
-# 训练
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo}, 设备：{cfg.device}')
-env,agent = env_agent_config(cfg,seed=1)
+    rewards = [] # 记录所有回合的奖励
-rewards, ma_rewards = train(cfg, env, agent)
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
-make_dir(plot_cfg.result_path, plot_cfg.model_path)
+    for i_ep in range(cfg.test_eps):
-agent.save(path=plot_cfg.model_path)
+        state = env.reset() 
-save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path)
+        done = False
-plot_rewards(rewards, ma_rewards, plot_cfg, tag="train")  # 画出结果
+        ep_reward = 0
-# 测试
+        i_step = 0
-env,agent = env_agent_config(cfg,seed=10)
+        while not done:
-agent.load(path=plot_cfg.model_path)
+            i_step += 1
-rewards,ma_rewards = test(plot_cfg,env,agent)
+            action = agent.choose_action(state)
-save_results(rewards,ma_rewards,tag = 'test',path = cfg.result_path)
+            next_state, reward, done, _ = env.step(action)
-plot_rewards(rewards, ma_rewards, plot_cfg, tag="test")  # 画出结果
+            ep_reward += reward
            state = next_state
        print('回合：{}/{}, 奖励：{}'.format(i_ep+1, cfg.train_eps, ep_reward))
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
        else:
            ma_rewards.append(ep_reward)
        print(f"回合：{i_ep+1}/{cfg.test_eps}，奖励：{ep_reward:.1f}")
    print('完成测试！')
    return rewards, ma_rewards
 if __name__ == "__main__":
    cfg = Config()
    # 训练
    env,agent = env_agent_config(cfg,seed=1)
    rewards, ma_rewards = train(cfg, env, agent)
    make_dir(cfg.result_path, cfg.model_path)
    agent.save(path=cfg.model_path)
    save_results(rewards, ma_rewards, tag='train', path=cfg.result_path)
    plot_rewards(rewards, ma_rewards, cfg, tag="train")  # 画出结果
    # 测试
    env,agent = env_agent_config(cfg,seed=10)
    agent.load(path=cfg.model_path)
    rewards,ma_rewards = test(cfg,env,agent)
    save_results(rewards,ma_rewards,tag = 'test',path = cfg.result_path)
    plot_rewards(rewards, ma_rewards, cfg, tag="test")  # 画出结果
--- a/codes/DDPG/train.py
+++ b/codes/DDPG/train.py
@@ -1,64 +0,0 @@
 import sys
 import os
 curr_path = os.path.dirname(os.path.abspath(__file__))  # 当前文件所在绝对路径
 parent_path = os.path.dirname(curr_path)  # 父路径
 sys.path.append(parent_path)  # 添加路径到系统路径
 from DDPG.env import OUNoise
 def train(cfg, env, agent):
    print('开始训练！')
    print(f'环境：{cfg.env_name}，算法：{cfg.algo}，设备：{cfg.device}')
    ou_noise = OUNoise(env.action_space)  # 动作噪声
    rewards = [] # 记录所有回合的奖励
    ma_rewards = []  # 记录所有回合的滑动平均奖励
    for i_ep in range(cfg.train_eps):
        state = env.reset()
        ou_noise.reset()
        done = False
        ep_reward = 0
        i_step = 0
        while not done:
            i_step += 1
            action = agent.choose_action(state)
            action = ou_noise.get_action(action, i_step) 
            next_state, reward, done, _ = env.step(action)
            ep_reward += reward
            agent.memory.push(state, action, reward, next_state, done)
            agent.update()
            state = next_state
        if (i_ep+1)%10 == 0:
            print('回合：{}/{}，奖励：{:.2f}'.format(i_ep+1, cfg.train_eps, ep_reward))
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
        else:
            ma_rewards.append(ep_reward)
    print('完成训练！')
    return rewards, ma_rewards
 def test(cfg, env, agent):
    print('开始测试！')
    print(f'环境：{cfg.env_name}, 算法：{cfg.algo}, 设备：{cfg.device}')
    rewards = [] # 记录所有回合的奖励
    ma_rewards = []  # 记录所有回合的滑动平均奖励
    for i_ep in range(cfg.test_eps):
        state = env.reset() 
        done = False
        ep_reward = 0
        i_step = 0
        while not done:
            i_step += 1
            action = agent.choose_action(state)
            next_state, reward, done, _ = env.step(action)
            ep_reward += reward
            state = next_state
        print('回合：{}/{}, 奖励：{}'.format(i_ep+1, cfg.train_eps, ep_reward))
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
        else:
            ma_rewards.append(ep_reward)
        print(f"回合：{i_ep+1}/{cfg.test_eps}，奖励：{ep_reward:.1f}")
    print('完成测试！')
    return rewards, ma_rewards
--- a/codes/DQN/README.md
+++ b/codes/DQN/README.md
@@ -50,15 +50,15 @@ import torch.nn as nn
 import torch.nn.functional as F
 class FCN(nn.Module):
-    def __init__(self, state_dim=4, action_dim=18):
+    def __init__(self, n_states=4, n_actions=18):
        """ 初始化q网络，为全连接网络
-            state_dim: 输入的feature即环境的state数目
+            n_states: 输入的feature即环境的state数目
-            action_dim: 输出的action总个数
+            n_actions: 输出的action总个数
        """
        super(FCN, self).__init__()
-        self.fc1 = nn.Linear(state_dim, 128) # 输入层
+        self.fc1 = nn.Linear(n_states, 128) # 输入层
        self.fc2 = nn.Linear(128, 128) # 隐藏层
-        self.fc3 = nn.Linear(128, action_dim) # 输出层
+        self.fc3 = nn.Linear(128, n_actions) # 输出层
    def forward(self, x):
        # 各层对应的激活函数
@@ -66,7 +66,7 @@ class FCN(nn.Module):
        x = F.relu(self.fc2(x))
        return self.fc3(x)
 ```
-输入为state_dim，输出为action_dim，包含一个128维度的隐藏层，这里根据需要可增加隐藏层维度和数量，然后一般使用relu激活函数，这里跟深度学习的网路设置是一样的。
+输入为n_states，输出为n_actions，包含一个128维度的隐藏层，这里根据需要可增加隐藏层维度和数量，然后一般使用relu激活函数，这里跟深度学习的网路设置是一样的。
 ### Replay Buffer
@@ -107,8 +107,8 @@ class ReplayBuffer:
 在类中建立两个网络，以及optimizer和memory，
 ```python
-self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
+self.policy_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
-self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
+self.target_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
 for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # copy params from policy net
    target_param.data.copy_(param.data)
 self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr)
@@ -124,7 +124,7 @@ def choose_action(self, state):
    if random.random() > self.epsilon(self.frame_idx):
        action = self.predict(state)
    else:
-        action = random.randrange(self.action_dim)
+        action = random.randrange(self.n_actions)
    return action
 ```
--- a/codes/DQN/dqn.py
+++ b/codes/DQN/dqn.py
@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:50:49
@LastEditor: John
-LastEditTime: 2021-12-22 14:01:37
+LastEditTime: 2022-03-02 11:05:11
@Discription: 
@Environment: python 3.7.7
 '''
@@ -20,22 +20,7 @@ import random
 import math
 import numpy as np
-class MLP(nn.Module):
+
    def __init__(self, state_dim,action_dim,hidden_dim=128):
        """ 初始化q网络，为全连接网络
            state_dim: 输入的特征数即环境的状态维度
            action_dim: 输出的动作维度
        """
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim) # 输入层
        self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
        self.fc3 = nn.Linear(hidden_dim, action_dim) # 输出层
    def forward(self, x):
        # 各层对应的激活函数
        x = F.relu(self.fc1(x)) 
        x = F.relu(self.fc2(x))
        return self.fc3(x)
 class ReplayBuffer:
    def __init__(self, capacity):
@@ -62,9 +47,9 @@ class ReplayBuffer:
        return len(self.buffer)
 class DQN:
-    def __init__(self, state_dim, action_dim, cfg):
+    def __init__(self, n_actions,model,cfg):
-        self.action_dim = action_dim  # 总的动作个数
+        self.n_actions = n_actions  # 总的动作个数
        self.device = cfg.device  # 设备，cpu或gpu等
        self.gamma = cfg.gamma  # 奖励的折扣因子
        # e-greedy策略相关参数
@@ -73,8 +58,8 @@ class DQN:
            (cfg.epsilon_start - cfg.epsilon_end) * \
            math.exp(-1. * frame_idx / cfg.epsilon_decay)
        self.batch_size = cfg.batch_size
-        self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
+        self.policy_net = model.to(self.device)
-        self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
+        self.target_net = model.to(self.device)
        for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net
            target_param.data.copy_(param.data)
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器
@@ -86,23 +71,24 @@ class DQN:
        self.frame_idx += 1
        if random.random() > self.epsilon(self.frame_idx):
            with torch.no_grad():
-                state = torch.tensor([state], device=self.device, dtype=torch.float32)
+                state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
                q_values = self.policy_net(state)
                action = q_values.max(1)[1].item() # 选择Q值最大的动作
        else:
-            action = random.randrange(self.action_dim)
+            action = random.randrange(self.n_actions)
        return action
    def update(self):
        if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时，不更新策略
            return
        # 从经验回放中(replay memory)中随机采样一个批量的转移(transition)
        # print('updating')
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
            self.batch_size)
-        # 转为张量
+        state_batch = torch.tensor(np.array(state_batch), device=self.device, dtype=torch.float)
        state_batch = torch.tensor(state_batch, device=self.device, dtype=torch.float)
        action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1)  
        reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float)  
-        next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float)
+        next_state_batch = torch.tensor(np.array(next_state_batch), device=self.device, dtype=torch.float)
        done_batch = torch.tensor(np.float32(done_batch), device=self.device)
        q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch) # 计算当前状态(s_t,a)对应的Q(s_t, a)
        next_q_values = self.target_net(next_state_batch).max(1)[0].detach() # 计算下一时刻的状态(s_t_,a)对应的Q值
--- a/codes/DQN/dqn_cnn.py
+++ b/codes/DQN/dqn_cnn.py
@@ -70,9 +70,9 @@ class ReplayBuffer:
        return len(self.buffer)
 class DQN:
-    def __init__(self, state_dim, action_dim, cfg):
+    def __init__(self, n_states, n_actions, cfg):
-        self.action_dim = action_dim  # 总的动作个数
+        self.n_actions = n_actions  # 总的动作个数
        self.device = cfg.device  # 设备，cpu或gpu等
        self.gamma = cfg.gamma  # 奖励的折扣因子
        # e-greedy策略相关参数
@@ -81,8 +81,8 @@ class DQN:
            (cfg.epsilon_start - cfg.epsilon_end) * \
            math.exp(-1. * frame_idx / cfg.epsilon_decay)
        self.batch_size = cfg.batch_size
-        self.policy_net = CNN(state_dim, action_dim).to(self.device)
+        self.policy_net = CNN(n_states, n_actions).to(self.device)
-        self.target_net = CNN(state_dim, action_dim).to(self.device)
+        self.target_net = CNN(n_states, n_actions).to(self.device)
        for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net
            target_param.data.copy_(param.data)
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器
@@ -94,11 +94,12 @@ class DQN:
        self.frame_idx += 1
        if random.random() > self.epsilon(self.frame_idx):
            with torch.no_grad():
                print(type(state))
                state = torch.tensor([state], device=self.device, dtype=torch.float32)
                q_values = self.policy_net(state)
                action = q_values.max(1)[1].item() # 选择Q值最大的动作
        else:
-            action = random.randrange(self.action_dim)
+            action = random.randrange(self.n_actions)
        return action
    def update(self):
        if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时，不更新策略
--- a/codes/DQN/dqn_cnn2.py
+++ b/codes/DQN/dqn_cnn2.py
@@ -0,0 +1,142 @@
 import torch
 import torch.nn as nn
 import torch.optim as optim
 import torch.autograd as autograd 
 import random
 import math
 import numpy as np
 class CNN(nn.Module):
    def __init__(self, n_frames, n_actions):
        super(CNN,self).__init__()
        self.n_frames = n_frames
        self.n_actions = n_actions
        # Layers
        self.conv1 = nn.Conv2d(
            in_channels=n_frames,
            out_channels=16,
            kernel_size=8,
            stride=4,
            padding=2
            )
        self.conv2 = nn.Conv2d(
            in_channels=16,
            out_channels=32,
            kernel_size=4,
            stride=2,
            padding=1
            )
        self.fc1 = nn.Linear(
            in_features=3200,
            out_features=256,
            )
        self.fc2 = nn.Linear(
            in_features=256,
            out_features=n_actions,
            )
        # Activation Functions
        self.relu = nn.ReLU()
    def flatten(self, x):
        batch_size = x.size()[0]
        x = x.view(batch_size, -1)
        return x
    def forward(self, x):
        # Forward pass
        x = self.relu(self.conv1(x))  # In: (80, 80, 4)  Out: (20, 20, 16)
        x = self.relu(self.conv2(x))  # In: (20, 20, 16) Out: (10, 10, 32)
        x = self.flatten(x)           # In: (10, 10, 32) Out: (3200,)
        x = self.relu(self.fc1(x))    # In: (3200,)      Out: (256,)
        x = self.fc2(x)               # In: (256,)       Out: (4,)
        return x
 class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity # 经验回放的容量
        self.buffer = [] # 缓冲区
        self.position = 0 
    def push(self, state, action, reward, next_state, done):
        ''' 缓冲区是一个队列，容量超出时去掉开始存入的转移(transition)
        '''
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state, done)
        self.position = (self.position + 1) % self.capacity 
    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移
        state, action, reward, next_state, done =  zip(*batch) # 解压成状态，动作等
        return state, action, reward, next_state, done
    def __len__(self):
        ''' 返回当前存储的量
        '''
        return len(self.buffer)
 class DQN:
    def __init__(self, n_states, n_actions, cfg):
        self.n_actions = n_actions  # 总的动作个数
        self.device = cfg.device  # 设备，cpu或gpu等
        self.gamma = cfg.gamma  # 奖励的折扣因子
        # e-greedy策略相关参数
        self.frame_idx = 0  # 用于epsilon的衰减计数
        self.epsilon = lambda frame_idx: cfg.epsilon_end + \
            (cfg.epsilon_start - cfg.epsilon_end) * \
            math.exp(-1. * frame_idx / cfg.epsilon_decay)
        self.batch_size = cfg.batch_size
        self.policy_net = CNN(n_states, n_actions).to(self.device)
        self.target_net = CNN(n_states, n_actions).to(self.device)
        for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net
            target_param.data.copy_(param.data)
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器
        self.memory = ReplayBuffer(cfg.memory_capacity) # 经验回放
    def choose_action(self, state):
        ''' 选择动作
        '''
        self.frame_idx += 1
        if random.random() > self.epsilon(self.frame_idx):
            with torch.no_grad():
                state = torch.tensor([state], device=self.device, dtype=torch.float32)
                q_values = self.policy_net(state)
                action = q_values.max(1)[1].item() # 选择Q值最大的动作
        else:
            action = random.randrange(self.n_actions)
        return action
    def update(self):
        if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时，不更新策略
            return
        # 从经验回放中(replay memory)中随机采样一个批量的转移(transition)
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
            self.batch_size)
        # 转为张量
        state_batch = torch.tensor(state_batch, device=self.device, dtype=torch.float)
        action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1)  
        reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float)  
        next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float)
        done_batch = torch.tensor(np.float32(done_batch), device=self.device)
        q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch) # 计算当前状态(s_t,a)对应的Q(s_t, a)
        next_q_values = self.target_net(next_state_batch).max(1)[0].detach() # 计算下一时刻的状态(s_t_,a)对应的Q值
        # 计算期望的Q值，对于终止状态，此时done_batch[0]=1, 对应的expected_q_value等于reward
        expected_q_values = reward_batch + self.gamma * next_q_values * (1-done_batch)
        loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1))  # 计算均方根损失
        # 优化更新模型
        self.optimizer.zero_grad()  
        loss.backward()
        for param in self.policy_net.parameters():  # clip防止梯度爆炸
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step() 
    def save(self, path):
        torch.save(self.target_net.state_dict(), path+'dqn_checkpoint.pth')
    def load(self, path):
        self.target_net.load_state_dict(torch.load(path+'dqn_checkpoint.pth'))
        for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
            param.data.copy_(target_param.data)
--- a/codes/DQN/outputs/CartPole-v0/20220302-111332/models/dqn_checkpoint.pth
+++ b/codes/DQN/outputs/CartPole-v0/20220302-111332/models/dqn_checkpoint.pth
--- a/codes/DQN/outputs/CartPole-v0/20220302-111332/results/test_ma_rewards.npy
+++ b/codes/DQN/outputs/CartPole-v0/20220302-111332/results/test_ma_rewards.npy
--- a/codes/DQN/outputs/CartPole-v0/20220302-111332/results/test_rewards.npy
+++ b/codes/DQN/outputs/CartPole-v0/20220302-111332/results/test_rewards.npy
--- a/codes/DQN/outputs/CartPole-v0/20220302-111332/results/test_rewards_curve.png
+++ b/codes/DQN/outputs/CartPole-v0/20220302-111332/results/test_rewards_curve.png
--- a/codes/Sarsa/outputs/CliffWalking-v0/20220424-221748/results/test_rewards.npy
+++ b/codes/Sarsa/outputs/CliffWalking-v0/20220424-221748/results/test_rewards.npy
--- a/codes/DQN/outputs/CartPole-v0/20220302-111332/results/train_ma_rewards.npy
+++ b/codes/DQN/outputs/CartPole-v0/20220302-111332/results/train_ma_rewards.npy
--- a/codes/DQN/outputs/CartPole-v0/20220302-111332/results/train_rewards.npy
+++ b/codes/DQN/outputs/CartPole-v0/20220302-111332/results/train_rewards.npy
--- a/codes/DQN/outputs/CartPole-v0/20220302-111332/results/train_rewards_curve.png
+++ b/codes/DQN/outputs/CartPole-v0/20220302-111332/results/train_rewards_curve.png
--- a/codes/DQN/outputs/CartPole-v0/20220302-111332/results/train_steps.npy
+++ b/codes/DQN/outputs/CartPole-v0/20220302-111332/results/train_steps.npy
--- a/codes/DQN/task0.py
+++ b/codes/DQN/task0.py
@@ -1,5 +1,7 @@
 import sys
 import os
 import torch.nn as nn
 import torch.nn.functional as F
 curr_path = os.path.dirname(os.path.abspath(__file__))  # 当前文件所在绝对路径
 parent_path = os.path.dirname(curr_path)  # 父路径
 sys.path.append(parent_path)  # 添加路径到系统路径
@@ -8,26 +10,42 @@ import gym
 import torch
 import datetime
 import numpy as np
-from common.utils import save_results, make_dir
+from common.utils import save_results_1, make_dir
 from common.utils import plot_rewards
-from DQN.dqn import DQN
+from dqn import DQN
 curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
 class MLP(nn.Module):
    def __init__(self, n_states,n_actions,hidden_dim=128):
        """ 初始化q网络，为全连接网络
            n_states: 输入的特征数即环境的状态维度
            n_actions: 输出的动作维度
        """
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
        self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
        self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层
    def forward(self, x):
        # 各层对应的激活函数
        x = F.relu(self.fc1(x)) 
        x = F.relu(self.fc2(x))
        return self.fc3(x)
 class Config:
    '''超参数
    '''
    def __init__(self):
-        ################################## 环境超参数 ###################################
+        ############################### hyperparameters ################################
-        self.algo_name = 'DQN'  # 算法名称
+        self.algo_name = 'DQN'  # algorithm name
-        self.env_name = 'CartPole-v0'  # 环境名称
+        self.env_name = 'CartPole-v0'  # environment name
        self.device = torch.device(
-            "cuda" if torch.cuda.is_available() else "cpu")  # 检测GPUgjgjlkhfsf风刀霜的撒发十
+            "cuda" if torch.cuda.is_available() else "cpu")  # check GPU
        self.seed = 10 # 随机种子，置0则不设置随机种子
        self.train_eps = 200  # 训练的回合数
-        self.test_eps = 30  # 测试的回合数
+        self.test_eps = 20  # 测试的回合数
        ################################################################################
        ################################## 算法超参数 ###################################
@@ -41,8 +59,8 @@ class Config:
        self.target_update = 4  # 目标网络的更新频率
        self.hidden_dim = 256  # 网络隐藏层
        ################################################################################
-
+        
-        ################################# 保存结果相关参数 ##############################
+        ################################# 保存结果相关参数 ################################
        self.result_path = curr_path + "/outputs/" + self.env_name + \
            '/' + curr_time + '/results/'  # 保存结果的路径
        self.model_path = curr_path + "/outputs/" + self.env_name + \
@@ -55,9 +73,11 @@ def env_agent_config(cfg):
    ''' 创建环境和智能体
    '''
    env = gym.make(cfg.env_name)  # 创建环境
-    state_dim = env.observation_space.shape[0]  # 状态维度
+    n_states = env.observation_space.shape[0]  # 状态维度
-    action_dim = env.action_space.n  # 动作维度
+    n_actions = env.action_space.n  # 动作维度
-    agent = DQN(state_dim, action_dim, cfg)  # 创建智能体
+    print(f"n states: {n_states}, n actions: {n_actions}")
    model = MLP(n_states,n_actions)
    agent = DQN(n_actions, model, cfg)  # 创建智能体
    if cfg.seed !=0: # 设置随机种子
        torch.manual_seed(cfg.seed)
        env.seed(cfg.seed)
@@ -72,10 +92,13 @@ def train(cfg, env, agent):
    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
    rewards = []  # 记录所有回合的奖励
    ma_rewards = []  # 记录所有回合的滑动平均奖励
    steps = []
    for i_ep in range(cfg.train_eps):
        ep_reward = 0  # 记录一回合内的奖励
        ep_step = 0
        state = env.reset()  # 重置环境，返回初始状态
        while True:
            ep_step += 1
            action = agent.choose_action(state)  # 选择动作
            next_state, reward, done, _ = env.step(action)  # 更新环境，返回transition
            agent.memory.push(state, action, reward,
@@ -87,16 +110,18 @@ def train(cfg, env, agent):
                break
        if (i_ep + 1) % cfg.target_update == 0:  # 智能体目标网络更新
            agent.target_net.load_state_dict(agent.policy_net.state_dict())
        steps.append(ep_step)
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward)
        else:
            ma_rewards.append(ep_reward)
-        if (i_ep + 1) % 10 == 0:
+        if (i_ep + 1) % 1 == 0:
-            print('回合：{}/{}, 奖励：{}'.format(i_ep + 1, cfg.train_eps, ep_reward))
+            print(f'Episode：{i_ep+1}/{cfg.test_eps}, Reward:{ep_reward:.2f}, Step:{ep_step:.2f} Epislon:{agent.epsilon(agent.frame_idx):.3f}')
-    print('完成训练！')
+    print('Finish training!')
    env.close()
-    return rewards, ma_rewards
+    res_dic = {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}
    return res_dic
 def test(cfg, env, agent):
@@ -108,41 +133,45 @@ def test(cfg, env, agent):
    ################################################################################
    rewards = []  # 记录所有回合的奖励
    ma_rewards = []  # 记录所有回合的滑动平均奖励
    steps = []
    for i_ep in range(cfg.test_eps):
        ep_reward = 0  # 记录一回合内的奖励
        ep_step = 0
        state = env.reset()  # 重置环境，返回初始状态
        while True:
            ep_step+=1
            action = agent.choose_action(state)  # 选择动作
            next_state, reward, done, _ = env.step(action)  # 更新环境，返回transition
            state = next_state  # 更新下一个状态
            ep_reward += reward  # 累加奖励
            if done:
                break
        steps.append(ep_step)
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1)
        else:
            ma_rewards.append(ep_reward)
-        print(f"回合：{i_ep+1}/{cfg.test_eps}，奖励：{ep_reward:.1f}")
+        print(f'Episode：{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}, Step:{ep_step:.2f}')
    print('完成测试！')
    env.close()
-    return rewards, ma_rewards
+    return {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}
 if __name__ == "__main__":
    cfg = Config()
    # 训练
    env, agent = env_agent_config(cfg)
-    rewards, ma_rewards = train(cfg, env, agent)
+    res_dic = train(cfg, env, agent)
    make_dir(cfg.result_path, cfg.model_path)  # 创建保存结果和模型路径的文件夹
    agent.save(path=cfg.model_path)  # 保存模型
-    save_results(rewards, ma_rewards, tag='train',
+    save_results_1(res_dic, tag='train',
                 path=cfg.result_path)  # 保存结果
-    plot_rewards(rewards, ma_rewards, cfg, tag="train")  # 画出结果
+    plot_rewards(res_dic['rewards'], res_dic['ma_rewards'], cfg, tag="train")  # 画出结果
    # 测试
    env, agent = env_agent_config(cfg)
    agent.load(path=cfg.model_path)  # 导入模型
-    rewards, ma_rewards = test(cfg, env, agent)
+    res_dic = test(cfg, env, agent)
-    save_results(rewards, ma_rewards, tag='test',
+    save_results_1(res_dic, tag='test',
                 path=cfg.result_path)  # 保存结果
-    plot_rewards(rewards, ma_rewards, cfg, tag="test")  # 画出结果
+    plot_rewards(res_dic['rewards'], res_dic['ma_rewards'],cfg, tag="test")  # 画出结果
--- a/codes/DQN/task1.py
+++ b/codes/DQN/task1.py
@@ -5,7 +5,7 @@ Author: JiangJi
 Email: johnjim0816@gmail.com
 Date: 2021-12-22 11:14:17
 LastEditor: JiangJi
-LastEditTime: 2021-12-22 11:40:44
+LastEditTime: 2022-02-10 06:17:41
 Discription: 使用 Nature DQN 训练 CartPole-v1
 '''
 import sys
@@ -19,7 +19,7 @@ import torch
 import datetime
 from common.utils import save_results, make_dir
 from common.utils import plot_rewards, plot_rewards_cn
-from DQN.dqn import DQN
+from dqn import DQN
 curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
 algo_name = "DQN"  # 算法名称
@@ -66,9 +66,9 @@ def env_agent_config(cfg, seed=1):
    '''
    env = gym.make(cfg.env_name)  # 创建环境
    env.seed(seed)  # 设置随机种子
-    state_dim = env.observation_space.shape[0]  # 状态维度
+    n_states = env.observation_space.shape[0]  # 状态维度
-    action_dim = env.action_space.n  # 动作维度
+    n_actions = env.action_space.n  # 动作维度
-    agent = DQN(state_dim, action_dim, cfg)  # 创建智能体
+    agent = DQN(n_states, n_actions, cfg)  # 创建智能体
    return env, agent
 def train(cfg, env, agent):
--- a/codes/DQN/task2.py
+++ b/codes/DQN/task2.py
@@ -5,7 +5,7 @@ Author: JiangJi
 Email: johnjim0816@gmail.com
 Date: 2021-12-22 11:14:17
 LastEditor: JiangJi
-LastEditTime: 2021-12-22 15:27:48
+LastEditTime: 2022-02-10 06:17:46
 Discription: 使用 DQN-cnn  训练 PongNoFrameskip-v4
 '''
 import sys
@@ -20,7 +20,7 @@ import datetime
 from common.utils import save_results, make_dir
 from common.utils import plot_rewards, plot_rewards_cn
 from common.atari_wrappers import make_atari, wrap_deepmind
-from DQN.dqn import DQN
+from dqn import DQN
 curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
 algo_name = 'DQN-cnn'  # 算法名称
@@ -68,9 +68,9 @@ def env_agent_config(cfg, seed=1):
    # env    = wrap_deepmind(env)
    # env    = wrap_pytorch(env) 
    env.seed(seed)  # 设置随机种子
-    state_dim = env.observation_space.shape[0]  # 状态维度
+    n_states = env.observation_space.shape[0]  # 状态维度
-    action_dim = env.action_space.n  # 动作维度
+    n_actions = env.action_space.n  # 动作维度
-    agent = DQN(state_dim, action_dim, cfg)  # 创建智能体
+    agent = DQN(n_states, n_actions, cfg)  # 创建智能体
    return env, agent
 def train(cfg, env, agent):
--- a/codes/DQN/task4.py
+++ b/codes/DQN/task4.py
@@ -0,0 +1,180 @@
 import sys
 import os
 import torch.nn as nn
 import torch.nn.functional as F
 curr_path = os.path.dirname(os.path.abspath(__file__))  # 当前文件所在绝对路径
 parent_path = os.path.dirname(curr_path)  # 父路径
 sys.path.append(parent_path)  # 添加路径到系统路径
 import gym
 import torch
 import datetime
 import numpy as np
 from common.utils import save_results_1, make_dir
 from common.utils import plot_rewards
 from dqn_1 import DQN
 curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
 class MLP(nn.Module):
    def __init__(self, n_states,n_actions,hidden_dim=256):
        """ 初始化q网络，为全连接网络
            n_states: 输入的特征数即环境的状态维度
            n_actions: 输出的动作维度
        """
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
        self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
        self.fc3 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
        self.fc4 = nn.Linear(hidden_dim, n_actions) # 输出层
    def forward(self, x):
        # 各层对应的激活函数
        x = F.relu(self.fc1(x)) 
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return self.fc4(x)
 class Config:
    '''超参数
    '''
    def __init__(self):
        ################################## 环境超参数 ###################################
        self.algo_name = 'DQN'  # 算法名称
        # self.env_name = 'Breakout-ram-v0'  # 环境名称
        self.env_name = 'ALE/Pong-ram-v5'
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")  # 检测GPUgjgjlkhfsf风刀霜的撒发十
        self.seed = 10 # 随机种子，置0则不设置随机种子
        self.train_eps = 5  # 训练的回合数
        self.test_eps = 30  # 测试的回合数
        ################################################################################
        ################################## 算法超参数 ###################################
        self.gamma = 0.99  # 强化学习中的折扣因子
        self.epsilon_start = 0.95  # e-greedy策略中初始epsilon
        self.epsilon_end = 0.01  # e-greedy策略中的终止epsilon
        self.epsilon_decay = 500000  # e-greedy策略中epsilon的衰减率
        self.lr = 0.00025 # 学习率
        self.memory_capacity = int(5e4)  # 经验回放的容量
        self.batch_size = 32  # mini-batch SGD中的批量大小
        self.target_update = 4  # 目标网络的更新频率
        self.hidden_dim = 512  # 网络隐藏层
        ################################################################################
        ################################# 保存结果相关参数 ################################
        self.result_path = curr_path + "/outputs/" + self.env_name + \
            '/' + curr_time + '/results/'  # 保存结果的路径
        self.model_path = curr_path + "/outputs/" + self.env_name + \
            '/' + curr_time + '/models/'  # 保存模型的路径
        self.save = True # 是否保存图片
        ################################################################################
 def env_agent_config(cfg):
    ''' 创建环境和智能体
    '''
    env = gym.make(cfg.env_name)  # 创建环境
    n_states = env.observation_space.shape[0]  # 状态维度
    n_actions = env.action_space.n  # 动作维度
    print(f"n states: {n_states}, n actions: {n_actions}")
    model = MLP(n_states,n_actions)
    agent = DQN(n_states, n_actions, model, cfg)  # 创建智能体
    if cfg.seed !=0: # 设置随机种子
        torch.manual_seed(cfg.seed)
        env.seed(cfg.seed)
        np.random.seed(cfg.seed)
    return env, agent
 def train(cfg, env, agent):
    ''' 训练
    '''
    print('开始训练!')
    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
    rewards = []  # 记录所有回合的奖励
    ma_rewards = []  # 记录所有回合的滑动平均奖励
    steps = []
    for i_ep in range(cfg.train_eps):
        ep_reward = 0  # 记录一回合内的奖励
        state = env.reset()  # 重置环境，返回初始状态
        ep_step = 0
        while True:
            ep_step+=1
            action = agent.choose_action(state)  # 选择动作
            next_state, reward, done, _ = env.step(action)  # 更新环境，返回transition
            agent.memory.push(state, action, reward,
                              next_state, done)  # 保存transition
            state = next_state  # 更新下一个状态
            agent.update()  # 更新智能体
            ep_reward += reward  # 累加奖励
            if done:
                break
        if (i_ep + 1) % cfg.target_update == 0:  # 智能体目标网络更新
            agent.target_net.load_state_dict(agent.policy_net.state_dict())
        steps.append(ep_step)
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward)
        else:
            ma_rewards.append(ep_reward)
        if (i_ep + 1) % 1 == 0:
            print(f'Episode：{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}, Epislon:{agent.epsilon(agent.frame_idx):.3f}')
    print('完成训练！')
    env.close()
    res_dic = {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}
    return res_dic
 def test(cfg, env, agent):
    print('开始测试!')
    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
    ############# 由于测试不需要使用epsilon-greedy策略，所以相应的值设置为0 ###############
    cfg.epsilon_start = 0.0  # e-greedy策略中初始epsilon
    cfg.epsilon_end = 0.0  # e-greedy策略中的终止epsilon
    ################################################################################
    rewards = []  # 记录所有回合的奖励
    ma_rewards = []  # 记录所有回合的滑动平均奖励
    steps = []
    for i_ep in range(cfg.test_eps):
        ep_reward = 0  # 记录一回合内的奖励
        ep_step = 0
        state = env.reset()  # 重置环境，返回初始状态
        while True:
            ep_step+=1
            action = agent.choose_action(state)  # 选择动作
            next_state, reward, done, _ = env.step(action)  # 更新环境，返回transition
            state = next_state  # 更新下一个状态
            ep_reward += reward  # 累加奖励
            if done:
                break
        steps.append(ep_step)
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1)
        else:
            ma_rewards.append(ep_reward)
        print(f"回合：{i_ep+1}/{cfg.test_eps}，奖励：{ep_reward:.1f}")
    print('完成测试！')
    env.close()
    return {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}
 if __name__ == "__main__":
    cfg = Config()
    # 训练
    env, agent = env_agent_config(cfg)
    res_dic = train(cfg, env, agent)
    make_dir(cfg.result_path, cfg.model_path)  # 创建保存结果和模型路径的文件夹
    agent.save(path=cfg.model_path)  # 保存模型
    save_results_1(res_dic, tag='train',
                 path=cfg.result_path)  # 保存结果
    plot_rewards(res_dic['rewards'], res_dic['ma_rewards'], cfg, tag="train")  # 画出结果
    # 测试
    env, agent = env_agent_config(cfg)
    agent.load(path=cfg.model_path)  # 导入模型
    res_dic = test(cfg, env, agent)
    save_results_1(res_dic, tag='test',
                 path=cfg.result_path)  # 保存结果
    plot_rewards(res_dic['rewards'], res_dic['ma_rewards'],cfg, tag="test")  # 画出结果
--- a/codes/DQN/task5.py
+++ b/codes/DQN/task5.py
@@ -0,0 +1,149 @@
 import sys
 import os
 curr_path = os.path.dirname(os.path.abspath(__file__))  # 当前文件所在绝对路径
 parent_path = os.path.dirname(curr_path)  # 父路径
 sys.path.append(parent_path)  # 添加路径到系统路径
 import gym
 import torch
 import datetime
 import numpy as np
 from common.utils import save_results, make_dir
 from common.utils import plot_rewards
 from dqn import DQN
 curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
 class Config:
    '''超参数
    '''
    def __init__(self):
        ################################## 环境超参数 ###################################
        self.algo_name = 'DQN'  # 算法名称
        self.env_name = 'SpaceInvaders-ram-v0'  # 环境名称
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")  # 检测GPUgjgjlkhfsf风刀霜的撒发十
        self.seed = 10 # 随机种子，置0则不设置随机种子
        self.train_eps = 200  # 训练的回合数
        self.test_eps = 30  # 测试的回合数
        ################################################################################
        ################################## 算法超参数 ###################################
        self.gamma = 0.99  # 强化学习中的折扣因子
        self.epsilon_start = 0.95  # e-greedy策略中初始epsilon
        self.epsilon_end = 0.01  # e-greedy策略中的终止epsilon
        self.epsilon_decay = 20000  # e-greedy策略中epsilon的衰减率
        self.lr = 2e-4  # 学习率
        self.memory_capacity = int(1e5)  # 经验回放的容量
        self.batch_size = 32  # mini-batch SGD中的批量大小
        self.target_update = 4  # 目标网络的更新频率
        self.hidden_dim = 512  # 网络隐藏层
        ################################################################################
        ################################# 保存结果相关参数 ################################
        self.result_path = curr_path + "/outputs/" + self.env_name + \
            '/' + curr_time + '/results/'  # 保存结果的路径
        self.model_path = curr_path + "/outputs/" + self.env_name + \
            '/' + curr_time + '/models/'  # 保存模型的路径
        self.save = True # 是否保存图片
        ################################################################################
 def env_agent_config(cfg):
    ''' 创建环境和智能体
    '''
    env = gym.make(cfg.env_name)  # 创建环境
    n_states = env.observation_space.shape[0]  # 状态维度
    n_actions = env.action_space.n  # 动作维度
    print(f"n states: {n_states}, n actions: {n_actions}")
    agent = DQN(n_states, n_actions, cfg)  # 创建智能体
    if cfg.seed !=0: # 设置随机种子
        torch.manual_seed(cfg.seed)
        env.seed(cfg.seed)
        np.random.seed(cfg.seed)
    return env, agent
 def train(cfg, env, agent):
    ''' 训练
    '''
    print('开始训练!')
    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
    rewards = []  # 记录所有回合的奖励
    ma_rewards = []  # 记录所有回合的滑动平均奖励
    for i_ep in range(cfg.train_eps):
        ep_reward = 0  # 记录一回合内的奖励
        state = env.reset()  # 重置环境，返回初始状态
        while True:
            action = agent.choose_action(state)  # 选择动作
            next_state, reward, done, _ = env.step(action)  # 更新环境，返回transition
            agent.memory.push(state, action, reward,
                              next_state, done)  # 保存transition
            state = next_state  # 更新下一个状态
            agent.update()  # 更新智能体
            ep_reward += reward  # 累加奖励
            if done:
                break
        if (i_ep + 1) % cfg.target_update == 0:  # 智能体目标网络更新
            agent.target_net.load_state_dict(agent.policy_net.state_dict())
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward)
        else:
            ma_rewards.append(ep_reward)
        if (i_ep + 1) % 1 == 0:
            print(f'Episode：{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}, Epislon:{agent.epsilon(agent.frame_idx):.3f}')
    print('完成训练！')
    env.close()
    return rewards, ma_rewards
 def test(cfg, env, agent):
    print('开始测试!')
    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
    ############# 由于测试不需要使用epsilon-greedy策略，所以相应的值设置为0 ###############
    cfg.epsilon_start = 0.0  # e-greedy策略中初始epsilon
    cfg.epsilon_end = 0.0  # e-greedy策略中的终止epsilon
    ################################################################################
    rewards = []  # 记录所有回合的奖励
    ma_rewards = []  # 记录所有回合的滑动平均奖励
    for i_ep in range(cfg.test_eps):
        ep_reward = 0  # 记录一回合内的奖励
        state = env.reset()  # 重置环境，返回初始状态
        while True:
            action = agent.choose_action(state)  # 选择动作
            next_state, reward, done, _ = env.step(action)  # 更新环境，返回transition
            state = next_state  # 更新下一个状态
            ep_reward += reward  # 累加奖励
            if done:
                break
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1)
        else:
            ma_rewards.append(ep_reward)
        print(f"回合：{i_ep+1}/{cfg.test_eps}，奖励：{ep_reward:.1f}")
    print('完成测试！')
    env.close()
    return rewards, ma_rewards
 if __name__ == "__main__":
    cfg = Config()
    # 训练
    env, agent = env_agent_config(cfg)
    rewards, ma_rewards = train(cfg, env, agent)
    make_dir(cfg.result_path, cfg.model_path)  # 创建保存结果和模型路径的文件夹
    agent.save(path=cfg.model_path)  # 保存模型
    save_results(rewards, ma_rewards, tag='train',
                 path=cfg.result_path)  # 保存结果
    plot_rewards(rewards, ma_rewards, cfg, tag="train")  # 画出结果
    # 测试
    env, agent = env_agent_config(cfg)
    agent.load(path=cfg.model_path)  # 导入模型
    rewards, ma_rewards = test(cfg, env, agent)
    save_results(rewards, ma_rewards, tag='test',
                 path=cfg.result_path)  # 保存结果
    plot_rewards(rewards, ma_rewards, cfg, tag="test")  # 画出结果
--- a/codes/DQN/test
+++ b/codes/DQN/test
@@ -0,0 +1,184 @@
 import random
 import numpy as np
 import pandas as pd
 import tensorflow as tf
 import os
 import gym
 import time
 from collections import deque
 from tensorflow.keras import optimizers
 from keras.models import Sequential
 from keras.layers import Dense, Dropout
 from keras.layers import Activation, Flatten, Conv1D, MaxPooling1D,Reshape
 import matplotlib.pyplot as plt
 class DQN:
    def __init__(self, env):
        self.env = env
        self.memory = deque(maxlen=400000)
        self.gamma = 0.99
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay =  self.epsilon_min / 500000
        self.batch_size = 32
        self.train_start = 1000
        self.state_size = self.env.observation_space.shape[0]*4
        self.action_size = self.env.action_space.n
        self.learning_rate = 0.00025
        self.evaluation_model = self.create_model()
        self.target_model = self.create_model()
    def create_model(self):
        model = Sequential()
        model.add(Dense(128*2, input_dim=self.state_size,activation='relu'))
        model.add(Dense(128*2, activation='relu'))
        model.add(Dense(128*2, activation='relu'))
        model.add(Dense(self.env.action_space.n, activation='linear'))
        model.compile(loss='mean_squared_error', optimizer=optimizers.RMSprop(lr=self.learning_rate,decay=0.99,epsilon=1e-6))
        return model
    def choose_action(self, state, steps):
        if steps > 50000:
            if self.epsilon > self.epsilon_min:
                self.epsilon -= self.epsilon_decay
        if np.random.random() < self.epsilon:
            return self.env.action_space.sample()
        return np.argmax(self.evaluation_model.predict(state)[0])
    def remember(self, cur_state, action, reward, new_state, done):
        if not hasattr(self, 'memory_counter'):
            self.memory_counter = 0
        transition = (cur_state, action, reward, new_state, done)
        self.memory.extend([transition])
        self.memory_counter += 1
    def replay(self):
        if len(self.memory) < self.train_start:
            return
        mini_batch = random.sample(self.memory, self.batch_size)
        update_input = np.zeros((self.batch_size, self.state_size))
        update_target = np.zeros((self.batch_size, self.action_size))
        for i in range(self.batch_size):
            state, action, reward, new_state, done = mini_batch[i]
            target = self.evaluation_model.predict(state)[0]
            if done:
                target[action] = reward
            else:
                target[action] = reward + self.gamma * np.amax(self.target_model.predict(new_state)[0])
            update_input[i] = state
            update_target[i] = target
        self.evaluation_model.fit(update_input, update_target, batch_size=self.batch_size, epochs=1, verbose=0)
    def target_train(self):
        self.target_model.set_weights(self.evaluation_model.get_weights())
        return
    def visualize(self, reward, episode):
        plt.plot(episode, reward, 'ob-')
        plt.title('Average reward each 100 episode')
        plt.ylabel('Reward')
        plt.xlabel('Episodes')
        plt.grid()
        plt.show()
    def transform(self,state):
        if state.shape[1]==512:
            return state
        a=[np.binary_repr(x,width=8) for x in state[0]]
        res=[]
        for x in a:
            res.extend([x[:2],x[2:4],x[4:6],x[6:]])
        res=[int(x,2) for x in res]
        return np.array(res)
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 def main():
    # env = gym.make('Breakout-ram-v0')
    env = gym.make('Breakout-ram-v0')
    env = env.unwrapped
    print(env.action_space)
    print(env.observation_space.shape[0])
    print(env.observation_space.high)
    print(env.observation_space.low)
    #print(env.observation_space.shape)
    episodes = 5000
    trial_len = 10000
    tmp_reward=0
    sum_rewards = 0
    n_success = 0
    total_steps = 0
    graph_reward = []
    graph_episodes = []
    time_record = []
    dqn_agent = DQN(env=env)
    for i_episode in range(episodes):
        start_time = time.time()
        total_reward = 0
        cur_state = env.reset().reshape(1,128)
        cur_state=dqn_agent.transform(cur_state).reshape(1,128*4)/4
        i_step=0
        for step in range(trial_len):
            #env.render()
            i_step+=1
            action = dqn_agent.choose_action(cur_state, total_steps)
            new_state, reward, done, _ = env.step(action)
            new_state = new_state.reshape(1, 128)
            new_state = dqn_agent.transform(new_state).reshape(1,128*4)/4
            total_reward += reward
            sum_rewards += reward
            tmp_reward += reward
            if reward>0:    #Testing whether it is good.
                reward=1
            dqn_agent.remember(cur_state, action, reward, new_state, done)
            if total_steps > 10000:
                if total_steps%4 == 0:
                    dqn_agent.replay()
                if total_steps%5000 == 0:
                    dqn_agent.target_train()
            cur_state = new_state
            total_steps += 1
            if done:
                env.reset()
                break
        if (i_episode+1) % 100 == 0:
            graph_reward.append(sum_rewards/100)
            graph_episodes.append(i_episode+1)
            sum_rewards = 0
            print("Episode ",i_episode+1," Reward: ")
            print(graph_reward[-1])
        end_time = time.time()
        time_record.append(end_time-start_time)
        print("NOW in episode: " + str(i_episode))
        print("Time cost: " + str(end_time-start_time))
        print("Reward: ",tmp_reward)
        print("Step:", i_step)
        tmp_reward=0
    print("Reward: ")
    print(graph_reward)
    print("Episode: ")
    print(graph_episodes)
    print("Average_time: ")
    print(sum(time_record)/5000)
    dqn_agent.visualize(graph_reward, graph_episodes)
 if __name__ == '__main__':
    main()
--- a/codes/Docs/使用DDPG解决倒立摆问题.md
+++ b/codes/Docs/使用DDPG解决倒立摆问题.md
@@ -90,15 +90,15 @@ class OUNoise(object):
        self.max_sigma    = max_sigma
        self.min_sigma    = min_sigma
        self.decay_period = decay_period
-        self.action_dim   = action_space.shape[0]
+        self.n_actions   = action_space.shape[0]
        self.low          = action_space.low
        self.high         = action_space.high
        self.reset()
    def reset(self):
-        self.obs = np.ones(self.action_dim) * self.mu
+        self.obs = np.ones(self.n_actions) * self.mu
    def evolve_obs(self):
        x  = self.obs
-        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
+        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.n_actions)
        self.obs = x + dx
        return self.obs
    def get_action(self, action, t=0):
--- a/codes/Docs/使用DQN解决推车杆问题.md
+++ b/codes/Docs/使用DQN解决推车杆问题.md
@@ -14,10 +14,10 @@ CartPole-v0是一个经典的入门环境，如下图，它通过向左(动作=0
 import gym
 env = gym.make('CartPole-v0')  # 建立环境
 env.seed(1) # 随机种子
-state_dim = env.observation_space.shape[0] # 状态维度
+n_states = env.observation_space.shape[0] # 状态维度
-action_dim = env.action_space.n # 动作维度
+n_actions = env.action_space.n # 动作维度
 state = env.reset() # 初始化环境
-print(f"状态维度：{state_dim}，动作维度：{action_dim}")
+print(f"状态维度：{n_states}，动作维度：{n_actions}")
 print(f"初始状态：{state}")
 ```
@@ -157,7 +157,7 @@ def choose_action(self, state):
                q_values = self.policy_net(state)
                action = q_values.max(1)[1].item() # 选择Q值最大的动作
        else:
-            action = random.randrange(self.action_dim)
+            action = random.randrange(self.n_actions)
 ```
 可以看到跟Q学习算法其实是一样的，都是用的$\epsilon-greedy$策略，只是使用神经网络的话我们需要通过Torch或者Tensorflow工具来处理相应的数据。
--- a/codes/Docs/使用Q-learning解决悬崖寻路问题.md
+++ b/codes/Docs/使用Q-learning解决悬崖寻路问题.md
@@ -30,9 +30,9 @@ env = CliffWalkingWapper(env) # 装饰环境
 这里我们在程序中使用了一个装饰器重新定义环境，但不影响对环境的理解，感兴趣的同学具体看相关代码。可以由于gym环境封装得比较好，所以我们想要使用这个环境只需要使用gym.make命令输入函数名即可，然后我们可以查看环境的状态和动作维度目：
 ```python
-state_dim = env.observation_space.n # 状态维度
+n_states = env.observation_space.n # 状态维度
-action_dim = env.action_space.n # 动作维度
+n_actions = env.action_space.n # 动作维度
-print(f"状态维度：{state_dim}，动作维度：{action_dim}")
+print(f"状态维度：{n_states}，动作维度：{n_actions}")
 ```
 打印出来的结果如下：
@@ -72,9 +72,9 @@ print(state)
 env = gym.make('CliffWalking-v0')  # 定义环境
 env = CliffWalkingWapper(env) # 装饰环境
 env.seed(1) # 设置随机种子
-state_dim = env.observation_space.n # 状态维度
+n_states = env.observation_space.n # 状态维度
-action_dim = env.action_space.n # 动作维度
+n_actions = env.action_space.n # 动作维度
-agent = QLearning(state_dim,action_dim,cfg) # cfg存储算法相关参数
+agent = QLearning(n_states,n_actions,cfg) # cfg存储算法相关参数
 for i_ep in range(cfg.train_eps): # cfg.train_eps表示最大训练的回合数
    ep_reward = 0  # 记录每个回合的奖励
    state = env.reset()  # 重置环境
@@ -126,7 +126,7 @@ def choose_action(self, state):
      if np.random.uniform(0, 1) > self.epsilon:
          action = np.argmax(self.Q_table[str(state)]) # 选择Q(s,a)最大对应的动作
      else:
-          action = np.random.choice(self.action_dim) # 随机选择动作
+          action = np.random.choice(self.n_actions) # 随机选择动作
      return action
 ```
--- a/codes/DoubleDQN/double_dqn.py
+++ b/codes/DoubleDQN/double_dqn.py
@@ -46,15 +46,15 @@ class ReplayBuffer:
        return len(self.buffer)
 class MLP(nn.Module):
-    def __init__(self, state_dim,action_dim,hidden_dim=128):
+    def __init__(self, n_states,n_actions,hidden_dim=128):
        """ 初始化q网络，为全连接网络
-            state_dim: 输入的特征数即环境的状态维度
+            n_states: 输入的特征数即环境的状态维度
-            action_dim: 输出的动作维度
+            n_actions: 输出的动作维度
        """
        super(MLP, self).__init__()
-        self.fc1 = nn.Linear(state_dim, hidden_dim) # 输入层
+        self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
        self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
-        self.fc3 = nn.Linear(hidden_dim, action_dim) # 输出层
+        self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层
    def forward(self, x):
        # 各层对应的激活函数
@@ -63,8 +63,8 @@ class MLP(nn.Module):
        return self.fc3(x)
 class DoubleDQN:
-    def __init__(self, state_dim, action_dim, cfg):
+    def __init__(self, n_states, n_actions, cfg):
-        self.action_dim = action_dim  # 总的动作个数
+        self.n_actions = n_actions  # 总的动作个数
        self.device = cfg.device  # 设备，cpu或gpu等
        self.gamma = cfg.gamma
        # e-greedy策略相关参数
@@ -73,8 +73,8 @@ class DoubleDQN:
        self.epsilon_end = cfg.epsilon_end
        self.epsilon_decay = cfg.epsilon_decay
        self.batch_size = cfg.batch_size
-        self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
+        self.policy_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
-        self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
+        self.target_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
        # target_net copy from policy_net
        for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
            target_param.data.copy_(param.data)
@@ -103,7 +103,7 @@ class DoubleDQN:
                # 所以tensor.max(1)[1]返回最大值对应的下标，即action
                action = q_value.max(1)[1].item()  
        else:
-            action = random.randrange(self.action_dim)
+            action = random.randrange(self.n_actions)
        return action
    def update(self):
--- a/codes/DoubleDQN/task0.py
+++ b/codes/DoubleDQN/task0.py
@@ -59,9 +59,9 @@ class Config:
 def env_agent_config(cfg,seed=1):
    env = gym.make(cfg.env_name)  
    env.seed(seed)
-    state_dim = env.observation_space.shape[0]
+    n_states = env.observation_space.shape[0]
-    action_dim = env.action_space.n
+    n_actions = env.action_space.n
-    agent = DoubleDQN(state_dim,action_dim,cfg)
+    agent = DoubleDQN(n_states,n_actions,cfg)
    return env,agent
 def train(cfg,env,agent):
--- a/codes/DuelingDQN/task0_train.ipynb
+++ b/codes/DuelingDQN/task0_train.ipynb
@@ -136,12 +136,12 @@
   "outputs": [],
   "source": [
    "class DuelingNet(nn.Module):\n",
-    "    def __init__(self, state_dim, action_dim,hidden_size=128):\n",
+    "    def __init__(self, n_states, n_actions,hidden_size=128):\n",
    "        super(DuelingNet, self).__init__()\n",
    "        \n",
    "        # 隐藏层\n",
    "        self.hidden = nn.Sequential(\n",
-    "            nn.Linear(state_dim, hidden_size),\n",
+    "            nn.Linear(n_states, hidden_size),\n",
    "            nn.ReLU()\n",
    "        )\n",
    "        \n",
@@ -149,7 +149,7 @@
    "        self.advantage = nn.Sequential(\n",
    "            nn.Linear(hidden_size, hidden_size),\n",
    "            nn.ReLU(),\n",
-    "            nn.Linear(hidden_size, action_dim)\n",
+    "            nn.Linear(hidden_size, n_actions)\n",
    "        )\n",
    "        \n",
    "        # 价值函数\n",
@@ -192,7 +192,7 @@
   ],
   "source": [
    "class DuelingDQN:\n",
-    "    def __init__(self,state_dim,action_dim,cfg) -> None:\n",
+    "    def __init__(self,n_states,n_actions,cfg) -> None:\n",
    "        self.batch_size = cfg.batch_size\n",
    "        self.device = cfg.device\n",
    "        self.loss_history = [] # 记录loss的变化\n",
@@ -200,8 +200,8 @@
    "        self.epsilon = lambda frame_idx: cfg.epsilon_end + \\\n",
    "            (cfg.epsilon_start - cfg.epsilon_end) * \\\n",
    "            math.exp(-1. * frame_idx / cfg.epsilon_decay)\n",
-    "        self.policy_net = DuelingNet(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)\n",
+    "        self.policy_net = DuelingNet(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)\n",
-    "        self.target_net = DuelingNet(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)\n",
+    "        self.target_net = DuelingNet(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)\n",
    "        for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网络targe_net\n",
    "            target_param.data.copy_(param.data)\n",
    "        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器\n",
@@ -214,7 +214,7 @@
    "                q_values = self.policy_net(state)\n",
    "                action = q_values.max(1)[1].item() # 选择Q值最大的动作\n",
    "        else:\n",
-    "            action = random.randrange(self.action_dim)\n",
+    "            action = random.randrange(self.n_actions)\n",
    "        return action\n",
    "    def update(self):\n",
    "        if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时，不更新策略\n",
--- a/codes/HierarchicalDQN/agent.py
+++ b/codes/HierarchicalDQN/agent.py
@@ -57,16 +57,16 @@ class MLP(nn.Module):
        return self.fc3(x)
 class HierarchicalDQN:
-    def __init__(self,state_dim,action_dim,cfg):
+    def __init__(self,n_states,n_actions,cfg):
-        self.state_dim = state_dim
+        self.n_states = n_states
-        self.action_dim = action_dim
+        self.n_actions = n_actions
        self.gamma = cfg.gamma
        self.device = cfg.device
        self.batch_size = cfg.batch_size
        self.frame_idx = 0  # 用于epsilon的衰减计数
        self.epsilon = lambda frame_idx: cfg.epsilon_end + (cfg.epsilon_start - cfg.epsilon_end ) * math.exp(-1. * frame_idx / cfg.epsilon_decay)
-        self.policy_net = MLP(2*state_dim, action_dim,cfg.hidden_dim).to(self.device)
+        self.policy_net = MLP(2*n_states, n_actions,cfg.hidden_dim).to(self.device)
-        self.meta_policy_net = MLP(state_dim, state_dim,cfg.hidden_dim).to(self.device)
+        self.meta_policy_net = MLP(n_states, n_states,cfg.hidden_dim).to(self.device)
        self.optimizer = optim.Adam(self.policy_net.parameters(),lr=cfg.lr)
        self.meta_optimizer = optim.Adam(self.meta_policy_net.parameters(),lr=cfg.lr)
        self.memory = ReplayBuffer(cfg.memory_capacity)
@@ -76,7 +76,7 @@ class HierarchicalDQN:
        self.losses = []
        self.meta_losses = []
    def to_onehot(self,x):
-        oh = np.zeros(self.state_dim)
+        oh = np.zeros(self.n_states)
        oh[x - 1] = 1.
        return oh
    def set_goal(self,state):
@@ -85,7 +85,7 @@ class HierarchicalDQN:
                state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(0)
                goal = self.meta_policy_net(state).max(1)[1].item() 
        else:
-            goal = random.randrange(self.state_dim)
+            goal = random.randrange(self.n_states)
        return goal
    def choose_action(self,state):
        self.frame_idx += 1
@@ -95,7 +95,7 @@ class HierarchicalDQN:
                q_value = self.policy_net(state)
                action = q_value.max(1)[1].item()  
        else:
-            action = random.randrange(self.action_dim)
+            action = random.randrange(self.n_actions)
        return action
    def update(self):
        self.update_policy()
--- a/codes/HierarchicalDQN/task0.py
+++ b/codes/HierarchicalDQN/task0.py
@@ -63,9 +63,9 @@ class PlotConfig:
 def env_agent_config(cfg,seed=1):
    env = gym.make(cfg.env_name)  
    env.seed(seed)
-    state_dim = env.observation_space.shape[0]
+    n_states = env.observation_space.shape[0]
-    action_dim = env.action_space.n
+    n_actions = env.action_space.n
-    agent = HierarchicalDQN(state_dim,action_dim,cfg)
+    agent = HierarchicalDQN(n_states,n_actions,cfg)
    return env,agent
 if __name__ == "__main__":
--- a/codes/LICENSE
+++ b/codes/LICENSE
@@ -1,21 +0,0 @@
 MIT License
 Copyright (c) 2020 John Jim
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/codes/MonteCarlo/agent.py
+++ b/codes/MonteCarlo/agent.py
@@ -17,11 +17,11 @@ import dill
 class FisrtVisitMC:
    ''' On-Policy First-Visit MC Control
    '''
-    def __init__(self,action_dim,cfg):
+    def __init__(self,n_actions,cfg):
-        self.action_dim = action_dim
+        self.n_actions = n_actions
        self.epsilon = cfg.epsilon
        self.gamma = cfg.gamma 
-        self.Q_table = defaultdict(lambda: np.zeros(action_dim))
+        self.Q_table = defaultdict(lambda: np.zeros(n_actions))
        self.returns_sum = defaultdict(float) # sum of returns
        self.returns_count = defaultdict(float)
@@ -29,11 +29,11 @@ class FisrtVisitMC:
        ''' e-greed policy '''
        if state in self.Q_table.keys():
            best_action = np.argmax(self.Q_table[state])
-            action_probs = np.ones(self.action_dim, dtype=float) * self.epsilon / self.action_dim
+            action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions
            action_probs[best_action] += (1.0 - self.epsilon)
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        else:
-            action = np.random.randint(0,self.action_dim)
+            action = np.random.randint(0,self.n_actions)
        return action
    def update(self,one_ep_transition):
        # Find all (state, action) pairs we've visited in this one_ep_transition
--- a/codes/MonteCarlo/task0_train.py
+++ b/codes/MonteCarlo/task0_train.py
@@ -43,8 +43,8 @@ class MCConfig:
 def env_agent_config(cfg,seed=1):
    env = RacetrackEnv()
-    action_dim = 9
+    n_actions = 9
-    agent = FisrtVisitMC(action_dim, cfg)
+    agent = FisrtVisitMC(n_actions, cfg)
    return env,agent
 def train(cfg, env, agent):
--- a/codes/PPO/README.md
+++ b/codes/PPO/README.md
@@ -57,16 +57,16 @@ model就是actor和critic两个网络了：
 import torch.nn as nn
 from torch.distributions.categorical import Categorical
 class Actor(nn.Module):
-    def __init__(self,state_dim, action_dim,
+    def __init__(self,n_states, n_actions,
            hidden_dim=256):
        super(Actor, self).__init__()
        self.actor = nn.Sequential(
-                nn.Linear(state_dim, hidden_dim),
+                nn.Linear(n_states, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, hidden_dim),
                nn.ReLU(),
-                nn.Linear(hidden_dim, action_dim),
+                nn.Linear(hidden_dim, n_actions),
                nn.Softmax(dim=-1)
        )
    def forward(self, state):
@@ -75,10 +75,10 @@ class Actor(nn.Module):
        return dist
 class Critic(nn.Module):
-    def __init__(self, state_dim,hidden_dim=256):
+    def __init__(self, n_states,hidden_dim=256):
        super(Critic, self).__init__()
        self.critic = nn.Sequential(
-                nn.Linear(state_dim, hidden_dim),
+                nn.Linear(n_states, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, hidden_dim),
                nn.ReLU(),
@@ -88,7 +88,7 @@ class Critic(nn.Module):
        value = self.critic(state)
        return value
 ```
-这里Actor就是得到一个概率分布(Categorica，也可以是别的分布，可以搜索torch distributionsl)，critc根据当前状态得到一个值，这里的输入维度可以是```state_dim+action_dim```，即将action信息也纳入critic网络中，这样会更好一些，感兴趣的小伙伴可以试试。
+这里Actor就是得到一个概率分布(Categorica，也可以是别的分布，可以搜索torch distributionsl)，critc根据当前状态得到一个值，这里的输入维度可以是```n_states+n_actions```，即将action信息也纳入critic网络中，这样会更好一些，感兴趣的小伙伴可以试试。
 ### PPO update
 定义一个update函数主要实现伪代码中的第六步和第七步：
--- a/codes/PPO/memory.py
+++ b/codes/PPO/memory.py
@@ -1,44 +0,0 @@
 #!/usr/bin/env python
 # coding=utf-8
 '''
 Author: John
 Email: johnjim0816@gmail.com
 Date: 2021-03-23 15:30:46
 LastEditor: John
 LastEditTime: 2021-09-26 22:00:07
 Discription: 
 Environment: 
 '''
 import numpy as np
 class PPOMemory:
    def __init__(self, batch_size):
        self.states = []
        self.probs = []
        self.vals = []
        self.actions = []
        self.rewards = []
        self.dones = []
        self.batch_size = batch_size
    def sample(self):
        batch_step = np.arange(0, len(self.states), self.batch_size)
        indices = np.arange(len(self.states), dtype=np.int64)
        np.random.shuffle(indices)
        batches = [indices[i:i+self.batch_size] for i in batch_step]
        return np.array(self.states),np.array(self.actions),np.array(self.probs),\
                np.array(self.vals),np.array(self.rewards),np.array(self.dones),batches
    def push(self, state, action, probs, vals, reward, done):
        self.states.append(state)
        self.actions.append(action)
        self.probs.append(probs)
        self.vals.append(vals)
        self.rewards.append(reward)
        self.dones.append(done)
    def clear(self):
        self.states = []
        self.probs = []
        self.actions = []
        self.rewards = []
        self.dones = []
        self.vals = []
--- a/codes/PPO/model.py
+++ b/codes/PPO/model.py
@@ -1,44 +0,0 @@
 #!/usr/bin/env python
 # coding=utf-8
 '''
 Author: John
 Email: johnjim0816@gmail.com
 Date: 2021-03-23 15:29:24
 LastEditor: John
 LastEditTime: 2021-04-08 22:36:43
 Discription: 
 Environment: 
 '''
 import torch.nn as nn
 from torch.distributions.categorical import Categorical
 class Actor(nn.Module):
    def __init__(self,state_dim, action_dim,
            hidden_dim):
        super(Actor, self).__init__()
        self.actor = nn.Sequential(
                nn.Linear(state_dim, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, action_dim),
                nn.Softmax(dim=-1)
        )
    def forward(self, state):
        dist = self.actor(state)
        dist = Categorical(dist)
        return dist
 class Critic(nn.Module):
    def __init__(self, state_dim,hidden_dim):
        super(Critic, self).__init__()
        self.critic = nn.Sequential(
                nn.Linear(state_dim, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, 1)
        )
    def forward(self, state):
        value = self.critic(state)
        return value
--- a/codes/PPO/outputs/CartPole-v0/20211117-184614/models/ppo_actor.pt
+++ b/codes/PPO/outputs/CartPole-v0/20211117-184614/models/ppo_actor.pt
--- a/codes/PPO/outputs/CartPole-v0/20211117-184614/models/ppo_critic.pt
+++ b/codes/PPO/outputs/CartPole-v0/20211117-184614/models/ppo_critic.pt
--- a/codes/PPO/outputs/CartPole-v0/20211117-184614/results/eval_rewards_curve.png
+++ b/codes/PPO/outputs/CartPole-v0/20211117-184614/results/eval_rewards_curve.png
--- a/codes/PPO/outputs/CartPole-v0/20211117-184614/results/train_ma_rewards.npy
+++ b/codes/PPO/outputs/CartPole-v0/20211117-184614/results/train_ma_rewards.npy
--- a/codes/PPO/outputs/CartPole-v0/20211117-184614/results/train_rewards.npy
+++ b/codes/PPO/outputs/CartPole-v0/20211117-184614/results/train_rewards.npy
--- a/codes/PPO/outputs/CartPole-v0/20211117-184614/results/train_rewards_curve.png
+++ b/codes/PPO/outputs/CartPole-v0/20211117-184614/results/train_rewards_curve.png
--- a/codes/PPO/outputs/CartPole-v0/20211231-193837/models/ppo_actor.pt
+++ b/codes/PPO/outputs/CartPole-v0/20211231-193837/models/ppo_actor.pt
--- a/codes/PPO/outputs/CartPole-v0/20211231-193837/models/ppo_critic.pt
+++ b/codes/PPO/outputs/CartPole-v0/20211231-193837/models/ppo_critic.pt
--- a/codes/PPO/outputs/CartPole-v0/20211231-193837/results/test_ma_rewards.npy
+++ b/codes/PPO/outputs/CartPole-v0/20211231-193837/results/test_ma_rewards.npy
--- a/codes/PPO/outputs/CartPole-v0/20211231-193837/results/test_rewards.npy
+++ b/codes/PPO/outputs/CartPole-v0/20211231-193837/results/test_rewards.npy
--- a/codes/PPO/outputs/CartPole-v0/20211231-193837/results/test_rewards_curve.png
+++ b/codes/PPO/outputs/CartPole-v0/20211231-193837/results/test_rewards_curve.png
--- a/codes/PPO/outputs/CartPole-v0/20211231-193837/results/train_ma_rewards.npy
+++ b/codes/PPO/outputs/CartPole-v0/20211231-193837/results/train_ma_rewards.npy
--- a/codes/PPO/outputs/CartPole-v0/20211231-193837/results/train_rewards.npy
+++ b/codes/PPO/outputs/CartPole-v0/20211231-193837/results/train_rewards.npy
--- a/codes/PPO/outputs/CartPole-v0/20211231-193837/results/train_rewards_curve.png
+++ b/codes/PPO/outputs/CartPole-v0/20211231-193837/results/train_rewards_curve.png
--- a/codes/PPO/agent.py
+++ b/codes/PPO/agent.py
@@ -5,7 +5,7 @@ Author: John
 Email: johnjim0816@gmail.com
 Date: 2021-03-23 15:17:42
 LastEditor: John
-LastEditTime: 2021-09-26 22:02:00
+LastEditTime: 2021-12-31 19:38:33
 Discription: 
 Environment: 
 '''
@@ -13,25 +13,89 @@ import os
 import numpy as np
 import torch 
 import torch.optim as optim
-from PPO.model import Actor,Critic
+import torch.nn as nn
-from PPO.memory import PPOMemory
+from torch.distributions.categorical import Categorical
 class PPOMemory:
    def __init__(self, batch_size):
        self.states = []
        self.probs = []
        self.vals = []
        self.actions = []
        self.rewards = []
        self.dones = []
        self.batch_size = batch_size
    def sample(self):
        batch_step = np.arange(0, len(self.states), self.batch_size)
        indices = np.arange(len(self.states), dtype=np.int64)
        np.random.shuffle(indices)
        batches = [indices[i:i+self.batch_size] for i in batch_step]
        return np.array(self.states),np.array(self.actions),np.array(self.probs),\
                np.array(self.vals),np.array(self.rewards),np.array(self.dones),batches
    def push(self, state, action, probs, vals, reward, done):
        self.states.append(state)
        self.actions.append(action)
        self.probs.append(probs)
        self.vals.append(vals)
        self.rewards.append(reward)
        self.dones.append(done)
    def clear(self):
        self.states = []
        self.probs = []
        self.actions = []
        self.rewards = []
        self.dones = []
        self.vals = []
 class Actor(nn.Module):
    def __init__(self,n_states, n_actions,
            hidden_dim):
        super(Actor, self).__init__()
        self.actor = nn.Sequential(
                nn.Linear(n_states, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, n_actions),
                nn.Softmax(dim=-1)
        )
    def forward(self, state):
        dist = self.actor(state)
        dist = Categorical(dist)
        return dist
 class Critic(nn.Module):
    def __init__(self, n_states,hidden_dim):
        super(Critic, self).__init__()
        self.critic = nn.Sequential(
                nn.Linear(n_states, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, 1)
        )
    def forward(self, state):
        value = self.critic(state)
        return value
 class PPO:
-    def __init__(self, state_dim, action_dim,cfg):
+    def __init__(self, n_states, n_actions,cfg):
        self.gamma = cfg.gamma
        self.continuous = cfg.continuous 
        self.policy_clip = cfg.policy_clip
        self.n_epochs = cfg.n_epochs
        self.gae_lambda = cfg.gae_lambda
        self.device = cfg.device
-        self.actor = Actor(state_dim, action_dim,cfg.hidden_dim).to(self.device)
+        self.actor = Actor(n_states, n_actions,cfg.hidden_dim).to(self.device)
-        self.critic = Critic(state_dim,cfg.hidden_dim).to(self.device)
+        self.critic = Critic(n_states,cfg.hidden_dim).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=cfg.actor_lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=cfg.critic_lr)
        self.memory = PPOMemory(cfg.batch_size)
        self.loss = 0
    def choose_action(self, state):
-        state = torch.tensor([state], dtype=torch.float).to(self.device)
+        state = np.array([state]) # 先转成数组再转tensor更高效
        state = torch.tensor(state, dtype=torch.float).to(self.device)
        dist = self.actor(state)
        value = self.critic(state)
        action = dist.sample()
--- a/codes/PPO/task0.py
+++ b/codes/PPO/task0.py
@@ -5,63 +5,127 @@ sys.path.append(parent_path) # 添加路径到系统路径
 import gym
 import torch
 import numpy as np
 import datetime
-from common.plot import plot_rewards
+from common.utils import plot_rewards
 from common.utils import save_results,make_dir
-from PPO.agent import PPO
+from ppo2 import PPO
 from PPO.train import train
 curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
-class PPOConfig:
+class Config:
    def __init__(self) -> None:
-        self.algo = "DQN"  # 算法名称
+        ################################## 环境超参数 ###################################
        self.algo_name = "DQN"  # 算法名称
        self.env_name = 'CartPole-v0' # 环境名称
        self.continuous = False # 环境是否为连续动作
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 检测GPU
        self.seed = 10 # 随机种子，置0则不设置随机种子
        self.train_eps = 200 # 训练的回合数
        self.test_eps = 20 # 测试的回合数
-        self.batch_size = 5
+        ################################################################################
-        self.gamma=0.99
+        
        ################################## 算法超参数 ####################################
        self.batch_size = 5  # mini-batch SGD中的批量大小
        self.gamma = 0.95  # 强化学习中的折扣因子
        self.n_epochs = 4
-        self.actor_lr = 0.0003
+        self.actor_lr = 0.0003 # actor的学习率
-        self.critic_lr = 0.0003
+        self.critic_lr = 0.0003 # critic的学习率
-        self.gae_lambda=0.95
+        self.gae_lambda = 0.95
-        self.policy_clip=0.2
+        self.policy_clip = 0.2
        self.hidden_dim = 256
-        self.update_fre = 20 # frequency of agent update
+        self.update_fre = 20 # 策略更新频率
-
+        ################################################################################
-class PlotConfig:
+        
-    def __init__(self) -> None:
+        ################################# 保存结果相关参数 ################################
        self.algo = "DQN"  # 算法名称
        self.env_name = 'CartPole-v0' # 环境名称
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 检测GPU
        self.result_path = curr_path+"/outputs/" + self.env_name + \
            '/'+curr_time+'/results/'  # 保存结果的路径
        self.model_path = curr_path+"/outputs/" + self.env_name + \
            '/'+curr_time+'/models/'  # 保存模型的路径
        self.save = True # 是否保存图片
        ################################################################################
 def env_agent_config(cfg):
    ''' 创建环境和智能体
    '''
    env = gym.make(cfg.env_name)  # 创建环境
    n_states = env.observation_space.shape[0]  # 状态维度
    if cfg.continuous:
        n_actions = env.action_space.shape[0] # 动作维度
    else:
        n_actions = env.action_space.n  # 动作维度
    agent = PPO(n_states, n_actions, cfg)  # 创建智能体
    if cfg.seed !=0: # 设置随机种子
        torch.manual_seed(cfg.seed)
        env.seed(cfg.seed)
        np.random.seed(cfg.seed)
    return env, agent
-def env_agent_config(cfg,seed=1):
+def train(cfg,env,agent):
-    env = gym.make(cfg.env_name)  
+    print('开始训练！')
-    env.seed(seed)
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
-    state_dim = env.observation_space.shape[0]
+    rewards = [] # 记录所有回合的奖励
-    action_dim = env.action_space.n
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
-    agent = PPO(state_dim,action_dim,cfg)
+    steps = 0
-    return env,agent
+    for i_ep in range(cfg.train_eps):
        state = env.reset()
        done = False
        ep_reward = 0
        while not done:
            action, prob, val = agent.choose_action(state)
            state_, reward, done, _ = env.step(action)
            steps += 1
            ep_reward += reward
            agent.memory.push(state, action, prob, val, reward, done)
            if steps % cfg.update_fre == 0:
                agent.update()
            state = state_
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
        else:
            ma_rewards.append(ep_reward)
        if (i_ep+1)%10 == 0: 
            print(f"回合：{i_ep+1}/{cfg.train_eps}，奖励：{ep_reward:.2f}")
    print('完成训练！')
    return rewards,ma_rewards
-cfg  = PPOConfig()
+def test(cfg,env,agent):
-plot_cfg = PlotConfig()
+    print('开始测试!')
-# 训练
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
-env,agent = env_agent_config(cfg,seed=1)
+    rewards = [] # 记录所有回合的奖励
-rewards, ma_rewards = train(cfg, env, agent)
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
-make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹
+    for i_ep in range(cfg.test_eps):
-agent.save(path=plot_cfg.model_path)
+        state = env.reset()
-save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path)
+        done = False
-plot_rewards(rewards, ma_rewards, plot_cfg, tag="train")
+        ep_reward = 0
-# 测试
+        while not done:
-env,agent = env_agent_config(cfg,seed=10)
+            action, prob, val = agent.choose_action(state)
-agent.load(path=plot_cfg.model_path)
+            state_, reward, done, _ = env.step(action)
-rewards,ma_rewards = eval(cfg,env,agent)
+            ep_reward += reward
-save_results(rewards,ma_rewards,tag='eval',path=plot_cfg.result_path)
+            state = state_
-plot_rewards(rewards,ma_rewards,plot_cfg,tag="eval")
+        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(
                0.9*ma_rewards[-1]+0.1*ep_reward)
        else:
            ma_rewards.append(ep_reward)
        print('回合：{}/{}, 奖励：{}'.format(i_ep+1, cfg.test_eps, ep_reward))
    print('完成训练！')
    return rewards,ma_rewards
 if __name__ == "__main__":
    cfg  = Config()
    # 训练
    env,agent = env_agent_config(cfg)
    rewards, ma_rewards = train(cfg, env, agent)
    make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹
    agent.save(path=cfg.model_path)
    save_results(rewards, ma_rewards, tag='train', path=cfg.result_path)
    plot_rewards(rewards, ma_rewards, cfg, tag="train")
    # 测试
    env,agent = env_agent_config(cfg)
    agent.load(path=cfg.model_path)
    rewards,ma_rewards = test(cfg,env,agent)
    save_results(rewards,ma_rewards,tag='test',path=cfg.result_path)
    plot_rewards(rewards,ma_rewards,cfg,tag="test")
--- a/codes/PPO/task1.py
+++ b/codes/PPO/task1.py
@@ -6,10 +6,9 @@ sys.path.append(parent_path) # 添加路径到系统路径
 import gym
 import torch
 import datetime
-from common.plot import plot_rewards
+from common.utils import plot_rewards
 from common.utils import save_results,make_dir
-from PPO.agent import PPO
+from ppo2 import PPO
 from PPO.train import train
 curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
@@ -45,9 +44,9 @@ class PlotConfig:
 def env_agent_config(cfg,seed=1):
    env = gym.make(cfg.env_name)  
    env.seed(seed)
-    state_dim = env.observation_space.shape[0]
+    n_states = env.observation_space.shape[0]
-    action_dim = env.action_space.shape[0]
+    n_actions = env.action_space.shape[0]
-    agent = PPO(state_dim,action_dim,cfg)
+    agent = PPO(n_states,n_actions,cfg)
    return env,agent
--- a/codes/PPO/train.ipynb
+++ b/codes/PPO/train.ipynb
--- a/codes/PPO/train.py
+++ b/codes/PPO/train.py
@@ -1,121 +0,0 @@
 def train(cfg,env,agent):
    print('开始训练！')
    print(f'环境：{cfg.env_name}, 算法：{cfg.algo}, 设备：{cfg.device}')
    rewards = [] # 记录所有回合的奖励
    ma_rewards = []  # 记录所有回合的滑动平均奖励
    steps = 0
    for i_ep in range(cfg.train_eps):
        state = env.reset()
        done = False
        ep_reward = 0
        while not done:
            action, prob, val = agent.choose_action(state)
            state_, reward, done, _ = env.step(action)
            steps += 1
            ep_reward += reward
            agent.memory.push(state, action, prob, val, reward, done)
            if steps % cfg.update_fre == 0:
                agent.update()
            state = state_
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
        else:
            ma_rewards.append(ep_reward)
        if (i_ep+1)%10 == 0: 
            print(f"回合：{i_ep+1}/{cfg.train_eps}，奖励：{ep_reward:.2f}")
    print('完成训练！')
    return rewards,ma_rewards
 def eval(cfg,env,agent):
    print('开始测试!')
    print(f'环境：{cfg.env_name}, 算法：{cfg.algo}, 设备：{cfg.device}')
    rewards = [] # 记录所有回合的奖励
    ma_rewards = []  # 记录所有回合的滑动平均奖励
    for i_ep in range(cfg.test_eps):
        state = env.reset()
        done = False
        ep_reward = 0
        while not done:
            action, prob, val = agent.choose_action(state)
            state_, reward, done, _ = env.step(action)
            ep_reward += reward
            state = state_
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(
                0.9*ma_rewards[-1]+0.1*ep_reward)
        else:
            ma_rewards.append(ep_reward)
        print('回合：{}/{}, 奖励：{}'.format(i_ep+1, cfg.test_eps, ep_reward))
    print('完成训练！')
    return rewards,ma_rewards
 if __name__ == '__main__':
    import sys,os
    curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
    parent_path = os.path.dirname(curr_path) # 父路径
    sys.path.append(parent_path) # 添加路径到系统路径
    import gym
    import torch
    import datetime
    from common.plot import plot_rewards
    from common.utils import save_results,make_dir
    from PPO.agent import PPO
    from PPO.train import train
    curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
    class PPOConfig:
        def __init__(self) -> None:
            self.algo = "DQN"  # 算法名称
            self.env_name = 'CartPole-v0' # 环境名称
            self.continuous = False # 环境是否为连续动作
            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 检测GPU
            self.train_eps = 200 # 训练的回合数
            self.test_eps = 20 # 测试的回合数
            self.batch_size = 5
            self.gamma=0.99
            self.n_epochs = 4
            self.actor_lr = 0.0003
            self.critic_lr = 0.0003
            self.gae_lambda=0.95
            self.policy_clip=0.2
            self.hidden_dim = 256
            self.update_fre = 20 # frequency of agent update
    class PlotConfig:
        def __init__(self) -> None:
            self.algo = "DQN"  # 算法名称
            self.env_name = 'CartPole-v0' # 环境名称
            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 检测GPU
            self.result_path = curr_path+"/outputs/" + self.env_name + \
                '/'+curr_time+'/results/'  # 保存结果的路径
            self.model_path = curr_path+"/outputs/" + self.env_name + \
                '/'+curr_time+'/models/'  # 保存模型的路径
            self.save = True # 是否保存图片
    def env_agent_config(cfg,seed=1):
        env = gym.make(cfg.env_name)  
        env.seed(seed)
        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.n
        agent = PPO(state_dim,action_dim,cfg)
        return env,agent
    cfg  = PPOConfig()
    plot_cfg = PlotConfig()
    # 训练
    env,agent = env_agent_config(cfg,seed=1)
    rewards, ma_rewards = train(cfg, env, agent)
    make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹
    agent.save(path=plot_cfg.model_path)
    save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path)
    plot_rewards(rewards, ma_rewards, plot_cfg, tag="train")
    # 测试
    env,agent = env_agent_config(cfg,seed=10)
    agent.load(path=plot_cfg.model_path)
    rewards,ma_rewards = eval(cfg,env,agent)
    save_results(rewards,ma_rewards,tag='eval',path=plot_cfg.result_path)
    plot_rewards(rewards,ma_rewards,plot_cfg,tag="eval")
--- a/codes/PolicyGradient/model.py
+++ b/codes/PolicyGradient/model.py
@@ -1,31 +0,0 @@
 #!/usr/bin/env python
 # coding=utf-8
 '''
 Author: John
 Email: johnjim0816@gmail.com
 Date: 2021-03-23 16:35:58
 LastEditor: John
 LastEditTime: 2021-12-21 23:21:26
 Discription: 
 Environment: 
 '''
 import torch.nn as nn
 import torch.nn.functional as F
 class MLP(nn.Module):
    ''' 多层感知机
        输入：state维度
        输出：概率
    '''
    def __init__(self,input_dim,hidden_dim = 36):
        super(MLP, self).__init__()
        # 24和36为hidden layer的层数，可根据input_dim, action_dim的情况来改变
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim,hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, 1)  # Prob of Left
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.sigmoid(self.fc3(x))
        return x
--- a/codes/PolicyGradient/outputs/CartPole-v0/20210505-173524/models/pg_checkpoint.pt
+++ b/codes/PolicyGradient/outputs/CartPole-v0/20210505-173524/models/pg_checkpoint.pt
--- a/codes/PolicyGradient/outputs/CartPole-v0/20210505-173524/results/eval_ma_rewards.npy
+++ b/codes/PolicyGradient/outputs/CartPole-v0/20210505-173524/results/eval_ma_rewards.npy
--- a/codes/PolicyGradient/outputs/CartPole-v0/20210505-173524/results/eval_rewards.npy
+++ b/codes/PolicyGradient/outputs/CartPole-v0/20210505-173524/results/eval_rewards.npy
--- a/codes/PolicyGradient/outputs/CartPole-v0/20210505-173524/results/eval_rewards_curve.png
+++ b/codes/PolicyGradient/outputs/CartPole-v0/20210505-173524/results/eval_rewards_curve.png
--- a/codes/PolicyGradient/outputs/CartPole-v0/20210505-173524/results/train_ma_rewards.npy
+++ b/codes/PolicyGradient/outputs/CartPole-v0/20210505-173524/results/train_ma_rewards.npy
--- a/codes/PolicyGradient/outputs/CartPole-v0/20210505-173524/results/train_rewards.npy
+++ b/codes/PolicyGradient/outputs/CartPole-v0/20210505-173524/results/train_rewards.npy
--- a/codes/PolicyGradient/outputs/CartPole-v0/20210505-173524/results/train_rewards_curve.png
+++ b/codes/PolicyGradient/outputs/CartPole-v0/20210505-173524/results/train_rewards_curve.png
--- a/codes/PolicyGradient/outputs/CartPole-v0/20220210-061325/models/pg_checkpoint.pt
+++ b/codes/PolicyGradient/outputs/CartPole-v0/20220210-061325/models/pg_checkpoint.pt
--- a/codes/PolicyGradient/outputs/CartPole-v0/20220210-061325/results/test_ma_rewards.npy
+++ b/codes/PolicyGradient/outputs/CartPole-v0/20220210-061325/results/test_ma_rewards.npy
--- a/codes/PolicyGradient/outputs/CartPole-v0/20220210-061325/results/test_rewards.npy
+++ b/codes/PolicyGradient/outputs/CartPole-v0/20220210-061325/results/test_rewards.npy
--- a/codes/PolicyGradient/outputs/CartPole-v0/20220210-061325/results/test_rewards_curve.png
+++ b/codes/PolicyGradient/outputs/CartPole-v0/20220210-061325/results/test_rewards_curve.png
--- a/codes/PolicyGradient/outputs/CartPole-v0/20220210-061325/results/train_ma_rewards.npy
+++ b/codes/PolicyGradient/outputs/CartPole-v0/20220210-061325/results/train_ma_rewards.npy
--- a/codes/PolicyGradient/outputs/CartPole-v0/20220210-061325/results/train_rewards.npy
+++ b/codes/PolicyGradient/outputs/CartPole-v0/20220210-061325/results/train_rewards.npy
--- a/codes/PolicyGradient/outputs/CartPole-v0/20220210-061325/results/train_rewards_curve.png
+++ b/codes/PolicyGradient/outputs/CartPole-v0/20220210-061325/results/train_rewards_curve.png
--- a/codes/PolicyGradient/agent.py
+++ b/codes/PolicyGradient/agent.py
@@ -5,21 +5,41 @@ Author: John
 Email: johnjim0816@gmail.com
 Date: 2020-11-22 23:27:44
 LastEditor: John
-LastEditTime: 2021-10-16 00:43:52
+LastEditTime: 2022-02-10 01:25:27
 Discription: 
 Environment: 
 '''
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.distributions import Bernoulli
 from torch.autograd import Variable
 import numpy as np
 from PolicyGradient.model import MLP
 class MLP(nn.Module):
    ''' 多层感知机
        输入：state维度
        输出：概率
    '''
    def __init__(self,input_dim,hidden_dim = 36):
        super(MLP, self).__init__()
        # 24和36为hidden layer的层数，可根据input_dim, n_actions的情况来改变
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim,hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, 1)  # Prob of Left
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.sigmoid(self.fc3(x))
        return x
 class PolicyGradient:
-    def __init__(self, state_dim,cfg):
+    def __init__(self, n_states,cfg):
        self.gamma = cfg.gamma
-        self.policy_net = MLP(state_dim,hidden_dim=cfg.hidden_dim)
+        self.policy_net = MLP(n_states,hidden_dim=cfg.hidden_dim)
        self.optimizer = torch.optim.RMSprop(self.policy_net.parameters(), lr=cfg.lr)
        self.batch_size = cfg.batch_size
--- a/codes/PolicyGradient/task0.py
+++ b/codes/PolicyGradient/task0.py
@@ -0,0 +1,152 @@
 #!/usr/bin/env python
 # coding=utf-8
 '''
 Author: John
 Email: johnjim0816@gmail.com
 Date: 2020-11-22 23:21:53
 LastEditor: John
 LastEditTime: 2022-02-10 06:13:21
 Discription: 
 Environment: 
 '''
 import sys
 import os
 curr_path = os.path.dirname(os.path.abspath(__file__))  # 当前文件所在绝对路径
 parent_path = os.path.dirname(curr_path)  # 父路径
 sys.path.append(parent_path)  # 添加路径到系统路径
 import gym
 import torch
 import datetime
 from itertools import count
 from pg import PolicyGradient
 from common.utils import save_results, make_dir
 from common.utils import plot_rewards
 curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
 class Config:
    '''超参数
    '''
    def __init__(self):
        ################################## 环境超参数 ###################################
        self.algo_name = "PolicyGradient"  # 算法名称
        self.env_name = 'CartPole-v0' # 环境名称
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")  # 检测GPUgjgjlkhfsf风刀霜的撒发十
        self.seed = 10 # 随机种子，置0则不设置随机种子
        self.train_eps = 300 # 训练的回合数
        self.test_eps = 30 # 测试的回合数
        ################################################################################
        ################################## 算法超参数 ###################################
        self.batch_size = 8 # mini-batch SGD中的批量大小
        self.lr = 0.01 # 学习率
        self.gamma = 0.99 # 强化学习中的折扣因子
        self.hidden_dim = 36 # 网络隐藏层
        ################################################################################
        ################################# 保存结果相关参数 ################################
        self.result_path = curr_path + "/outputs/" + self.env_name + \
            '/' + curr_time + '/results/'  # 保存结果的路径
        self.model_path = curr_path + "/outputs/" + self.env_name + \
            '/' + curr_time + '/models/'  # 保存模型的路径
        self.save = True # 是否保存图片
        ################################################################################
 def env_agent_config(cfg,seed=1):
    env = gym.make(cfg.env_name)  
    env.seed(seed)
    n_states = env.observation_space.shape[0]
    agent = PolicyGradient(n_states,cfg)
    return env,agent
 def train(cfg,env,agent):
    print('开始训练!')
    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
    state_pool = [] # 存放每batch_size个episode的state序列
    action_pool = []
    reward_pool = [] 
    rewards = []
    ma_rewards = []
    for i_ep in range(cfg.train_eps):
        state = env.reset()
        ep_reward = 0
        for _ in count():
            action = agent.choose_action(state) # 根据当前环境state选择action
            next_state, reward, done, _ = env.step(action)
            ep_reward += reward
            if done:
                reward = 0
            state_pool.append(state)
            action_pool.append(float(action))
            reward_pool.append(reward)
            state = next_state
            if done:
                print('回合：{}/{}, 奖励：{}'.format(i_ep + 1, cfg.train_eps, ep_reward))
                break
        if i_ep > 0 and i_ep % cfg.batch_size == 0:
            agent.update(reward_pool,state_pool,action_pool)
            state_pool = [] # 每个episode的state
            action_pool = []
            reward_pool = []
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(
                0.9*ma_rewards[-1]+0.1*ep_reward)
        else:
            ma_rewards.append(ep_reward)
    print('完成训练！')
    env.close()
    return rewards, ma_rewards
 def test(cfg,env,agent):
    print('开始测试!')
    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
    rewards = []
    ma_rewards = []
    for i_ep in range(cfg.test_eps):
        state = env.reset()
        ep_reward = 0
        for _ in count():
            action = agent.choose_action(state) # 根据当前环境state选择action
            next_state, reward, done, _ = env.step(action)
            ep_reward += reward
            if done:
                reward = 0
            state = next_state
            if done:
                print('回合：{}/{}, 奖励：{}'.format(i_ep + 1, cfg.train_eps, ep_reward))
                break
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(
                0.9*ma_rewards[-1]+0.1*ep_reward)
        else:
            ma_rewards.append(ep_reward)
    print('完成测试！')
    env.close()
    return rewards, ma_rewards
 if __name__ == "__main__":
    cfg = Config()
    # 训练
    env, agent = env_agent_config(cfg)
    rewards, ma_rewards = train(cfg, env, agent)
    make_dir(cfg.result_path, cfg.model_path)  # 创建保存结果和模型路径的文件夹
    agent.save(path=cfg.model_path)  # 保存模型
    save_results(rewards, ma_rewards, tag='train',
                 path=cfg.result_path)  # 保存结果
    plot_rewards(rewards, ma_rewards, cfg, tag="train")  # 画出结果
    # 测试
    env, agent = env_agent_config(cfg)
    agent.load(path=cfg.model_path)  # 导入模型
    rewards, ma_rewards = test(cfg, env, agent)
    save_results(rewards, ma_rewards, tag='test',
                 path=cfg.result_path)  # 保存结果
    plot_rewards(rewards, ma_rewards, cfg, tag="test")  # 画出结果
--- a/codes/PolicyGradient/task0_train.py
+++ b/codes/PolicyGradient/task0_train.py
@@ -1,136 +0,0 @@
 #!/usr/bin/env python
 # coding=utf-8
 '''
 Author: John
 Email: johnjim0816@gmail.com
 Date: 2020-11-22 23:21:53
 LastEditor: John
 LastEditTime: 2021-10-16 00:34:13
 Discription: 
 Environment: 
 '''
 import sys,os
 curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
 parent_path = os.path.dirname(curr_path) # 父路径
 sys.path.append(parent_path) # 添加父路径到系统路径sys.path
 import gym
 import torch
 import datetime
 from itertools import count
 from PolicyGradient.agent import PolicyGradient
 from common.plot import plot_rewards
 from common.utils import save_results,make_dir
 curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
 class PGConfig:
    def __init__(self):
        self.algo = "PolicyGradient"  # 算法名称
        self.env = 'CartPole-v0' # 环境名称
        self.result_path = curr_path+"/outputs/" + self.env + \
            '/'+curr_time+'/results/'  # 保存结果的路径
        self.model_path = curr_path+"/outputs/" + self.env + \
            '/'+curr_time+'/models/'  # 保存模型的路径
        self.train_eps = 300 # 训练的回合数
        self.test_eps = 30 # 测试的回合数
        self.batch_size = 8
        self.lr = 0.01 # 学习率
        self.gamma = 0.99
        self.hidden_dim = 36 # dimmension of hidden layer
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")  # check gpu
 def env_agent_config(cfg,seed=1):
    env = gym.make(cfg.env)  
    env.seed(seed)
    state_dim = env.observation_space.shape[0]
    agent = PolicyGradient(state_dim,cfg)
    return env,agent
 def train(cfg,env,agent):
    print('Start to eval !')
    print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}')
    state_pool = [] # 存放每batch_size个episode的state序列
    action_pool = []
    reward_pool = [] 
    rewards = []
    ma_rewards = []
    for i_ep in range(cfg.train_eps):
        state = env.reset()
        ep_reward = 0
        for _ in count():
            action = agent.choose_action(state) # 根据当前环境state选择action
            next_state, reward, done, _ = env.step(action)
            ep_reward += reward
            if done:
                reward = 0
            state_pool.append(state)
            action_pool.append(float(action))
            reward_pool.append(reward)
            state = next_state
            if done:
                print('Episode:', i_ep, ' Reward:',  ep_reward)
                break
        if i_ep > 0 and i_ep % cfg.batch_size == 0:
            agent.update(reward_pool,state_pool,action_pool)
            state_pool = [] # 每个episode的state
            action_pool = []
            reward_pool = []
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(
                0.9*ma_rewards[-1]+0.1*ep_reward)
        else:
            ma_rewards.append(ep_reward)
    print('complete training！')
    return rewards, ma_rewards
 def eval(cfg,env,agent):
    print('Start to eval !')
    print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}')
    rewards = []
    ma_rewards = []
    for i_ep in range(cfg.test_eps):
        state = env.reset()
        ep_reward = 0
        for _ in count():
            action = agent.choose_action(state) # 根据当前环境state选择action
            next_state, reward, done, _ = env.step(action)
            ep_reward += reward
            if done:
                reward = 0
            state = next_state
            if done:
                print('Episode:', i_ep, ' Reward:',  ep_reward)
                break
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(
                0.9*ma_rewards[-1]+0.1*ep_reward)
        else:
            ma_rewards.append(ep_reward)
    print('complete evaling！')
    return rewards, ma_rewards
 if __name__ == "__main__":
    cfg = PGConfig()
     # train
    env,agent = env_agent_config(cfg,seed=1)
    rewards, ma_rewards = train(cfg, env, agent)
    make_dir(cfg.result_path, cfg.model_path)
    agent.save(path=cfg.model_path)
    save_results(rewards, ma_rewards, tag='train', path=cfg.result_path)
    plot_rewards(rewards, ma_rewards, tag="train",
                 algo=cfg.algo, path=cfg.result_path)
    # eval
    env,agent = env_agent_config(cfg,seed=10)
    agent.load(path=cfg.model_path)
    rewards,ma_rewards = eval(cfg,env,agent)
    save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path)
    plot_rewards(rewards,ma_rewards,tag="eval",env=cfg.env,algo = cfg.algo,path=cfg.result_path)
--- a/codes/QLearning/env/gridworld_env.py
+++ b/codes/QLearning/env/gridworld_env.py
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/models/Qleaning_model.pkl
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/models/Qleaning_model.pkl
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/test_rewards_curve.png
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/test_rewards_curve.png
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/train_ma_rewards.npy
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/train_ma_rewards.npy
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/train_rewards.npy
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/train_rewards.npy
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/train_rewards_curve.png
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/train_rewards_curve.png
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/models/Qleaning_model.pkl
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/models/Qleaning_model.pkl
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/test_ma_rewards.npy
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/test_ma_rewards.npy
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/test_rewards.npy
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/test_rewards.npy
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/test_rewards_curve.png
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/test_rewards_curve.png
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/train_ma_rewards.npy
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/train_ma_rewards.npy
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/train_rewards.npy
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/train_rewards.npy
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/train_rewards_curve.png
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/train_rewards_curve.png
--- a/codes/QLearning/outputs/CliffWalking-v0/20220210-005501/models/Qleaning_model.pkl
+++ b/codes/QLearning/outputs/CliffWalking-v0/20220210-005501/models/Qleaning_model.pkl
--- a/codes/QLearning/outputs/CliffWalking-v0/20220210-005501/results/test_ma_rewards.npy
+++ b/codes/QLearning/outputs/CliffWalking-v0/20220210-005501/results/test_ma_rewards.npy
--- a/codes/QLearning/outputs/CliffWalking-v0/20220210-005501/results/test_rewards.npy
+++ b/codes/QLearning/outputs/CliffWalking-v0/20220210-005501/results/test_rewards.npy
--- a/codes/QLearning/outputs/CliffWalking-v0/20220210-005501/results/test_rewards_curve.png
+++ b/codes/QLearning/outputs/CliffWalking-v0/20220210-005501/results/test_rewards_curve.png
--- a/Show More
+++ b/Show More