update

2021-12-22 11:19:13 +08:00
parent c257313d5b
commit 75df999258
55 changed files with 605 additions and 403 deletions
--- a/codes/DDPG/task0.py
+++ b/codes/DDPG/task0.py
@@ -34,7 +34,7 @@ class DDPGConfig:
        self.env_name = env_name # 环境名称
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
        self.train_eps = 300 # 训练的回合数
-        self.eval_eps = 50 # 测试的回合数
+        self.test_eps = 50 # 测试的回合数
        self.gamma = 0.99 # 折扣因子
        self.critic_lr = 1e-3 # 评论家网络的学习率
        self.actor_lr = 1e-4 # 演员网络的学习率
--- a/codes/DDPG/train.py
+++ b/codes/DDPG/train.py
@@ -42,7 +42,7 @@ def test(cfg, env, agent):
    print(f'环境：{cfg.env_name}, 算法：{cfg.algo}, 设备：{cfg.device}')
    rewards = [] # 记录所有回合的奖励
    ma_rewards = []  # 记录所有回合的滑动平均奖励
-    for i_ep in range(cfg.eval_eps):
+    for i_ep in range(cfg.test_eps):
        state = env.reset() 
        done = False
        ep_reward = 0
@@ -59,6 +59,6 @@ def test(cfg, env, agent):
            ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
        else:
            ma_rewards.append(ep_reward)
-        print(f"回合：{i_ep+1}/{cfg.eval_eps}，奖励：{ep_reward:.1f}")
+        print(f"回合：{i_ep+1}/{cfg.test_eps}，奖励：{ep_reward:.1f}")
    print('完成测试！')
    return rewards, ma_rewards
--- a/codes/DQN/task0.py
+++ b/codes/DQN/task0.py
@@ -23,7 +23,7 @@ class DQNConfig:
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")  # 检测GPU
        self.train_eps = 200  # 训练的回合数
-        self.eval_eps = 30  # 测试的回合数
+        self.test_eps = 30  # 测试的回合数
        # 超参数
        self.gamma = 0.95  # 强化学习中的折扣因子
        self.epsilon_start = 0.90  # e-greedy策略中初始epsilon
--- a/codes/DQN/task1.py
+++ b/codes/DQN/task1.py
@@ -26,7 +26,7 @@ class DQNConfig:
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")  # 检测GPU
        self.train_eps = 200  # 训练的回合数
-        self.eval_eps = 30  # 测试的回合数
+        self.test_eps = 30  # 测试的回合数
        # 超参数
        self.gamma = 0.95  # 强化学习中的折扣因子
        self.epsilon_start = 0.90  # e-greedy策略中初始epsilon
--- a/codes/DQN/train.ipynb
+++ b/codes/DQN/train.ipynb
@@ -180,7 +180,7 @@
    "        self.algo = \"DQN\"  # 算法名称\n",
    "        self.env = 'CartPole-v0' # 环境名称\n",
    "        self.train_eps = 200 # 训练的回合数\n",
-    "        self.eval_eps = 20 # 测试的回合数\n",
+    "        self.test_eps = 20 # 测试的回合数\n",
    "        self.gamma = 0.95 # 强化学习中的折扣因子\n",
    "        self.epsilon_start = 0.90 # e-greedy策略中初始epsilon\n",
    "        self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon\n",
@@ -365,7 +365,7 @@
    "    cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon\n",
    "    rewards = [] # 记录所有回合的奖励\n",
    "    ma_rewards = []  # 记录所有回合的滑动平均奖励\n",
-    "    for i_ep in range(cfg.eval_eps):\n",
+    "    for i_ep in range(cfg.test_eps):\n",
    "        ep_reward = 0 # 记录一回合内的奖励\n",
    "        state = env.reset() # 重置环境，返回初始状态\n",
    "        while True:\n",
@@ -381,7 +381,7 @@
    "        else:\n",
    "            ma_rewards.append(ep_reward)\n",
    "        if (i_ep+1)%3 == 0: \n",
-    "            print(f\"回合：{i_ep+1}/{cfg.eval_eps}, 奖励：{ep_reward:.1f}\")\n",
+    "            print(f\"回合：{i_ep+1}/{cfg.test_eps}, 奖励：{ep_reward:.1f}\")\n",
    "    print('完成测试！')\n",
    "    return rewards,ma_rewards\n",
    "\n",
--- a/codes/DQN/train.py
+++ b/codes/DQN/train.py
@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:48:57
@LastEditor: John
-LastEditTime: 2021-09-15 15:34:13
+LastEditTime: 2021-12-22 11:08:04
@Discription: 
@Environment: python 3.7.7
 '''
@@ -30,13 +30,13 @@ def train(cfg, env, agent):
                break
        if (i_ep+1) % cfg.target_update == 0: # 智能体目标网络更新
            agent.target_net.load_state_dict(agent.policy_net.state_dict())
        if (i_ep+1)%10 == 0: 
            print('回合：{}/{}, 奖励：{}'.format(i_ep+1, cfg.train_eps, ep_reward))
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
        else:
            ma_rewards.append(ep_reward)
        if (i_ep+1)%10 == 0: 
            print('回合：{}/{}, 奖励：{}'.format(i_ep+1, cfg.train_eps, ep_reward))
    print('完成训练！')
    return rewards, ma_rewards
@@ -48,7 +48,7 @@ def test(cfg,env,agent):
    cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
    rewards = [] # 记录所有回合的奖励
    ma_rewards = []  # 记录所有回合的滑动平均奖励
-    for i_ep in range(cfg.eval_eps):
+    for i_ep in range(cfg.test_eps):
        ep_reward = 0 # 记录一回合内的奖励
        state = env.reset() # 重置环境，返回初始状态
        while True:
@@ -63,7 +63,7 @@ def test(cfg,env,agent):
            ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
        else:
            ma_rewards.append(ep_reward)
-        print(f"回合：{i_ep+1}/{cfg.eval_eps}，奖励：{ep_reward:.1f}")
+        print(f"回合：{i_ep+1}/{cfg.test_eps}，奖励：{ep_reward:.1f}")
    print('完成测试！')
    return rewards,ma_rewards
@@ -89,7 +89,7 @@ if __name__ == "__main__":
            self.env_name = 'CartPole-v0' # 环境名称
            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 检测GPU
            self.train_eps = 200 # 训练的回合数
-            self.eval_eps = 30 # 测试的回合数
+            self.test_eps = 30 # 测试的回合数
            # 超参数
            self.gamma = 0.95 # 强化学习中的折扣因子
            self.epsilon_start = 0.90 # e-greedy策略中初始epsilon
--- a/codes/PPO/task0.py
+++ b/codes/PPO/task0.py
@@ -20,7 +20,7 @@ class PPOConfig:
        self.continuous = False # 环境是否为连续动作
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 检测GPU
        self.train_eps = 200 # 训练的回合数
-        self.eval_eps = 20 # 测试的回合数
+        self.test_eps = 20 # 测试的回合数
        self.batch_size = 5
        self.gamma=0.99
        self.n_epochs = 4
--- a/codes/PPO/task1.py
+++ b/codes/PPO/task1.py
@@ -20,7 +20,7 @@ class PPOConfig:
        self.continuous = True # 环境是否为连续动作
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 检测GPU
        self.train_eps = 200 # 训练的回合数
-        self.eval_eps = 20 # 测试的回合数
+        self.test_eps = 20 # 测试的回合数
        self.batch_size = 5
        self.gamma=0.99
        self.n_epochs = 4
--- a/codes/PPO/train.ipynb
+++ b/codes/PPO/train.ipynb
@@ -68,7 +68,7 @@
    "        self.result_path = curr_path+\"/results/\" +self.env+'/'+curr_time+'/results/'  # path to save results\n",
    "        self.model_path = curr_path+\"/results/\" +self.env+'/'+curr_time+'/models/'  # path to save models\n",
    "        self.train_eps = 200 # max training episodes\n",
-    "        self.eval_eps = 50\n",
+    "        self.test_eps = 50\n",
    "        self.batch_size = 5\n",
    "        self.gamma=0.99\n",
    "        self.n_epochs = 4\n",
@@ -144,7 +144,7 @@
    "    print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}')\n",
    "    rewards= []\n",
    "    ma_rewards = [] # moving average rewards\n",
-    "    for i_ep in range(cfg.eval_eps):\n",
+    "    for i_ep in range(cfg.test_eps):\n",
    "        state = env.reset()\n",
    "        done = False\n",
    "        ep_reward = 0\n",
--- a/codes/PPO/train.py
+++ b/codes/PPO/train.py
@@ -32,7 +32,7 @@ def eval(cfg,env,agent):
    print(f'环境：{cfg.env_name}, 算法：{cfg.algo}, 设备：{cfg.device}')
    rewards = [] # 记录所有回合的奖励
    ma_rewards = []  # 记录所有回合的滑动平均奖励
-    for i_ep in range(cfg.eval_eps):
+    for i_ep in range(cfg.test_eps):
        state = env.reset()
        done = False
        ep_reward = 0
@@ -47,7 +47,7 @@ def eval(cfg,env,agent):
                0.9*ma_rewards[-1]+0.1*ep_reward)
        else:
            ma_rewards.append(ep_reward)
-        print('回合：{}/{}, 奖励：{}'.format(i_ep+1, cfg.eval_eps, ep_reward))
+        print('回合：{}/{}, 奖励：{}'.format(i_ep+1, cfg.test_eps, ep_reward))
    print('完成训练！')
    return rewards,ma_rewards
@@ -74,7 +74,7 @@ if __name__ == '__main__':
            self.continuous = False # 环境是否为连续动作
            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 检测GPU
            self.train_eps = 200 # 训练的回合数
-            self.eval_eps = 20 # 测试的回合数
+            self.test_eps = 20 # 测试的回合数
            self.batch_size = 5
            self.gamma=0.99
            self.n_epochs = 4
--- a/codes/PolicyGradient/model.py
+++ b/codes/PolicyGradient/model.py
@@ -5,21 +5,22 @@ Author: John
 Email: johnjim0816@gmail.com
 Date: 2021-03-23 16:35:58
 LastEditor: John
-LastEditTime: 2021-03-23 16:36:20
+LastEditTime: 2021-12-21 23:21:26
 Discription: 
 Environment: 
 '''
 import torch.nn as nn
 import torch.nn.functional as F
 class MLP(nn.Module):
    ''' 多层感知机
        输入：state维度
        输出：概率
    '''
-    def __init__(self,state_dim,hidden_dim = 36):
+    def __init__(self,input_dim,hidden_dim = 36):
        super(MLP, self).__init__()
-        # 24和36为hidden layer的层数，可根据state_dim, action_dim的情况来改变
+        # 24和36为hidden layer的层数，可根据input_dim, action_dim的情况来改变
-        self.fc1 = nn.Linear(state_dim, hidden_dim)
+        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim,hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, 1)  # Prob of Left
--- a/codes/PolicyGradient/task0_train.py
+++ b/codes/PolicyGradient/task0_train.py
@@ -34,7 +34,7 @@ class PGConfig:
        self.model_path = curr_path+"/outputs/" + self.env + \
            '/'+curr_time+'/models/'  # 保存模型的路径
        self.train_eps = 300 # 训练的回合数
-        self.eval_eps = 30 # 测试的回合数
+        self.test_eps = 30 # 测试的回合数
        self.batch_size = 8
        self.lr = 0.01 # 学习率
        self.gamma = 0.99
@@ -94,7 +94,7 @@ def eval(cfg,env,agent):
    print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}')
    rewards = []
    ma_rewards = []
-    for i_ep in range(cfg.eval_eps):
+    for i_ep in range(cfg.test_eps):
        state = env.reset()
        ep_reward = 0
        for _ in count():
--- a/codes/QLearning/agent.py
+++ b/codes/QLearning/agent.py
@@ -5,7 +5,7 @@ Author: John
 Email: johnjim0816@gmail.com
 Date: 2020-09-11 23:03:00
 LastEditor: John
-LastEditTime: 2021-09-19 23:05:45
+LastEditTime: 2021-12-22 10:54:57
 Discription: use defaultdict to define Q table
 Environment: 
 '''
@@ -15,17 +15,17 @@ import torch
 from collections import defaultdict
 class QLearning(object):
-    def __init__(self,state_dim,
+    def __init__(self,n_states,
-                 action_dim,cfg):
+                 n_actions,cfg):
-        self.action_dim = action_dim  # dimension of acgtion
+        self.n_actions = n_actions 
-        self.lr = cfg.lr  # learning rate
+        self.lr = cfg.lr  # 学习率
        self.gamma = cfg.gamma  
        self.epsilon = 0 
        self.sample_count = 0  
        self.epsilon_start = cfg.epsilon_start
        self.epsilon_end = cfg.epsilon_end
        self.epsilon_decay = cfg.epsilon_decay
-        self.Q_table  = defaultdict(lambda: np.zeros(action_dim)) # A nested dictionary that maps state -> (action -> action-value)
+        self.Q_table  = defaultdict(lambda: np.zeros(n_actions)) # 用嵌套字典存放状态->动作->状态-动作值（Q值）的映射，即Q表
    def choose_action(self, state):
        self.sample_count += 1
        self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
@@ -34,7 +34,7 @@ class QLearning(object):
        if np.random.uniform(0, 1) > self.epsilon:
            action = np.argmax(self.Q_table[str(state)]) # 选择Q(s,a)最大对应的动作
        else:
-            action = np.random.choice(self.action_dim) # 随机选择动作
+            action = np.random.choice(self.n_actions) # 随机选择动作
        return action
    def predict(self,state):
        action = np.argmax(self.Q_table[str(state)])
--- a/codes/QLearning/outputs/CliffWalking-v0/20210919-211343/models/Qleaning_model.pkl
+++ b/codes/QLearning/outputs/CliffWalking-v0/20210919-211343/models/Qleaning_model.pkl
--- a/codes/QLearning/outputs/CliffWalking-v0/20210919-211343/results/train_ma_rewards.npy
+++ b/codes/QLearning/outputs/CliffWalking-v0/20210919-211343/results/train_ma_rewards.npy
--- a/codes/QLearning/outputs/CliffWalking-v0/20210919-211343/results/train_rewards.npy
+++ b/codes/QLearning/outputs/CliffWalking-v0/20210919-211343/results/train_rewards.npy
--- a/codes/QLearning/outputs/CliffWalking-v0/20210919-211343/results/train_rewards_curve_cn.png
+++ b/codes/QLearning/outputs/CliffWalking-v0/20210919-211343/results/train_rewards_curve_cn.png
--- a/codes/QLearning/outputs/CliffWalking-v0/20210919-211442/models/Qleaning_model.pkl
+++ b/codes/QLearning/outputs/CliffWalking-v0/20210919-211442/models/Qleaning_model.pkl
--- a/codes/QLearning/outputs/CliffWalking-v0/20210919-211442/results/train_ma_rewards.npy
+++ b/codes/QLearning/outputs/CliffWalking-v0/20210919-211442/results/train_ma_rewards.npy
--- a/codes/QLearning/outputs/CliffWalking-v0/20210919-211442/results/train_rewards.npy
+++ b/codes/QLearning/outputs/CliffWalking-v0/20210919-211442/results/train_rewards.npy
--- a/codes/QLearning/outputs/CliffWalking-v0/20210919-211442/results/train_rewards_curve_cn.png
+++ b/codes/QLearning/outputs/CliffWalking-v0/20210919-211442/results/train_rewards_curve_cn.png
--- a/codes/QLearning/outputs/CliffWalking-v0/20210919-211456/models/Qleaning_model.pkl
+++ b/codes/QLearning/outputs/CliffWalking-v0/20210919-211456/models/Qleaning_model.pkl
--- a/codes/QLearning/outputs/CliffWalking-v0/20210919-211456/results/train_ma_rewards.npy
+++ b/codes/QLearning/outputs/CliffWalking-v0/20210919-211456/results/train_ma_rewards.npy
--- a/codes/QLearning/outputs/CliffWalking-v0/20210919-211456/results/train_rewards.npy
+++ b/codes/QLearning/outputs/CliffWalking-v0/20210919-211456/results/train_rewards.npy
--- a/codes/QLearning/outputs/CliffWalking-v0/20210919-211456/results/train_rewards_curve_cn.png
+++ b/codes/QLearning/outputs/CliffWalking-v0/20210919-211456/results/train_rewards_curve_cn.png
--- a/codes/QLearning/outputs/CliffWalking-v0/20210920-003309/models/Qleaning_model.pkl
+++ b/codes/QLearning/outputs/CliffWalking-v0/20210920-003309/models/Qleaning_model.pkl
--- a/codes/QLearning/outputs/CliffWalking-v0/20210920-003309/results/eval_rewards_curve_cn.png
+++ b/codes/QLearning/outputs/CliffWalking-v0/20210920-003309/results/eval_rewards_curve_cn.png
--- a/codes/QLearning/outputs/CliffWalking-v0/20210920-003309/results/train_ma_rewards.npy
+++ b/codes/QLearning/outputs/CliffWalking-v0/20210920-003309/results/train_ma_rewards.npy
--- a/codes/QLearning/outputs/CliffWalking-v0/20210920-003309/results/train_rewards.npy
+++ b/codes/QLearning/outputs/CliffWalking-v0/20210920-003309/results/train_rewards.npy
--- a/codes/QLearning/outputs/CliffWalking-v0/20210920-003309/results/train_rewards_curve_cn.png
+++ b/codes/QLearning/outputs/CliffWalking-v0/20210920-003309/results/train_rewards_curve_cn.png
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/models/Qleaning_model.pkl
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/models/Qleaning_model.pkl
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/test_ma_rewards.npy
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/test_ma_rewards.npy
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/test_rewards.npy
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/test_rewards.npy
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/test_rewards_curve.png
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/test_rewards_curve.png
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/train_ma_rewards.npy
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/train_ma_rewards.npy
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/train_rewards.npy
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/train_rewards.npy
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/train_rewards_curve.png
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-110223/results/train_rewards_curve.png
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/models/Qleaning_model.pkl
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/models/Qleaning_model.pkl
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/test_ma_rewards.npy
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/test_ma_rewards.npy
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/test_rewards.npy
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/test_rewards.npy
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/test_rewards_curve.png
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/test_rewards_curve.png
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/train_ma_rewards.npy
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/train_ma_rewards.npy
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/train_rewards.npy
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/train_rewards.npy
--- a/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/train_rewards_curve.png
+++ b/codes/QLearning/outputs/CliffWalking-v0/20211222-111747/results/train_rewards_curve.png
--- a/codes/QLearning/task0.ipynb
+++ b/codes/QLearning/task0.ipynb
--- a/codes/QLearning/task0.py
+++ b/codes/QLearning/task0.py
@@ -0,0 +1,93 @@
 #!/usr/bin/env python
 # coding=utf-8
 '''
 Author: John
 Email: johnjim0816@gmail.com
 Date: 2020-09-11 23:03:00
 LastEditor: John
 LastEditTime: 2021-12-22 11:13:23
 Discription: 
 Environment: 
 '''
 import sys
 import os
 curr_path = os.path.dirname(os.path.abspath(__file__))  # 当前文件所在绝对路径
 parent_path = os.path.dirname(curr_path)  # 父路径
 sys.path.append(parent_path)  # 添加路径到系统路径
 import gym
 import torch
 import datetime
 from envs.gridworld_env import CliffWalkingWapper
 from QLearning.agent import QLearning
 from QLearning.train import train,test
 from common.utils import plot_rewards,plot_rewards_cn
 from common.utils import save_results,make_dir
 curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
 algo_name = 'Q-learning'  # 算法名称
 env_name = 'CliffWalking-v0'  # 环境名称
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
 class QlearningConfig:
    '''训练相关参数'''
    def __init__(self):
        self.algo_name = algo_name # 算法名称
        self.env_name = env_name # 环境名称
        self.device = device # 检测GPU
        self.train_eps = 400 # 训练的回合数
        self.test_eps = 30 # 测试的回合数
        self.gamma = 0.9 # reward的衰减率
        self.epsilon_start = 0.95 # e-greedy策略中初始epsilon
        self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon
        self.epsilon_decay = 300 # e-greedy策略中epsilon的衰减率
        self.lr = 0.1 # 学习率      
 class PlotConfig:
    ''' 绘图相关参数设置
    '''
    def __init__(self) -> None:
        self.algo_name = algo_name  # 算法名称
        self.env_name = env_name  # 环境名称
        self.device = device # 检测GPU
        self.result_path = curr_path + "/outputs/" + self.env_name + \
            '/' + curr_time + '/results/'  # 保存结果的路径
        self.model_path = curr_path + "/outputs/" + self.env_name + \
            '/' + curr_time + '/models/'  # 保存模型的路径
        self.save = True  # 是否保存图片
 def env_agent_config(cfg,seed=1):
    '''创建环境和智能体
    Args:
        cfg ([type]): [description]
        seed (int, optional): 随机种子. Defaults to 1.
    Returns:
        env [type]: 环境
        agent : 智能体
    '''    
    env = gym.make(cfg.env_name)  
    env = CliffWalkingWapper(env)
    env.seed(seed) # 设置随机种子
    n_states = env.observation_space.n # 状态维度
    n_actions = env.action_space.n # 动作维度
    agent = QLearning(n_states,n_actions,cfg)
    return env,agent
 cfg = QlearningConfig()
 plot_cfg = PlotConfig()
 # 训练
 env, agent = env_agent_config(cfg, seed=1)
 rewards, ma_rewards = train(cfg, env, agent)
 make_dir(plot_cfg.result_path, plot_cfg.model_path)  # 创建保存结果和模型路径的文件夹
 agent.save(path=plot_cfg.model_path)  # 保存模型
 save_results(rewards, ma_rewards, tag='train',
            path=plot_cfg.result_path)  # 保存结果
 plot_rewards(rewards, ma_rewards, plot_cfg, tag="train")  # 画出结果
 # 测试
 env, agent = env_agent_config(cfg, seed=10)
 agent.load(path=plot_cfg.model_path)  # 导入模型
 rewards, ma_rewards = test(cfg, env, agent)
 save_results(rewards, ma_rewards, tag='test', path=plot_cfg.result_path)  # 保存结果
 plot_rewards(rewards, ma_rewards, plot_cfg, tag="test")  # 画出结果
--- a/codes/QLearning/task0_train.ipynb
+++ b/codes/QLearning/task0_train.ipynb
--- a/codes/QLearning/task0_train.py
+++ b/codes/QLearning/task0_train.py
@@ -1,126 +0,0 @@
 #!/usr/bin/env python
 # coding=utf-8
 '''
 Author: John
 Email: johnjim0816@gmail.com
 Date: 2020-09-11 23:03:00
 LastEditor: John
 LastEditTime: 2021-09-23 12:22:58
 Discription: 
 Environment: 
 '''
 import sys,os
 curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前路径
 parent_path=os.path.dirname(curr_path) # 父路径，这里就是我们的项目路径
 sys.path.append(parent_path) # 由于需要引用项目路径下的其他模块比如envs，所以需要添加路径到sys.path
 import gym
 import torch
 import datetime
 from envs.gridworld_env import CliffWalkingWapper
 from QLearning.agent import QLearning
 from common.plot import plot_rewards,plot_rewards_cn
 from common.utils import save_results,make_dir
 curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
 class QlearningConfig:
    '''训练相关参数'''
    def __init__(self):
        self.algo = 'Q-learning' # 算法名称
        self.env = 'CliffWalking-v0' # 环境名称
        self.result_path = curr_path+"/outputs/" +self.env+'/'+curr_time+'/results/'  # 保存结果的路径
        self.model_path = curr_path+"/outputs/" +self.env+'/'+curr_time+'/models/'  # 保存模型的路径
        self.train_eps = 400 # 训练的回合数
        self.eval_eps = 30 # 测试的回合数
        self.gamma = 0.9 # reward的衰减率
        self.epsilon_start = 0.95 # e-greedy策略中初始epsilon
        self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon
        self.epsilon_decay = 300 # e-greedy策略中epsilon的衰减率
        self.lr = 0.1 # 学习率
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
 def env_agent_config(cfg,seed=1):
    env = gym.make(cfg.env)  
    env = CliffWalkingWapper(env)
    env.seed(seed) # 设置随机种子
    state_dim = env.observation_space.n # 状态维度
    action_dim = env.action_space.n # 动作维度
    agent = QLearning(state_dim,action_dim,cfg)
    return env,agent
 def train(cfg,env,agent):
    print('开始训练！')
    print(f'环境:{cfg.env}, 算法:{cfg.algo}, 设备:{cfg.device}')
    rewards = []  # 记录奖励
    ma_rewards = [] # 记录滑动平均奖励
    for i_ep in range(cfg.train_eps):
        ep_reward = 0  # 记录每个回合的奖励
        state = env.reset()  # 重置环境,即开始新的回合
        while True:
            action = agent.choose_action(state)  # 根据算法选择一个动作
            next_state, reward, done, _ = env.step(action)  # 与环境进行一次动作交互
            print(reward)
            agent.update(state, action, reward, next_state, done)  # Q学习算法更新
            state = next_state  # 更新状态
            ep_reward += reward
            if done:
                break
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
        else:
            ma_rewards.append(ep_reward)
        print("回合数：{}/{}，奖励{:.1f}".format(i_ep+1, cfg.train_eps,ep_reward))
    print('完成训练！')
    return rewards,ma_rewards
 def eval(cfg,env,agent):
    print('开始测试！')
    print(f'环境：{cfg.env}, 算法：{cfg.algo}, 设备：{cfg.device}')
    for item in agent.Q_table.items():
        print(item)
    rewards = []  # 记录所有回合的奖励
    ma_rewards = [] # 滑动平均的奖励
    for i_ep in range(cfg.eval_eps):
        ep_reward = 0  # 记录每个episode的reward
        state = env.reset()  # 重置环境, 重新开一局（即开始新的一个回合）
        while True:
            action = agent.predict(state)  # 根据算法选择一个动作
            next_state, reward, done, _ = env.step(action)  # 与环境进行一个交互
            state = next_state  # 更新状态
            ep_reward += reward
            if done:
                break
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
        else:
            ma_rewards.append(ep_reward)
        print(f"回合数：{i_ep+1}/{cfg.eval_eps}, 奖励：{ep_reward:.1f}")
    print('完成测试！')
    return rewards,ma_rewards
 if __name__ == "__main__":
    cfg = QlearningConfig()
    # 训练
    env,agent = env_agent_config(cfg,seed=0)
    rewards,ma_rewards = train(cfg,env,agent)
    make_dir(cfg.result_path,cfg.model_path) # 创建文件夹
    agent.save(path=cfg.model_path) # 保存模型
    for item in agent.Q_table.items():
        print(item)
    save_results(rewards,ma_rewards,tag='train',path=cfg.result_path) # 保存结果
    plot_rewards_cn(rewards,ma_rewards,tag="train",env=cfg.env,algo = cfg.algo,path=cfg.result_path)
    # # 测试
    env,agent = env_agent_config(cfg,seed=10)
    agent.load(path=cfg.model_path) # 加载模型
    rewards,ma_rewards = eval(cfg,env,agent)
    save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path)
    plot_rewards_cn(rewards,ma_rewards,tag="eval",env=cfg.env,algo = cfg.algo,path=cfg.result_path)
--- a/codes/QLearning/train.py
+++ b/codes/QLearning/train.py
@@ -0,0 +1,51 @@
 def train(cfg,env,agent):
    print('开始训练！')
    print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
    rewards = []  # 记录奖励
    ma_rewards = [] # 记录滑动平均奖励
    for i_ep in range(cfg.train_eps):
        ep_reward = 0  # 记录每个回合的奖励
        state = env.reset()  # 重置环境,即开始新的回合
        while True:
            action = agent.choose_action(state)  # 根据算法选择一个动作
            next_state, reward, done, _ = env.step(action)  # 与环境进行一次动作交互
            agent.update(state, action, reward, next_state, done)  # Q学习算法更新
            state = next_state  # 更新状态
            ep_reward += reward
            if done:
                break
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
        else:
            ma_rewards.append(ep_reward)
        if ()
        print("回合数：{}/{}，奖励{:.1f}".format(i_ep+1, cfg.train_eps,ep_reward))
    print('完成训练！')
    return rewards,ma_rewards
 def test(cfg,env,agent):
    print('开始测试！')
    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
    for item in agent.Q_table.items():
        print(item)
    rewards = []  # 记录所有回合的奖励
    ma_rewards = [] # 滑动平均的奖励
    for i_ep in range(cfg.test_eps):
        ep_reward = 0  # 记录每个episode的reward
        state = env.reset()  # 重置环境, 重新开一局（即开始新的一个回合）
        while True:
            action = agent.predict(state)  # 根据算法选择一个动作
            next_state, reward, done, _ = env.step(action)  # 与环境进行一个交互
            state = next_state  # 更新状态
            ep_reward += reward
            if done:
                break
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
        else:
            ma_rewards.append(ep_reward)
        print(f"回合数：{i_ep+1}/{cfg.test_eps}, 奖励：{ep_reward:.1f}")
    print('完成测试！')
    return rewards,ma_rewards
--- a/codes/README.md
+++ b/codes/README.md
@@ -13,6 +13,7 @@
 其中```model.py```,```memory.py```,```plot.py``` 由于不同算法都会用到，所以放入```common```文件夹中。
 **注意：新版本中将```model```,```memory```相关内容全部放到了```agent.py```里面，```plot```放到了```common.utils```中。**
 ## 运行环境
 python 3.7、pytorch 1.6.0-1.8.1、gym 0.17.0-0.19.0
--- a/codes/SAC/task0_train.ipynb
+++ b/codes/SAC/task0_train.ipynb
@@ -45,7 +45,7 @@
    "        self.model_path = curr_path+\"/outputs/\" +self.env+'/'+curr_time+'/models/'  # path to save models\n",
    "        self.train_eps = 300\n",
    "        self.train_steps = 500\n",
-    "        self.eval_eps = 50\n",
+    "        self.test_eps = 50\n",
    "        self.eval_steps = 500\n",
    "        self.gamma = 0.99\n",
    "        self.mean_lambda=1e-3\n",
@@ -121,7 +121,7 @@
    "    print(f'Env: {cfg.env}, Algorithm: {cfg.algo}, Device: {cfg.device}')\n",
    "    rewards  = []\n",
    "    ma_rewards = [] # moveing average reward\n",
-    "    for i_ep in range(cfg.eval_eps):\n",
+    "    for i_ep in range(cfg.test_eps):\n",
    "        state = env.reset()\n",
    "        ep_reward = 0\n",
    "        for i_step in range(cfg.eval_steps):\n",
--- a/codes/SAC/task0_train.py
+++ b/codes/SAC/task0_train.py
@@ -33,7 +33,7 @@ class SACConfig:
        self.model_path = curr_path+"/outputs/" +self.env_name+'/'+curr_time+'/models/'  # path to save models
        self.train_eps = 300
        self.train_steps = 500
-        self.eval_eps = 50
+        self.test_eps = 50
        self.eval_steps = 500
        self.gamma = 0.99
        self.mean_lambda=1e-3
@@ -96,7 +96,7 @@ def eval(cfg,env,agent):
    print(f'Env: {cfg.env_name}, Algorithm: {cfg.algo}, Device: {cfg.device}')
    rewards  = []
    ma_rewards = [] # moveing average reward
-    for i_ep in range(cfg.eval_eps):
+    for i_ep in range(cfg.test_eps):
        state = env.reset()
        ep_reward = 0
        for i_step in range(cfg.eval_steps):
--- a/codes/Sarsa/task0_train.py
+++ b/codes/Sarsa/task0_train.py
@@ -31,7 +31,7 @@ class SarsaConfig:
        self.result_path = curr_path+"/outputs/" +self.env+'/'+curr_time+'/results/'  # path to save results
        self.model_path = curr_path+"/outputs/" +self.env+'/'+curr_time+'/models/'  # path to save models
        self.train_eps = 200
-        self.eval_eps = 50
+        self.test_eps = 50
        self.epsilon = 0.15 # epsilon: The probability to select a random action . 
        self.gamma = 0.9 # gamma: Gamma discount factor.
        self.lr = 0.2 # learning rate: step size parameter
@@ -74,7 +74,7 @@ def train(cfg,env,agent):
 def eval(cfg,env,agent):
    rewards = []
    ma_rewards = []
-    for i_episode in range(cfg.eval_eps):
+    for i_episode in range(cfg.test_eps):
        # Print out which episode we're on, useful for debugging.
        # Generate an episode.
        # An episode is an array of (state, action, reward) tuples
@@ -94,7 +94,7 @@ def eval(cfg,env,agent):
            ma_rewards.append(ep_reward)
        rewards.append(ep_reward)
        if (i_episode+1)%10==0:
-            print("Episode:{}/{}: Reward:{}".format(i_episode+1, cfg.eval_eps,ep_reward))
+            print("Episode:{}/{}: Reward:{}".format(i_episode+1, cfg.test_eps,ep_reward))
    print('Complete evaling！')
    return rewards,ma_rewards
--- a/codes/TD3/README.md
+++ b/codes/TD3/README.md
@@ -0,0 +1 @@
 这是对[Implementation of Twin Delayed Deep Deterministic Policy Gradients (TD3)](https://arxiv.org/abs/1802.09477)的复现
--- a/codes/TD3/agent.py
+++ b/codes/TD3/agent.py
@@ -1,3 +1,13 @@
 #!/usr/bin/env python
 # coding=utf-8
 '''
 Author: JiangJi
 Email: johnjim0816@gmail.com
 Date: 2021-12-22 10:40:05
 LastEditor: JiangJi
 LastEditTime: 2021-12-22 10:43:55
 Discription: 
 '''
 import copy
 import numpy as np
 import torch
@@ -5,40 +15,41 @@ import torch.nn as nn
 import torch.nn.functional as F
 from TD3.memory import ReplayBuffer
 # Implementation of Twin Delayed Deep Deterministic Policy Gradients (TD3)
 # Paper: https://arxiv.org/abs/1802.09477
 class Actor(nn.Module):
-	def __init__(self, state_dim, action_dim, max_action):
+	
 	def __init__(self, input_dim, output_dim, max_action):
 		'''[summary]
 		Args:
 			input_dim (int): 输入维度，这里等于n_states
 			output_dim (int): 输出维度，这里等于n_actions
 			max_action (int): action的最大值
 		'''		
 		super(Actor, self).__init__()
-		self.l1 = nn.Linear(state_dim, 256)
+		self.l1 = nn.Linear(input_dim, 256)
 		self.l2 = nn.Linear(256, 256)
-		self.l3 = nn.Linear(256, action_dim)
+		self.l3 = nn.Linear(256, output_dim)
 		self.max_action = max_action
-		
+	
 	def forward(self, state):
 		a = F.relu(self.l1(state))
 		a = F.relu(self.l2(a))
 		return self.max_action * torch.tanh(self.l3(a))
 class Critic(nn.Module):
-	def __init__(self, state_dim, action_dim):
+	def __init__(self, input_dim, output_dim):
 		super(Critic, self).__init__()
 		# Q1 architecture
-		self.l1 = nn.Linear(state_dim + action_dim, 256)
+		self.l1 = nn.Linear(input_dim + output_dim, 256)
 		self.l2 = nn.Linear(256, 256)
 		self.l3 = nn.Linear(256, 1)
 		# Q2 architecture
-		self.l4 = nn.Linear(state_dim + action_dim, 256)
+		self.l4 = nn.Linear(input_dim + output_dim, 256)
 		self.l5 = nn.Linear(256, 256)
 		self.l6 = nn.Linear(256, 1)
@@ -68,8 +79,8 @@ class Critic(nn.Module):
 class TD3(object):
 	def __init__(
 		self,
-		state_dim,
+		input_dim,
-		action_dim,
+		output_dim,
 		max_action,
 		cfg,
 	):
@@ -83,14 +94,14 @@ class TD3(object):
 		self.device = cfg.device
 		self.total_it = 0
-		self.actor = Actor(state_dim, action_dim, max_action).to(self.device)
+		self.actor = Actor(input_dim, output_dim, max_action).to(self.device)
 		self.actor_target = copy.deepcopy(self.actor)
 		self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=3e-4)
-		self.critic = Critic(state_dim, action_dim).to(self.device)
+		self.critic = Critic(input_dim, output_dim).to(self.device)
 		self.critic_target = copy.deepcopy(self.critic)
 		self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4)
-		self.memory = ReplayBuffer(state_dim, action_dim)
+		self.memory = ReplayBuffer(input_dim, output_dim)
 	def choose_action(self, state):
 		state = torch.FloatTensor(state.reshape(1, -1)).to(self.device)
		`@@ -0,0 +1 @@`
							`这是对[Implementation of Twin Delayed Deep Deterministic Policy Gradients (TD3)](https://arxiv.org/abs/1802.09477)的复现`