update rainbowdqn

2022-05-31 01:20:58 +08:00
parent cfc0f6492e
commit c7c94468c9
149 changed files with 1866 additions and 1549 deletions
--- a/codes/PPO/README.md
+++ b/codes/PPO/README.md
@@ -57,16 +57,16 @@ model就是actor和critic两个网络了：
 import torch.nn as nn
 from torch.distributions.categorical import Categorical
 class Actor(nn.Module):
-    def __init__(self,state_dim, action_dim,
+    def __init__(self,n_states, n_actions,
            hidden_dim=256):
        super(Actor, self).__init__()

        self.actor = nn.Sequential(
-                nn.Linear(state_dim, hidden_dim),
+                nn.Linear(n_states, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, hidden_dim),
                nn.ReLU(),
-                nn.Linear(hidden_dim, action_dim),
+                nn.Linear(hidden_dim, n_actions),
                nn.Softmax(dim=-1)
        )
    def forward(self, state):
@@ -75,10 +75,10 @@ class Actor(nn.Module):
        return dist

 class Critic(nn.Module):
-    def __init__(self, state_dim,hidden_dim=256):
+    def __init__(self, n_states,hidden_dim=256):
        super(Critic, self).__init__()
        self.critic = nn.Sequential(
-                nn.Linear(state_dim, hidden_dim),
+                nn.Linear(n_states, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, hidden_dim),
                nn.ReLU(),
@@ -88,7 +88,7 @@ class Critic(nn.Module):
        value = self.critic(state)
        return value
 ```
-这里Actor就是得到一个概率分布(Categorica，也可以是别的分布，可以搜索torch distributionsl)，critc根据当前状态得到一个值，这里的输入维度可以是```state_dim+action_dim```，即将action信息也纳入critic网络中，这样会更好一些，感兴趣的小伙伴可以试试。
+这里Actor就是得到一个概率分布(Categorica，也可以是别的分布，可以搜索torch distributionsl)，critc根据当前状态得到一个值，这里的输入维度可以是```n_states+n_actions```，即将action信息也纳入critic网络中，这样会更好一些，感兴趣的小伙伴可以试试。

 ### PPO update
 定义一个update函数主要实现伪代码中的第六步和第七步：
--- a/codes/PPO/memory.py
+++ b/codes/PPO/memory.py
@@ -1,44 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-'''
-Author: John
-Email: johnjim0816@gmail.com
-Date: 2021-03-23 15:30:46
-LastEditor: John
-LastEditTime: 2021-09-26 22:00:07
-Discription: 
-Environment: 
-'''
-import numpy as np
-class PPOMemory:
-    def __init__(self, batch_size):
-        self.states = []
-        self.probs = []
-        self.vals = []
-        self.actions = []
-        self.rewards = []
-        self.dones = []
-        self.batch_size = batch_size
-    def sample(self):
-        batch_step = np.arange(0, len(self.states), self.batch_size)
-        indices = np.arange(len(self.states), dtype=np.int64)
-        np.random.shuffle(indices)
-        batches = [indices[i:i+self.batch_size] for i in batch_step]
-        return np.array(self.states),np.array(self.actions),np.array(self.probs),\
-                np.array(self.vals),np.array(self.rewards),np.array(self.dones),batches
-                
-    def push(self, state, action, probs, vals, reward, done):
-        self.states.append(state)
-        self.actions.append(action)
-        self.probs.append(probs)
-        self.vals.append(vals)
-        self.rewards.append(reward)
-        self.dones.append(done)
-
-    def clear(self):
-        self.states = []
-        self.probs = []
-        self.actions = []
-        self.rewards = []
-        self.dones = []
-        self.vals = []
--- a/codes/PPO/model.py
+++ b/codes/PPO/model.py
@@ -1,44 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-'''
-Author: John
-Email: johnjim0816@gmail.com
-Date: 2021-03-23 15:29:24
-LastEditor: John
-LastEditTime: 2021-04-08 22:36:43
-Discription: 
-Environment: 
-'''
-import torch.nn as nn
-from torch.distributions.categorical import Categorical
-class Actor(nn.Module):
-    def __init__(self,state_dim, action_dim,
-            hidden_dim):
-        super(Actor, self).__init__()
-
-        self.actor = nn.Sequential(
-                nn.Linear(state_dim, hidden_dim),
-                nn.ReLU(),
-                nn.Linear(hidden_dim, hidden_dim),
-                nn.ReLU(),
-                nn.Linear(hidden_dim, action_dim),
-                nn.Softmax(dim=-1)
-        )
-    def forward(self, state):
-        dist = self.actor(state)
-        dist = Categorical(dist)
-        return dist
-
-class Critic(nn.Module):
-    def __init__(self, state_dim,hidden_dim):
-        super(Critic, self).__init__()
-        self.critic = nn.Sequential(
-                nn.Linear(state_dim, hidden_dim),
-                nn.ReLU(),
-                nn.Linear(hidden_dim, hidden_dim),
-                nn.ReLU(),
-                nn.Linear(hidden_dim, 1)
-        )
-    def forward(self, state):
-        value = self.critic(state)
-        return value
--- a/codes/PPO/outputs/CartPole-v0/20211117-184614/models/ppo_actor.pt
+++ b/codes/PPO/outputs/CartPole-v0/20211117-184614/models/ppo_actor.pt
--- a/codes/PPO/outputs/CartPole-v0/20211117-184614/models/ppo_critic.pt
+++ b/codes/PPO/outputs/CartPole-v0/20211117-184614/models/ppo_critic.pt
--- a/codes/PPO/outputs/CartPole-v0/20211117-184614/results/eval_rewards_curve.png
+++ b/codes/PPO/outputs/CartPole-v0/20211117-184614/results/eval_rewards_curve.png
--- a/codes/PPO/outputs/CartPole-v0/20211117-184614/results/train_ma_rewards.npy
+++ b/codes/PPO/outputs/CartPole-v0/20211117-184614/results/train_ma_rewards.npy
--- a/codes/PPO/outputs/CartPole-v0/20211117-184614/results/train_rewards.npy
+++ b/codes/PPO/outputs/CartPole-v0/20211117-184614/results/train_rewards.npy
--- a/codes/PPO/outputs/CartPole-v0/20211117-184614/results/train_rewards_curve.png
+++ b/codes/PPO/outputs/CartPole-v0/20211117-184614/results/train_rewards_curve.png
--- a/codes/PPO/outputs/CartPole-v0/20211231-193837/models/ppo_actor.pt
+++ b/codes/PPO/outputs/CartPole-v0/20211231-193837/models/ppo_actor.pt
--- a/codes/PPO/outputs/CartPole-v0/20211231-193837/models/ppo_critic.pt
+++ b/codes/PPO/outputs/CartPole-v0/20211231-193837/models/ppo_critic.pt
--- a/codes/PPO/outputs/CartPole-v0/20211231-193837/results/test_ma_rewards.npy
+++ b/codes/PPO/outputs/CartPole-v0/20211231-193837/results/test_ma_rewards.npy
--- a/codes/PPO/outputs/CartPole-v0/20211231-193837/results/test_rewards.npy
+++ b/codes/PPO/outputs/CartPole-v0/20211231-193837/results/test_rewards.npy
--- a/codes/PPO/outputs/CartPole-v0/20211231-193837/results/test_rewards_curve.png
+++ b/codes/PPO/outputs/CartPole-v0/20211231-193837/results/test_rewards_curve.png
--- a/codes/PPO/outputs/CartPole-v0/20211231-193837/results/train_ma_rewards.npy
+++ b/codes/PPO/outputs/CartPole-v0/20211231-193837/results/train_ma_rewards.npy
--- a/codes/PPO/outputs/CartPole-v0/20211231-193837/results/train_rewards.npy
+++ b/codes/PPO/outputs/CartPole-v0/20211231-193837/results/train_rewards.npy
--- a/codes/PPO/outputs/CartPole-v0/20211231-193837/results/train_rewards_curve.png
+++ b/codes/PPO/outputs/CartPole-v0/20211231-193837/results/train_rewards_curve.png
--- a/codes/PPO/agent.py
+++ b/codes/PPO/agent.py
@@ -5,7 +5,7 @@ Author: John
 Email: johnjim0816@gmail.com
 Date: 2021-03-23 15:17:42
 LastEditor: John
-LastEditTime: 2021-09-26 22:02:00
+LastEditTime: 2021-12-31 19:38:33
 Discription: 
 Environment: 
 '''
@@ -13,25 +13,89 @@ import os
 import numpy as np
 import torch 
 import torch.optim as optim
-from PPO.model import Actor,Critic
-from PPO.memory import PPOMemory
+import torch.nn as nn
+from torch.distributions.categorical import Categorical
+class PPOMemory:
+    def __init__(self, batch_size):
+        self.states = []
+        self.probs = []
+        self.vals = []
+        self.actions = []
+        self.rewards = []
+        self.dones = []
+        self.batch_size = batch_size
+    def sample(self):
+        batch_step = np.arange(0, len(self.states), self.batch_size)
+        indices = np.arange(len(self.states), dtype=np.int64)
+        np.random.shuffle(indices)
+        batches = [indices[i:i+self.batch_size] for i in batch_step]
+        return np.array(self.states),np.array(self.actions),np.array(self.probs),\
+                np.array(self.vals),np.array(self.rewards),np.array(self.dones),batches
+                
+    def push(self, state, action, probs, vals, reward, done):
+        self.states.append(state)
+        self.actions.append(action)
+        self.probs.append(probs)
+        self.vals.append(vals)
+        self.rewards.append(reward)
+        self.dones.append(done)
+
+    def clear(self):
+        self.states = []
+        self.probs = []
+        self.actions = []
+        self.rewards = []
+        self.dones = []
+        self.vals = []
+class Actor(nn.Module):
+    def __init__(self,n_states, n_actions,
+            hidden_dim):
+        super(Actor, self).__init__()
+
+        self.actor = nn.Sequential(
+                nn.Linear(n_states, hidden_dim),
+                nn.ReLU(),
+                nn.Linear(hidden_dim, hidden_dim),
+                nn.ReLU(),
+                nn.Linear(hidden_dim, n_actions),
+                nn.Softmax(dim=-1)
+        )
+    def forward(self, state):
+        dist = self.actor(state)
+        dist = Categorical(dist)
+        return dist
+
+class Critic(nn.Module):
+    def __init__(self, n_states,hidden_dim):
+        super(Critic, self).__init__()
+        self.critic = nn.Sequential(
+                nn.Linear(n_states, hidden_dim),
+                nn.ReLU(),
+                nn.Linear(hidden_dim, hidden_dim),
+                nn.ReLU(),
+                nn.Linear(hidden_dim, 1)
+        )
+    def forward(self, state):
+        value = self.critic(state)
+        return value
 class PPO:
-    def __init__(self, state_dim, action_dim,cfg):
+    def __init__(self, n_states, n_actions,cfg):
        self.gamma = cfg.gamma
        self.continuous = cfg.continuous 
        self.policy_clip = cfg.policy_clip
        self.n_epochs = cfg.n_epochs
        self.gae_lambda = cfg.gae_lambda
        self.device = cfg.device
-        self.actor = Actor(state_dim, action_dim,cfg.hidden_dim).to(self.device)
-        self.critic = Critic(state_dim,cfg.hidden_dim).to(self.device)
+        self.actor = Actor(n_states, n_actions,cfg.hidden_dim).to(self.device)
+        self.critic = Critic(n_states,cfg.hidden_dim).to(self.device)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=cfg.actor_lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=cfg.critic_lr)
        self.memory = PPOMemory(cfg.batch_size)
        self.loss = 0

    def choose_action(self, state):
-        state = torch.tensor([state], dtype=torch.float).to(self.device)
+        state = np.array([state]) # 先转成数组再转tensor更高效
+        state = torch.tensor(state, dtype=torch.float).to(self.device)
        dist = self.actor(state)
        value = self.critic(state)
        action = dist.sample()
--- a/codes/PPO/task0.py
+++ b/codes/PPO/task0.py
@@ -5,63 +5,127 @@ sys.path.append(parent_path) # 添加路径到系统路径

 import gym
 import torch
+import numpy as np
 import datetime
-from common.plot import plot_rewards
+from common.utils import plot_rewards
 from common.utils import save_results,make_dir
-from PPO.agent import PPO
-from PPO.train import train
+from ppo2 import PPO

 curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间

-class PPOConfig:
+class Config:
    def __init__(self) -> None:
-        self.algo = "DQN"  # 算法名称
+        ################################## 环境超参数 ###################################
+        self.algo_name = "DQN"  # 算法名称
        self.env_name = 'CartPole-v0' # 环境名称
        self.continuous = False # 环境是否为连续动作
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 检测GPU
+        self.seed = 10 # 随机种子，置0则不设置随机种子
        self.train_eps = 200 # 训练的回合数
        self.test_eps = 20 # 测试的回合数
-        self.batch_size = 5
-        self.gamma=0.99
+        ################################################################################
+        
+        ################################## 算法超参数 ####################################
+        self.batch_size = 5  # mini-batch SGD中的批量大小
+        self.gamma = 0.95  # 强化学习中的折扣因子
        self.n_epochs = 4
-        self.actor_lr = 0.0003
-        self.critic_lr = 0.0003
-        self.gae_lambda=0.95
-        self.policy_clip=0.2
+        self.actor_lr = 0.0003 # actor的学习率
+        self.critic_lr = 0.0003 # critic的学习率
+        self.gae_lambda = 0.95
+        self.policy_clip = 0.2
        self.hidden_dim = 256
-        self.update_fre = 20 # frequency of agent update
-
-class PlotConfig:
-    def __init__(self) -> None:
-        self.algo = "DQN"  # 算法名称
-        self.env_name = 'CartPole-v0' # 环境名称
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 检测GPU
+        self.update_fre = 20 # 策略更新频率
+        ################################################################################
+        
+        ################################# 保存结果相关参数 ################################
        self.result_path = curr_path+"/outputs/" + self.env_name + \
            '/'+curr_time+'/results/'  # 保存结果的路径
        self.model_path = curr_path+"/outputs/" + self.env_name + \
            '/'+curr_time+'/models/'  # 保存模型的路径
        self.save = True # 是否保存图片
+        ################################################################################
+        
+def env_agent_config(cfg):
+    ''' 创建环境和智能体
+    '''
+    env = gym.make(cfg.env_name)  # 创建环境
+    n_states = env.observation_space.shape[0]  # 状态维度
+    if cfg.continuous:
+        n_actions = env.action_space.shape[0] # 动作维度
+    else:
+        n_actions = env.action_space.n  # 动作维度
+    agent = PPO(n_states, n_actions, cfg)  # 创建智能体
+    if cfg.seed !=0: # 设置随机种子
+        torch.manual_seed(cfg.seed)
+        env.seed(cfg.seed)
+        np.random.seed(cfg.seed)
+    return env, agent

-def env_agent_config(cfg,seed=1):
-    env = gym.make(cfg.env_name)  
-    env.seed(seed)
-    state_dim = env.observation_space.shape[0]
-    action_dim = env.action_space.n
-    agent = PPO(state_dim,action_dim,cfg)
-    return env,agent
+def train(cfg,env,agent):
+    print('开始训练！')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
+    rewards = [] # 记录所有回合的奖励
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    steps = 0
+    for i_ep in range(cfg.train_eps):
+        state = env.reset()
+        done = False
+        ep_reward = 0
+        while not done:
+            action, prob, val = agent.choose_action(state)
+            state_, reward, done, _ = env.step(action)
+            steps += 1
+            ep_reward += reward
+            agent.memory.push(state, action, prob, val, reward, done)
+            if steps % cfg.update_fre == 0:
+                agent.update()
+            state = state_
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
+        else:
+            ma_rewards.append(ep_reward)
+        if (i_ep+1)%10 == 0: 
+            print(f"回合：{i_ep+1}/{cfg.train_eps}，奖励：{ep_reward:.2f}")
+    print('完成训练！')
+    return rewards,ma_rewards

-cfg  = PPOConfig()
-plot_cfg = PlotConfig()
-# 训练
-env,agent = env_agent_config(cfg,seed=1)
-rewards, ma_rewards = train(cfg, env, agent)
-make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹
-agent.save(path=plot_cfg.model_path)
-save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path)
-plot_rewards(rewards, ma_rewards, plot_cfg, tag="train")
-# 测试
-env,agent = env_agent_config(cfg,seed=10)
-agent.load(path=plot_cfg.model_path)
-rewards,ma_rewards = eval(cfg,env,agent)
-save_results(rewards,ma_rewards,tag='eval',path=plot_cfg.result_path)
-plot_rewards(rewards,ma_rewards,plot_cfg,tag="eval")
+def test(cfg,env,agent):
+    print('开始测试!')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
+    rewards = [] # 记录所有回合的奖励
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    for i_ep in range(cfg.test_eps):
+        state = env.reset()
+        done = False
+        ep_reward = 0
+        while not done:
+            action, prob, val = agent.choose_action(state)
+            state_, reward, done, _ = env.step(action)
+            ep_reward += reward
+            state = state_
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(
+                0.9*ma_rewards[-1]+0.1*ep_reward)
+        else:
+            ma_rewards.append(ep_reward)
+        print('回合：{}/{}, 奖励：{}'.format(i_ep+1, cfg.test_eps, ep_reward))
+    print('完成训练！')
+    return rewards,ma_rewards
+
+if __name__ == "__main__":
+    cfg  = Config()
+    # 训练
+    env,agent = env_agent_config(cfg)
+    rewards, ma_rewards = train(cfg, env, agent)
+    make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹
+    agent.save(path=cfg.model_path)
+    save_results(rewards, ma_rewards, tag='train', path=cfg.result_path)
+    plot_rewards(rewards, ma_rewards, cfg, tag="train")
+    # 测试
+    env,agent = env_agent_config(cfg)
+    agent.load(path=cfg.model_path)
+    rewards,ma_rewards = test(cfg,env,agent)
+    save_results(rewards,ma_rewards,tag='test',path=cfg.result_path)
+    plot_rewards(rewards,ma_rewards,cfg,tag="test")
--- a/codes/PPO/task1.py
+++ b/codes/PPO/task1.py
@@ -6,10 +6,9 @@ sys.path.append(parent_path) # 添加路径到系统路径
 import gym
 import torch
 import datetime
-from common.plot import plot_rewards
+from common.utils import plot_rewards
 from common.utils import save_results,make_dir
-from PPO.agent import PPO
-from PPO.train import train
+from ppo2 import PPO

 curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间

@@ -45,9 +44,9 @@ class PlotConfig:
 def env_agent_config(cfg,seed=1):
    env = gym.make(cfg.env_name)  
    env.seed(seed)
-    state_dim = env.observation_space.shape[0]
-    action_dim = env.action_space.shape[0]
-    agent = PPO(state_dim,action_dim,cfg)
+    n_states = env.observation_space.shape[0]
+    n_actions = env.action_space.shape[0]
+    agent = PPO(n_states,n_actions,cfg)
    return env,agent


--- a/codes/PPO/train.ipynb
+++ b/codes/PPO/train.ipynb
--- a/codes/PPO/train.py
+++ b/codes/PPO/train.py
@@ -1,121 +0,0 @@
-def train(cfg,env,agent):
-    print('开始训练！')
-    print(f'环境：{cfg.env_name}, 算法：{cfg.algo}, 设备：{cfg.device}')
-    rewards = [] # 记录所有回合的奖励
-    ma_rewards = []  # 记录所有回合的滑动平均奖励
-    steps = 0
-    for i_ep in range(cfg.train_eps):
-        state = env.reset()
-        done = False
-        ep_reward = 0
-        while not done:
-            action, prob, val = agent.choose_action(state)
-            state_, reward, done, _ = env.step(action)
-            steps += 1
-            ep_reward += reward
-            agent.memory.push(state, action, prob, val, reward, done)
-            if steps % cfg.update_fre == 0:
-                agent.update()
-            state = state_
-        rewards.append(ep_reward)
-        if ma_rewards:
-            ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
-        else:
-            ma_rewards.append(ep_reward)
-        if (i_ep+1)%10 == 0: 
-            print(f"回合：{i_ep+1}/{cfg.train_eps}，奖励：{ep_reward:.2f}")
-    print('完成训练！')
-    return rewards,ma_rewards
-
-def eval(cfg,env,agent):
-    print('开始测试!')
-    print(f'环境：{cfg.env_name}, 算法：{cfg.algo}, 设备：{cfg.device}')
-    rewards = [] # 记录所有回合的奖励
-    ma_rewards = []  # 记录所有回合的滑动平均奖励
-    for i_ep in range(cfg.test_eps):
-        state = env.reset()
-        done = False
-        ep_reward = 0
-        while not done:
-            action, prob, val = agent.choose_action(state)
-            state_, reward, done, _ = env.step(action)
-            ep_reward += reward
-            state = state_
-        rewards.append(ep_reward)
-        if ma_rewards:
-            ma_rewards.append(
-                0.9*ma_rewards[-1]+0.1*ep_reward)
-        else:
-            ma_rewards.append(ep_reward)
-        print('回合：{}/{}, 奖励：{}'.format(i_ep+1, cfg.test_eps, ep_reward))
-    print('完成训练！')
-    return rewards,ma_rewards
-
-if __name__ == '__main__':
-    import sys,os
-    curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
-    parent_path = os.path.dirname(curr_path) # 父路径
-    sys.path.append(parent_path) # 添加路径到系统路径
-
-    import gym
-    import torch
-    import datetime
-    from common.plot import plot_rewards
-    from common.utils import save_results,make_dir
-    from PPO.agent import PPO
-    from PPO.train import train
-
-    curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
-
-    class PPOConfig:
-        def __init__(self) -> None:
-            self.algo = "DQN"  # 算法名称
-            self.env_name = 'CartPole-v0' # 环境名称
-            self.continuous = False # 环境是否为连续动作
-            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 检测GPU
-            self.train_eps = 200 # 训练的回合数
-            self.test_eps = 20 # 测试的回合数
-            self.batch_size = 5
-            self.gamma=0.99
-            self.n_epochs = 4
-            self.actor_lr = 0.0003
-            self.critic_lr = 0.0003
-            self.gae_lambda=0.95
-            self.policy_clip=0.2
-            self.hidden_dim = 256
-            self.update_fre = 20 # frequency of agent update
-
-    class PlotConfig:
-        def __init__(self) -> None:
-            self.algo = "DQN"  # 算法名称
-            self.env_name = 'CartPole-v0' # 环境名称
-            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 检测GPU
-            self.result_path = curr_path+"/outputs/" + self.env_name + \
-                '/'+curr_time+'/results/'  # 保存结果的路径
-            self.model_path = curr_path+"/outputs/" + self.env_name + \
-                '/'+curr_time+'/models/'  # 保存模型的路径
-            self.save = True # 是否保存图片
-
-    def env_agent_config(cfg,seed=1):
-        env = gym.make(cfg.env_name)  
-        env.seed(seed)
-        state_dim = env.observation_space.shape[0]
-        action_dim = env.action_space.n
-        agent = PPO(state_dim,action_dim,cfg)
-        return env,agent
-
-    cfg  = PPOConfig()
-    plot_cfg = PlotConfig()
-    # 训练
-    env,agent = env_agent_config(cfg,seed=1)
-    rewards, ma_rewards = train(cfg, env, agent)
-    make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹
-    agent.save(path=plot_cfg.model_path)
-    save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path)
-    plot_rewards(rewards, ma_rewards, plot_cfg, tag="train")
-    # 测试
-    env,agent = env_agent_config(cfg,seed=10)
-    agent.load(path=plot_cfg.model_path)
-    rewards,ma_rewards = eval(cfg,env,agent)
-    save_results(rewards,ma_rewards,tag='eval',path=plot_cfg.result_path)
-    plot_rewards(rewards,ma_rewards,plot_cfg,tag="eval")