update rainbowdqn

2022-05-31 01:20:58 +08:00
parent cfc0f6492e
commit c7c94468c9
149 changed files with 1866 additions and 1549 deletions
--- a/codes/DQN/task0.py
+++ b/codes/DQN/task0.py
@@ -1,5 +1,7 @@
 import sys
 import os
+import torch.nn as nn
+import torch.nn.functional as F
 curr_path = os.path.dirname(os.path.abspath(__file__))  # 当前文件所在绝对路径
 parent_path = os.path.dirname(curr_path)  # 父路径
 sys.path.append(parent_path)  # 添加路径到系统路径
@@ -8,26 +10,42 @@ import gym
 import torch
 import datetime
 import numpy as np
-from common.utils import save_results, make_dir
+from common.utils import save_results_1, make_dir
 from common.utils import plot_rewards
-from DQN.dqn import DQN
+from dqn import DQN

 curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间

+class MLP(nn.Module):
+    def __init__(self, n_states,n_actions,hidden_dim=128):
+        """ 初始化q网络，为全连接网络
+            n_states: 输入的特征数即环境的状态维度
+            n_actions: 输出的动作维度
+        """
+        super(MLP, self).__init__()
+        self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
+        self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
+        self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层
+        
+    def forward(self, x):
+        # 各层对应的激活函数
+        x = F.relu(self.fc1(x)) 
+        x = F.relu(self.fc2(x))
+        return self.fc3(x)

 class Config:
    '''超参数
    '''

    def __init__(self):
-        ################################## 环境超参数 ###################################
-        self.algo_name = 'DQN'  # 算法名称
-        self.env_name = 'CartPole-v0'  # 环境名称
+        ############################### hyperparameters ################################
+        self.algo_name = 'DQN'  # algorithm name
+        self.env_name = 'CartPole-v0'  # environment name
        self.device = torch.device(
-            "cuda" if torch.cuda.is_available() else "cpu")  # 检测GPUgjgjlkhfsf风刀霜的撒发十
+            "cuda" if torch.cuda.is_available() else "cpu")  # check GPU
        self.seed = 10 # 随机种子，置0则不设置随机种子
        self.train_eps = 200  # 训练的回合数
-        self.test_eps = 30  # 测试的回合数
+        self.test_eps = 20  # 测试的回合数
        ################################################################################
        
        ################################## 算法超参数 ###################################
@@ -41,8 +59,8 @@ class Config:
        self.target_update = 4  # 目标网络的更新频率
        self.hidden_dim = 256  # 网络隐藏层
        ################################################################################
-
-        ################################# 保存结果相关参数 ##############################
+        
+        ################################# 保存结果相关参数 ################################
        self.result_path = curr_path + "/outputs/" + self.env_name + \
            '/' + curr_time + '/results/'  # 保存结果的路径
        self.model_path = curr_path + "/outputs/" + self.env_name + \
@@ -55,9 +73,11 @@ def env_agent_config(cfg):
    ''' 创建环境和智能体
    '''
    env = gym.make(cfg.env_name)  # 创建环境
-    state_dim = env.observation_space.shape[0]  # 状态维度
-    action_dim = env.action_space.n  # 动作维度
-    agent = DQN(state_dim, action_dim, cfg)  # 创建智能体
+    n_states = env.observation_space.shape[0]  # 状态维度
+    n_actions = env.action_space.n  # 动作维度
+    print(f"n states: {n_states}, n actions: {n_actions}")
+    model = MLP(n_states,n_actions)
+    agent = DQN(n_actions, model, cfg)  # 创建智能体
    if cfg.seed !=0: # 设置随机种子
        torch.manual_seed(cfg.seed)
        env.seed(cfg.seed)
@@ -72,10 +92,13 @@ def train(cfg, env, agent):
    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
    rewards = []  # 记录所有回合的奖励
    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    steps = []
    for i_ep in range(cfg.train_eps):
        ep_reward = 0  # 记录一回合内的奖励
+        ep_step = 0
        state = env.reset()  # 重置环境，返回初始状态
        while True:
+            ep_step += 1
            action = agent.choose_action(state)  # 选择动作
            next_state, reward, done, _ = env.step(action)  # 更新环境，返回transition
            agent.memory.push(state, action, reward,
@@ -87,16 +110,18 @@ def train(cfg, env, agent):
                break
        if (i_ep + 1) % cfg.target_update == 0:  # 智能体目标网络更新
            agent.target_net.load_state_dict(agent.policy_net.state_dict())
+        steps.append(ep_step)
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward)
        else:
            ma_rewards.append(ep_reward)
-        if (i_ep + 1) % 10 == 0:
-            print('回合：{}/{}, 奖励：{}'.format(i_ep + 1, cfg.train_eps, ep_reward))
-    print('完成训练！')
+        if (i_ep + 1) % 1 == 0:
+            print(f'Episode：{i_ep+1}/{cfg.test_eps}, Reward:{ep_reward:.2f}, Step:{ep_step:.2f} Epislon:{agent.epsilon(agent.frame_idx):.3f}')
+    print('Finish training!')
    env.close()
-    return rewards, ma_rewards
+    res_dic = {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}
+    return res_dic


 def test(cfg, env, agent):
@@ -108,41 +133,45 @@ def test(cfg, env, agent):
    ################################################################################
    rewards = []  # 记录所有回合的奖励
    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    steps = []
    for i_ep in range(cfg.test_eps):
        ep_reward = 0  # 记录一回合内的奖励
+        ep_step = 0
        state = env.reset()  # 重置环境，返回初始状态
        while True:
+            ep_step+=1
            action = agent.choose_action(state)  # 选择动作
            next_state, reward, done, _ = env.step(action)  # 更新环境，返回transition
            state = next_state  # 更新下一个状态
            ep_reward += reward  # 累加奖励
            if done:
                break
+        steps.append(ep_step)
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1)
        else:
            ma_rewards.append(ep_reward)
-        print(f"回合：{i_ep+1}/{cfg.test_eps}，奖励：{ep_reward:.1f}")
+        print(f'Episode：{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}, Step:{ep_step:.2f}')
    print('完成测试！')
    env.close()
-    return rewards, ma_rewards
+    return {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}


 if __name__ == "__main__":
    cfg = Config()
    # 训练
    env, agent = env_agent_config(cfg)
-    rewards, ma_rewards = train(cfg, env, agent)
+    res_dic = train(cfg, env, agent)
    make_dir(cfg.result_path, cfg.model_path)  # 创建保存结果和模型路径的文件夹
    agent.save(path=cfg.model_path)  # 保存模型
-    save_results(rewards, ma_rewards, tag='train',
+    save_results_1(res_dic, tag='train',
                 path=cfg.result_path)  # 保存结果
-    plot_rewards(rewards, ma_rewards, cfg, tag="train")  # 画出结果
+    plot_rewards(res_dic['rewards'], res_dic['ma_rewards'], cfg, tag="train")  # 画出结果
    # 测试
    env, agent = env_agent_config(cfg)
    agent.load(path=cfg.model_path)  # 导入模型
-    rewards, ma_rewards = test(cfg, env, agent)
-    save_results(rewards, ma_rewards, tag='test',
+    res_dic = test(cfg, env, agent)
+    save_results_1(res_dic, tag='test',
                 path=cfg.result_path)  # 保存结果
-    plot_rewards(rewards, ma_rewards, cfg, tag="test")  # 画出结果
+    plot_rewards(res_dic['rewards'], res_dic['ma_rewards'],cfg, tag="test")  # 画出结果