update rainbowdqn

2022-05-31 01:20:58 +08:00
parent cfc0f6492e
commit c7c94468c9
149 changed files with 1866 additions and 1549 deletions
@@ -50,15 +50,15 @@ import torch.nn as nn
 import torch.nn.functional as F

 class FCN(nn.Module):
-    def __init__(self, state_dim=4, action_dim=18):
+    def __init__(self, n_states=4, n_actions=18):
        """ 初始化q网络，为全连接网络
-            state_dim: 输入的feature即环境的state数目
-            action_dim: 输出的action总个数
+            n_states: 输入的feature即环境的state数目
+            n_actions: 输出的action总个数
        """
        super(FCN, self).__init__()
-        self.fc1 = nn.Linear(state_dim, 128) # 输入层
+        self.fc1 = nn.Linear(n_states, 128) # 输入层
        self.fc2 = nn.Linear(128, 128) # 隐藏层
-        self.fc3 = nn.Linear(128, action_dim) # 输出层
+        self.fc3 = nn.Linear(128, n_actions) # 输出层
        
    def forward(self, x):
        # 各层对应的激活函数
@@ -66,7 +66,7 @@ class FCN(nn.Module):
        x = F.relu(self.fc2(x))
        return self.fc3(x)
 ```
-输入为state_dim，输出为action_dim，包含一个128维度的隐藏层，这里根据需要可增加隐藏层维度和数量，然后一般使用relu激活函数，这里跟深度学习的网路设置是一样的。
+输入为n_states，输出为n_actions，包含一个128维度的隐藏层，这里根据需要可增加隐藏层维度和数量，然后一般使用relu激活函数，这里跟深度学习的网路设置是一样的。

 ### Replay Buffer

@@ -107,8 +107,8 @@ class ReplayBuffer:
 在类中建立两个网络，以及optimizer和memory，

 ```python
-self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
-self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
+self.policy_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
+self.target_net = MLP(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device)
 for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # copy params from policy net
    target_param.data.copy_(param.data)
 self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr)
@@ -124,7 +124,7 @@ def choose_action(self, state):
    if random.random() > self.epsilon(self.frame_idx):
        action = self.predict(state)
    else:
-        action = random.randrange(self.action_dim)
+        action = random.randrange(self.n_actions)
    return action
 ```

@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:50:49
@LastEditor: John
-LastEditTime: 2021-12-22 14:01:37
+LastEditTime: 2022-03-02 11:05:11
@Discription: 
@Environment: python 3.7.7
 '''
@@ -20,22 +20,7 @@ import random
 import math
 import numpy as np

-class MLP(nn.Module):
-    def __init__(self, state_dim,action_dim,hidden_dim=128):
-        """ 初始化q网络，为全连接网络
-            state_dim: 输入的特征数即环境的状态维度
-            action_dim: 输出的动作维度
-        """
-        super(MLP, self).__init__()
-        self.fc1 = nn.Linear(state_dim, hidden_dim) # 输入层
-        self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
-        self.fc3 = nn.Linear(hidden_dim, action_dim) # 输出层
-        
-    def forward(self, x):
-        # 各层对应的激活函数
-        x = F.relu(self.fc1(x)) 
-        x = F.relu(self.fc2(x))
-        return self.fc3(x)
+

 class ReplayBuffer:
    def __init__(self, capacity):
@@ -62,9 +47,9 @@ class ReplayBuffer:
        return len(self.buffer)

 class DQN:
-    def __init__(self, state_dim, action_dim, cfg):
+    def __init__(self, n_actions,model,cfg):

-        self.action_dim = action_dim  # 总的动作个数
+        self.n_actions = n_actions  # 总的动作个数
        self.device = cfg.device  # 设备，cpu或gpu等
        self.gamma = cfg.gamma  # 奖励的折扣因子
        # e-greedy策略相关参数
@@ -73,8 +58,8 @@ class DQN:
            (cfg.epsilon_start - cfg.epsilon_end) * \
            math.exp(-1. * frame_idx / cfg.epsilon_decay)
        self.batch_size = cfg.batch_size
-        self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
-        self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
+        self.policy_net = model.to(self.device)
+        self.target_net = model.to(self.device)
        for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net
            target_param.data.copy_(param.data)
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器
@@ -86,23 +71,24 @@ class DQN:
        self.frame_idx += 1
        if random.random() > self.epsilon(self.frame_idx):
            with torch.no_grad():
-                state = torch.tensor([state], device=self.device, dtype=torch.float32)
+                state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
                q_values = self.policy_net(state)
                action = q_values.max(1)[1].item() # 选择Q值最大的动作
        else:
-            action = random.randrange(self.action_dim)
+            action = random.randrange(self.n_actions)
        return action
    def update(self):
        if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时，不更新策略
            return
        # 从经验回放中(replay memory)中随机采样一个批量的转移(transition)
+        # print('updating')
+        
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
            self.batch_size)
-        # 转为张量
-        state_batch = torch.tensor(state_batch, device=self.device, dtype=torch.float)
+        state_batch = torch.tensor(np.array(state_batch), device=self.device, dtype=torch.float)
        action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1)  
        reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float)  
-        next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float)
+        next_state_batch = torch.tensor(np.array(next_state_batch), device=self.device, dtype=torch.float)
        done_batch = torch.tensor(np.float32(done_batch), device=self.device)
        q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch) # 计算当前状态(s_t,a)对应的Q(s_t, a)
        next_q_values = self.target_net(next_state_batch).max(1)[0].detach() # 计算下一时刻的状态(s_t_,a)对应的Q值
@@ -70,9 +70,9 @@ class ReplayBuffer:
        return len(self.buffer)

 class DQN:
-    def __init__(self, state_dim, action_dim, cfg):
+    def __init__(self, n_states, n_actions, cfg):

-        self.action_dim = action_dim  # 总的动作个数
+        self.n_actions = n_actions  # 总的动作个数
        self.device = cfg.device  # 设备，cpu或gpu等
        self.gamma = cfg.gamma  # 奖励的折扣因子
        # e-greedy策略相关参数
@@ -81,8 +81,8 @@ class DQN:
            (cfg.epsilon_start - cfg.epsilon_end) * \
            math.exp(-1. * frame_idx / cfg.epsilon_decay)
        self.batch_size = cfg.batch_size
-        self.policy_net = CNN(state_dim, action_dim).to(self.device)
-        self.target_net = CNN(state_dim, action_dim).to(self.device)
+        self.policy_net = CNN(n_states, n_actions).to(self.device)
+        self.target_net = CNN(n_states, n_actions).to(self.device)
        for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net
            target_param.data.copy_(param.data)
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器
@@ -94,11 +94,12 @@ class DQN:
        self.frame_idx += 1
        if random.random() > self.epsilon(self.frame_idx):
            with torch.no_grad():
+                print(type(state))
                state = torch.tensor([state], device=self.device, dtype=torch.float32)
                q_values = self.policy_net(state)
                action = q_values.max(1)[1].item() # 选择Q值最大的动作
        else:
-            action = random.randrange(self.action_dim)
+            action = random.randrange(self.n_actions)
        return action
    def update(self):
        if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时，不更新策略
@@ -0,0 +1,142 @@
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.autograd as autograd 
+import random
+import math
+import numpy as np
+class CNN(nn.Module):
+    def __init__(self, n_frames, n_actions):
+        super(CNN,self).__init__()
+        self.n_frames = n_frames
+        self.n_actions = n_actions
+        
+        # Layers
+        self.conv1 = nn.Conv2d(
+            in_channels=n_frames,
+            out_channels=16,
+            kernel_size=8,
+            stride=4,
+            padding=2
+            )
+        self.conv2 = nn.Conv2d(
+            in_channels=16,
+            out_channels=32,
+            kernel_size=4,
+            stride=2,
+            padding=1
+            )
+        self.fc1 = nn.Linear(
+            in_features=3200,
+            out_features=256,
+            )
+        self.fc2 = nn.Linear(
+            in_features=256,
+            out_features=n_actions,
+            )
+        
+        # Activation Functions
+        self.relu = nn.ReLU()
+    
+    def flatten(self, x):
+        batch_size = x.size()[0]
+        x = x.view(batch_size, -1)
+        return x
+    
+    def forward(self, x):
+        
+        # Forward pass
+        x = self.relu(self.conv1(x))  # In: (80, 80, 4)  Out: (20, 20, 16)
+        x = self.relu(self.conv2(x))  # In: (20, 20, 16) Out: (10, 10, 32)
+        x = self.flatten(x)           # In: (10, 10, 32) Out: (3200,)
+        x = self.relu(self.fc1(x))    # In: (3200,)      Out: (256,)
+        x = self.fc2(x)               # In: (256,)       Out: (4,)
+        
+        return x
+
+class ReplayBuffer:
+    def __init__(self, capacity):
+        self.capacity = capacity # 经验回放的容量
+        self.buffer = [] # 缓冲区
+        self.position = 0 
+    
+    def push(self, state, action, reward, next_state, done):
+        ''' 缓冲区是一个队列，容量超出时去掉开始存入的转移(transition)
+        '''
+        if len(self.buffer) < self.capacity:
+            self.buffer.append(None)
+        self.buffer[self.position] = (state, action, reward, next_state, done)
+        self.position = (self.position + 1) % self.capacity 
+    
+    def sample(self, batch_size):
+        batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移
+        state, action, reward, next_state, done =  zip(*batch) # 解压成状态，动作等
+        return state, action, reward, next_state, done
+    
+    def __len__(self):
+        ''' 返回当前存储的量
+        '''
+        return len(self.buffer)
+
+class DQN:
+    def __init__(self, n_states, n_actions, cfg):
+
+        self.n_actions = n_actions  # 总的动作个数
+        self.device = cfg.device  # 设备，cpu或gpu等
+        self.gamma = cfg.gamma  # 奖励的折扣因子
+        # e-greedy策略相关参数
+        self.frame_idx = 0  # 用于epsilon的衰减计数
+        self.epsilon = lambda frame_idx: cfg.epsilon_end + \
+            (cfg.epsilon_start - cfg.epsilon_end) * \
+            math.exp(-1. * frame_idx / cfg.epsilon_decay)
+        self.batch_size = cfg.batch_size
+        self.policy_net = CNN(n_states, n_actions).to(self.device)
+        self.target_net = CNN(n_states, n_actions).to(self.device)
+        for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net
+            target_param.data.copy_(param.data)
+        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器
+        self.memory = ReplayBuffer(cfg.memory_capacity) # 经验回放
+
+    def choose_action(self, state):
+        ''' 选择动作
+        '''
+        self.frame_idx += 1
+        if random.random() > self.epsilon(self.frame_idx):
+            with torch.no_grad():
+                state = torch.tensor([state], device=self.device, dtype=torch.float32)
+                q_values = self.policy_net(state)
+                action = q_values.max(1)[1].item() # 选择Q值最大的动作
+        else:
+            action = random.randrange(self.n_actions)
+        return action
+    def update(self):
+        if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时，不更新策略
+            return
+        # 从经验回放中(replay memory)中随机采样一个批量的转移(transition)
+        state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
+            self.batch_size)
+        # 转为张量
+        state_batch = torch.tensor(state_batch, device=self.device, dtype=torch.float)
+        action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1)  
+        reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float)  
+        next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float)
+        done_batch = torch.tensor(np.float32(done_batch), device=self.device)
+        q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch) # 计算当前状态(s_t,a)对应的Q(s_t, a)
+        next_q_values = self.target_net(next_state_batch).max(1)[0].detach() # 计算下一时刻的状态(s_t_,a)对应的Q值
+        # 计算期望的Q值，对于终止状态，此时done_batch[0]=1, 对应的expected_q_value等于reward
+        expected_q_values = reward_batch + self.gamma * next_q_values * (1-done_batch)
+        loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1))  # 计算均方根损失
+        # 优化更新模型
+        self.optimizer.zero_grad()  
+        loss.backward()
+        for param in self.policy_net.parameters():  # clip防止梯度爆炸
+            param.grad.data.clamp_(-1, 1)
+        self.optimizer.step() 
+
+    def save(self, path):
+        torch.save(self.target_net.state_dict(), path+'dqn_checkpoint.pth')
+
+    def load(self, path):
+        self.target_net.load_state_dict(torch.load(path+'dqn_checkpoint.pth'))
+        for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
+            param.data.copy_(target_param.data)
@@ -1,5 +1,7 @@
 import sys
 import os
+import torch.nn as nn
+import torch.nn.functional as F
 curr_path = os.path.dirname(os.path.abspath(__file__))  # 当前文件所在绝对路径
 parent_path = os.path.dirname(curr_path)  # 父路径
 sys.path.append(parent_path)  # 添加路径到系统路径
@@ -8,26 +10,42 @@ import gym
 import torch
 import datetime
 import numpy as np
-from common.utils import save_results, make_dir
+from common.utils import save_results_1, make_dir
 from common.utils import plot_rewards
-from DQN.dqn import DQN
+from dqn import DQN

 curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间

+class MLP(nn.Module):
+    def __init__(self, n_states,n_actions,hidden_dim=128):
+        """ 初始化q网络，为全连接网络
+            n_states: 输入的特征数即环境的状态维度
+            n_actions: 输出的动作维度
+        """
+        super(MLP, self).__init__()
+        self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
+        self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
+        self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层
+        
+    def forward(self, x):
+        # 各层对应的激活函数
+        x = F.relu(self.fc1(x)) 
+        x = F.relu(self.fc2(x))
+        return self.fc3(x)

 class Config:
    '''超参数
    '''

    def __init__(self):
-        ################################## 环境超参数 ###################################
-        self.algo_name = 'DQN'  # 算法名称
-        self.env_name = 'CartPole-v0'  # 环境名称
+        ############################### hyperparameters ################################
+        self.algo_name = 'DQN'  # algorithm name
+        self.env_name = 'CartPole-v0'  # environment name
        self.device = torch.device(
-            "cuda" if torch.cuda.is_available() else "cpu")  # 检测GPUgjgjlkhfsf风刀霜的撒发十
+            "cuda" if torch.cuda.is_available() else "cpu")  # check GPU
        self.seed = 10 # 随机种子，置0则不设置随机种子
        self.train_eps = 200  # 训练的回合数
-        self.test_eps = 30  # 测试的回合数
+        self.test_eps = 20  # 测试的回合数
        ################################################################################
        
        ################################## 算法超参数 ###################################
@@ -41,8 +59,8 @@ class Config:
        self.target_update = 4  # 目标网络的更新频率
        self.hidden_dim = 256  # 网络隐藏层
        ################################################################################
-
-        ################################# 保存结果相关参数 ##############################
+        
+        ################################# 保存结果相关参数 ################################
        self.result_path = curr_path + "/outputs/" + self.env_name + \
            '/' + curr_time + '/results/'  # 保存结果的路径
        self.model_path = curr_path + "/outputs/" + self.env_name + \
@@ -55,9 +73,11 @@ def env_agent_config(cfg):
    ''' 创建环境和智能体
    '''
    env = gym.make(cfg.env_name)  # 创建环境
-    state_dim = env.observation_space.shape[0]  # 状态维度
-    action_dim = env.action_space.n  # 动作维度
-    agent = DQN(state_dim, action_dim, cfg)  # 创建智能体
+    n_states = env.observation_space.shape[0]  # 状态维度
+    n_actions = env.action_space.n  # 动作维度
+    print(f"n states: {n_states}, n actions: {n_actions}")
+    model = MLP(n_states,n_actions)
+    agent = DQN(n_actions, model, cfg)  # 创建智能体
    if cfg.seed !=0: # 设置随机种子
        torch.manual_seed(cfg.seed)
        env.seed(cfg.seed)
@@ -72,10 +92,13 @@ def train(cfg, env, agent):
    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
    rewards = []  # 记录所有回合的奖励
    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    steps = []
    for i_ep in range(cfg.train_eps):
        ep_reward = 0  # 记录一回合内的奖励
+        ep_step = 0
        state = env.reset()  # 重置环境，返回初始状态
        while True:
+            ep_step += 1
            action = agent.choose_action(state)  # 选择动作
            next_state, reward, done, _ = env.step(action)  # 更新环境，返回transition
            agent.memory.push(state, action, reward,
@@ -87,16 +110,18 @@ def train(cfg, env, agent):
                break
        if (i_ep + 1) % cfg.target_update == 0:  # 智能体目标网络更新
            agent.target_net.load_state_dict(agent.policy_net.state_dict())
+        steps.append(ep_step)
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward)
        else:
            ma_rewards.append(ep_reward)
-        if (i_ep + 1) % 10 == 0:
-            print('回合：{}/{}, 奖励：{}'.format(i_ep + 1, cfg.train_eps, ep_reward))
-    print('完成训练！')
+        if (i_ep + 1) % 1 == 0:
+            print(f'Episode：{i_ep+1}/{cfg.test_eps}, Reward:{ep_reward:.2f}, Step:{ep_step:.2f} Epislon:{agent.epsilon(agent.frame_idx):.3f}')
+    print('Finish training!')
    env.close()
-    return rewards, ma_rewards
+    res_dic = {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}
+    return res_dic


 def test(cfg, env, agent):
@@ -108,41 +133,45 @@ def test(cfg, env, agent):
    ################################################################################
    rewards = []  # 记录所有回合的奖励
    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    steps = []
    for i_ep in range(cfg.test_eps):
        ep_reward = 0  # 记录一回合内的奖励
+        ep_step = 0
        state = env.reset()  # 重置环境，返回初始状态
        while True:
+            ep_step+=1
            action = agent.choose_action(state)  # 选择动作
            next_state, reward, done, _ = env.step(action)  # 更新环境，返回transition
            state = next_state  # 更新下一个状态
            ep_reward += reward  # 累加奖励
            if done:
                break
+        steps.append(ep_step)
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1)
        else:
            ma_rewards.append(ep_reward)
-        print(f"回合：{i_ep+1}/{cfg.test_eps}，奖励：{ep_reward:.1f}")
+        print(f'Episode：{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}, Step:{ep_step:.2f}')
    print('完成测试！')
    env.close()
-    return rewards, ma_rewards
+    return {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}


 if __name__ == "__main__":
    cfg = Config()
    # 训练
    env, agent = env_agent_config(cfg)
-    rewards, ma_rewards = train(cfg, env, agent)
+    res_dic = train(cfg, env, agent)
    make_dir(cfg.result_path, cfg.model_path)  # 创建保存结果和模型路径的文件夹
    agent.save(path=cfg.model_path)  # 保存模型
-    save_results(rewards, ma_rewards, tag='train',
+    save_results_1(res_dic, tag='train',
                 path=cfg.result_path)  # 保存结果
-    plot_rewards(rewards, ma_rewards, cfg, tag="train")  # 画出结果
+    plot_rewards(res_dic['rewards'], res_dic['ma_rewards'], cfg, tag="train")  # 画出结果
    # 测试
    env, agent = env_agent_config(cfg)
    agent.load(path=cfg.model_path)  # 导入模型
-    rewards, ma_rewards = test(cfg, env, agent)
-    save_results(rewards, ma_rewards, tag='test',
+    res_dic = test(cfg, env, agent)
+    save_results_1(res_dic, tag='test',
                 path=cfg.result_path)  # 保存结果
-    plot_rewards(rewards, ma_rewards, cfg, tag="test")  # 画出结果
+    plot_rewards(res_dic['rewards'], res_dic['ma_rewards'],cfg, tag="test")  # 画出结果
@@ -5,7 +5,7 @@ Author: JiangJi
 Email: johnjim0816@gmail.com
 Date: 2021-12-22 11:14:17
 LastEditor: JiangJi
-LastEditTime: 2021-12-22 11:40:44
+LastEditTime: 2022-02-10 06:17:41
 Discription: 使用 Nature DQN 训练 CartPole-v1
 '''
 import sys
@@ -19,7 +19,7 @@ import torch
 import datetime
 from common.utils import save_results, make_dir
 from common.utils import plot_rewards, plot_rewards_cn
-from DQN.dqn import DQN
+from dqn import DQN

 curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
 algo_name = "DQN"  # 算法名称
@@ -66,9 +66,9 @@ def env_agent_config(cfg, seed=1):
    '''
    env = gym.make(cfg.env_name)  # 创建环境
    env.seed(seed)  # 设置随机种子
-    state_dim = env.observation_space.shape[0]  # 状态维度
-    action_dim = env.action_space.n  # 动作维度
-    agent = DQN(state_dim, action_dim, cfg)  # 创建智能体
+    n_states = env.observation_space.shape[0]  # 状态维度
+    n_actions = env.action_space.n  # 动作维度
+    agent = DQN(n_states, n_actions, cfg)  # 创建智能体
    return env, agent

 def train(cfg, env, agent):
@@ -5,7 +5,7 @@ Author: JiangJi
 Email: johnjim0816@gmail.com
 Date: 2021-12-22 11:14:17
 LastEditor: JiangJi
-LastEditTime: 2021-12-22 15:27:48
+LastEditTime: 2022-02-10 06:17:46
 Discription: 使用 DQN-cnn  训练 PongNoFrameskip-v4
 '''
 import sys
@@ -20,7 +20,7 @@ import datetime
 from common.utils import save_results, make_dir
 from common.utils import plot_rewards, plot_rewards_cn
 from common.atari_wrappers import make_atari, wrap_deepmind
-from DQN.dqn import DQN
+from dqn import DQN

 curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
 algo_name = 'DQN-cnn'  # 算法名称
@@ -68,9 +68,9 @@ def env_agent_config(cfg, seed=1):
    # env    = wrap_deepmind(env)
    # env    = wrap_pytorch(env) 
    env.seed(seed)  # 设置随机种子
-    state_dim = env.observation_space.shape[0]  # 状态维度
-    action_dim = env.action_space.n  # 动作维度
-    agent = DQN(state_dim, action_dim, cfg)  # 创建智能体
+    n_states = env.observation_space.shape[0]  # 状态维度
+    n_actions = env.action_space.n  # 动作维度
+    agent = DQN(n_states, n_actions, cfg)  # 创建智能体
    return env, agent

 def train(cfg, env, agent):
@@ -0,0 +1,180 @@
+import sys
+import os
+import torch.nn as nn
+import torch.nn.functional as F
+curr_path = os.path.dirname(os.path.abspath(__file__))  # 当前文件所在绝对路径
+parent_path = os.path.dirname(curr_path)  # 父路径
+sys.path.append(parent_path)  # 添加路径到系统路径
+
+import gym
+import torch
+import datetime
+import numpy as np
+from common.utils import save_results_1, make_dir
+from common.utils import plot_rewards
+from dqn_1 import DQN
+
+curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
+
+class MLP(nn.Module):
+    def __init__(self, n_states,n_actions,hidden_dim=256):
+        """ 初始化q网络，为全连接网络
+            n_states: 输入的特征数即环境的状态维度
+            n_actions: 输出的动作维度
+        """
+        super(MLP, self).__init__()
+        self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
+        self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
+        self.fc3 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
+        self.fc4 = nn.Linear(hidden_dim, n_actions) # 输出层
+        
+    def forward(self, x):
+        # 各层对应的激活函数
+        x = F.relu(self.fc1(x)) 
+        x = F.relu(self.fc2(x))
+        x = F.relu(self.fc3(x))
+        return self.fc4(x)
+
+class Config:
+    '''超参数
+    '''
+
+    def __init__(self):
+        ################################## 环境超参数 ###################################
+        self.algo_name = 'DQN'  # 算法名称
+        # self.env_name = 'Breakout-ram-v0'  # 环境名称
+        self.env_name = 'ALE/Pong-ram-v5'
+        self.device = torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu")  # 检测GPUgjgjlkhfsf风刀霜的撒发十
+        self.seed = 10 # 随机种子，置0则不设置随机种子
+        self.train_eps = 5  # 训练的回合数
+        self.test_eps = 30  # 测试的回合数
+        ################################################################################
+        
+        ################################## 算法超参数 ###################################
+        self.gamma = 0.99  # 强化学习中的折扣因子
+        self.epsilon_start = 0.95  # e-greedy策略中初始epsilon
+        self.epsilon_end = 0.01  # e-greedy策略中的终止epsilon
+        self.epsilon_decay = 500000  # e-greedy策略中epsilon的衰减率
+        self.lr = 0.00025 # 学习率
+        self.memory_capacity = int(5e4)  # 经验回放的容量
+        self.batch_size = 32  # mini-batch SGD中的批量大小
+        self.target_update = 4  # 目标网络的更新频率
+        self.hidden_dim = 512  # 网络隐藏层
+        ################################################################################
+        
+        ################################# 保存结果相关参数 ################################
+        self.result_path = curr_path + "/outputs/" + self.env_name + \
+            '/' + curr_time + '/results/'  # 保存结果的路径
+        self.model_path = curr_path + "/outputs/" + self.env_name + \
+            '/' + curr_time + '/models/'  # 保存模型的路径
+        self.save = True # 是否保存图片
+        ################################################################################
+
+
+def env_agent_config(cfg):
+    ''' 创建环境和智能体
+    '''
+    env = gym.make(cfg.env_name)  # 创建环境
+    n_states = env.observation_space.shape[0]  # 状态维度
+    n_actions = env.action_space.n  # 动作维度
+    print(f"n states: {n_states}, n actions: {n_actions}")
+    model = MLP(n_states,n_actions)
+    agent = DQN(n_states, n_actions, model, cfg)  # 创建智能体
+    if cfg.seed !=0: # 设置随机种子
+        torch.manual_seed(cfg.seed)
+        env.seed(cfg.seed)
+        np.random.seed(cfg.seed)
+    return env, agent
+
+
+def train(cfg, env, agent):
+    ''' 训练
+    '''
+    print('开始训练!')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
+    rewards = []  # 记录所有回合的奖励
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    steps = []
+    for i_ep in range(cfg.train_eps):
+        ep_reward = 0  # 记录一回合内的奖励
+        state = env.reset()  # 重置环境，返回初始状态
+        ep_step = 0
+        while True:
+            ep_step+=1
+            action = agent.choose_action(state)  # 选择动作
+            next_state, reward, done, _ = env.step(action)  # 更新环境，返回transition
+            agent.memory.push(state, action, reward,
+                              next_state, done)  # 保存transition
+            state = next_state  # 更新下一个状态
+            agent.update()  # 更新智能体
+            ep_reward += reward  # 累加奖励
+            if done:
+                break
+        if (i_ep + 1) % cfg.target_update == 0:  # 智能体目标网络更新
+            agent.target_net.load_state_dict(agent.policy_net.state_dict())
+        steps.append(ep_step)
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward)
+        else:
+            ma_rewards.append(ep_reward)
+        if (i_ep + 1) % 1 == 0:
+            print(f'Episode：{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}, Epislon:{agent.epsilon(agent.frame_idx):.3f}')
+    print('完成训练！')
+    env.close()
+    res_dic = {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}
+    return res_dic
+
+
+def test(cfg, env, agent):
+    print('开始测试!')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
+    ############# 由于测试不需要使用epsilon-greedy策略，所以相应的值设置为0 ###############
+    cfg.epsilon_start = 0.0  # e-greedy策略中初始epsilon
+    cfg.epsilon_end = 0.0  # e-greedy策略中的终止epsilon
+    ################################################################################
+    rewards = []  # 记录所有回合的奖励
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    steps = []
+    for i_ep in range(cfg.test_eps):
+        ep_reward = 0  # 记录一回合内的奖励
+        ep_step = 0
+        state = env.reset()  # 重置环境，返回初始状态
+        while True:
+            ep_step+=1
+            action = agent.choose_action(state)  # 选择动作
+            next_state, reward, done, _ = env.step(action)  # 更新环境，返回transition
+            state = next_state  # 更新下一个状态
+            ep_reward += reward  # 累加奖励
+            if done:
+                break
+        steps.append(ep_step)
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1)
+        else:
+            ma_rewards.append(ep_reward)
+        print(f"回合：{i_ep+1}/{cfg.test_eps}，奖励：{ep_reward:.1f}")
+    print('完成测试！')
+    env.close()
+    return {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}
+
+
+if __name__ == "__main__":
+    cfg = Config()
+    # 训练
+    env, agent = env_agent_config(cfg)
+    res_dic = train(cfg, env, agent)
+    make_dir(cfg.result_path, cfg.model_path)  # 创建保存结果和模型路径的文件夹
+    agent.save(path=cfg.model_path)  # 保存模型
+    save_results_1(res_dic, tag='train',
+                 path=cfg.result_path)  # 保存结果
+    plot_rewards(res_dic['rewards'], res_dic['ma_rewards'], cfg, tag="train")  # 画出结果
+    # 测试
+    env, agent = env_agent_config(cfg)
+    agent.load(path=cfg.model_path)  # 导入模型
+    res_dic = test(cfg, env, agent)
+    save_results_1(res_dic, tag='test',
+                 path=cfg.result_path)  # 保存结果
+    plot_rewards(res_dic['rewards'], res_dic['ma_rewards'],cfg, tag="test")  # 画出结果
@@ -0,0 +1,149 @@
+import sys
+import os
+curr_path = os.path.dirname(os.path.abspath(__file__))  # 当前文件所在绝对路径
+parent_path = os.path.dirname(curr_path)  # 父路径
+sys.path.append(parent_path)  # 添加路径到系统路径
+
+import gym
+import torch
+import datetime
+import numpy as np
+from common.utils import save_results, make_dir
+from common.utils import plot_rewards
+from dqn import DQN
+
+curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
+
+
+class Config:
+    '''超参数
+    '''
+
+    def __init__(self):
+        ################################## 环境超参数 ###################################
+        self.algo_name = 'DQN'  # 算法名称
+        self.env_name = 'SpaceInvaders-ram-v0'  # 环境名称
+        self.device = torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu")  # 检测GPUgjgjlkhfsf风刀霜的撒发十
+        self.seed = 10 # 随机种子，置0则不设置随机种子
+        self.train_eps = 200  # 训练的回合数
+        self.test_eps = 30  # 测试的回合数
+        ################################################################################
+        
+        ################################## 算法超参数 ###################################
+        self.gamma = 0.99  # 强化学习中的折扣因子
+        self.epsilon_start = 0.95  # e-greedy策略中初始epsilon
+        self.epsilon_end = 0.01  # e-greedy策略中的终止epsilon
+        self.epsilon_decay = 20000  # e-greedy策略中epsilon的衰减率
+        self.lr = 2e-4  # 学习率
+        self.memory_capacity = int(1e5)  # 经验回放的容量
+        self.batch_size = 32  # mini-batch SGD中的批量大小
+        self.target_update = 4  # 目标网络的更新频率
+        self.hidden_dim = 512  # 网络隐藏层
+        ################################################################################
+        
+        ################################# 保存结果相关参数 ################################
+        self.result_path = curr_path + "/outputs/" + self.env_name + \
+            '/' + curr_time + '/results/'  # 保存结果的路径
+        self.model_path = curr_path + "/outputs/" + self.env_name + \
+            '/' + curr_time + '/models/'  # 保存模型的路径
+        self.save = True # 是否保存图片
+        ################################################################################
+
+
+def env_agent_config(cfg):
+    ''' 创建环境和智能体
+    '''
+    env = gym.make(cfg.env_name)  # 创建环境
+    n_states = env.observation_space.shape[0]  # 状态维度
+    n_actions = env.action_space.n  # 动作维度
+    print(f"n states: {n_states}, n actions: {n_actions}")
+    agent = DQN(n_states, n_actions, cfg)  # 创建智能体
+    if cfg.seed !=0: # 设置随机种子
+        torch.manual_seed(cfg.seed)
+        env.seed(cfg.seed)
+        np.random.seed(cfg.seed)
+    return env, agent
+
+
+def train(cfg, env, agent):
+    ''' 训练
+    '''
+    print('开始训练!')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
+    rewards = []  # 记录所有回合的奖励
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    for i_ep in range(cfg.train_eps):
+        ep_reward = 0  # 记录一回合内的奖励
+        state = env.reset()  # 重置环境，返回初始状态
+        while True:
+            action = agent.choose_action(state)  # 选择动作
+            next_state, reward, done, _ = env.step(action)  # 更新环境，返回transition
+            agent.memory.push(state, action, reward,
+                              next_state, done)  # 保存transition
+            state = next_state  # 更新下一个状态
+            agent.update()  # 更新智能体
+            ep_reward += reward  # 累加奖励
+            if done:
+                break
+        if (i_ep + 1) % cfg.target_update == 0:  # 智能体目标网络更新
+            agent.target_net.load_state_dict(agent.policy_net.state_dict())
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward)
+        else:
+            ma_rewards.append(ep_reward)
+        if (i_ep + 1) % 1 == 0:
+            print(f'Episode：{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}, Epislon:{agent.epsilon(agent.frame_idx):.3f}')
+    print('完成训练！')
+    env.close()
+    return rewards, ma_rewards
+
+
+def test(cfg, env, agent):
+    print('开始测试!')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
+    ############# 由于测试不需要使用epsilon-greedy策略，所以相应的值设置为0 ###############
+    cfg.epsilon_start = 0.0  # e-greedy策略中初始epsilon
+    cfg.epsilon_end = 0.0  # e-greedy策略中的终止epsilon
+    ################################################################################
+    rewards = []  # 记录所有回合的奖励
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    for i_ep in range(cfg.test_eps):
+        ep_reward = 0  # 记录一回合内的奖励
+        state = env.reset()  # 重置环境，返回初始状态
+        while True:
+            action = agent.choose_action(state)  # 选择动作
+            next_state, reward, done, _ = env.step(action)  # 更新环境，返回transition
+            state = next_state  # 更新下一个状态
+            ep_reward += reward  # 累加奖励
+            if done:
+                break
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1)
+        else:
+            ma_rewards.append(ep_reward)
+        print(f"回合：{i_ep+1}/{cfg.test_eps}，奖励：{ep_reward:.1f}")
+    print('完成测试！')
+    env.close()
+    return rewards, ma_rewards
+
+
+if __name__ == "__main__":
+    cfg = Config()
+    # 训练
+    env, agent = env_agent_config(cfg)
+    rewards, ma_rewards = train(cfg, env, agent)
+    make_dir(cfg.result_path, cfg.model_path)  # 创建保存结果和模型路径的文件夹
+    agent.save(path=cfg.model_path)  # 保存模型
+    save_results(rewards, ma_rewards, tag='train',
+                 path=cfg.result_path)  # 保存结果
+    plot_rewards(rewards, ma_rewards, cfg, tag="train")  # 画出结果
+    # 测试
+    env, agent = env_agent_config(cfg)
+    agent.load(path=cfg.model_path)  # 导入模型
+    rewards, ma_rewards = test(cfg, env, agent)
+    save_results(rewards, ma_rewards, tag='test',
+                 path=cfg.result_path)  # 保存结果
+    plot_rewards(rewards, ma_rewards, cfg, tag="test")  # 画出结果
@@ -0,0 +1,184 @@
+import random
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+import os
+import gym
+import time
+from collections import deque
+from tensorflow.keras import optimizers
+from keras.models import Sequential
+from keras.layers import Dense, Dropout
+from keras.layers import Activation, Flatten, Conv1D, MaxPooling1D,Reshape
+import matplotlib.pyplot as plt
+
+class DQN:
+    def __init__(self, env):
+        self.env = env
+        self.memory = deque(maxlen=400000)
+        self.gamma = 0.99
+        self.epsilon = 1.0
+        self.epsilon_min = 0.01
+        self.epsilon_decay =  self.epsilon_min / 500000
+        
+        self.batch_size = 32
+        self.train_start = 1000
+        self.state_size = self.env.observation_space.shape[0]*4
+        self.action_size = self.env.action_space.n
+        self.learning_rate = 0.00025
+        
+        self.evaluation_model = self.create_model()
+        self.target_model = self.create_model()
+        
+    def create_model(self):
+        model = Sequential()
+        model.add(Dense(128*2, input_dim=self.state_size,activation='relu'))
+        model.add(Dense(128*2, activation='relu'))
+        model.add(Dense(128*2, activation='relu'))
+        model.add(Dense(self.env.action_space.n, activation='linear'))
+        model.compile(loss='mean_squared_error', optimizer=optimizers.RMSprop(lr=self.learning_rate,decay=0.99,epsilon=1e-6))
+        return model
+    
+    def choose_action(self, state, steps):
+        if steps > 50000:
+            if self.epsilon > self.epsilon_min:
+                self.epsilon -= self.epsilon_decay
+        if np.random.random() < self.epsilon:
+            return self.env.action_space.sample()
+        return np.argmax(self.evaluation_model.predict(state)[0])
+        
+    def remember(self, cur_state, action, reward, new_state, done):
+        if not hasattr(self, 'memory_counter'):
+            self.memory_counter = 0
+        
+        transition = (cur_state, action, reward, new_state, done)
+        self.memory.extend([transition])
+        
+        self.memory_counter += 1
+    
+    def replay(self):
+        if len(self.memory) < self.train_start:
+            return
+        
+        mini_batch = random.sample(self.memory, self.batch_size)
+        
+        update_input = np.zeros((self.batch_size, self.state_size))
+        update_target = np.zeros((self.batch_size, self.action_size))
+        
+        for i in range(self.batch_size):
+            state, action, reward, new_state, done = mini_batch[i]
+            target = self.evaluation_model.predict(state)[0]
+        
+            if done:
+                target[action] = reward
+            else:
+                target[action] = reward + self.gamma * np.amax(self.target_model.predict(new_state)[0])
+            
+            update_input[i] = state
+            update_target[i] = target
+    
+        self.evaluation_model.fit(update_input, update_target, batch_size=self.batch_size, epochs=1, verbose=0)
+    
+    def target_train(self):
+        self.target_model.set_weights(self.evaluation_model.get_weights())
+        return
+    
+    def visualize(self, reward, episode):
+        plt.plot(episode, reward, 'ob-')
+        plt.title('Average reward each 100 episode')
+        plt.ylabel('Reward')
+        plt.xlabel('Episodes')
+        plt.grid()
+        plt.show()
+    
+    def transform(self,state):
+        if state.shape[1]==512:
+            return state
+        a=[np.binary_repr(x,width=8) for x in state[0]]
+        res=[]
+        for x in a:
+            res.extend([x[:2],x[2:4],x[4:6],x[6:]])
+        res=[int(x,2) for x in res]
+        return np.array(res)
+        
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+def main():
+    # env = gym.make('Breakout-ram-v0')
+    env = gym.make('Breakout-ram-v0')
+    env = env.unwrapped
+    
+    print(env.action_space)
+    print(env.observation_space.shape[0])
+    print(env.observation_space.high)
+    print(env.observation_space.low)
+    
+    #print(env.observation_space.shape)
+    
+    
+    episodes = 5000
+    trial_len = 10000
+    
+    tmp_reward=0
+    sum_rewards = 0
+    n_success = 0
+    total_steps = 0
+    
+    graph_reward = []
+    graph_episodes = []
+    time_record = []
+    
+    dqn_agent = DQN(env=env)
+    for i_episode in range(episodes):
+        start_time = time.time()
+        total_reward = 0
+        cur_state = env.reset().reshape(1,128)
+        cur_state=dqn_agent.transform(cur_state).reshape(1,128*4)/4
+        i_step=0
+        for step in range(trial_len):
+            #env.render()
+            i_step+=1
+            action = dqn_agent.choose_action(cur_state, total_steps)
+            new_state, reward, done, _ = env.step(action)
+            new_state = new_state.reshape(1, 128)
+            new_state = dqn_agent.transform(new_state).reshape(1,128*4)/4
+            total_reward += reward
+            sum_rewards += reward
+            tmp_reward += reward
+            if reward>0:    #Testing whether it is good.
+                reward=1
+            
+            dqn_agent.remember(cur_state, action, reward, new_state, done)
+            if total_steps > 10000:
+                if total_steps%4 == 0:
+                    dqn_agent.replay()
+                if total_steps%5000 == 0:
+                    dqn_agent.target_train()
+            
+            cur_state = new_state
+            total_steps += 1
+            if done:
+                env.reset()
+                break
+        if (i_episode+1) % 100 == 0:
+            graph_reward.append(sum_rewards/100)
+            graph_episodes.append(i_episode+1)
+            sum_rewards = 0
+            print("Episode ",i_episode+1," Reward: ")
+            print(graph_reward[-1])
+        end_time = time.time()
+        time_record.append(end_time-start_time)
+        print("NOW in episode: " + str(i_episode))
+        print("Time cost: " + str(end_time-start_time))
+        print("Reward: ",tmp_reward)
+        print("Step:", i_step)
+        tmp_reward=0
+    print("Reward: ")
+    print(graph_reward)
+    print("Episode: ")
+    print(graph_episodes)
+    print("Average_time: ")
+    print(sum(time_record)/5000)
+    dqn_agent.visualize(graph_reward, graph_episodes)
+    
+if __name__ == '__main__':
+    main()