update projects

2022-07-31 23:42:12 +08:00
parent e9b3e92141
commit ffab9e3028
236 changed files with 370 additions and 133 deletions
--- a/projects/codes/RainbowDQN/rainbow_dqn.py
+++ b/projects/codes/RainbowDQN/rainbow_dqn.py
@@ -0,0 +1,215 @@
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torch.autograd import Variable
+import random
+class ReplayBuffer:
+    def __init__(self, capacity):
+        self.capacity = capacity # 经验回放的容量
+        self.buffer = [] # 缓冲区
+        self.position = 0 
+    
+    def push(self, state, action, reward, next_state, done):
+        ''' 缓冲区是一个队列，容量超出时去掉开始存入的转移(transition)
+        '''
+        if len(self.buffer) < self.capacity:
+            self.buffer.append(None)
+        self.buffer[self.position] = (state, action, reward, next_state, done)
+        self.position = (self.position + 1) % self.capacity 
+    
+    def sample(self, batch_size):
+        batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移
+        state, action, reward, next_state, done =  zip(*batch) # 解压成状态，动作等
+        return state, action, reward, next_state, done
+    
+    def __len__(self):
+        ''' 返回当前存储的量
+        '''
+        return len(self.buffer)
+class NoisyLinear(nn.Module):
+    def __init__(self, input_dim, output_dim, device, std_init=0.4):
+        super(NoisyLinear, self).__init__()
+        
+        self.device     = device
+        self.input_dim  = input_dim
+        self.output_dim = output_dim
+        self.std_init     = std_init
+        
+        self.weight_mu    = nn.Parameter(torch.FloatTensor(output_dim, input_dim))
+        self.weight_sigma = nn.Parameter(torch.FloatTensor(output_dim, input_dim))
+        self.register_buffer('weight_epsilon', torch.FloatTensor(output_dim, input_dim))
+        
+        self.bias_mu    = nn.Parameter(torch.FloatTensor(output_dim))
+        self.bias_sigma = nn.Parameter(torch.FloatTensor(output_dim))
+        self.register_buffer('bias_epsilon', torch.FloatTensor(output_dim))
+        
+        self.reset_parameters()
+        self.reset_noise()
+    
+    def forward(self, x):
+        if self.device:
+            weight_epsilon = self.weight_epsilon.cuda()
+            bias_epsilon   = self.bias_epsilon.cuda()
+        else:
+            weight_epsilon = self.weight_epsilon
+            bias_epsilon   = self.bias_epsilon
+            
+        if self.training: 
+            weight = self.weight_mu + self.weight_sigma.mul(Variable(weight_epsilon))
+            bias   = self.bias_mu   + self.bias_sigma.mul(Variable(bias_epsilon))
+        else:
+            weight = self.weight_mu
+            bias   = self.bias_mu
+        
+        return F.linear(x, weight, bias)
+    
+    def reset_parameters(self):
+        mu_range = 1 / math.sqrt(self.weight_mu.size(1))
+        
+        self.weight_mu.data.uniform_(-mu_range, mu_range)
+        self.weight_sigma.data.fill_(self.std_init / math.sqrt(self.weight_sigma.size(1)))
+        
+        self.bias_mu.data.uniform_(-mu_range, mu_range)
+        self.bias_sigma.data.fill_(self.std_init / math.sqrt(self.bias_sigma.size(0)))
+    
+    def reset_noise(self):
+        epsilon_in  = self._scale_noise(self.input_dim)
+        epsilon_out = self._scale_noise(self.output_dim)
+        
+        self.weight_epsilon.copy_(epsilon_out.ger(epsilon_in))
+        self.bias_epsilon.copy_(self._scale_noise(self.output_dim))
+    
+    def _scale_noise(self, size):
+        x = torch.randn(size)
+        x = x.sign().mul(x.abs().sqrt())
+        return x
+
+class RainbowModel(nn.Module):
+    def __init__(self, n_states, n_actions, n_atoms, Vmin, Vmax):
+        super(RainbowModel, self).__init__()
+        
+        self.n_states   = n_states
+        self.n_actions  = n_actions
+        self.n_atoms    = n_atoms
+        self.Vmin         = Vmin
+        self.Vmax         = Vmax
+        
+        self.linear1 = nn.Linear(n_states, 32)
+        self.linear2 = nn.Linear(32, 64)
+        
+        self.noisy_value1 = NoisyLinear(64, 64, device=device)
+        self.noisy_value2 = NoisyLinear(64, self.n_atoms, device=device)
+        
+        self.noisy_advantage1 = NoisyLinear(64, 64, device=device)
+        self.noisy_advantage2 = NoisyLinear(64, self.n_atoms * self.n_actions, device=device)
+        
+    def forward(self, x):
+        batch_size = x.size(0)
+        
+        x = F.relu(self.linear1(x))
+        x = F.relu(self.linear2(x))
+        
+        value = F.relu(self.noisy_value1(x))
+        value = self.noisy_value2(value)
+        
+        advantage = F.relu(self.noisy_advantage1(x))
+        advantage = self.noisy_advantage2(advantage)
+        
+        value     = value.view(batch_size, 1, self.n_atoms)
+        advantage = advantage.view(batch_size, self.n_actions, self.n_atoms)
+        
+        x = value + advantage - advantage.mean(1, keepdim=True)
+        x = F.softmax(x.view(-1, self.n_atoms)).view(-1, self.n_actions, self.n_atoms)
+        
+        return x
+        
+    def reset_noise(self):
+        self.noisy_value1.reset_noise()
+        self.noisy_value2.reset_noise()
+        self.noisy_advantage1.reset_noise()
+        self.noisy_advantage2.reset_noise()
+
+    def act(self, state):
+        state = Variable(torch.FloatTensor(state).unsqueeze(0), volatile=True)
+        dist = self.forward(state).data.cpu()
+        dist = dist * torch.linspace(self.Vmin, self.Vmax, self.n_atoms)
+        action = dist.sum(2).max(1)[1].numpy()[0]
+        return action
+
+class RainbowDQN(nn.Module):
+    def __init__(self, n_states, n_actions, n_atoms, Vmin, Vmax,cfg):
+        super(RainbowDQN, self).__init__()
+        self.n_states = n_states
+        self.n_actions = n_actions
+        self.n_atoms = cfg.n_atoms
+        self.Vmin = cfg.Vmin
+        self.Vmax = cfg.Vmax
+        self.policy_model = RainbowModel(n_states, n_actions, n_atoms, Vmin, Vmax)
+        self.target_model = RainbowModel(n_states, n_actions, n_atoms, Vmin, Vmax)
+        self.batch_size = cfg.batch_size
+        self.memory = ReplayBuffer(cfg.memory_capacity) # 经验回放
+        self.optimizer = optim.Adam(self.policy_model.parameters(), 0.001)
+    def choose_action(self,state):
+        state = Variable(torch.FloatTensor(state).unsqueeze(0), volatile=True)
+        dist = self.policy_model(state).data.cpu()
+        dist = dist * torch.linspace(self.Vmin, self.Vmax, self.n_atoms)
+        action = dist.sum(2).max(1)[1].numpy()[0]
+        return action
+    def projection_distribution(self,next_state, rewards, dones):
+     
+        
+        delta_z = float(self.Vmax - self.Vmin) / (self.n_atoms - 1)
+        support = torch.linspace(self.Vmin, self.Vmax, self.n_atoms)
+        
+        next_dist   = self.target_model(next_state).data.cpu() * support
+        next_action = next_dist.sum(2).max(1)[1]
+        next_action = next_action.unsqueeze(1).unsqueeze(1).expand(next_dist.size(0), 1, next_dist.size(2))
+        next_dist   = next_dist.gather(1, next_action).squeeze(1)
+            
+        rewards = rewards.unsqueeze(1).expand_as(next_dist)
+        dones   = dones.unsqueeze(1).expand_as(next_dist)
+        support = support.unsqueeze(0).expand_as(next_dist)
+        
+        Tz = rewards + (1 - dones) * 0.99 * support
+        Tz = Tz.clamp(min=self.Vmin, max=self.Vmax)
+        b  = (Tz - self.Vmin) / delta_z
+        l  = b.floor().long()
+        u  = b.ceil().long()
+            
+        offset = torch.linspace(0, (self.batch_size - 1) * self.n_atoms, self.batch_size).long()\
+                        .unsqueeze(1).expand(self.batch_size, self.n_atoms)
+
+        proj_dist = torch.zeros(next_dist.size())    
+        proj_dist.view(-1).index_add_(0, (l + offset).view(-1), (next_dist * (u.float() - b)).view(-1))
+        proj_dist.view(-1).index_add_(0, (u + offset).view(-1), (next_dist * (b - l.float())).view(-1))
+            
+        return proj_dist
+    def update(self):
+        if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时，不更新策略
+            return
+        state, action, reward, next_state, done = self.memory.sample(self.batch_size) 
+
+        state      = Variable(torch.FloatTensor(np.float32(state)))
+        next_state = Variable(torch.FloatTensor(np.float32(next_state)), volatile=True)
+        action     = Variable(torch.LongTensor(action))
+        reward     = torch.FloatTensor(reward)
+        done       = torch.FloatTensor(np.float32(done))
+
+        proj_dist = self.projection_distribution(next_state, reward, done)
+        
+        dist = self.policy_model(state)
+        action = action.unsqueeze(1).unsqueeze(1).expand(self.batch_size, 1, self.n_atoms)
+        dist = dist.gather(1, action).squeeze(1)
+        dist.data.clamp_(0.01, 0.99)
+        loss = -(Variable(proj_dist) * dist.log()).sum(1)
+        loss  = loss.mean()
+            
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
+
+        self.policy_model.reset_noise()
+        self.target_model.reset_noise()
+    
--- a/projects/codes/RainbowDQN/task0.py
+++ b/projects/codes/RainbowDQN/task0.py
@@ -0,0 +1,177 @@
+import sys
+import os
+import torch.nn as nn
+import torch.nn.functional as F
+curr_path = os.path.dirname(os.path.abspath(__file__))  # 当前文件所在绝对路径
+parent_path = os.path.dirname(curr_path)  # 父路径
+sys.path.append(parent_path)  # 添加路径到系统路径
+
+import gym
+import torch
+import datetime
+import numpy as np
+from common.utils import save_results_1, make_dir
+from common.utils import plot_rewards
+from dqn import DQN
+
+curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
+
+class MLP(nn.Module):
+    def __init__(self, n_states,n_actions,hidden_dim=128):
+        """ 初始化q网络，为全连接网络
+            n_states: 输入的特征数即环境的状态维度
+            n_actions: 输出的动作维度
+        """
+        super(MLP, self).__init__()
+        self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
+        self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
+        self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层
+        
+    def forward(self, x):
+        # 各层对应的激活函数
+        x = F.relu(self.fc1(x)) 
+        x = F.relu(self.fc2(x))
+        return self.fc3(x)
+
+class Config:
+    '''超参数
+    '''
+
+    def __init__(self):
+        ############################### hyperparameters ################################
+        self.algo_name = 'DQN'  # algorithm name
+        self.env_name = 'CartPole-v0'  # environment name
+        self.device = torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu")  # check GPU
+        self.seed = 10 # 随机种子，置0则不设置随机种子
+        self.train_eps = 200  # 训练的回合数
+        self.test_eps = 20  # 测试的回合数
+        ################################################################################
+        
+        ################################## 算法超参数 ###################################
+        self.gamma = 0.95  # 强化学习中的折扣因子
+        self.epsilon_start = 0.90  # e-greedy策略中初始epsilon
+        self.epsilon_end = 0.01  # e-greedy策略中的终止epsilon
+        self.epsilon_decay = 500  # e-greedy策略中epsilon的衰减率
+        self.lr = 0.0001  # 学习率
+        self.memory_capacity = 100000  # 经验回放的容量
+        self.batch_size = 64  # mini-batch SGD中的批量大小
+        self.target_update = 4  # 目标网络的更新频率
+        self.hidden_dim = 256  # 网络隐藏层
+        ################################################################################
+        
+        ################################# 保存结果相关参数 ################################
+        self.result_path = curr_path + "/outputs/" + self.env_name + \
+            '/' + curr_time + '/results/'  # 保存结果的路径
+        self.model_path = curr_path + "/outputs/" + self.env_name + \
+            '/' + curr_time + '/models/'  # 保存模型的路径
+        self.save = True # 是否保存图片
+        ################################################################################
+
+
+def env_agent_config(cfg):
+    ''' 创建环境和智能体
+    '''
+    env = gym.make(cfg.env_name)  # 创建环境
+    n_states = env.observation_space.shape[0]  # 状态维度
+    n_actions = env.action_space.n  # 动作维度
+    print(f"n states: {n_states}, n actions: {n_actions}")
+    model = MLP(n_states,n_actions)
+    agent = DQN(n_actions, model, cfg)  # 创建智能体
+    if cfg.seed !=0: # 设置随机种子
+        torch.manual_seed(cfg.seed)
+        env.seed(cfg.seed)
+        np.random.seed(cfg.seed)
+    return env, agent
+
+
+def train(cfg, env, agent):
+    ''' 训练
+    '''
+    print('开始训练!')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
+    rewards = []  # 记录所有回合的奖励
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    steps = []
+    for i_ep in range(cfg.train_eps):
+        ep_reward = 0  # 记录一回合内的奖励
+        ep_step = 0
+        state = env.reset()  # 重置环境，返回初始状态
+        while True:
+            ep_step += 1
+            action = agent.choose_action(state)  # 选择动作
+            next_state, reward, done, _ = env.step(action)  # 更新环境，返回transition
+            agent.memory.push(state, action, reward,
+                              next_state, done)  # 保存transition
+            state = next_state  # 更新下一个状态
+            agent.update()  # 更新智能体
+            ep_reward += reward  # 累加奖励
+            if done:
+                break
+        if (i_ep + 1) % cfg.target_update == 0:  # 智能体目标网络更新
+            agent.target_net.load_state_dict(agent.policy_net.state_dict())
+        steps.append(ep_step)
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward)
+        else:
+            ma_rewards.append(ep_reward)
+        if (i_ep + 1) % 1 == 0:
+            print(f'Episode：{i_ep+1}/{cfg.test_eps}, Reward:{ep_reward:.2f}, Step:{ep_step:.2f} Epislon:{agent.epsilon(agent.frame_idx):.3f}')
+    print('Finish training!')
+    env.close()
+    res_dic = {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}
+    return res_dic
+
+
+def test(cfg, env, agent):
+    print('开始测试!')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
+    ############# 由于测试不需要使用epsilon-greedy策略，所以相应的值设置为0 ###############
+    cfg.epsilon_start = 0.0  # e-greedy策略中初始epsilon
+    cfg.epsilon_end = 0.0  # e-greedy策略中的终止epsilon
+    ################################################################################
+    rewards = []  # 记录所有回合的奖励
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    steps = []
+    for i_ep in range(cfg.test_eps):
+        ep_reward = 0  # 记录一回合内的奖励
+        ep_step = 0
+        state = env.reset()  # 重置环境，返回初始状态
+        while True:
+            ep_step+=1
+            action = agent.choose_action(state)  # 选择动作
+            next_state, reward, done, _ = env.step(action)  # 更新环境，返回transition
+            state = next_state  # 更新下一个状态
+            ep_reward += reward  # 累加奖励
+            if done:
+                break
+        steps.append(ep_step)
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1)
+        else:
+            ma_rewards.append(ep_reward)
+        print(f'Episode：{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}, Step:{ep_step:.2f}')
+    print('完成测试！')
+    env.close()
+    return {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}
+
+
+if __name__ == "__main__":
+    cfg = Config()
+    # 训练
+    env, agent = env_agent_config(cfg)
+    res_dic = train(cfg, env, agent)
+    make_dir(cfg.result_path, cfg.model_path)  # 创建保存结果和模型路径的文件夹
+    agent.save(path=cfg.model_path)  # 保存模型
+    save_results_1(res_dic, tag='train',
+                 path=cfg.result_path)  # 保存结果
+    plot_rewards(res_dic['rewards'], res_dic['ma_rewards'], cfg, tag="train")  # 画出结果
+    # 测试
+    env, agent = env_agent_config(cfg)
+    agent.load(path=cfg.model_path)  # 导入模型
+    res_dic = test(cfg, env, agent)
+    save_results_1(res_dic, tag='test',
+                 path=cfg.result_path)  # 保存结果
+    plot_rewards(res_dic['rewards'], res_dic['ma_rewards'],cfg, tag="test")  # 画出结果