update

2021-03-31 15:37:09 +08:00
parent 6a92f97138
commit b6f63a91bf
65 changed files with 1244 additions and 459 deletions
--- a/codes/DDPG/agent.py
+++ b/codes/DDPG/agent.py
@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-09 20:25:52
@LastEditor: John
-LastEditTime: 2021-03-17 20:43:25
+LastEditTime: 2021-03-31 00:56:32
@Discription: 
@Environment: python 3.7.7
 '''
@@ -58,9 +58,7 @@ class DDPG:
        done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(self.device)
        # 注意critic将(s_t,a)作为输入
        policy_loss = self.critic(state, self.actor(state))
        policy_loss = -policy_loss.mean()
        next_action = self.target_actor(next_state)
        target_value = self.target_critic(next_state, next_action.detach())
        expected_value = reward + (1.0 - done) * self.gamma * target_value
@@ -87,7 +85,7 @@ class DDPG:
                param.data * self.soft_tau
            )
    def save(self,path):
-        torch.save(self.target_net.state_dict(), path+'DDPG_checkpoint.pth')
+        torch.save(self.actor.state_dict(), path+'checkpoint.pt')
    def load(self,path):
-        self.actor.load_state_dict(torch.load(path+'DDPG_checkpoint.pth')) 
+        self.actor.load_state_dict(torch.load(path+'checkpoint.pt')) 
--- a/codes/DDPG/main.py
+++ b/codes/DDPG/main.py
@@ -5,12 +5,17 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-11 20:58:21
@LastEditor: John
-LastEditTime: 2021-03-19 19:57:00
+LastEditTime: 2021-03-31 01:04:48
@Discription: 
@Environment: python 3.7.7
 '''
 import sys,os
-sys.path.append(os.getcwd()) # 添加当前终端路径
+from pathlib import Path
 import sys,os
 curr_path = os.path.dirname(__file__)
 parent_path=os.path.dirname(curr_path) 
 sys.path.append(parent_path) # add current terminal path to sys.path
 import torch
 import gym
 import numpy as np
@@ -20,27 +25,23 @@ from DDPG.env import NormalizedActions,OUNoise
 from common.plot import plot_rewards
 from common.utils import save_results
-SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
+SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
-SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' # 生成保存的模型路径
+SAVED_MODEL_PATH = curr_path+"/saved_model/"+SEQUENCE+'/' # path to save model
-if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"): # 检测是否存在文件夹
+if not os.path.exists(curr_path+"/saved_model/"): os.mkdir(curr_path+"/saved_model/")
-    os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/")
+if not os.path.exists(SAVED_MODEL_PATH): os.mkdir(SAVED_MODEL_PATH)
-if not os.path.exists(SAVED_MODEL_PATH): # 检测是否存在文件夹
+RESULT_PATH = curr_path+"/results/"+SEQUENCE+'/' # path to save rewards
-    os.mkdir(SAVED_MODEL_PATH)
+if not os.path.exists(curr_path+"/results/"): os.mkdir(curr_path+"/results/")
-RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/' # 存储reward的路径
+if not os.path.exists(RESULT_PATH): os.mkdir(RESULT_PATH)
 if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/results/"): # 检测是否存在文件夹
    os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/results/")
 if not os.path.exists(RESULT_PATH): # 检测是否存在文件夹
    os.mkdir(RESULT_PATH)
 class DDPGConfig:
    def __init__(self):
        self.algo = 'DDPG'
        self.gamma = 0.99
        self.critic_lr = 1e-3  
        self.actor_lr = 1e-4 
        self.memory_capacity = 10000
        self.batch_size = 128
        self.train_eps =300
        self.train_steps = 200
        self.eval_eps = 200
        self.eval_steps = 200
        self.target_update = 4
@@ -56,19 +57,19 @@ def train(cfg,env,agent):
    for i_episode in range(cfg.train_eps):
        state = env.reset()
        ou_noise.reset()
        done = False
        ep_reward = 0
-        for i_step in range(cfg.train_steps):
+        i_step = 0
        while not done:
            i_step += 1
            action = agent.choose_action(state)
-            action = ou_noise.get_action(
+            action = ou_noise.get_action(action, i_step)  # 即paper中的random process
                action, i_step)  # 即paper中的random process
            next_state, reward, done, _ = env.step(action)
            ep_reward += reward
            agent.memory.push(state, action, reward, next_state, done)
            agent.update()
            state = next_state
-            if done:
+        print('Episode:{}/{}, Reward:{}'.format(i_episode+1,cfg.train_eps,ep_reward))
                break
        print('Episode:{}/{}, Reward:{}, Steps:{}, Done:{}'.format(i_episode+1,cfg.train_eps,ep_reward,i_step+1,done))
        ep_steps.append(i_step)
        rewards.append(ep_reward)
        if ma_rewards:
--- a/codes/DDPG/results/20210331-010047/ma_rewards_train.npy
+++ b/codes/DDPG/results/20210331-010047/ma_rewards_train.npy
--- a/codes/DDPG/results/20210331-010047/rewards_curve_train.png
+++ b/codes/DDPG/results/20210331-010047/rewards_curve_train.png
--- a/codes/DDPG/results/20210331-010047/rewards_train.npy
+++ b/codes/DDPG/results/20210331-010047/rewards_train.npy
--- a/codes/DDPG/saved_model/20210331-010047/checkpoint.pt
+++ b/codes/DDPG/saved_model/20210331-010047/checkpoint.pt
--- a/codes/DQN/README.md
+++ b/codes/DQN/README.md
@@ -1,7 +1,7 @@
 # DQN
 ## 原理简介
-DQN是Q-leanning算法的优化和延伸，Q-leaning中使用有限的Q表存储值的信息，而DQN中则用神经网络替代Q表存储信息，这样更适用于高维的情况，相关知识基础可参考[datawhale李宏毅笔记-Q学习](https://datawhalechina.github.io/leedeeprl-notes/#/chapter6/chapter6)。
+DQN是Q-leanning算法的优化和延伸，Q-leaning中使用有限的Q表存储值的信息，而DQN中则用神经网络替代Q表存储信息，这样更适用于高维的情况，相关知识基础可参考[datawhale李宏毅笔记-Q学习](https://datawhalechina.github.io/easy-rl/#/chapter6/chapter6)。
 论文方面主要可以参考两篇，一篇就是2013年谷歌DeepMind团队的[Playing Atari with Deep Reinforcement Learning](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf)，一篇是也是他们团队后来在Nature杂志上发表的[Human-level control through deep reinforcement learning](https://web.stanford.edu/class/psych209/Readings/MnihEtAlHassibis15NatureControlDeepRL.pdf)。后者在算法层面增加target q-net，也可以叫做Nature DQN。
--- a/codes/DQN/agent.py
+++ b/codes/DQN/agent.py
@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:50:49
@LastEditor: John
-LastEditTime: 2021-03-13 14:56:23
+LastEditTime: 2021-03-30 17:01:26
@Discription: 
@Environment: python 3.7.7
 '''
@@ -13,6 +13,8 @@ LastEditTime: 2021-03-13 14:56:23
 '''
 import torch
 import torch.nn as nn
 import torch.optim as optim
@@ -26,58 +28,41 @@ class DQN:
        self.action_dim = action_dim  # 总的动作个数
        self.device = cfg.device  # 设备，cpu或gpu等
-        self.gamma = cfg.gamma # 奖励的折扣因子
+        self.gamma = cfg.gamma  # 奖励的折扣因子
        # e-greedy策略相关参数
-        self.sample_count = 0 # 用于epsilon的衰减计数
+        self.frame_idx = 0  # 用于epsilon的衰减计数
-        self.epsilon = 0
+        self.epsilon = lambda frame_idx: cfg.epsilon_end + \
-        self.epsilon_start = cfg.epsilon_start
+            (cfg.epsilon_start - cfg.epsilon_end) * \
-        self.epsilon_end = cfg.epsilon_end
+            math.exp(-1. * frame_idx / cfg.epsilon_decay)
        self.epsilon_decay = cfg.epsilon_decay
        self.batch_size = cfg.batch_size
-        self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
+        self.policy_net = MLP(state_dim, action_dim,
-        self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
+                              hidden_dim=cfg.hidden_dim).to(self.device)
-        # target_net的初始模型参数完全复制policy_net
+        self.target_net = MLP(state_dim, action_dim,
-        self.target_net.load_state_dict(self.policy_net.state_dict())
+                              hidden_dim=cfg.hidden_dim).to(self.device)
        self.target_net.eval()  # 不启用 BatchNormalization 和 Dropout
        # 可查parameters()与state_dict()的区别，前者require_grad=True
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr)
        self.loss = 0
        self.memory = ReplayBuffer(cfg.memory_capacity)
-    def choose_action(self, state, train=True):
+    def choose_action(self, state):
        '''选择动作
        '''
-        if train:
+        self.frame_idx += 1
-            self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
+        if random.random() > self.epsilon(self.frame_idx):
-                math.exp(-1. * self.sample_count / self.epsilon_decay)
+            with torch.no_grad():
-            self.sample_count += 1
+                # 先转为张量便于丢给神经网络,state元素数据原本为float64
-            if random.random() > self.epsilon:
+                # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
-                with torch.no_grad():
+                state = torch.tensor(
-                    # 先转为张量便于丢给神经网络,state元素数据原本为float64
+                    [state], device=self.device, dtype=torch.float32)
-                    # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
+                # 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
-                    state = torch.tensor(
+                q_value = self.policy_net(state)
-                        [state], device=self.device, dtype=torch.float32)
+                # tensor.max(1)返回每行的最大值以及对应的下标，
-                    # 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
+                # 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0]))
-                    q_value = self.policy_net(state)
+                # 所以tensor.max(1)[1]返回最大值对应的下标，即action
-                    # tensor.max(1)返回每行的最大值以及对应的下标，
+                action = q_value.max(1)[1].item()
                    # 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0]))
                    # 所以tensor.max(1)[1]返回最大值对应的下标，即action
                    action = q_value.max(1)[1].item()  
            else:
                action = random.randrange(self.action_dim)
            return action
        else:
-            with torch.no_grad(): # 取消保存梯度
+            action = random.randrange(self.action_dim)
-                    # 先转为张量便于丢给神经网络,state元素数据原本为float64
+        return action
-                    # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
+
                    state = torch.tensor(
                        [state], device='cpu', dtype=torch.float32) # 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
                    q_value = self.target_net(state)
                    # tensor.max(1)返回每行的最大值以及对应的下标，
                    # 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0]))
                    # 所以tensor.max(1)[1]返回最大值对应的下标，即action
                    action = q_value.max(1)[1].item() 
            return action
    def update(self):
        if len(self.memory) < self.batch_size:
@@ -96,32 +81,31 @@ class DQN:
        next_state_batch = torch.tensor(
            next_state_batch, device=self.device, dtype=torch.float)
        done_batch = torch.tensor(np.float32(
-            done_batch), device=self.device).unsqueeze(1)  # 将bool转为float然后转为张量
+            done_batch), device=self.device)
        '''计算当前(s_t,a)对应的Q(s_t, a)'''
        '''torch.gather:对于a=torch.Tensor([[1,2],[3,4]]),那么a.gather(1,torch.Tensor([[0],[1]]))=torch.Tensor([[1],[3]])'''
        q_values = self.policy_net(state_batch).gather(
            dim=1, index=action_batch)  # 等价于self.forward
        # 计算所有next states的V(s_{t+1})，即通过target_net中选取reward最大的对应states
-        next_state_values = self.target_net(
+        next_q_values = self.target_net(next_state_batch).max(
-            next_state_batch).max(1)[0].detach()  # 比如tensor([ 0.0060, -0.0171,...,])
+            1)[0].detach()  # 比如tensor([ 0.0060, -0.0171,...,])
        # 计算 expected_q_value
        # 对于终止状态，此时done_batch[0]=1, 对应的expected_q_value等于reward
-        expected_q_values = reward_batch + self.gamma * \
+        expected_q_values = reward_batch + \
-            next_state_values * (1-done_batch[0])
+            self.gamma * next_q_values * (1-done_batch)
        # self.loss = F.smooth_l1_loss(q_values,expected_q_values.unsqueeze(1)) # 计算 Huber loss
        self.loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1))  # 计算 均方误差loss
        # 优化模型
        self.optimizer.zero_grad()  # zero_grad清除上一步所有旧的gradients from the last step
        # loss.backward()使用backpropagation计算loss相对于所有parameters(需要gradients)的微分
        self.loss.backward()
-        for param in self.policy_net.parameters():  # clip防止梯度爆炸
+        # for param in self.policy_net.parameters():  # clip防止梯度爆炸
-            param.grad.data.clamp_(-1, 1)
+        #     param.grad.data.clamp_(-1, 1)
        self.optimizer.step()  # 更新模型
-    def save(self,path):
+    def save(self, path):
        torch.save(self.target_net.state_dict(), path+'dqn_checkpoint.pth')
-    def load(self,path):
+    def load(self, path):
        self.target_net.load_state_dict(torch.load(path+'dqn_checkpoint.pth'))
--- a/codes/DQN/main.ipynb
+++ b/codes/DQN/main.ipynb
--- a/codes/DQN/main.py
+++ b/codes/DQN/main.py
@@ -5,12 +5,17 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:48:57
@LastEditor: John
-LastEditTime: 2021-03-26 17:17:17
+LastEditTime: 2021-03-30 16:59:19
@Discription: 
@Environment: python 3.7.7
 '''
 import sys,os
-sys.path.append(os.getcwd()) # 添加当前终端路径
+from pathlib import Path
 import sys,os
 curr_path = os.path.dirname(__file__)
 parent_path=os.path.dirname(curr_path) 
 sys.path.append(parent_path) # add current terminal path to sys.path
 import gym
 import torch
 import datetime
@@ -18,58 +23,52 @@ from DQN.agent import DQN
 from common.plot import plot_rewards
 from common.utils import save_results
-SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
+SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
-SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' # 生成保存的模型路径
+SAVED_MODEL_PATH = curr_path+"/saved_model/"+SEQUENCE+'/' # path to save model
-if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"): # 检测是否存在文件夹
+if not os.path.exists(curr_path+"/saved_model/"): 
-    os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/")
+    os.mkdir(curr_path+"/saved_model/")
-if not os.path.exists(SAVED_MODEL_PATH): # 检测是否存在文件夹
+if not os.path.exists(SAVED_MODEL_PATH): 
    os.mkdir(SAVED_MODEL_PATH)
-RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/' # 存储reward的路径
+RESULT_PATH = curr_path+"/results/"+SEQUENCE+'/' # path to save rewards
-if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/results/"): # 检测是否存在文件夹
+if not os.path.exists(curr_path+"/results/"): 
-    os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/results/")
+    os.mkdir(curr_path+"/results/")
-if not os.path.exists(RESULT_PATH): # 检测是否存在文件夹
+if not os.path.exists(RESULT_PATH): 
    os.mkdir(RESULT_PATH)
 class DQNConfig:
    def __init__(self):
-        self.algo = "DQN" # 算法名称
+        self.algo = "DQN"  # name of algo
-        self.gamma = 0.99
+        self.gamma = 0.95
-        self.epsilon_start = 0.95 # e-greedy策略的初始epsilon
+        self.epsilon_start = 1 # e-greedy策略的初始epsilon
        self.epsilon_end = 0.01
-        self.epsilon_decay = 200
+        self.epsilon_decay = 500
-        self.lr = 0.01 # 学习率
+        self.lr = 0.0001 # learning rate
-        self.memory_capacity = 800 # Replay Memory容量
+        self.memory_capacity = 10000 # Replay Memory容量
-        self.batch_size = 64
+        self.batch_size = 32
        self.train_eps = 300 # 训练的episode数目
        self.train_steps = 200 # 训练每个episode的最大长度
        self.target_update = 2 # target net的更新频率
        self.eval_eps = 20 # 测试的episode数目
        self.eval_steps = 200 # 测试每个episode的最大长度
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu
-        self.hidden_dim = 128 # 神经网络隐藏层维度
+        self.hidden_dim = 256 # 神经网络隐藏层维度
 def train(cfg,env,agent):
    print('Start to train !')
    rewards = []
-    ma_rewards = [] # 滑动平均的reward
+    ma_rewards = [] # moveing average reward
    ep_steps = []
    for i_episode in range(cfg.train_eps):
-        state = env.reset() # reset环境状态
+        state = env.reset() 
        done = False
        ep_reward = 0
-        for i_step in range(cfg.train_steps):
+        while not done:
-            action = agent.choose_action(state) # 根据当前环境state选择action
+            action = agent.choose_action(state) 
-            next_state, reward, done, _ = env.step(action) # 更新环境参数
+            next_state, reward, done, _ = env.step(action) 
            ep_reward += reward
-            agent.memory.push(state, action, reward, next_state, done) # 将state等这些transition存入memory
+            agent.memory.push(state, action, reward, next_state, done) 
-            state = next_state # 跳转到下一个状态
+            state = next_state 
-            agent.update() # 每步更新网络
+            agent.update() 
            if done:
                break
        # 更新target network，复制DQN中的所有weights and biases
        if i_episode % cfg.target_update == 0:
            agent.target_net.load_state_dict(agent.policy_net.state_dict())
-        print('Episode:{}/{}, Reward:{}, Steps:{}, Done:{}'.format(i_episode+1,cfg.train_eps,ep_reward,i_step+1,done))
+        print('Episode:{}/{}, Reward:{}'.format(i_episode+1,cfg.train_eps,ep_reward))
        ep_steps.append(i_step)
        rewards.append(ep_reward)
        # 计算滑动窗口的reward
        if ma_rewards:
@@ -82,8 +81,8 @@ def train(cfg,env,agent):
 if __name__ == "__main__":
    cfg = DQNConfig()
-    env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym，此处一般不需要
+    env = gym.make('CartPole-v0')
-    env.seed(1) # 设置env随机种子
+    env.seed(1)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    agent = DQN(state_dim,action_dim,cfg)
--- a/codes/DQN/results/20210313-140409/ma_rewards_train.npy
+++ b/codes/DQN/results/20210313-140409/ma_rewards_train.npy
--- a/codes/DQN/results/20210313-140409/rewards_curve_train.png
+++ b/codes/DQN/results/20210313-140409/rewards_curve_train.png
--- a/codes/DQN/results/20210313-140409/rewards_train.npy
+++ b/codes/DQN/results/20210313-140409/rewards_train.npy
--- a/codes/DQN/results/20210326-171704/ma_rewards_train.npy
+++ b/codes/DQN/results/20210326-171704/ma_rewards_train.npy
--- a/codes/DQN/results/20210326-171704/rewards_curve_train.png
+++ b/codes/DQN/results/20210326-171704/rewards_curve_train.png
--- a/codes/DQN/results/20210326-171704/rewards_train.npy
+++ b/codes/DQN/results/20210326-171704/rewards_train.npy
--- a/codes/DQN/results/20210326-171722/ma_rewards_train.npy
+++ b/codes/DQN/results/20210326-171722/ma_rewards_train.npy
--- a/codes/DQN/results/20210326-171722/rewards_curve_train.png
+++ b/codes/DQN/results/20210326-171722/rewards_curve_train.png
--- a/codes/DQN/results/20210326-171722/rewards_train.npy
+++ b/codes/DQN/results/20210326-171722/rewards_train.npy
--- a/codes/DQN/results/20210330-150205/ma_rewards_train.npy
+++ b/codes/DQN/results/20210330-150205/ma_rewards_train.npy
--- a/codes/DQN/results/20210330-150205/rewards_curve_train.png
+++ b/codes/DQN/results/20210330-150205/rewards_curve_train.png
--- a/codes/DQN/results/20210330-150205/rewards_train.npy
+++ b/codes/DQN/results/20210330-150205/rewards_train.npy
--- a/codes/DQN/results/20210330-165925/ma_rewards_train.npy
+++ b/codes/DQN/results/20210330-165925/ma_rewards_train.npy
--- a/codes/DQN/results/20210330-165925/rewards_curve_train.png
+++ b/codes/DQN/results/20210330-165925/rewards_curve_train.png
--- a/codes/DQN/results/20210330-165925/rewards_train.npy
+++ b/codes/DQN/results/20210330-165925/rewards_train.npy
--- a/codes/DQN/saved_model/20210313-140409/dqn_checkpoint.pth
+++ b/codes/DQN/saved_model/20210313-140409/dqn_checkpoint.pth
--- a/codes/DQN/saved_model/20210326-171704/dqn_checkpoint.pth
+++ b/codes/DQN/saved_model/20210326-171704/dqn_checkpoint.pth
--- a/codes/DQN/saved_model/20210326-171722/dqn_checkpoint.pth
+++ b/codes/DQN/saved_model/20210326-171722/dqn_checkpoint.pth
--- a/codes/DQN/saved_model/20210330-150205/dqn_checkpoint.pth
+++ b/codes/DQN/saved_model/20210330-150205/dqn_checkpoint.pth
--- a/codes/DQN/saved_model/20210330-165925/dqn_checkpoint.pth
+++ b/codes/DQN/saved_model/20210330-165925/dqn_checkpoint.pth
--- a/codes/DQN_cnn/main.py
+++ b/codes/DQN_cnn/main.py
@@ -5,12 +5,17 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-11 10:01:09
@LastEditor: John
-LastEditTime: 2021-03-23 20:43:28
+LastEditTime: 2021-03-29 20:23:48
@Discription: 
@Environment: python 3.7.7
 '''
 import sys,os
-sys.path.append(os.getcwd()) # add current terminal path to sys.path
+from pathlib import Path
 import sys,os
 curr_path = os.path.dirname(__file__)
 parent_path=os.path.dirname(curr_path) 
 sys.path.append(parent_path) # add current terminal path to sys.path
 import gym
 import torch
 import datetime
@@ -19,17 +24,15 @@ from DQN_cnn.agent import DQNcnn
 from common.plot import plot_rewards
 from common.utils import save_results
 sys.path.append(os.getcwd())  # add current terminal path to sys.path
 SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
-SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' # path to save model
+SAVED_MODEL_PATH = curr_path+"/saved_model/"+SEQUENCE+'/' # path to save model
-if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"): 
+if not os.path.exists(curr_path+"/saved_model/"): 
-    os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/")
+    os.mkdir(curr_path+"/saved_model/")
 if not os.path.exists(SAVED_MODEL_PATH): 
    os.mkdir(SAVED_MODEL_PATH)
-RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/' # path to save rewards
+RESULT_PATH = curr_path+"/results/"+SEQUENCE+'/' # path to save rewards
-if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/results/"): 
+if not os.path.exists(curr_path+"/results/"): 
-    os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/results/")
+    os.mkdir(curr_path+"/results/")
 if not os.path.exists(RESULT_PATH): 
    os.mkdir(RESULT_PATH)
--- a/codes/DoubleDQN/memory.py
+++ b/codes/DoubleDQN/memory.py
@@ -1,40 +0,0 @@
 #!/usr/bin/env python
 # coding=utf-8
 '''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-10 15:27:16
@LastEditor: John
 LastEditTime: 2021-01-20 18:58:37
@Discription: 
@Environment: python 3.7.7
 '''
 import random
 class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity # buffer的最大容量
        self.buffer = []
        self.position = 0
    def push(self, state, action, reward, next_state, done):
        '''以队列的方式将样本填入buffer中
        '''
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state, done)
        self.position = (self.position + 1) % self.capacity
    def sample(self, batch_size):
        '''随机采样batch_size个样本
        '''
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done =  zip(*batch)
        return state, action, reward, next_state, done
    def __len__(self):
        '''返回buffer的长度
        '''
        return len(self.buffer)
--- a/codes/DoubleDQN/model.py
+++ b/codes/DoubleDQN/model.py
@@ -1,30 +0,0 @@
 #!/usr/bin/env python
 # coding=utf-8
 '''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:47:02
@LastEditor: John
 LastEditTime: 2020-08-19 16:55:54
@Discription: 
@Environment: python 3.7.7
 '''
 import torch.nn as nn
 import torch.nn.functional as F
 class MLP(nn.Module):
    def __init__(self, n_states=4, n_actions=18):
        """ 初始化q网络，为全连接网络
            n_states: 输入的feature即环境的state数目
            n_actions: 输出的action总个数
        """
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(n_states, 128) # 输入层
        self.fc2 = nn.Linear(128, 128) # 隐藏层
        self.fc3 = nn.Linear(128, n_actions) # 输出层
    def forward(self, x):
        # 各层对应的激活函数
        x = F.relu(self.fc1(x)) 
        x = F.relu(self.fc2(x))
        return self.fc3(x)
--- a/codes/DoubleDQN/params.py
+++ b/codes/DoubleDQN/params.py
@@ -1,51 +0,0 @@
 #!/usr/bin/env python
 # coding=utf-8
 '''
 Author: John
 Email: johnjim0816@gmail.com
 Date: 2020-12-22 15:22:17
 LastEditor: John
 LastEditTime: 2021-01-21 14:30:38
 Discription: 
 Environment: 
 '''
 import datetime
 import os
 import argparse
 ALGO_NAME = 'Double DQN'
 SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
 SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/'
 RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/'
 TRAIN_LOG_DIR=os.path.split(os.path.abspath(__file__))[0]+"/logs/train/" + SEQUENCE
 EVAL_LOG_DIR=os.path.split(os.path.abspath(__file__))[0]+"/logs/eval/" + SEQUENCE
 def get_args():
    '''模型参数
    '''
    parser = argparse.ArgumentParser()
    parser.add_argument("--train", default=1, type=int)  # 1 表示训练，0表示只进行eval
    parser.add_argument("--gamma", default=0.99,
                        type=float)  # q-learning中的gamma
    parser.add_argument("--epsilon_start", default=0.95,
                        type=float)  # 基于贪心选择action对应的参数epsilon
    parser.add_argument("--epsilon_end", default=0.01, type=float)
    parser.add_argument("--epsilon_decay", default=500, type=float)
    parser.add_argument("--policy_lr", default=0.01, type=float)
    parser.add_argument("--memory_capacity", default=1000,
                        type=int, help="capacity of Replay Memory") 
    parser.add_argument("--batch_size", default=32, type=int,
                        help="batch size of memory sampling")
    parser.add_argument("--train_eps", default=200, type=int) # 训练的最大episode数目
    parser.add_argument("--train_steps", default=200, type=int)
    parser.add_argument("--target_update", default=2, type=int,
                        help="when(every default 2 eisodes) to update target net ") # 更新频率
    parser.add_argument("--eval_eps", default=100, type=int)  # 训练的最大episode数目
    parser.add_argument("--eval_steps", default=200,
                        type=int)  # 训练每个episode的长度
    config = parser.parse_args()
    return config
--- a/codes/DoubleDQN/plot.py
+++ b/codes/DoubleDQN/plot.py
@@ -1,48 +0,0 @@
 #!/usr/bin/env python
 # coding=utf-8
 '''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-11 16:30:09
@LastEditor: John
 LastEditTime: 2020-12-22 15:24:31
@Discription: 
@Environment: python 3.7.7
 '''
 import matplotlib.pyplot as plt
 import seaborn as sns
 import numpy as np
 import os 
 from params import ALGO_NAME
 def plot(item,ylabel='rewards_train', save_fig = True):
    '''plot using searborn to plot 
    '''
    sns.set()
    plt.figure()
    plt.plot(np.arange(len(item)), item)
    plt.title(ylabel+' of '+ALGO_NAME) 
    plt.ylabel(ylabel)
    plt.xlabel('episodes')
    if save_fig:
        plt.savefig(os.path.dirname(__file__)+"/results/"+ylabel+".png")
    plt.show()
    # plt.show()
 if __name__ == "__main__":
    output_path = os.path.split(os.path.abspath(__file__))[0]+"/results/"
    tag = 'train'
    rewards=np.load(output_path+"rewards_"+tag+".npy", )
    moving_average_rewards=np.load(output_path+"moving_average_rewards_"+tag+".npy",)
    steps=np.load(output_path+"steps_"+tag+".npy")
    plot(rewards)
    plot(moving_average_rewards,ylabel='moving_average_rewards_'+tag)
    plot(steps,ylabel='steps_'+tag)
    tag = 'eval'
    rewards=np.load(output_path+"rewards_"+tag+".npy", )
    moving_average_rewards=np.load(output_path+"moving_average_rewards_"+tag+".npy",)
    steps=np.load(output_path+"steps_"+tag+".npy")
    plot(rewards,ylabel='rewards_'+tag)
    plot(moving_average_rewards,ylabel='moving_average_rewards_'+tag)
    plot(steps,ylabel='steps_'+tag) 
--- a/codes/DoubleDQN/results/20210317-010120/ma_rewards_train.npy
+++ b/codes/DoubleDQN/results/20210317-010120/ma_rewards_train.npy
--- a/codes/DoubleDQN/results/20210317-010120/rewards_curve_train.png
+++ b/codes/DoubleDQN/results/20210317-010120/rewards_curve_train.png
--- a/codes/DoubleDQN/results/20210317-010120/rewards_train.npy
+++ b/codes/DoubleDQN/results/20210317-010120/rewards_train.npy
--- a/codes/DoubleDQN/saved_model/20210317-010120/DoubleDQN_checkpoint.pth
+++ b/codes/DoubleDQN/saved_model/20210317-010120/DoubleDQN_checkpoint.pth
--- a/codes/HierarchicalDQN/README.md
+++ b/codes/HierarchicalDQN/README.md
@@ -0,0 +1,13 @@
 # Hierarchical DQN
 ## 原理简介
 Hierarchical DQN是一种分层强化学习方法，与DQN相比增加了一个meta controller，
 ![image-20210331153115575](assets/image-20210331153115575.png)
 即学习时，meta controller每次会生成一个goal，然后controller或者说下面的actor就会达到这个goal，直到done为止。这就相当于给agent增加了一个队长，队长擅长制定局部目标，指导agent前行，这样应对一些每回合步数较长或者稀疏奖励的问题会有所帮助。
 ## 伪代码
 ![image-20210331153542314](assets/image-20210331153542314.png)
--- a/codes/HierarchicalDQN/agent.py
+++ b/codes/HierarchicalDQN/agent.py
@@ -5,7 +5,7 @@ Author: John
 Email: johnjim0816@gmail.com
 Date: 2021-03-24 22:18:18
 LastEditor: John
-LastEditTime: 2021-03-27 04:24:30
+LastEditTime: 2021-03-31 14:51:09
 Discription: 
 Environment: 
 '''
@@ -13,90 +13,103 @@ import torch
 import torch.nn as nn
 import numpy as np
 import random,math
 from HierarchicalDQN.model import MLP
 from common.memory import ReplayBuffer
 import torch.optim as optim
 from common.model import MLP
 from common.memory import ReplayBuffer
 class HierarchicalDQN:
    def __init__(self,state_dim,action_dim,cfg):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.gamma = cfg.gamma
        self.device = cfg.device
        self.batch_size = cfg.batch_size
-        self.sample_count = 0 
+        self.frame_idx = 0 
-        self.epsilon = 0
+        self.epsilon = lambda frame_idx: cfg.epsilon_end + (cfg.epsilon_start - cfg.epsilon_end ) * math.exp(-1. * frame_idx / cfg.epsilon_decay)
        self.epsilon_start = cfg.epsilon_start
        self.epsilon_end = cfg.epsilon_end
        self.epsilon_decay = cfg.epsilon_decay
        self.batch_size = cfg.batch_size
        self.policy_net = MLP(2*state_dim, action_dim,cfg.hidden_dim).to(self.device)
-        self.target_net = MLP(2*state_dim, action_dim,cfg.hidden_dim).to(self.device)
+        self.meta_policy_net = MLP(state_dim, state_dim,cfg.hidden_dim).to(self.device)
        self.meta_policy_net  = MLP(state_dim, state_dim,cfg.hidden_dim).to(self.device)
        self.meta_target_net = MLP(state_dim, state_dim,cfg.hidden_dim).to(self.device)
        self.optimizer = optim.Adam(self.policy_net.parameters(),lr=cfg.lr)
        self.meta_optimizer = optim.Adam(self.meta_policy_net.parameters(),lr=cfg.lr)
        self.memory = ReplayBuffer(cfg.memory_capacity)
        self.meta_memory = ReplayBuffer(cfg.memory_capacity)
-    def to_onehot(x):
+        self.loss_numpy  = 0
-        oh = np.zeros(6)
+        self.meta_loss_numpy  = 0
        self.losses = []
        self.meta_losses = []
    def to_onehot(self,x):
        oh = np.zeros(self.state_dim)
        oh[x - 1] = 1.
        return oh
-    def set_goal(self,meta_state):
+    def set_goal(self,state):
-        self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * math.exp(-1. * self.sample_count / self.epsilon_decay)
+        if random.random() > self.epsilon(self.frame_idx):
        self.sample_count += 1
        if random.random() > self.epsilon:
            with torch.no_grad():
-                meta_state = torch.tensor([meta_state], device=self.device, dtype=torch.float32)
+                state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(0)
-                q_value = self.policy_net(meta_state)
+                goal = self.meta_policy_net(state).max(1)[1].item() 
                goal = q_value.max(1)[1].item() 
        else:
-            goal = random.randrange(self.action_dim)
+            goal = random.randrange(self.state_dim)
-        goal = self.meta_policy_net(meta_state)
+        return goal
        onehot_goal = self.to_onehot(goal)
        return onehot_goal
    def choose_action(self,state):
-        self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * math.exp(-1. * self.sample_count / self.epsilon_decay)
+        self.frame_idx += 1
-        self.sample_count += 1
+        if random.random() > self.epsilon(self.frame_idx):
        if random.random() > self.epsilon:
            with torch.no_grad():
-                state = torch.tensor([state], device=self.device, dtype=torch.float32)
+                state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(0)
                q_value = self.policy_net(state)
                action = q_value.max(1)[1].item()  
        else:
            action = random.randrange(self.action_dim)
        return action
    def update(self):
        self.update_policy()
        self.update_meta()
    def update_policy(self): 
        if self.batch_size > len(self.memory):
-            state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(self.batch_size)
+            return
-        state_batch = torch.tensor(
+        state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(self.batch_size)
-            state_batch, device=self.device, dtype=torch.float)
+        state_batch = torch.tensor(state_batch,dtype=torch.float)
-        action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1)  
+        action_batch = torch.tensor(action_batch,dtype=torch.int64).unsqueeze(1)  
-        reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float)  
+        reward_batch = torch.tensor(reward_batch,dtype=torch.float)  
-        next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float)
+        next_state_batch = torch.tensor(next_state_batch, dtype=torch.float)
-        done_batch = torch.tensor(np.float32(done_batch), device=self.device).unsqueeze(1)  
+        done_batch = torch.tensor(np.float32(done_batch))
-        q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch)
+        q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch).squeeze(1)
-        next_state_values = self.target_net(next_state_batch).max(1)[0].detach()  
+        next_state_values = self.policy_net(next_state_batch).max(1)[0].detach()
-        expected_q_values = reward_batch + self.gamma * next_state_values * (1-done_batch[0])
+        expected_q_values = reward_batch + 0.99 * next_state_values * (1-done_batch)
-        loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) 
+        loss = nn.MSELoss()(q_values, expected_q_values) 
        self.optimizer.zero_grad() 
        loss.backward()
-        for param in self.policy_net.parameters(): 
+        for param in self.policy_net.parameters():  # clip防止梯度爆炸
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()  
-
+        self.loss_numpy = loss.detach().numpy()
        self.losses.append(self.loss_numpy)  
    def update_meta(self):
        if self.batch_size > len(self.meta_memory):
-            meta_state_batch, meta_action_batch, meta_reward_batch, next_meta_state_batch, meta_done_batch = self.memory.sample(self.batch_size)
+            return
-        meta_state_batch = torch.tensor(meta_state_batch, device=self.device, dtype=torch.float)
+        state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.meta_memory.sample(self.batch_size)
-        meta_action_batch = torch.tensor(meta_action_batch, device=self.device).unsqueeze(1)  
+        state_batch = torch.tensor(state_batch,dtype=torch.float)
-        meta_reward_batch = torch.tensor(meta_reward_batch, device=self.device, dtype=torch.float)  
+        action_batch = torch.tensor(action_batch,dtype=torch.int64).unsqueeze(1)  
-        next_meta_state_batch = torch.tensor(next_meta_state_batch, device=self.device, dtype=torch.float)
+        reward_batch = torch.tensor(reward_batch,dtype=torch.float)  
-        meta_done_batch = torch.tensor(np.float32(meta_done_batch), device=self.device).unsqueeze(1)  
+        next_state_batch = torch.tensor(next_state_batch, dtype=torch.float)
-        meta_q_values = self.meta_policy_net(meta_state_batch).gather(dim=1, index=meta_action_batch)
+        done_batch = torch.tensor(np.float32(done_batch))
-        next_state_values = self.target_net(next_meta_state_batch).max(1)[0].detach()  
+        q_values = self.meta_policy_net(state_batch).gather(dim=1, index=action_batch).squeeze(1)
-        expected_meta_q_values = meta_reward_batch + self.gamma * next_state_values * (1-meta_done_batch[0])
+        next_state_values = self.meta_policy_net(next_state_batch).max(1)[0].detach()
-        meta_loss = nn.MSEmeta_loss()(meta_q_values, expected_meta_q_values.unsqueeze(1)) 
+        expected_q_values = reward_batch + 0.99 * next_state_values * (1-done_batch)
        meta_loss = nn.MSELoss()(q_values, expected_q_values) 
        self.meta_optimizer.zero_grad() 
        meta_loss.backward()
-        for param in self.meta_policy_net.parameters(): 
+        for param in self.meta_policy_net.parameters():  # clip防止梯度爆炸
            param.grad.data.clamp_(-1, 1)
        self.meta_optimizer.step() 
        self.meta_loss_numpy = meta_loss.detach().numpy()
        self.meta_losses.append(self.meta_loss_numpy)
    def save(self, path):
        torch.save(self.policy_net.state_dict(), path+'policy_checkpoint.pth')
        torch.save(self.meta_policy_net.state_dict(), path+'meta_checkpoint.pth')
    def load(self, path):
        self.policy_net.load_state_dict(torch.load(path+'policy_checkpoint.pth'))
        self.meta_policy_net.load_state_dict(torch.load(path+'meta_checkpoint.pth'))
--- a/codes/HierarchicalDQN/assets/image-20210331153115575.png
+++ b/codes/HierarchicalDQN/assets/image-20210331153115575.png
--- a/codes/HierarchicalDQN/assets/image-20210331153542314.png
+++ b/codes/HierarchicalDQN/assets/image-20210331153542314.png
--- a/codes/HierarchicalDQN/main.ipynb
+++ b/codes/HierarchicalDQN/main.ipynb
--- a/codes/HierarchicalDQN/main.py
+++ b/codes/HierarchicalDQN/main.py
@@ -3,95 +3,108 @@
 '''
 Author: John
 Email: johnjim0816@gmail.com
-Date: 2021-03-24 22:14:04
+Date: 2021-03-29 10:37:32
 LastEditor: John
-LastEditTime: 2021-03-27 04:23:43
+LastEditTime: 2021-03-31 14:58:49
 Discription: 
 Environment: 
 '''
 import sys,os
-sys.path.append(os.getcwd()) # add current terminal path to sys.path
+curr_path = os.path.dirname(__file__)
-import gym
+parent_path = os.path.dirname(curr_path)
 sys.path.append(parent_path)  # add current terminal path to sys.path
 import datetime
 import numpy as np
 import torch
-import datetime
+import gym
 from HierarchicalDQN.agent import HierarchicalDQN
 from common.plot import plot_rewards
 from common.utils import save_results
-SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
+from common.utils import save_results
-SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/'  # path to save model
+from common.plot import plot_rewards,plot_losses
-if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"): 
+from HierarchicalDQN.agent import HierarchicalDQN
-    os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/")
+
 SEQUENCE = datetime.datetime.now().strftime(
    "%Y%m%d-%H%M%S")  # obtain current time
 SAVED_MODEL_PATH = curr_path+"/saved_model/"+SEQUENCE+'/'  # path to save model
 if not os.path.exists(curr_path+"/saved_model/"):
    os.mkdir(curr_path+"/saved_model/")
 if not os.path.exists(SAVED_MODEL_PATH):
    os.mkdir(SAVED_MODEL_PATH)
-RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/' # path to save rewards
+RESULT_PATH = curr_path+"/results/"+SEQUENCE+'/'  # path to save rewards
-if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/results/"): 
+if not os.path.exists(curr_path+"/results/"):
-    os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/results/")
+    os.mkdir(curr_path+"/results/")
 if not os.path.exists(RESULT_PATH):
    os.mkdir(RESULT_PATH)
 class HierarchicalDQNConfig:
    def __init__(self):
-        self.algo = "DQN" # name of algo
+        self.algo = "H-DQN"  # name of algo
        self.gamma = 0.99
-        self.epsilon_start = 0.95 # start epsilon of e-greedy policy
+        self.epsilon_start = 1  # start epsilon of e-greedy policy
        self.epsilon_end = 0.01
        self.epsilon_decay = 200
-        self.lr = 0.01 # learning rate
+        self.lr = 0.0001  # learning rate
-        self.memory_capacity = 800 # Replay Memory capacity
+        self.memory_capacity = 10000  # Replay Memory capacity
-        self.batch_size = 64
+        self.batch_size = 32
-        self.train_eps = 250 # 训练的episode数目
+        self.train_eps = 300  # 训练的episode数目
-        self.train_steps = 200 # 训练每个episode的最大长度
+        self.target_update = 2  # target net的更新频率
-        self.target_update = 2 # target net的更新频率
+        self.eval_eps = 20  # 测试的episode数目
-        self.eval_eps = 20 # 测试的episode数目
+        self.device = torch.device(
-        self.eval_steps = 200 # 测试每个episode的最大长度
+            "cuda" if torch.cuda.is_available() else "cpu")  # 检测gpu
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu
+        self.hidden_dim = 256  # dimension of hidden layer
        self.hidden_dim = 256 # dimension of hidden layer
-def train(cfg,env,agent):
+
 def train(cfg, env, agent):
    print('Start to train !')
    rewards = []
-    ma_rewards = [] # moving average reward
+    ma_rewards = []  # moveing average reward
    ep_steps = []
    for i_episode in range(cfg.train_eps):
        state = env.reset()
-        extrinsic_reward = 0
+        done = False
-        for i_step in range(cfg.train_steps):
+        ep_reward = 0
-            goal= agent.set_goal(state)
+        while not done:
            goal = agent.set_goal(state)
            onehot_goal = agent.to_onehot(goal)
            meta_state = state
-            goal_state  = np.concatenate([state, goal])
+            extrinsic_reward = 0
-            action = agent.choose_action(state) 
+            while not done and goal != np.argmax(state):
-            next_state, reward, done, _ = env.step(action)
+                goal_state = np.concatenate([state, onehot_goal])
-            extrinsic_reward += reward
+                action = agent.choose_action(goal_state)
-            intrinsic_reward = 1.0 if goal == np.argmax(next_state) else 0.0
+                next_state, reward, done, _ = env.step(action)
-            agent.memory.push(goal_state, action, intrinsic_reward, np.concatenate([next_state, goal]), done)
+                ep_reward += reward
-            state = next_state 
+                extrinsic_reward += reward
-            agent.update()
+                intrinsic_reward = 1.0 if goal == np.argmax(
-            if done:
+                    next_state) else 0.0
-                break
+                agent.memory.push(goal_state, action, intrinsic_reward, np.concatenate(
-        if i_episode % cfg.target_update == 0:
+                    [next_state, onehot_goal]), done)
-            agent.target_net.load_state_dict(agent.policy_net.state_dict())
+                state = next_state
-        print('Episode:{}/{}, Reward:{}, Steps:{}, Done:{}'.format(i_episode+1,cfg.train_eps,extrinsic_reward,i_step+1,done))
+                agent.update()
-        ep_steps.append(i_step)
+        agent.meta_memory.push(meta_state, goal, extrinsic_reward, state, done)
-        rewards.append(extrinsic_reward)
+        print('Episode:{}/{}, Reward:{}, Loss:{:.2f}, Meta_Loss:{:.2f}'.format(i_episode+1, cfg.train_eps, ep_reward,agent.loss_numpy ,agent.meta_loss_numpy ))
        rewards.append(ep_reward)
        if ma_rewards:
            ma_rewards.append(
-                0.9*ma_rewards[-1]+0.1*extrinsic_reward)
+                0.9*ma_rewards[-1]+0.1*ep_reward)
        else:
-            ma_rewards.append(extrinsic_reward)   
+            ma_rewards.append(ep_reward)
    agent.meta_memory.push(meta_state, goal, extrinsic_reward, state, done)
    print('Complete training！')
-    return rewards,ma_rewards
+    return rewards, ma_rewards
 if __name__ == "__main__":
    cfg = HierarchicalDQNConfig()
    env = gym.make('CartPole-v0')
    env.seed(1)
    cfg = HierarchicalDQNConfig()
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
-    agent = HierarchicalDQN(state_dim,action_dim,cfg)
+    agent = HierarchicalDQN(state_dim, action_dim, cfg)
-    rewards,ma_rewards = train(cfg,env,agent)
+    rewards, ma_rewards = train(cfg, env, agent)
    agent.save(path=SAVED_MODEL_PATH)
-    save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
+    save_results(rewards, ma_rewards, tag='train', path=RESULT_PATH)
-    plot_rewards(rewards,ma_rewards,tag="train",algo = cfg.algo,path=RESULT_PATH)
+    plot_rewards(rewards, ma_rewards, tag="train",
                 algo=cfg.algo, path=RESULT_PATH)
    plot_losses(agent.losses,algo=cfg.algo, path=RESULT_PATH)
--- a/codes/HierarchicalDQN/model.py
+++ b/codes/HierarchicalDQN/model.py
@@ -1,24 +0,0 @@
 #!/usr/bin/env python
 # coding=utf-8
 '''
 Author: John
 Email: johnjim0816@gmail.com
 Date: 2021-03-24 22:14:12
 LastEditor: John
 LastEditTime: 2021-03-24 22:17:09
 Discription: 
 Environment: 
 '''
 import torch.nn as nn
 import torch.nn.functional as F
 class MLP(nn.Module):
    def __init__(self, state_dim,action_dim,hidden_dim=128):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim) 
        self.fc2 = nn.Linear(hidden_dim,hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, action_dim) 
    def forward(self, x):
        x = F.relu(self.fc1(x)) 
        x = F.relu(self.fc2(x))
        return self.fc3(x)
--- a/codes/HierarchicalDQN/results/20210331-134559/ma_rewards_train.npy
+++ b/codes/HierarchicalDQN/results/20210331-134559/ma_rewards_train.npy
--- a/codes/HierarchicalDQN/results/20210331-134559/rewards_curve_train.png
+++ b/codes/HierarchicalDQN/results/20210331-134559/rewards_curve_train.png
--- a/codes/HierarchicalDQN/results/20210331-134559/rewards_train.npy
+++ b/codes/HierarchicalDQN/results/20210331-134559/rewards_train.npy
--- a/codes/HierarchicalDQN/results/20210331-145852/losses_curve.png
+++ b/codes/HierarchicalDQN/results/20210331-145852/losses_curve.png
--- a/codes/HierarchicalDQN/results/20210331-145852/ma_rewards_train.npy
+++ b/codes/HierarchicalDQN/results/20210331-145852/ma_rewards_train.npy
--- a/codes/HierarchicalDQN/results/20210331-145852/rewards_curve_train.png
+++ b/codes/HierarchicalDQN/results/20210331-145852/rewards_curve_train.png
--- a/codes/HierarchicalDQN/results/20210331-145852/rewards_train.npy
+++ b/codes/HierarchicalDQN/results/20210331-145852/rewards_train.npy
--- a/codes/HierarchicalDQN/saved_model/20210331-134559/meta_checkpoint.pth
+++ b/codes/HierarchicalDQN/saved_model/20210331-134559/meta_checkpoint.pth
--- a/codes/HierarchicalDQN/saved_model/20210331-134559/policy_checkpoint.pth
+++ b/codes/HierarchicalDQN/saved_model/20210331-134559/policy_checkpoint.pth
--- a/codes/HierarchicalDQN/saved_model/20210331-145852/meta_checkpoint.pth
+++ b/codes/HierarchicalDQN/saved_model/20210331-145852/meta_checkpoint.pth
--- a/codes/HierarchicalDQN/saved_model/20210331-145852/policy_checkpoint.pth
+++ b/codes/HierarchicalDQN/saved_model/20210331-145852/policy_checkpoint.pth
--- a/codes/QLearning/results/20210313-110213/ma_rewards_train.npy
+++ b/codes/QLearning/results/20210313-110213/ma_rewards_train.npy
--- a/codes/QLearning/results/20210313-110213/rewards_curve_train.png
+++ b/codes/QLearning/results/20210313-110213/rewards_curve_train.png
--- a/codes/QLearning/results/20210313-110213/rewards_train.npy
+++ b/codes/QLearning/results/20210313-110213/rewards_train.npy
--- a/codes/QLearning/saved_model/20210313-110213/Qleaning_model.pkl
+++ b/codes/QLearning/saved_model/20210313-110213/Qleaning_model.pkl
--- a/codes/README.md
+++ b/codes/README.md
@@ -19,9 +19,10 @@
 ## 运行环境
 python 3.7、pytorch 1.6.0-1.7.1、gym 0.17.0-0.18.0
 ## 使用说明
-对应算法文件夹下运行```main.py```即可
+运行```main.py```或者```main.ipynb```
 ## 算法进度
 |                 算法名称                 |                        相关论文材料                         | 环境                                  |                备注                |
@@ -29,17 +30,17 @@ python 3.7、pytorch 1.6.0-1.7.1、gym 0.17.0-0.18.0
 | [On-Policy First-Visit MC](./MonteCarlo) |                                                             | [Racetrack](./envs/racetrack_env.md)  |                                    |
 |        [Q-Learning](./QLearning)         |                                                             | [CliffWalking-v0](./envs/gym_info.md) |                                    |
 |             [Sarsa](./Sarsa)             |                                                             | [Racetrack](./envs/racetrack_env.md)  |                                    |
-|               [DQN](./DQN)               | [DQN-paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](./envs/gym_info.md)     |                                    |
+|               [DQN](./DQN)               | [DQN Paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](./envs/gym_info.md)     |                                    |
-|                 [DQN-cnn](./DQN_cnn)                  | [DQN-paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](./envs/gym_info.md)     | 与DQN相比使用了CNN而不是全链接网络 |
+|           [DQN-cnn](./DQN_cnn)           | [DQN Paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](./envs/gym_info.md)     | 与DQN相比使用了CNN而不是全链接网络 |
 |         [DoubleDQN](./DoubleDQN)         |                                                             | [CartPole-v0](./envs/gym_info.md)     |          效果不好，待改进          |
-|             Hierarchical DQN             |    [Hierarchical DQN](https://arxiv.org/abs/1604.06057)     |                                       |                                    |
+|             Hierarchical DQN             |       [H-DQN Paper](https://arxiv.org/abs/1604.06057)       |                                       |                                    |
 |    [PolicyGradient](./PolicyGradient)    |                                                             | [CartPole-v0](./envs/gym_info.md)     |                                    |
 |                   A2C                    |                                                             | [CartPole-v0](./envs/gym_info.md)     |                                    |
 |                   A3C                    |                                                             |                                       |                                    |
 |                   SAC                    |                                                             |                                       |                                    |
 |               [PPO](./PPO)               |        [PPO paper](https://arxiv.org/abs/1707.06347)        | [CartPole-v0](./envs/gym_info.md)     |                                    |
 |                   DDPG                   |       [DDPG Paper](https://arxiv.org/abs/1509.02971)        | [Pendulum-v0](./envs/gym_info.md)     |                                    |
-|                   TD3                    | [Twin Dueling DDPG Paper](https://arxiv.org/abs/1802.09477) |                                       |                                    |
+|                   TD3                    |        [TD3 Paper](https://arxiv.org/abs/1802.09477)        |                                       |                                    |
 |                   GAIL                   |                                                             |                                       |                                    |
--- a/codes/README_en.md
+++ b/codes/README_en.md
@@ -24,7 +24,7 @@ Note that ```model.py```,```memory.py```,```plot.py``` shall be utilized in diff
 python 3.7.9、pytorch 1.6.0、gym 0.18.0
 ## Usage
-Environment infomations see [环境说明](https://github.com/JohnJim0816/reinforcement-learning-tutorials/blob/master/env_info.md)
+run ```main.py``` or ```main.ipynb```
 ## Schedule
--- a/codes/common/model.py
+++ b/codes/common/model.py
@@ -5,7 +5,7 @@ Author: John
 Email: johnjim0816@gmail.com
 Date: 2021-03-12 21:14:12
 LastEditor: John
-LastEditTime: 2021-03-24 22:15:00
+LastEditTime: 2021-03-31 13:49:06
 Discription: 
 Environment: 
 '''
@@ -15,15 +15,15 @@ import torch.nn.functional as F
 from torch.distributions import Categorical
 class MLP(nn.Module):
-    def __init__(self, state_dim,action_dim,hidden_dim=128):
+    def __init__(self, input_dim,output_dim,hidden_dim=128):
        """ 初始化q网络，为全连接网络
-            state_dim: 输入的feature即环境的state数目
+            input_dim: 输入的feature即环境的state数目
-            action_dim: 输出的action总个数
+            output_dim: 输出的action总个数
        """
        super(MLP, self).__init__()
-        self.fc1 = nn.Linear(state_dim, hidden_dim) # 输入层
+        self.fc1 = nn.Linear(input_dim, hidden_dim) # 输入层
        self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
-        self.fc3 = nn.Linear(hidden_dim, action_dim) # 输出层
+        self.fc3 = nn.Linear(hidden_dim, output_dim) # 输出层
    def forward(self, x):
        # 各层对应的激活函数
@@ -32,10 +32,10 @@ class MLP(nn.Module):
        return self.fc3(x)
 class Critic(nn.Module):
-    def __init__(self, n_obs, action_dim, hidden_size, init_w=3e-3):
+    def __init__(self, n_obs, output_dim, hidden_size, init_w=3e-3):
        super(Critic, self).__init__()
-        self.linear1 = nn.Linear(n_obs + action_dim, hidden_size)
+        self.linear1 = nn.Linear(n_obs + output_dim, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.linear3 = nn.Linear(hidden_size, 1)
        # 随机初始化为较小的值
@@ -51,11 +51,11 @@ class Critic(nn.Module):
        return x
 class Actor(nn.Module):
-    def __init__(self, n_obs, action_dim, hidden_size, init_w=3e-3):
+    def __init__(self, n_obs, output_dim, hidden_size, init_w=3e-3):
        super(Actor, self).__init__()  
        self.linear1 = nn.Linear(n_obs, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
-        self.linear3 = nn.Linear(hidden_size, action_dim)
+        self.linear3 = nn.Linear(hidden_size, output_dim)
        self.linear3.weight.data.uniform_(-init_w, init_w)
        self.linear3.bias.data.uniform_(-init_w, init_w)
@@ -67,18 +67,18 @@ class Actor(nn.Module):
        return x
 class ActorCritic(nn.Module):
-    def __init__(self, state_dim, action_dim, hidden_dim=256):
+    def __init__(self, input_dim, output_dim, hidden_dim=256):
        super(ActorCritic, self).__init__()
        self.critic = nn.Sequential(
-            nn.Linear(state_dim, hidden_dim),
+            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )
        self.actor = nn.Sequential(
-            nn.Linear(state_dim, hidden_dim),
+            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
-            nn.Linear(hidden_dim, action_dim),
+            nn.Linear(hidden_dim, output_dim),
            nn.Softmax(dim=1),
        )
--- a/codes/common/plot.py
+++ b/codes/common/plot.py
@@ -5,13 +5,13 @@ Author: John
 Email: johnjim0816@gmail.com
 Date: 2020-10-07 20:57:11
 LastEditor: John
-LastEditTime: 2021-03-13 11:31:49
+LastEditTime: 2021-03-31 14:05:52
 Discription: 
 Environment: 
 '''
 import matplotlib.pyplot as plt
 import seaborn as sns
-def plot_rewards(rewards,ma_rewards,tag="train",algo = "On-Policy First-Visit MC Control",path='./'):
+def plot_rewards(rewards,ma_rewards,tag="train",algo = "DQN",path='./'):
    sns.set()
    plt.title("average learning curve of {}".format(algo))
    plt.xlabel('epsiodes')
@@ -21,3 +21,12 @@ def plot_rewards(rewards,ma_rewards,tag="train",algo = "On-Policy First-Visit MC
    plt.savefig(path+"rewards_curve_{}".format(tag))
    plt.show()
 def plot_losses(losses,algo = "DQN",path='./'):
    sns.set()
    plt.title("loss curve of {}".format(algo))
    plt.xlabel('epsiodes')
    plt.plot(losses,label='rewards')
    plt.legend()
    plt.savefig(path+"losses_curve")
    plt.show()