Merge branch 'master' of github.com:datawhalechina/easy-rl

2022-03-30 18:36:54 +08:00
parent d9ef6fc482 66d38128d9
commit e6be50d320
620 changed files with 15500 additions and 0 deletions
@@ -0,0 +1,4 @@
+.DS_STORE
+__pycache__
+.vscode
+test.py
@@ -0,0 +1,437 @@
+Attribution-NonCommercial-ShareAlike 4.0 International
+
+=======================================================================
+
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+
+Using Creative Commons Public Licenses
+
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+
+     Considerations for licensors: Our public licenses are
+     intended for use by those authorized to give the public
+     permission to use material in ways otherwise restricted by
+     copyright and certain other rights. Our licenses are
+     irrevocable. Licensors should read and understand the terms
+     and conditions of the license they choose before applying it.
+     Licensors should also secure all rights necessary before
+     applying our licenses so that the public can reuse the
+     material as expected. Licensors should clearly mark any
+     material not subject to the license. This includes other CC-
+     licensed material, or material used under an exception or
+     limitation to copyright. More considerations for licensors:
+    wiki.creativecommons.org/Considerations_for_licensors
+
+     Considerations for the public: By using one of our public
+     licenses, a licensor grants the public permission to use the
+     licensed material under specified terms and conditions. If
+     the licensor's permission is not necessary for any reason--for
+     example, because of any applicable exception or limitation to
+     copyright--then that use is not regulated by the license. Our
+     licenses grant only permissions under copyright and certain
+     other rights that a licensor has authority to grant. Use of
+     the licensed material may still be restricted for other
+     reasons, including because others have copyright or other
+     rights in the material. A licensor may make special requests,
+     such as asking that all changes be marked or described.
+     Although not required by our licenses, you are encouraged to
+     respect those requests where reasonable. More considerations
+     for the public:
+    wiki.creativecommons.org/Considerations_for_licensees
+
+=======================================================================
+
+Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International
+Public License
+
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial-ShareAlike 4.0 International Public License
+("Public License"). To the extent this Public License may be
+interpreted as a contract, You are granted the Licensed Rights in
+consideration of Your acceptance of these terms and conditions, and the
+Licensor grants You such rights in consideration of benefits the
+Licensor receives from making the Licensed Material available under
+these terms and conditions.
+
+
+Section 1 -- Definitions.
+
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+
+  b. Adapter's License means the license You apply to Your Copyright
+     and Similar Rights in Your contributions to Adapted Material in
+     accordance with the terms and conditions of this Public License.
+
+  c. BY-NC-SA Compatible License means a license listed at
+     creativecommons.org/compatiblelicenses, approved by Creative
+     Commons as essentially the equivalent of this Public License.
+
+  d. Copyright and Similar Rights means copyright and/or similar rights
+     closely related to copyright including, without limitation,
+     performance, broadcast, sound recording, and Sui Generis Database
+     Rights, without regard to how the rights are labeled or
+     categorized. For purposes of this Public License, the rights
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
+     Rights.
+
+  e. Effective Technological Measures means those measures that, in the
+     absence of proper authority, may not be circumvented under laws
+     fulfilling obligations under Article 11 of the WIPO Copyright
+     Treaty adopted on December 20, 1996, and/or similar international
+     agreements.
+
+  f. Exceptions and Limitations means fair use, fair dealing, and/or
+     any other exception or limitation to Copyright and Similar Rights
+     that applies to Your use of the Licensed Material.
+
+  g. License Elements means the license attributes listed in the name
+     of a Creative Commons Public License. The License Elements of this
+     Public License are Attribution, NonCommercial, and ShareAlike.
+
+  h. Licensed Material means the artistic or literary work, database,
+     or other material to which the Licensor applied this Public
+     License.
+
+  i. Licensed Rights means the rights granted to You subject to the
+     terms and conditions of this Public License, which are limited to
+     all Copyright and Similar Rights that apply to Your use of the
+     Licensed Material and that the Licensor has authority to license.
+
+  j. Licensor means the individual(s) or entity(ies) granting rights
+     under this Public License.
+
+  k. NonCommercial means not primarily intended for or directed towards
+     commercial advantage or monetary compensation. For purposes of
+     this Public License, the exchange of the Licensed Material for
+     other material subject to Copyright and Similar Rights by digital
+     file-sharing or similar means is NonCommercial provided there is
+     no payment of monetary compensation in connection with the
+     exchange.
+
+  l. Share means to provide material to the public by any means or
+     process that requires permission under the Licensed Rights, such
+     as reproduction, public display, public performance, distribution,
+     dissemination, communication, or importation, and to make material
+     available to the public including in ways that members of the
+     public may access the material from a place and at a time
+     individually chosen by them.
+
+  m. Sui Generis Database Rights means rights other than copyright
+     resulting from Directive 96/9/EC of the European Parliament and of
+     the Council of 11 March 1996 on the legal protection of databases,
+     as amended and/or succeeded, as well as other essentially
+     equivalent rights anywhere in the world.
+
+  n. You means the individual or entity exercising the Licensed Rights
+     under this Public License. Your has a corresponding meaning.
+
+
+Section 2 -- Scope.
+
+  a. License grant.
+
+       1. Subject to the terms and conditions of this Public License,
+          the Licensor hereby grants You a worldwide, royalty-free,
+          non-sublicensable, non-exclusive, irrevocable license to
+          exercise the Licensed Rights in the Licensed Material to:
+
+            a. reproduce and Share the Licensed Material, in whole or
+               in part, for NonCommercial purposes only; and
+
+            b. produce, reproduce, and Share Adapted Material for
+               NonCommercial purposes only.
+
+       2. Exceptions and Limitations. For the avoidance of doubt, where
+          Exceptions and Limitations apply to Your use, this Public
+          License does not apply, and You do not need to comply with
+          its terms and conditions.
+
+       3. Term. The term of this Public License is specified in Section
+          6(a).
+
+       4. Media and formats; technical modifications allowed. The
+          Licensor authorizes You to exercise the Licensed Rights in
+          all media and formats whether now known or hereafter created,
+          and to make technical modifications necessary to do so. The
+          Licensor waives and/or agrees not to assert any right or
+          authority to forbid You from making technical modifications
+          necessary to exercise the Licensed Rights, including
+          technical modifications necessary to circumvent Effective
+          Technological Measures. For purposes of this Public License,
+          simply making modifications authorized by this Section 2(a)
+          (4) never produces Adapted Material.
+
+       5. Downstream recipients.
+
+            a. Offer from the Licensor -- Licensed Material. Every
+               recipient of the Licensed Material automatically
+               receives an offer from the Licensor to exercise the
+               Licensed Rights under the terms and conditions of this
+               Public License.
+
+            b. Additional offer from the Licensor -- Adapted Material.
+               Every recipient of Adapted Material from You
+               automatically receives an offer from the Licensor to
+               exercise the Licensed Rights in the Adapted Material
+               under the conditions of the Adapter's License You apply.
+
+            c. No downstream restrictions. You may not offer or impose
+               any additional or different terms or conditions on, or
+               apply any Effective Technological Measures to, the
+               Licensed Material if doing so restricts exercise of the
+               Licensed Rights by any recipient of the Licensed
+               Material.
+
+       6. No endorsement. Nothing in this Public License constitutes or
+          may be construed as permission to assert or imply that You
+          are, or that Your use of the Licensed Material is, connected
+          with, or sponsored, endorsed, or granted official status by,
+          the Licensor or others designated to receive attribution as
+          provided in Section 3(a)(1)(A)(i).
+
+  b. Other rights.
+
+       1. Moral rights, such as the right of integrity, are not
+          licensed under this Public License, nor are publicity,
+          privacy, and/or other similar personality rights; however, to
+          the extent possible, the Licensor waives and/or agrees not to
+          assert any such rights held by the Licensor to the limited
+          extent necessary to allow You to exercise the Licensed
+          Rights, but not otherwise.
+
+       2. Patent and trademark rights are not licensed under this
+          Public License.
+
+       3. To the extent possible, the Licensor waives any right to
+          collect royalties from You for the exercise of the Licensed
+          Rights, whether directly or through a collecting society
+          under any voluntary or waivable statutory or compulsory
+          licensing scheme. In all other cases the Licensor expressly
+          reserves any right to collect such royalties, including when
+          the Licensed Material is used other than for NonCommercial
+          purposes.
+
+
+Section 3 -- License Conditions.
+
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+
+  a. Attribution.
+
+       1. If You Share the Licensed Material (including in modified
+          form), You must:
+
+            a. retain the following if it is supplied by the Licensor
+               with the Licensed Material:
+
+                 i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if
+                    designated);
+
+                ii. a copyright notice;
+
+               iii. a notice that refers to this Public License;
+
+                iv. a notice that refers to the disclaimer of
+                    warranties;
+
+                 v. a URI or hyperlink to the Licensed Material to the
+                    extent reasonably practicable;
+
+            b. indicate if You modified the Licensed Material and
+               retain an indication of any previous modifications; and
+
+            c. indicate the Licensed Material is licensed under this
+               Public License, and include the text of, or the URI or
+               hyperlink to, this Public License.
+
+       2. You may satisfy the conditions in Section 3(a)(1) in any
+          reasonable manner based on the medium, means, and context in
+          which You Share the Licensed Material. For example, it may be
+          reasonable to satisfy the conditions by providing a URI or
+          hyperlink to a resource that includes the required
+          information.
+       3. If requested by the Licensor, You must remove any of the
+          information required by Section 3(a)(1)(A) to the extent
+          reasonably practicable.
+
+  b. ShareAlike.
+
+     In addition to the conditions in Section 3(a), if You Share
+     Adapted Material You produce, the following conditions also apply.
+
+       1. The Adapter's License You apply must be a Creative Commons
+          license with the same License Elements, this version or
+          later, or a BY-NC-SA Compatible License.
+
+       2. You must include the text of, or the URI or hyperlink to, the
+          Adapter's License You apply. You may satisfy this condition
+          in any reasonable manner based on the medium, means, and
+          context in which You Share Adapted Material.
+
+       3. You may not offer or impose any additional or different terms
+          or conditions on, or apply any Effective Technological
+          Measures to, Adapted Material that restrict exercise of the
+          rights granted under the Adapter's License You apply.
+
+
+Section 4 -- Sui Generis Database Rights.
+
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+     to extract, reuse, reproduce, and Share all or a substantial
+     portion of the contents of the database for NonCommercial purposes
+     only;
+
+  b. if You include all or a substantial portion of the database
+     contents in a database in which You have Sui Generis Database
+     Rights, then the database in which You have Sui Generis Database
+     Rights (but not its individual contents) is Adapted Material,
+     including for purposes of Section 3(b); and
+
+  c. You must comply with the conditions in Section 3(a) if You Share
+     all or a substantial portion of the contents of the database.
+
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+
+
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+
+  c. The disclaimer of warranties and limitation of liability provided
+     above shall be interpreted in a manner that, to the extent
+     possible, most closely approximates an absolute disclaimer and
+     waiver of all liability.
+
+
+Section 6 -- Term and Termination.
+
+  a. This Public License applies for the term of the Copyright and
+     Similar Rights licensed here. However, if You fail to comply with
+     this Public License, then Your rights under this Public License
+     terminate automatically.
+
+  b. Where Your right to use the Licensed Material has terminated under
+     Section 6(a), it reinstates:
+
+       1. automatically as of the date the violation is cured, provided
+          it is cured within 30 days of Your discovery of the
+          violation; or
+
+       2. upon express reinstatement by the Licensor.
+
+     For the avoidance of doubt, this Section 6(b) does not affect any
+     right the Licensor may have to seek remedies for Your violations
+     of this Public License.
+
+  c. For the avoidance of doubt, the Licensor may also offer the
+     Licensed Material under separate terms or conditions or stop
+     distributing the Licensed Material at any time; however, doing so
+     will not terminate this Public License.
+
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+     License.
+
+
+Section 7 -- Other Terms and Conditions.
+
+  a. The Licensor shall not be bound by any additional or different
+     terms or conditions communicated by You unless expressly agreed.
+
+  b. Any arrangements, understandings, or agreements regarding the
+     Licensed Material not stated herein are separate from and
+     independent of the terms and conditions of this Public License.
+
+
+Section 8 -- Interpretation.
+
+  a. For the avoidance of doubt, this Public License does not, and
+     shall not be interpreted to, reduce, limit, restrict, or impose
+     conditions on any use of the Licensed Material that could lawfully
+     be made without permission under this Public License.
+
+  b. To the extent possible, if any provision of this Public License is
+     deemed unenforceable, it shall be automatically reformed to the
+     minimum extent necessary to make it enforceable. If the provision
+     cannot be reformed, it shall be severed from this Public License
+     without affecting the enforceability of the remaining terms and
+     conditions.
+
+  c. No term or condition of this Public License will be waived and no
+     failure to comply consented to unless expressly agreed to by the
+     Licensor.
+
+  d. Nothing in this Public License constitutes or may be interpreted
+     as a limitation upon, or waiver of, any privileges and immunities
+     that apply to the Licensor or You, including from the legal
+     processes of any jurisdiction or authority.
+
+=======================================================================
+
+Creative Commons is not a party to its public
+licenses. Notwithstanding, Creative Commons may elect to apply one of
+its public licenses to material it publishes and in those instances
+will be considered the “Licensor.” The text of the Creative Commons
+public licenses is dedicated to the public domain under the CC0 Public
+Domain Dedication. Except for the limited purpose of indicating that
+material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the
+public licenses.
+
+Creative Commons may be contacted at creativecommons.org.
@@ -0,0 +1,5 @@
+## A2C
+
+
+
+https://towardsdatascience.com/understanding-actor-critic-methods-931b97b6df3f
@@ -0,0 +1,55 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Author: JiangJi
+Email: johnjim0816@gmail.com
+Date: 2021-05-03 22:16:08
+LastEditor: JiangJi
+LastEditTime: 2021-05-03 22:23:48
+Discription: 
+Environment: 
+'''
+import torch.optim as optim
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.distributions import Categorical
+
+class ActorCritic(nn.Module):
+    ''' A2C网络模型，包含一个Actor和Critic
+    '''
+    def __init__(self, input_dim, output_dim, hidden_dim):
+        super(ActorCritic, self).__init__()
+        self.critic = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, 1)
+        )
+        
+        self.actor = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, output_dim),
+            nn.Softmax(dim=1),
+        )
+        
+    def forward(self, x):
+        value = self.critic(x)
+        probs = self.actor(x)
+        dist  = Categorical(probs)
+        return dist, value
+class A2C:
+    ''' A2C算法
+    '''
+    def __init__(self,state_dim,action_dim,cfg) -> None:
+        self.gamma = cfg.gamma
+        self.device = cfg.device
+        self.model = ActorCritic(state_dim, action_dim, cfg.hidden_size).to(self.device)
+        self.optimizer = optim.Adam(self.model.parameters())
+
+    def compute_returns(self,next_value, rewards, masks):
+        R = next_value
+        returns = []
+        for step in reversed(range(len(rewards))):
+            R = rewards[step] + self.gamma * R * masks[step]
+            returns.insert(0, R)
+        return returns
@@ -0,0 +1,138 @@
+import sys
+import os
+curr_path = os.path.dirname(os.path.abspath(__file__))  # 当前文件所在绝对路径
+parent_path = os.path.dirname(curr_path)  # 父路径
+sys.path.append(parent_path)  # 添加路径到系统路径
+
+import gym
+import numpy as np
+import torch
+import torch.optim as optim
+import datetime
+from common.multiprocessing_env import SubprocVecEnv
+from A2C.agent import ActorCritic
+from common.utils import save_results, make_dir
+from common.utils import plot_rewards
+
+curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
+algo_name = 'A2C'  # 算法名称
+env_name = 'CartPole-v0'  # 环境名称
+
+class A2CConfig:
+    def __init__(self) -> None:
+        self.algo_name = algo_name# 算法名称
+        self.env_name = env_name # 环境名称
+        self.n_envs = 8 # 异步的环境数目
+        self.gamma = 0.99 # 强化学习中的折扣因子
+        self.hidden_dim = 256
+        self.lr = 1e-3 # learning rate
+        self.max_frames = 30000
+        self.n_steps = 5
+        self.device  = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+class PlotConfig:
+    def __init__(self) -> None:
+        self.algo_name = algo_name # 算法名称
+        self.env_name = env_name # 环境名称
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 检测GPU
+        self.result_path = curr_path+"/outputs/" + self.env_name + \
+            '/'+curr_time+'/results/'  # 保存结果的路径
+        self.model_path = curr_path+"/outputs/" + self.env_name + \
+            '/'+curr_time+'/models/'  # 保存模型的路径
+        self.save = True # 是否保存图片
+        
+
+def make_envs(env_name):
+    def _thunk():
+        env = gym.make(env_name)
+        env.seed(2)
+        return env
+    return _thunk
+def test_env(env,model,vis=False):
+    state = env.reset()
+    if vis: env.render()
+    done = False
+    total_reward = 0
+    while not done:
+        state = torch.FloatTensor(state).unsqueeze(0).to(cfg.device)
+        dist, _ = model(state)
+        next_state, reward, done, _ = env.step(dist.sample().cpu().numpy()[0])
+        state = next_state
+        if vis: env.render()
+        total_reward += reward
+    return total_reward
+def compute_returns(next_value, rewards, masks, gamma=0.99):
+    R = next_value
+    returns = []
+    for step in reversed(range(len(rewards))):
+        R = rewards[step] + gamma * R * masks[step]
+        returns.insert(0, R)
+    return returns
+
+
+def train(cfg,envs):
+    print('开始训练!')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo}, 设备：{cfg.device}')
+    env = gym.make(cfg.env_name) # a single env
+    env.seed(10)
+    state_dim  = envs.observation_space.shape[0]
+    action_dim = envs.action_space.n
+    model = ActorCritic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
+    optimizer = optim.Adam(model.parameters())
+    frame_idx    = 0
+    test_rewards = []
+    test_ma_rewards = []
+    state = envs.reset()
+    while frame_idx < cfg.max_frames:
+        log_probs = []
+        values    = []
+        rewards   = []
+        masks     = []
+        entropy = 0
+        # rollout trajectory
+        for _ in range(cfg.n_steps):
+            state = torch.FloatTensor(state).to(cfg.device)
+            dist, value = model(state)
+            action = dist.sample()
+            next_state, reward, done, _ = envs.step(action.cpu().numpy())
+            log_prob = dist.log_prob(action)
+            entropy += dist.entropy().mean()
+            log_probs.append(log_prob)
+            values.append(value)
+            rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(cfg.device))
+            masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(cfg.device))
+            state = next_state
+            frame_idx += 1
+            if frame_idx % 100 == 0:
+                test_reward = np.mean([test_env(env,model) for _ in range(10)])
+                print(f"frame_idx:{frame_idx}, test_reward:{test_reward}")
+                test_rewards.append(test_reward)
+                if test_ma_rewards:
+                    test_ma_rewards.append(0.9*test_ma_rewards[-1]+0.1*test_reward)
+                else:
+                    test_ma_rewards.append(test_reward) 
+                # plot(frame_idx, test_rewards)   
+        next_state = torch.FloatTensor(next_state).to(cfg.device)
+        _, next_value = model(next_state)
+        returns = compute_returns(next_value, rewards, masks)
+        log_probs = torch.cat(log_probs)
+        returns   = torch.cat(returns).detach()
+        values    = torch.cat(values)
+        advantage = returns - values
+        actor_loss  = -(log_probs * advantage.detach()).mean()
+        critic_loss = advantage.pow(2).mean()
+        loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+    print('完成训练！')
+    return test_rewards, test_ma_rewards
+if __name__ == "__main__":
+    cfg = A2CConfig()
+    plot_cfg = PlotConfig()
+    envs = [make_envs(cfg.env_name) for i in range(cfg.n_envs)]
+    envs = SubprocVecEnv(envs) 
+    # 训练
+    rewards,ma_rewards = train(cfg,envs)
+    make_dir(plot_cfg.result_path,plot_cfg.model_path)
+    save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path) # 保存结果
+    plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果
@@ -0,0 +1,7 @@
+# DDPG
+
+#TODO
+
+## 伪代码
+
+![image-20210320151900695](assets/image-20210320151900695.png)
@@ -0,0 +1,144 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+@Author: John
+@Email: johnjim0816@gmail.com
+@Date: 2020-06-09 20:25:52
+@LastEditor: John
+LastEditTime: 2021-09-16 00:55:30
+@Discription: 
+@Environment: python 3.7.7
+'''
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.nn.functional as F
+class ReplayBuffer:
+    def __init__(self, capacity):
+        self.capacity = capacity # 经验回放的容量
+        self.buffer = [] # 缓冲区
+        self.position = 0 
+    
+    def push(self, state, action, reward, next_state, done):
+        ''' 缓冲区是一个队列，容量超出时去掉开始存入的转移(transition)
+        '''
+        if len(self.buffer) < self.capacity:
+            self.buffer.append(None)
+        self.buffer[self.position] = (state, action, reward, next_state, done)
+        self.position = (self.position + 1) % self.capacity 
+    
+    def sample(self, batch_size):
+        batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移
+        state, action, reward, next_state, done =  zip(*batch) # 解压成状态，动作等
+        return state, action, reward, next_state, done
+    
+    def __len__(self):
+        ''' 返回当前存储的量
+        '''
+        return len(self.buffer)
+class Actor(nn.Module):
+    def __init__(self, state_dim, action_dim, hidden_dim, init_w=3e-3):
+        super(Actor, self).__init__()  
+        self.linear1 = nn.Linear(state_dim, hidden_dim)
+        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
+        self.linear3 = nn.Linear(hidden_dim, action_dim)
+        
+        self.linear3.weight.data.uniform_(-init_w, init_w)
+        self.linear3.bias.data.uniform_(-init_w, init_w)
+        
+    def forward(self, x):
+        x = F.relu(self.linear1(x))
+        x = F.relu(self.linear2(x))
+        x = torch.tanh(self.linear3(x))
+        return x
+class Critic(nn.Module):
+    def __init__(self, state_dim, action_dim, hidden_dim, init_w=3e-3):
+        super(Critic, self).__init__()
+        
+        self.linear1 = nn.Linear(state_dim + action_dim, hidden_dim)
+        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
+        self.linear3 = nn.Linear(hidden_dim, 1)
+        # 随机初始化为较小的值
+        self.linear3.weight.data.uniform_(-init_w, init_w)
+        self.linear3.bias.data.uniform_(-init_w, init_w)
+        
+    def forward(self, state, action):
+        # 按维数1拼接
+        x = torch.cat([state, action], 1)
+        x = F.relu(self.linear1(x))
+        x = F.relu(self.linear2(x))
+        x = self.linear3(x)
+        return x
+class DDPG:
+    def __init__(self, state_dim, action_dim, cfg):
+        self.device = cfg.device
+        self.critic = Critic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
+        self.actor = Actor(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
+        self.target_critic = Critic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
+        self.target_actor = Actor(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
+
+        # 复制参数到目标网络
+        for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()):
+            target_param.data.copy_(param.data)
+        for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()):
+            target_param.data.copy_(param.data)
+
+        self.critic_optimizer = optim.Adam(
+            self.critic.parameters(),  lr=cfg.critic_lr)
+        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=cfg.actor_lr)
+        self.memory = ReplayBuffer(cfg.memory_capacity)
+        self.batch_size = cfg.batch_size
+        self.soft_tau = cfg.soft_tau # 软更新参数
+        self.gamma = cfg.gamma
+
+    def choose_action(self, state):
+        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
+        action = self.actor(state)
+        return action.detach().cpu().numpy()[0, 0]
+
+    def update(self):
+        if len(self.memory) < self.batch_size: # 当 memory 中不满足一个批量时，不更新策略
+            return
+        # 从经验回放中(replay memory)中随机采样一个批量的转移(transition)
+        state, action, reward, next_state, done = self.memory.sample(self.batch_size)
+        # 转变为张量
+        state = torch.FloatTensor(state).to(self.device)
+        next_state = torch.FloatTensor(next_state).to(self.device)
+        action = torch.FloatTensor(action).to(self.device)
+        reward = torch.FloatTensor(reward).unsqueeze(1).to(self.device)
+        done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(self.device)
+       
+        policy_loss = self.critic(state, self.actor(state))
+        policy_loss = -policy_loss.mean()
+        next_action = self.target_actor(next_state)
+        target_value = self.target_critic(next_state, next_action.detach())
+        expected_value = reward + (1.0 - done) * self.gamma * target_value
+        expected_value = torch.clamp(expected_value, -np.inf, np.inf)
+
+        value = self.critic(state, action)
+        value_loss = nn.MSELoss()(value, expected_value.detach())
+        
+        self.actor_optimizer.zero_grad()
+        policy_loss.backward()
+        self.actor_optimizer.step()
+        self.critic_optimizer.zero_grad()
+        value_loss.backward()
+        self.critic_optimizer.step()
+        # 软更新
+        for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()):
+            target_param.data.copy_(
+                target_param.data * (1.0 - self.soft_tau) +
+                param.data * self.soft_tau
+            )
+        for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()):
+            target_param.data.copy_(
+                target_param.data * (1.0 - self.soft_tau) +
+                param.data * self.soft_tau
+            )
+    def save(self,path):
+        torch.save(self.actor.state_dict(), path+'checkpoint.pt')
+
+    def load(self,path):
+        self.actor.load_state_dict(torch.load(path+'checkpoint.pt')) 
@@ -0,0 +1,56 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+@Author: John
+@Email: johnjim0816@gmail.com
+@Date: 2020-06-10 15:28:30
+@LastEditor: John
+LastEditTime: 2021-09-16 00:52:30
+@Discription: 
+@Environment: python 3.7.7
+'''
+import gym
+import numpy as np
+
+class NormalizedActions(gym.ActionWrapper):
+    ''' 将action范围重定在[0.1]之间
+    '''
+    def action(self, action):
+        low_bound   = self.action_space.low
+        upper_bound = self.action_space.high
+        action = low_bound + (action + 1.0) * 0.5 * (upper_bound - low_bound)
+        action = np.clip(action, low_bound, upper_bound)
+        return action
+
+    def reverse_action(self, action):
+        low_bound   = self.action_space.low
+        upper_bound = self.action_space.high
+        action = 2 * (action - low_bound) / (upper_bound - low_bound) - 1
+        action = np.clip(action, low_bound, upper_bound)
+        return action
+
+class OUNoise(object):
+    '''Ornstein–Uhlenbeck噪声
+    '''
+    def __init__(self, action_space, mu=0.0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000):
+        self.mu           = mu # OU噪声的参数
+        self.theta        = theta # OU噪声的参数
+        self.sigma        = max_sigma # OU噪声的参数
+        self.max_sigma    = max_sigma
+        self.min_sigma    = min_sigma
+        self.decay_period = decay_period
+        self.action_dim   = action_space.shape[0]
+        self.low          = action_space.low
+        self.high         = action_space.high
+        self.reset()
+    def reset(self):
+        self.obs = np.ones(self.action_dim) * self.mu
+    def evolve_obs(self):
+        x  = self.obs
+        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
+        self.obs = x + dx
+        return self.obs
+    def get_action(self, action, t=0):
+        ou_obs = self.evolve_obs()
+        self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period) # sigma会逐渐衰减
+        return np.clip(action + ou_obs, self.low, self.high) # 动作加上噪声后进行剪切
@@ -0,0 +1,81 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+@Author: John
+@Email: johnjim0816@gmail.com
+@Date: 2020-06-11 20:58:21
+@LastEditor: John
+LastEditTime: 2021-09-16 01:31:33
+@Discription: 
+@Environment: python 3.7.7
+'''
+import sys,os
+curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
+parent_path = os.path.dirname(curr_path) # 父路径
+sys.path.append(parent_path) # 添加路径到系统路径sys.path
+
+import datetime
+import gym
+import torch
+
+from DDPG.env import NormalizedActions
+from DDPG.agent import DDPG
+from DDPG.train import train,test
+from common.utils import save_results,make_dir
+from common.utils import plot_rewards
+
+curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
+algo_name = 'DDPG'  # 算法名称
+env_name = 'Pendulum-v1'  # 环境名称，gym新版本（约0.21.0之后）中Pendulum-v0改为Pendulum-v1
+
+class DDPGConfig:
+    def __init__(self):
+        self.algo_name = algo_name # 算法名称
+        self.env_name = env_name # 环境名称
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
+        self.train_eps = 300 # 训练的回合数
+        self.test_eps = 50 # 测试的回合数
+        self.gamma = 0.99 # 折扣因子
+        self.critic_lr = 1e-3 # 评论家网络的学习率
+        self.actor_lr = 1e-4 # 演员网络的学习率
+        self.memory_capacity = 8000 # 经验回放的容量
+        self.batch_size = 128 # mini-batch SGD中的批量大小
+        self.target_update = 2 # 目标网络的更新频率
+        self.hidden_dim = 256 # 网络隐藏层维度
+        self.soft_tau = 1e-2 # 软更新参数
+        
+class PlotConfig:
+    def __init__(self) -> None:
+        self.algo_name = algo_name  # 算法名称
+        self.env_name = env_name # 环境名称
+        self.result_path = curr_path+"/outputs/" + self.env_name + \
+            '/'+curr_time+'/results/'  # 保存结果的路径
+        self.model_path = curr_path+"/outputs/" + self.env_name + \
+            '/'+curr_time+'/models/'  # 保存模型的路径
+        self.save = True # 是否保存图片
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 检测GPU
+
+def env_agent_config(cfg,seed=1):
+    env = NormalizedActions(gym.make(cfg.env_name)) # 装饰action噪声
+    env.seed(seed) # 随机种子
+    state_dim = env.observation_space.shape[0]
+    action_dim = env.action_space.shape[0]
+    agent = DDPG(state_dim,action_dim,cfg)
+    return env,agent
+
+cfg = DDPGConfig()
+plot_cfg = PlotConfig()
+# 训练
+env,agent = env_agent_config(cfg,seed=1)
+rewards, ma_rewards = train(cfg, env, agent)
+make_dir(plot_cfg.result_path, plot_cfg.model_path)
+agent.save(path=plot_cfg.model_path)
+save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path)
+plot_rewards(rewards, ma_rewards, plot_cfg, tag="train")  # 画出结果
+# 测试
+env,agent = env_agent_config(cfg,seed=10)
+agent.load(path=plot_cfg.model_path)
+rewards,ma_rewards = test(plot_cfg,env,agent)
+save_results(rewards,ma_rewards,tag = 'test',path = cfg.result_path)
+plot_rewards(rewards, ma_rewards, plot_cfg, tag="test")  # 画出结果
+
@@ -0,0 +1,64 @@
+import sys
+import os
+curr_path = os.path.dirname(os.path.abspath(__file__))  # 当前文件所在绝对路径
+parent_path = os.path.dirname(curr_path)  # 父路径
+sys.path.append(parent_path)  # 添加路径到系统路径
+
+from DDPG.env import OUNoise
+
+def train(cfg, env, agent):
+    print('开始训练！')
+    print(f'环境：{cfg.env_name}，算法：{cfg.algo}，设备：{cfg.device}')
+    ou_noise = OUNoise(env.action_space)  # 动作噪声
+    rewards = [] # 记录所有回合的奖励
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    for i_ep in range(cfg.train_eps):
+        state = env.reset()
+        ou_noise.reset()
+        done = False
+        ep_reward = 0
+        i_step = 0
+        while not done:
+            i_step += 1
+            action = agent.choose_action(state)
+            action = ou_noise.get_action(action, i_step) 
+            next_state, reward, done, _ = env.step(action)
+            ep_reward += reward
+            agent.memory.push(state, action, reward, next_state, done)
+            agent.update()
+            state = next_state
+        if (i_ep+1)%10 == 0:
+            print('回合：{}/{}，奖励：{:.2f}'.format(i_ep+1, cfg.train_eps, ep_reward))
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
+        else:
+            ma_rewards.append(ep_reward)
+    print('完成训练！')
+    return rewards, ma_rewards
+
+def test(cfg, env, agent):
+    print('开始测试！')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo}, 设备：{cfg.device}')
+    rewards = [] # 记录所有回合的奖励
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    for i_ep in range(cfg.test_eps):
+        state = env.reset() 
+        done = False
+        ep_reward = 0
+        i_step = 0
+        while not done:
+            i_step += 1
+            action = agent.choose_action(state)
+            next_state, reward, done, _ = env.step(action)
+            ep_reward += reward
+            state = next_state
+        print('回合：{}/{}, 奖励：{}'.format(i_ep+1, cfg.train_eps, ep_reward))
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
+        else:
+            ma_rewards.append(ep_reward)
+        print(f"回合：{i_ep+1}/{cfg.test_eps}，奖励：{ep_reward:.1f}")
+    print('完成测试！')
+    return rewards, ma_rewards
@@ -0,0 +1,218 @@
+# DQN
+
+## 原理简介
+
+DQN是Q-leanning算法的优化和延伸，Q-leaning中使用有限的Q表存储值的信息，而DQN中则用神经网络替代Q表存储信息，这样更适用于高维的情况，相关知识基础可参考[datawhale李宏毅笔记-Q学习](https://datawhalechina.github.io/easy-rl/#/chapter6/chapter6)。
+
+论文方面主要可以参考两篇，一篇就是2013年谷歌DeepMind团队的[Playing Atari with Deep Reinforcement Learning](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf)，一篇是也是他们团队后来在Nature杂志上发表的[Human-level control through deep reinforcement learning](https://web.stanford.edu/class/psych209/Readings/MnihEtAlHassibis15NatureControlDeepRL.pdf)。后者在算法层面增加target q-net，也可以叫做Nature DQN。
+
+Nature DQN使用了两个Q网络，一个当前Q网络𝑄用来选择动作，更新模型参数，另一个目标Q网络𝑄′用于计算目标Q值。目标Q网络的网络参数不需要迭代更新，而是每隔一段时间从当前Q网络𝑄复制过来，即延时更新，这样可以减少目标Q值和当前的Q值相关性。
+
+要注意的是，两个Q网络的结构是一模一样的。这样才可以复制网络参数。Nature DQN和[Playing Atari with Deep Reinforcement Learning](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf)相比，除了用一个新的相同结构的目标Q网络来计算目标Q值以外，其余部分基本是完全相同的。细节也可参考[强化学习（九）Deep Q-Learning进阶之Nature DQN](https://www.cnblogs.com/pinard/p/9756075.html)。
+
+https://blog.csdn.net/JohnJim0/article/details/109557173)
+
+## 伪代码
+
+<img src="assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70.png" alt="img" style="zoom:50%;" />
+
+## 代码实现
+
+### RL接口
+
+首先是强化学习训练的基本接口，即通用的训练模式：
+```python
+for i_episode in range(MAX_EPISODES):
+	state = env.reset() # reset环境状态
+	for i_step in range(MAX_STEPS):
+    action = agent.choose_action(state) # 根据当前环境state选择action
+    next_state, reward, done, _ = env.step(action) # 更新环境参数
+    agent.memory.push(state, action, reward, next_state, done) # 将state等这些transition存入memory
+    agent.update() # 每步更新网络
+    state = next_state # 跳转到下一个状态
+    if done:
+    	break        
+```
+每个episode加一个MAX_STEPS，也可以使用while not done, 加这个max_steps有时是因为比如gym环境训练目标就是在200个step下达到200的reward，或者是当完成一个episode的步数较多时也可以设置，基本流程跟所有伪代码一致，如下：
+1. agent选择动作
+2. 环境根据agent的动作反馈出next_state和reward
+3. agent进行更新，如有memory就会将transition(包含state，reward，action等)存入memory中
+4. 跳转到下一个状态
+5. 如果done了，就跳出循环，进行下一个episode的训练。
+
+想要实现完整的算法还需要创建Qnet，Replaybuffer等类
+
+### 两个Q网络
+
+上文讲了Nature DQN中有两个Q网络，一个是policy_net，一个是延时更新的target_net，两个网络的结构是一模一样的，如下(见```model.py```)，注意DQN使用的Qnet就是全连接网络即FCH：
+```python
+import torch.nn as nn
+import torch.nn.functional as F
+
+class FCN(nn.Module):
+    def __init__(self, state_dim=4, action_dim=18):
+        """ 初始化q网络，为全连接网络
+            state_dim: 输入的feature即环境的state数目
+            action_dim: 输出的action总个数
+        """
+        super(FCN, self).__init__()
+        self.fc1 = nn.Linear(state_dim, 128) # 输入层
+        self.fc2 = nn.Linear(128, 128) # 隐藏层
+        self.fc3 = nn.Linear(128, action_dim) # 输出层
+        
+    def forward(self, x):
+        # 各层对应的激活函数
+        x = F.relu(self.fc1(x)) 
+        x = F.relu(self.fc2(x))
+        return self.fc3(x)
+```
+输入为state_dim，输出为action_dim，包含一个128维度的隐藏层，这里根据需要可增加隐藏层维度和数量，然后一般使用relu激活函数，这里跟深度学习的网路设置是一样的。
+
+### Replay Buffer
+
+然后就是Replay Memory了，其作用主要是是克服经验数据的相关性（correlated data）和非平稳分布（non-stationary distribution）问题，实现如下(见```memory.py```)：
+
+```python
+import random
+import numpy as np
+
+class ReplayBuffer:
+    
+    def __init__(self, capacity):
+        self.capacity = capacity
+        self.buffer = []
+        self.position = 0
+    
+    def push(self, state, action, reward, next_state, done):
+        if len(self.buffer) < self.capacity:
+            self.buffer.append(None)
+        self.buffer[self.position] = (state, action, reward, next_state, done)
+        self.position = (self.position + 1) % self.capacity
+    
+    def sample(self, batch_size):
+        batch = random.sample(self.buffer, batch_size)
+        state, action, reward, next_state, done =  zip(*batch)
+        return state, action, reward, next_state, done
+    
+    def __len__(self):
+        return len(self.buffer)
+```
+
+参数capacity表示buffer的容量，主要包括push和sample两个步骤，push是将transitions放到memory中，sample是从memory随机抽取一些transition。
+
+### Agent类
+
+在```agent.py```中我们定义强化学习算法类，包括```choose_action```(选择动作，使用e-greedy策略时会多一个```predict```函数，下面会将到)和```update```(更新)等函数。
+
+在类中建立两个网络，以及optimizer和memory，
+
+```python
+self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
+self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
+for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # copy params from policy net
+    target_param.data.copy_(param.data)
+self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr)
+self.memory = ReplayBuffer(cfg.memory_capacity)
+```
+然后是选择action：
+
+```python
+def choose_action(self, state):
+        '''选择动作
+        '''
+    self.frame_idx += 1
+    if random.random() > self.epsilon(self.frame_idx):
+        action = self.predict(state)
+    else:
+        action = random.randrange(self.action_dim)
+    return action
+```
+
+这里使用e-greedy策略，即设置一个参数epsilon，如果生成的随机数大于epsilon，就根据网络预测的选择action，否则还是随机选择action，这个epsilon是会逐渐减小的，可以使用线性或者指数减小的方式，但不会减小到零，这样在训练稳定时还能保持一定的探索，这部分可以学习探索与利用(exploration and exploition)相关知识。
+
+上面讲到的预测函数其实就是根据state选取q值最大的action，如下：
+
+```python
+def predict(self,state):
+    with torch.no_grad():
+        state = torch.tensor([state], device=self.device, dtype=torch.float32)
+        q_values = self.policy_net(state)
+        action = q_values.max(1)[1].item()
+```
+
+然后是更新函数了：
+
+```python
+def update(self):
+
+        if len(self.memory) < self.batch_size:
+            return
+        # 从memory中随机采样transition
+        state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
+            self.batch_size)
+        '''转为张量
+        例如tensor([[-4.5543e-02, -2.3910e-01,  1.8344e-02,  2.3158e-01],...,[-1.8615e-02, -2.3921e-01, -1.1791e-02,  2.3400e-01]])'''
+        state_batch = torch.tensor(
+            state_batch, device=self.device, dtype=torch.float)
+        action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(
+            1)  # 例如tensor([[1],...,[0]])
+        reward_batch = torch.tensor(
+            reward_batch, device=self.device, dtype=torch.float)  # tensor([1., 1.,...,1])
+        next_state_batch = torch.tensor(
+            next_state_batch, device=self.device, dtype=torch.float)
+        done_batch = torch.tensor(np.float32(
+            done_batch), device=self.device)
+
+        '''计算当前(s_t,a)对应的Q(s_t, a)'''
+        '''torch.gather:对于a=torch.Tensor([[1,2],[3,4]]),那么a.gather(1,torch.Tensor([[0],[1]]))=torch.Tensor([[1],[3]])'''
+        q_values = self.policy_net(state_batch).gather(
+            dim=1, index=action_batch)  # 等价于self.forward
+        # 计算所有next states的V(s_{t+1})，即通过target_net中选取reward最大的对应states
+        next_q_values = self.target_net(next_state_batch).max(
+            1)[0].detach()  # 比如tensor([ 0.0060, -0.0171,...,])
+        # 计算 expected_q_value
+        # 对于终止状态，此时done_batch[0]=1, 对应的expected_q_value等于reward
+        expected_q_values = reward_batch + \
+            self.gamma * next_q_values * (1-done_batch)
+        # self.loss = F.smooth_l1_loss(q_values,expected_q_values.unsqueeze(1)) # 计算 Huber loss
+        loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1))  # 计算 均方误差loss
+        # 优化模型
+        self.optimizer.zero_grad()  # zero_grad清除上一步所有旧的gradients from the last step
+        # loss.backward()使用backpropagation计算loss相对于所有parameters(需要gradients)的微分
+        loss.backward()
+        # for param in self.policy_net.parameters():  # clip防止梯度爆炸
+        #     param.grad.data.clamp_(-1, 1)
+        self.optimizer.step()  # 更新模型
+```
+
+更新遵循伪代码的以下部分：
+
+<img src="assets/image-20210507162813393.png" alt="image-20210507162813393" style="zoom:50%;" />
+
+首先从replay buffer中选取一个batch的数据，计算loss，然后进行minibatch SGD。
+
+然后是保存与加载模型的部分，如下：
+
+```python
+def save(self, path):
+        torch.save(self.target_net.state_dict(), path+'dqn_checkpoint.pth')
+def load(self, path):
+    self.target_net.load_state_dict(torch.load(path+'dqn_checkpoint.pth'))
+    for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
+        param.data.copy_(target_param.data)
+```
+
+
+
+### 实验结果
+
+训练结果如下：
+
+<img src="assets/train_rewards_curve.png" alt="train_rewards_curve" style="zoom: 67%;" />
+
+<img src="assets/eval_rewards_curve.png" alt="eval_rewards_curve" style="zoom:67%;" />
+
+## 参考
+
+[with torch.no_grad()](https://www.jianshu.com/p/1cea017f5d11)
+
@@ -0,0 +1,125 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+@Author: John
+@Email: johnjim0816@gmail.com
+@Date: 2020-06-12 00:50:49
+@LastEditor: John
+LastEditTime: 2021-12-22 14:01:37
+@Discription: 
+@Environment: python 3.7.7
+'''
+'''off-policy
+'''
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import random
+import math
+import numpy as np
+
+class MLP(nn.Module):
+    def __init__(self, state_dim,action_dim,hidden_dim=128):
+        """ 初始化q网络，为全连接网络
+            state_dim: 输入的特征数即环境的状态维度
+            action_dim: 输出的动作维度
+        """
+        super(MLP, self).__init__()
+        self.fc1 = nn.Linear(state_dim, hidden_dim) # 输入层
+        self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
+        self.fc3 = nn.Linear(hidden_dim, action_dim) # 输出层
+        
+    def forward(self, x):
+        # 各层对应的激活函数
+        x = F.relu(self.fc1(x)) 
+        x = F.relu(self.fc2(x))
+        return self.fc3(x)
+
+class ReplayBuffer:
+    def __init__(self, capacity):
+        self.capacity = capacity # 经验回放的容量
+        self.buffer = [] # 缓冲区
+        self.position = 0 
+    
+    def push(self, state, action, reward, next_state, done):
+        ''' 缓冲区是一个队列，容量超出时去掉开始存入的转移(transition)
+        '''
+        if len(self.buffer) < self.capacity:
+            self.buffer.append(None)
+        self.buffer[self.position] = (state, action, reward, next_state, done)
+        self.position = (self.position + 1) % self.capacity 
+    
+    def sample(self, batch_size):
+        batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移
+        state, action, reward, next_state, done =  zip(*batch) # 解压成状态，动作等
+        return state, action, reward, next_state, done
+    
+    def __len__(self):
+        ''' 返回当前存储的量
+        '''
+        return len(self.buffer)
+
+class DQN:
+    def __init__(self, state_dim, action_dim, cfg):
+
+        self.action_dim = action_dim  # 总的动作个数
+        self.device = cfg.device  # 设备，cpu或gpu等
+        self.gamma = cfg.gamma  # 奖励的折扣因子
+        # e-greedy策略相关参数
+        self.frame_idx = 0  # 用于epsilon的衰减计数
+        self.epsilon = lambda frame_idx: cfg.epsilon_end + \
+            (cfg.epsilon_start - cfg.epsilon_end) * \
+            math.exp(-1. * frame_idx / cfg.epsilon_decay)
+        self.batch_size = cfg.batch_size
+        self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
+        self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
+        for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net
+            target_param.data.copy_(param.data)
+        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器
+        self.memory = ReplayBuffer(cfg.memory_capacity) # 经验回放
+
+    def choose_action(self, state):
+        ''' 选择动作
+        '''
+        self.frame_idx += 1
+        if random.random() > self.epsilon(self.frame_idx):
+            with torch.no_grad():
+                state = torch.tensor([state], device=self.device, dtype=torch.float32)
+                q_values = self.policy_net(state)
+                action = q_values.max(1)[1].item() # 选择Q值最大的动作
+        else:
+            action = random.randrange(self.action_dim)
+        return action
+    def update(self):
+        if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时，不更新策略
+            return
+        # 从经验回放中(replay memory)中随机采样一个批量的转移(transition)
+        state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
+            self.batch_size)
+        # 转为张量
+        state_batch = torch.tensor(state_batch, device=self.device, dtype=torch.float)
+        action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1)  
+        reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float)  
+        next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float)
+        done_batch = torch.tensor(np.float32(done_batch), device=self.device)
+        q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch) # 计算当前状态(s_t,a)对应的Q(s_t, a)
+        next_q_values = self.target_net(next_state_batch).max(1)[0].detach() # 计算下一时刻的状态(s_t_,a)对应的Q值
+        # 计算期望的Q值，对于终止状态，此时done_batch[0]=1, 对应的expected_q_value等于reward
+        expected_q_values = reward_batch + self.gamma * next_q_values * (1-done_batch)
+        loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1))  # 计算均方根损失
+        # 优化更新模型
+        self.optimizer.zero_grad()  
+        loss.backward()
+        for param in self.policy_net.parameters():  # clip防止梯度爆炸
+            param.grad.data.clamp_(-1, 1)
+        self.optimizer.step() 
+
+    def save(self, path):
+        torch.save(self.target_net.state_dict(), path+'dqn_checkpoint.pth')
+
+    def load(self, path):
+        self.target_net.load_state_dict(torch.load(path+'dqn_checkpoint.pth'))
+        for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
+            param.data.copy_(target_param.data)
@@ -0,0 +1,133 @@
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.autograd as autograd 
+import random
+import math
+class CNN(nn.Module):
+    def __init__(self, input_dim, output_dim):
+        super(CNN, self).__init__()
+        
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        
+        self.features = nn.Sequential(
+            nn.Conv2d(input_dim[0], 32, kernel_size=8, stride=4),
+            nn.ReLU(),
+            nn.Conv2d(32, 64, kernel_size=4, stride=2),
+            nn.ReLU(),
+            nn.Conv2d(64, 64, kernel_size=3, stride=1),
+            nn.ReLU()
+        )
+        
+        self.fc = nn.Sequential(
+            nn.Linear(self.feature_size(), 512),
+            nn.ReLU(),
+            nn.Linear(512, self.output_dim)
+        )
+        
+    def forward(self, x):
+        x = self.features(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+        return x
+    
+    def feature_size(self):
+        return self.features(autograd.Variable(torch.zeros(1, *self.input_dim))).view(1, -1).size(1)
+
+
+    def act(self, state, epsilon):
+        if random.random() > epsilon:
+            state   = Variable(torch.FloatTensor(np.float32(state)).unsqueeze(0), volatile=True)
+            q_value = self.forward(state)
+            action  = q_value.max(1)[1].data[0]
+        else:
+            action = random.randrange(env.action_space.n)
+        return action
+
+class ReplayBuffer:
+    def __init__(self, capacity):
+        self.capacity = capacity # 经验回放的容量
+        self.buffer = [] # 缓冲区
+        self.position = 0 
+    
+    def push(self, state, action, reward, next_state, done):
+        ''' 缓冲区是一个队列，容量超出时去掉开始存入的转移(transition)
+        '''
+        if len(self.buffer) < self.capacity:
+            self.buffer.append(None)
+        self.buffer[self.position] = (state, action, reward, next_state, done)
+        self.position = (self.position + 1) % self.capacity 
+    
+    def sample(self, batch_size):
+        batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移
+        state, action, reward, next_state, done =  zip(*batch) # 解压成状态，动作等
+        return state, action, reward, next_state, done
+    
+    def __len__(self):
+        ''' 返回当前存储的量
+        '''
+        return len(self.buffer)
+
+class DQN:
+    def __init__(self, state_dim, action_dim, cfg):
+
+        self.action_dim = action_dim  # 总的动作个数
+        self.device = cfg.device  # 设备，cpu或gpu等
+        self.gamma = cfg.gamma  # 奖励的折扣因子
+        # e-greedy策略相关参数
+        self.frame_idx = 0  # 用于epsilon的衰减计数
+        self.epsilon = lambda frame_idx: cfg.epsilon_end + \
+            (cfg.epsilon_start - cfg.epsilon_end) * \
+            math.exp(-1. * frame_idx / cfg.epsilon_decay)
+        self.batch_size = cfg.batch_size
+        self.policy_net = CNN(state_dim, action_dim).to(self.device)
+        self.target_net = CNN(state_dim, action_dim).to(self.device)
+        for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net
+            target_param.data.copy_(param.data)
+        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器
+        self.memory = ReplayBuffer(cfg.memory_capacity) # 经验回放
+
+    def choose_action(self, state):
+        ''' 选择动作
+        '''
+        self.frame_idx += 1
+        if random.random() > self.epsilon(self.frame_idx):
+            with torch.no_grad():
+                state = torch.tensor([state], device=self.device, dtype=torch.float32)
+                q_values = self.policy_net(state)
+                action = q_values.max(1)[1].item() # 选择Q值最大的动作
+        else:
+            action = random.randrange(self.action_dim)
+        return action
+    def update(self):
+        if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时，不更新策略
+            return
+        # 从经验回放中(replay memory)中随机采样一个批量的转移(transition)
+        state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
+            self.batch_size)
+        # 转为张量
+        state_batch = torch.tensor(state_batch, device=self.device, dtype=torch.float)
+        action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(1)  
+        reward_batch = torch.tensor(reward_batch, device=self.device, dtype=torch.float)  
+        next_state_batch = torch.tensor(next_state_batch, device=self.device, dtype=torch.float)
+        done_batch = torch.tensor(np.float32(done_batch), device=self.device)
+        q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch) # 计算当前状态(s_t,a)对应的Q(s_t, a)
+        next_q_values = self.target_net(next_state_batch).max(1)[0].detach() # 计算下一时刻的状态(s_t_,a)对应的Q值
+        # 计算期望的Q值，对于终止状态，此时done_batch[0]=1, 对应的expected_q_value等于reward
+        expected_q_values = reward_batch + self.gamma * next_q_values * (1-done_batch)
+        loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1))  # 计算均方根损失
+        # 优化更新模型
+        self.optimizer.zero_grad()  
+        loss.backward()
+        for param in self.policy_net.parameters():  # clip防止梯度爆炸
+            param.grad.data.clamp_(-1, 1)
+        self.optimizer.step() 
+
+    def save(self, path):
+        torch.save(self.target_net.state_dict(), path+'dqn_checkpoint.pth')
+
+    def load(self, path):
+        self.target_net.load_state_dict(torch.load(path+'dqn_checkpoint.pth'))
+        for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
+            param.data.copy_(target_param.data)
@@ -0,0 +1,148 @@
+import sys
+import os
+curr_path = os.path.dirname(os.path.abspath(__file__))  # 当前文件所在绝对路径
+parent_path = os.path.dirname(curr_path)  # 父路径
+sys.path.append(parent_path)  # 添加路径到系统路径
+
+import gym
+import torch
+import datetime
+import numpy as np
+from common.utils import save_results, make_dir
+from common.utils import plot_rewards
+from DQN.dqn import DQN
+
+curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
+
+
+class Config:
+    '''超参数
+    '''
+
+    def __init__(self):
+        ################################## 环境超参数 ###################################
+        self.algo_name = 'DQN'  # 算法名称
+        self.env_name = 'CartPole-v0'  # 环境名称
+        self.device = torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu")  # 检测GPUgjgjlkhfsf风刀霜的撒发十
+        self.seed = 10 # 随机种子，置0则不设置随机种子
+        self.train_eps = 200  # 训练的回合数
+        self.test_eps = 30  # 测试的回合数
+        ################################################################################
+        
+        ################################## 算法超参数 ###################################
+        self.gamma = 0.95  # 强化学习中的折扣因子
+        self.epsilon_start = 0.90  # e-greedy策略中初始epsilon
+        self.epsilon_end = 0.01  # e-greedy策略中的终止epsilon
+        self.epsilon_decay = 500  # e-greedy策略中epsilon的衰减率
+        self.lr = 0.0001  # 学习率
+        self.memory_capacity = 100000  # 经验回放的容量
+        self.batch_size = 64  # mini-batch SGD中的批量大小
+        self.target_update = 4  # 目标网络的更新频率
+        self.hidden_dim = 256  # 网络隐藏层
+        ################################################################################
+
+        ################################# 保存结果相关参数 ##############################
+        self.result_path = curr_path + "/outputs/" + self.env_name + \
+            '/' + curr_time + '/results/'  # 保存结果的路径
+        self.model_path = curr_path + "/outputs/" + self.env_name + \
+            '/' + curr_time + '/models/'  # 保存模型的路径
+        self.save = True # 是否保存图片
+        ################################################################################
+
+
+def env_agent_config(cfg):
+    ''' 创建环境和智能体
+    '''
+    env = gym.make(cfg.env_name)  # 创建环境
+    state_dim = env.observation_space.shape[0]  # 状态维度
+    action_dim = env.action_space.n  # 动作维度
+    agent = DQN(state_dim, action_dim, cfg)  # 创建智能体
+    if cfg.seed !=0: # 设置随机种子
+        torch.manual_seed(cfg.seed)
+        env.seed(cfg.seed)
+        np.random.seed(cfg.seed)
+    return env, agent
+
+
+def train(cfg, env, agent):
+    ''' 训练
+    '''
+    print('开始训练!')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
+    rewards = []  # 记录所有回合的奖励
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    for i_ep in range(cfg.train_eps):
+        ep_reward = 0  # 记录一回合内的奖励
+        state = env.reset()  # 重置环境，返回初始状态
+        while True:
+            action = agent.choose_action(state)  # 选择动作
+            next_state, reward, done, _ = env.step(action)  # 更新环境，返回transition
+            agent.memory.push(state, action, reward,
+                              next_state, done)  # 保存transition
+            state = next_state  # 更新下一个状态
+            agent.update()  # 更新智能体
+            ep_reward += reward  # 累加奖励
+            if done:
+                break
+        if (i_ep + 1) % cfg.target_update == 0:  # 智能体目标网络更新
+            agent.target_net.load_state_dict(agent.policy_net.state_dict())
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward)
+        else:
+            ma_rewards.append(ep_reward)
+        if (i_ep + 1) % 10 == 0:
+            print('回合：{}/{}, 奖励：{}'.format(i_ep + 1, cfg.train_eps, ep_reward))
+    print('完成训练！')
+    env.close()
+    return rewards, ma_rewards
+
+
+def test(cfg, env, agent):
+    print('开始测试!')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
+    ############# 由于测试不需要使用epsilon-greedy策略，所以相应的值设置为0 ###############
+    cfg.epsilon_start = 0.0  # e-greedy策略中初始epsilon
+    cfg.epsilon_end = 0.0  # e-greedy策略中的终止epsilon
+    ################################################################################
+    rewards = []  # 记录所有回合的奖励
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    for i_ep in range(cfg.test_eps):
+        ep_reward = 0  # 记录一回合内的奖励
+        state = env.reset()  # 重置环境，返回初始状态
+        while True:
+            action = agent.choose_action(state)  # 选择动作
+            next_state, reward, done, _ = env.step(action)  # 更新环境，返回transition
+            state = next_state  # 更新下一个状态
+            ep_reward += reward  # 累加奖励
+            if done:
+                break
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1)
+        else:
+            ma_rewards.append(ep_reward)
+        print(f"回合：{i_ep+1}/{cfg.test_eps}，奖励：{ep_reward:.1f}")
+    print('完成测试！')
+    env.close()
+    return rewards, ma_rewards
+
+
+if __name__ == "__main__":
+    cfg = Config()
+    # 训练
+    env, agent = env_agent_config(cfg)
+    rewards, ma_rewards = train(cfg, env, agent)
+    make_dir(cfg.result_path, cfg.model_path)  # 创建保存结果和模型路径的文件夹
+    agent.save(path=cfg.model_path)  # 保存模型
+    save_results(rewards, ma_rewards, tag='train',
+                 path=cfg.result_path)  # 保存结果
+    plot_rewards(rewards, ma_rewards, cfg, tag="train")  # 画出结果
+    # 测试
+    env, agent = env_agent_config(cfg)
+    agent.load(path=cfg.model_path)  # 导入模型
+    rewards, ma_rewards = test(cfg, env, agent)
+    save_results(rewards, ma_rewards, tag='test',
+                 path=cfg.result_path)  # 保存结果
+    plot_rewards(rewards, ma_rewards, cfg, tag="test")  # 画出结果
@@ -0,0 +1,148 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Author: JiangJi
+Email: johnjim0816@gmail.com
+Date: 2021-12-22 11:14:17
+LastEditor: JiangJi
+LastEditTime: 2021-12-22 11:40:44
+Discription: 使用 Nature DQN 训练 CartPole-v1
+'''
+import sys
+import os
+curr_path = os.path.dirname(os.path.abspath(__file__))  # 当前文件所在绝对路径
+parent_path = os.path.dirname(curr_path)  # 父路径
+sys.path.append(parent_path)  # 添加路径到系统路径
+
+import gym
+import torch
+import datetime
+from common.utils import save_results, make_dir
+from common.utils import plot_rewards, plot_rewards_cn
+from DQN.dqn import DQN
+
+curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
+algo_name = "DQN"  # 算法名称
+env_name = 'CartPole-v1'  # 环境名称
+class DQNConfig:
+    ''' 算法相关参数设置
+    '''
+
+    def __init__(self):
+        self.algo_name = algo_name  # 算法名称
+        self.env_name = env_name  # 环境名称
+        self.device = torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu")  # 检测GPU
+        self.train_eps = 200  # 训练的回合数
+        self.test_eps = 30  # 测试的回合数
+        # 超参数
+        self.gamma = 0.95  # 强化学习中的折扣因子
+        self.epsilon_start = 0.90  # e-greedy策略中初始epsilon
+        self.epsilon_end = 0.01  # e-greedy策略中的终止epsilon
+        self.epsilon_decay = 500  # e-greedy策略中epsilon的衰减率
+        self.lr = 0.0001  # 学习率
+        self.memory_capacity = 100000  # 经验回放的容量
+        self.batch_size = 64  # mini-batch SGD中的批量大小
+        self.target_update = 4  # 目标网络的更新频率
+        self.hidden_dim = 256  # 网络隐藏层
+class PlotConfig:
+    ''' 绘图相关参数设置
+    '''
+
+    def __init__(self) -> None:
+        self.algo_name = algo_name  # 算法名称
+        self.env_name = env_name  # 环境名称
+        self.device = torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu")  # 检测GPU
+        self.result_path = curr_path + "/outputs/" + self.env_name + \
+            '/' + curr_time + '/results/'  # 保存结果的路径
+        self.model_path = curr_path + "/outputs/" + self.env_name + \
+            '/' + curr_time + '/models/'  # 保存模型的路径
+        self.save = True  # 是否保存图片
+
+
+def env_agent_config(cfg, seed=1):
+    ''' 创建环境和智能体
+    '''
+    env = gym.make(cfg.env_name)  # 创建环境
+    env.seed(seed)  # 设置随机种子
+    state_dim = env.observation_space.shape[0]  # 状态维度
+    action_dim = env.action_space.n  # 动作维度
+    agent = DQN(state_dim, action_dim, cfg)  # 创建智能体
+    return env, agent
+
+def train(cfg, env, agent):
+    ''' 训练
+    '''
+    print('开始训练!')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
+    rewards = [] # 记录所有回合的奖励
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    for i_ep in range(cfg.train_eps):
+        ep_reward = 0 # 记录一回合内的奖励
+        state = env.reset() # 重置环境，返回初始状态
+        while True:
+            action = agent.choose_action(state) # 选择动作
+            next_state, reward, done, _ = env.step(action) # 更新环境，返回transition
+            agent.memory.push(state, action, reward, next_state, done) # 保存transition
+            state = next_state # 更新下一个状态
+            agent.update() # 更新智能体
+            ep_reward += reward # 累加奖励
+            if done:
+                break
+        if (i_ep+1) % cfg.target_update == 0: # 智能体目标网络更新
+            agent.target_net.load_state_dict(agent.policy_net.state_dict())
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
+        else:
+            ma_rewards.append(ep_reward)
+        if (i_ep+1)%10 == 0: 
+            print('回合：{}/{}, 奖励：{}'.format(i_ep+1, cfg.train_eps, ep_reward))
+    print('完成训练！')
+    return rewards, ma_rewards
+
+def test(cfg,env,agent):
+    print('开始测试!')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
+    # 由于测试不需要使用epsilon-greedy策略，所以相应的值设置为0
+    cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon
+    cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
+    rewards = [] # 记录所有回合的奖励
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    for i_ep in range(cfg.test_eps):
+        ep_reward = 0 # 记录一回合内的奖励
+        state = env.reset() # 重置环境，返回初始状态
+        while True:
+            action = agent.choose_action(state) # 选择动作
+            next_state, reward, done, _ = env.step(action) # 更新环境，返回transition
+            state = next_state # 更新下一个状态
+            ep_reward += reward # 累加奖励
+            if done:
+                break
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
+        else:
+            ma_rewards.append(ep_reward)
+        print(f"回合：{i_ep+1}/{cfg.test_eps}，奖励：{ep_reward:.1f}")
+    print('完成测试！')
+    return rewards,ma_rewards
+if __name__ == "__main__":
+    cfg = DQNConfig()
+    plot_cfg = PlotConfig()
+    # 训练
+    env, agent = env_agent_config(cfg, seed=1)
+    rewards, ma_rewards = train(cfg, env, agent)
+    make_dir(plot_cfg.result_path, plot_cfg.model_path)  # 创建保存结果和模型路径的文件夹
+    agent.save(path=plot_cfg.model_path)  # 保存模型
+    save_results(rewards, ma_rewards, tag='train',
+                path=plot_cfg.result_path)  # 保存结果
+    plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="train")  # 画出结果
+    # 测试
+    env, agent = env_agent_config(cfg, seed=10)
+    agent.load(path=plot_cfg.model_path)  # 导入模型
+    rewards, ma_rewards = test(cfg, env, agent)
+    save_results(rewards, ma_rewards, tag='test',
+                path=plot_cfg.result_path)  # 保存结果
+    plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="test")  # 画出结果
@@ -0,0 +1,150 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Author: JiangJi
+Email: johnjim0816@gmail.com
+Date: 2021-12-22 11:14:17
+LastEditor: JiangJi
+LastEditTime: 2021-12-22 15:27:48
+Discription: 使用 DQN-cnn  训练 PongNoFrameskip-v4
+'''
+import sys
+import os
+curr_path = os.path.dirname(os.path.abspath(__file__))  # 当前文件所在绝对路径
+parent_path = os.path.dirname(curr_path)  # 父路径
+sys.path.append(parent_path)  # 添加路径到系统路径
+
+import gym
+import torch
+import datetime
+from common.utils import save_results, make_dir
+from common.utils import plot_rewards, plot_rewards_cn
+from common.atari_wrappers import make_atari, wrap_deepmind
+from DQN.dqn import DQN
+
+curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
+algo_name = 'DQN-cnn'  # 算法名称
+env_name = 'PongNoFrameskip-v4'  # 环境名称
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 检测GPU
+class DQNConfig:
+    ''' 算法相关参数设置
+    '''
+
+    def __init__(self):
+        self.algo_name = algo_name  # 算法名称
+        self.env_name = env_name  # 环境名称
+        self.device = device # 检测GPU
+        self.train_eps = 500  # 训练的回合数
+        self.test_eps = 30  # 测试的回合数
+        # 超参数
+        self.gamma = 0.95  # 强化学习中的折扣因子
+        self.epsilon_start = 0.90  # e-greedy策略中初始epsilon
+        self.epsilon_end = 0.01  # e-greedy策略中的终止epsilon
+        self.epsilon_decay = 500  # e-greedy策略中epsilon的衰减率
+        self.lr = 0.0001  # 学习率
+        self.memory_capacity = 100000  # 经验回放的容量
+        self.batch_size = 64  # mini-batch SGD中的批量大小
+        self.target_update = 4  # 目标网络的更新频率
+        self.hidden_dim = 256  # 网络隐藏层
+class PlotConfig:
+    ''' 绘图相关参数设置
+    '''
+
+    def __init__(self) -> None:
+        self.algo_name = algo_name  # 算法名称
+        self.env_name = env_name  # 环境名称
+        self.device = device  # 检测GPU
+        self.result_path = curr_path + "/outputs/" + self.env_name + \
+            '/' + curr_time + '/results/'  # 保存结果的路径
+        self.model_path = curr_path + "/outputs/" + self.env_name + \
+            '/' + curr_time + '/models/'  # 保存模型的路径
+        self.save = True  # 是否保存图片
+
+
+def env_agent_config(cfg, seed=1):
+    ''' 创建环境和智能体
+    '''
+    env    = make_atari(cfg.env_name) # 创建环境
+    # env    = wrap_deepmind(env)
+    # env    = wrap_pytorch(env) 
+    env.seed(seed)  # 设置随机种子
+    state_dim = env.observation_space.shape[0]  # 状态维度
+    action_dim = env.action_space.n  # 动作维度
+    agent = DQN(state_dim, action_dim, cfg)  # 创建智能体
+    return env, agent
+
+def train(cfg, env, agent):
+    ''' 训练
+    '''
+    print('开始训练!')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
+    rewards = [] # 记录所有回合的奖励
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    for i_ep in range(cfg.train_eps):
+        ep_reward = 0 # 记录一回合内的奖励
+        state = env.reset() # 重置环境，返回初始状态
+        while True:
+            action = agent.choose_action(state) # 选择动作
+            next_state, reward, done, _ = env.step(action) # 更新环境，返回transition
+            agent.memory.push(state, action, reward, next_state, done) # 保存transition
+            state = next_state # 更新下一个状态
+            agent.update() # 更新智能体
+            ep_reward += reward # 累加奖励
+            if done:
+                break
+        if (i_ep+1) % cfg.target_update == 0: # 智能体目标网络更新
+            agent.target_net.load_state_dict(agent.policy_net.state_dict())
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
+        else:
+            ma_rewards.append(ep_reward)
+        if (i_ep+1)%10 == 0: 
+            print('回合：{}/{}, 奖励：{}'.format(i_ep+1, cfg.train_eps, ep_reward))
+    print('完成训练！')
+    return rewards, ma_rewards
+
+def test(cfg,env,agent):
+    print('开始测试!')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
+    # 由于测试不需要使用epsilon-greedy策略，所以相应的值设置为0
+    cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon
+    cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
+    rewards = [] # 记录所有回合的奖励
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    for i_ep in range(cfg.test_eps):
+        ep_reward = 0 # 记录一回合内的奖励
+        state = env.reset() # 重置环境，返回初始状态
+        while True:
+            action = agent.choose_action(state) # 选择动作
+            next_state, reward, done, _ = env.step(action) # 更新环境，返回transition
+            state = next_state # 更新下一个状态
+            ep_reward += reward # 累加奖励
+            if done:
+                break
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
+        else:
+            ma_rewards.append(ep_reward)
+        print(f"回合：{i_ep+1}/{cfg.test_eps}，奖励：{ep_reward:.1f}")
+    print('完成测试！')
+    return rewards,ma_rewards
+if __name__ == "__main__":
+    cfg = DQNConfig()
+    plot_cfg = PlotConfig()
+    # 训练
+    env, agent = env_agent_config(cfg, seed=1)
+    rewards, ma_rewards = train(cfg, env, agent)
+    make_dir(plot_cfg.result_path, plot_cfg.model_path)  # 创建保存结果和模型路径的文件夹
+    agent.save(path=plot_cfg.model_path)  # 保存模型
+    save_results(rewards, ma_rewards, tag='train',
+                path=plot_cfg.result_path)  # 保存结果
+    plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="train")  # 画出结果
+    # 测试
+    env, agent = env_agent_config(cfg, seed=10)
+    agent.load(path=plot_cfg.model_path)  # 导入模型
+    rewards, ma_rewards = test(cfg, env, agent)
+    save_results(rewards, ma_rewards, tag='test',
+                path=plot_cfg.result_path)  # 保存结果
+    plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="test")  # 画出结果
@@ -0,0 +1,175 @@
+前面项目讲的环境都是离散动作的，但实际中也有很多连续动作的环境，比如Open AI Gym中的[Pendulum-v0](https://github.com/openai/gym/wiki/Pendulum-v0)环境，它解决的是一个倒立摆问题，我们先对该环境做一个简要说明。
+
+## Pendulum-v0简介
+
+如果说 CartPole-v0 是一个离散动作的经典入门环境的话，那么对应 Pendulum-v0 就是连续动作的经典入门环境，如下图，我们通过施加力矩使其向上摆动并保持直立。
+
+<img src="../../easy_rl_book/res/ch12/assets/pendulum_1.png" alt="image-20210915161550713" style="zoom:50%;" />
+
+该环境的状态维度有三个，设摆针竖直方向上的顺时针旋转角为$\theta$，$\theta$设在$[-\pi,\pi]$之间，则相应的状态为$[cos\theta,sin\theta,\dot{\theta}]$，即表示角度和角速度，我们的动作则是一个-2到2之间的力矩，它是一个连续量，因而该环境不能用离散动作的算法比如 DQN 来解决。关于奖励是根据相关的物理原理而计算出的等式，如下：
+$$
+-\left(\theta^{2}+0.1 * \hat{\theta}^{2}+0.001 * \text { action }^{2}\right)
+$$
+对于每一步，其最低奖励为$-\left(\pi^{2}+0.1 * 8^{2}+0.001 *  2^{2}\right)= -16.2736044$，最高奖励为0。同 CartPole-v0 环境一样，达到最优算法的情况下，每回合的步数是无限的，因此这里设定每回合最大步数为200以便于训练。
+
+##  DDPG 基本接口
+
+我们依然使用接口的概念，通过伪代码分析并实现 DDPG 的训练模式，如下：
+
+> 初始化评论家网络$Q\left(s, a \mid \theta^{Q}\right)$和演员网络$\mu\left(s \mid \theta^{\mu}\right)$，其权重分别为$\theta^{Q}$和$\theta^{\mu}$
+>
+> 初始化目标网络$Q'$和$\mu'$，并复制权重$\theta^{Q^{\prime}} \leftarrow \theta^{Q}, \theta^{\mu^{\prime}} \leftarrow \theta^{\mu}$
+>
+> 初始化经验回放缓冲区$R$
+>
+> 执行$M$个回合循环，对于每个回合：
+>
+> * 初始化动作探索的的随机过程即噪声$\mathcal{N}$
+>
+> * 初始化状态$s_1$
+>
+>   循环$T$个时间步长，对于每个时步$
+>
+>   * 根据当前策略和噪声选择动作$a_{t}=\mu\left(s_{t} \mid \theta^{\mu}\right)+\mathcal{N}_{t}$
+>   * 执行动作$a_t$并得到反馈$r_t$和下一个状态$s_{t+1}$
+>   * 存储转移$\left(s_{t}, a_{t}, r_{t}, s_{t+1}\right)$到经验缓冲$R$中
+>   * (更新策略)从$D$随机采样一个小批量的转移
+>   * (更新策略)计算实际的Q值$y_{i}=r_{i}+\gamma Q^{\prime}\left(s_{i+1}, \mu^{\prime}\left(s_{i+1} \mid \theta^{\mu^{\prime}}\right) \mid \theta^{Q^{\prime}}\right)$
+>   * (更新策略)对损失函数$L=\frac{1}{N} \sum_{i}\left(y_{i}-Q\left(s_{i}, a_{i} \mid \theta^{Q}\right)\right)^{2}$关于参数$\theta$做梯度下降用于更新评论家网络
+>   * (更新策略)使用采样梯度更新演员网络的策略：$\left.\left.\nabla_{\theta^{\mu}} J \approx \frac{1}{N} \sum_{i} \nabla_{a} Q\left(s, a \mid \theta^{Q}\right)\right|_{s=s_{i}, a=\mu\left(s_{i}\right)} \nabla_{\theta^{\mu}} \mu\left(s \mid \theta^{\mu}\right)\right|_{s_{i}}$
+>   * (更新策略)更新目标网络：$\theta^{Q^{\prime}} \leftarrow \tau \theta^{Q}+(1-\tau) \theta^{Q^{\prime}}$，$\theta^{\mu^{\prime}} \leftarrow \tau \theta^{\mu}+(1-\tau) \theta^{\mu^{\prime}}$
+
+代码如下：
+
+```python
+ou_noise = OUNoise(env.action_space)  # 动作噪声
+rewards = [] # 记录奖励
+ma_rewards = []  # 记录滑动平均奖励
+for i_ep in range(cfg.train_eps):
+    state = env.reset()
+    ou_noise.reset()
+    done = False
+    ep_reward = 0
+    i_step = 0
+    while not done:
+        i_step += 1
+        action = agent.choose_action(state)
+        action = ou_noise.get_action(action, i_step) 
+        next_state, reward, done, _ = env.step(action)
+        ep_reward += reward
+        agent.memory.push(state, action, reward, next_state, done)
+        agent.update()
+        state = next_state
+    if (i_ep+1)%10 == 0:
+        print('回合：{}/{}，奖励：{}'.format(i_ep+1, cfg.train_eps, ep_reward))
+    rewards.append(ep_reward)
+    if ma_rewards:
+        ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
+    else:
+        ma_rewards.append(ep_reward)
+```
+
+相比于 DQN ，DDPG 主要多了两处修改，一个是给动作施加噪声，另外一个是软更新策略，即最后一步。
+
+## Ornstein-Uhlenbeck噪声
+
+ OU 噪声适用于惯性系统，尤其是时间离散化粒度较小的情况。 OU 噪声是一种随机过程，下面略去证明，直接给出公式：
+$$
+x(t+\Delta t)=x(t)-\theta(x(t)-\mu) \Delta t+\sigma W_t
+$$
+其中 $W_t$ 属于正太分布，进而代码实现如下：
+
+```python
+class OUNoise(object):
+    '''Ornstein–Uhlenbeck噪声
+    '''
+    def __init__(self, action_space, mu=0.0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000):
+        self.mu           = mu # OU噪声的参数
+        self.theta        = theta # OU噪声的参数
+        self.sigma        = max_sigma # OU噪声的参数
+        self.max_sigma    = max_sigma
+        self.min_sigma    = min_sigma
+        self.decay_period = decay_period
+        self.action_dim   = action_space.shape[0]
+        self.low          = action_space.low
+        self.high         = action_space.high
+        self.reset()
+    def reset(self):
+        self.obs = np.ones(self.action_dim) * self.mu
+    def evolve_obs(self):
+        x  = self.obs
+        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
+        self.obs = x + dx
+        return self.obs
+    def get_action(self, action, t=0):
+        ou_obs = self.evolve_obs()
+        self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period) # sigma会逐渐衰减
+        return np.clip(action + ou_obs, self.low, self.high) # 动作加上噪声后进行剪切
+```
+
+## DDPG算法
+
+DDPG算法主要也包括两个功能，一个是选择动作，另外一个是更新策略，首先看选择动作：
+
+```python
+def choose_action(self, state):
+        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
+        action = self.actor(state)
+        return action.detach().cpu().numpy()[0, 0]
+```
+
+由于DDPG是直接从演员网络取得动作，所以这里不用$\epsilon-greedy$策略。在更新策略函数中，也会跟DQN稍有不同，并且加入软更新：
+
+```python
+def update(self):
+        if len(self.memory) < self.batch_size: # 当 memory 中不满足一个批量时，不更新策略
+            return
+        # 从经验回放中(replay memory)中随机采样一个批量的转移(transition)
+        state, action, reward, next_state, done = self.memory.sample(self.batch_size)
+        # 转变为张量
+        state = torch.FloatTensor(state).to(self.device)
+        next_state = torch.FloatTensor(next_state).to(self.device)
+        action = torch.FloatTensor(action).to(self.device)
+        reward = torch.FloatTensor(reward).unsqueeze(1).to(self.device)
+        done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(self.device)
+       
+        policy_loss = self.critic(state, self.actor(state))
+        policy_loss = -policy_loss.mean()
+        next_action = self.target_actor(next_state)
+        target_value = self.target_critic(next_state, next_action.detach())
+        expected_value = reward + (1.0 - done) * self.gamma * target_value
+        expected_value = torch.clamp(expected_value, -np.inf, np.inf)
+
+        value = self.critic(state, action)
+        value_loss = nn.MSELoss()(value, expected_value.detach())
+        
+        self.actor_optimizer.zero_grad()
+        policy_loss.backward()
+        self.actor_optimizer.step()
+        self.critic_optimizer.zero_grad()
+        value_loss.backward()
+        self.critic_optimizer.step()
+        # 软更新
+        for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()):
+            target_param.data.copy_(
+                target_param.data * (1.0 - self.soft_tau) +
+                param.data * self.soft_tau
+            )
+        for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()):
+            target_param.data.copy_(
+                target_param.data * (1.0 - self.soft_tau) +
+                param.data * self.soft_tau
+            )
+```
+
+## 结果分析
+
+实现算法之后，我们先看看训练效果：
+
+![train_rewards_curve_cn](../../easy_rl_book/res/ch12/assets/train_rewards_curve_cn-1760758.png)
+
+可以看到算法整体上是达到收敛了的，但是稳定状态下波动还比较大，依然有提升的空间，限于笔者的精力，这里只是帮助赌注实现一个基础的代码演示，想要使得算法调到最优感兴趣的读者可以多思考实现。我们再来看看测试的结果：
+
+![eval_rewards_curve_cn](../../easy_rl_book/res/ch12/assets/eval_rewards_curve_cn-1760950.png)
+
+从图中看出测试的平均奖励在-150左右，但其实训练的时候平均的稳态奖励在-300左右，这是因为测试的时候我们舍去了OU噪声的缘故。
@@ -0,0 +1,208 @@
+
+
+在练习本项目之前，可以先回顾一下之前的项目实战，即使用Q学习解决悬崖寻路问题。本项目将具体实现DQN算法来解决推车杆问题，对应的模拟环境为Open AI Gym中的[CartPole-v0](https://datawhalechina.github.io/easy-rl/#/chapter7/project2?id=cartpole-v0)，我们同样先对该环境做一个简要说明。
+
+## CartPole-v0 简介
+
+CartPole-v0是一个经典的入门环境，如下图，它通过向左(动作=0)或向右(动作=1)推动推车来实现竖直杆的平衡，每次实施一个动作后如果能够继续保持平衡就会得到一个+1的奖励，否则杆将无法保持平衡而导致游戏结束。
+
+![Gym](assets/poster.jpg)
+
+我们来看看这个环境的一些参数，执行以下代码：
+
+```python
+import gym
+env = gym.make('CartPole-v0')  # 建立环境
+env.seed(1) # 随机种子
+state_dim = env.observation_space.shape[0] # 状态维度
+action_dim = env.action_space.n # 动作维度
+state = env.reset() # 初始化环境
+print(f"状态维度：{state_dim}，动作维度：{action_dim}")
+print(f"初始状态：{state}")
+```
+
+可以得到结果：
+
+```bash
+状态维度：4，动作维度：2
+初始状态：[ 0.03073904  0.00145001 -0.03088818 -0.03131252]
+```
+
+该环境状态维度是四个，分别为车的位置、车的速度、杆的角度以及杆顶部的速度，动作维度为两个，并且是离散的向左或者向右。理论上达到最优化算法的情况下，推车杆是一直能保持平衡的，也就是每回合的步数是无限，但是这不方便训练，所以环境内部设置了每回合的最大步数为200，也就是说理想情况下，只需要我们每回合的奖励达到200就算训练完成。
+
+## DQN基本接口
+
+介绍完环境之后，我们沿用接口的概念，通过分析伪代码来实现DQN的基本训练模式，以及一些要素比如建立什么网络需要什么模块等等。我们现在常用的DQN伪代码如下：
+
+> 初始化经验回放缓冲区(replay memory)$D$，容量(capacity)为$N$
+>
+> 初始化状态-动作函数，即带有初始随机权重$\theta$的$Q$网络
+>
+> 初始化目标状态-动作函数，即带有初始随机权重$\theta^-$的$\hat{Q}$网络，且$\theta^-=\theta$
+>
+> 执行$M$个回合循环，对于每个回合
+>
+> * 初始化环境，得到初始状态$s_1$
+> * 循环$T$个时间步长，对于每个时步$t$
+>   * 使用$\epsilon-greedy$策略选择动作$a_t$
+>   * 环境根据$a_t$反馈当前的奖励$r_t$和下一个状态$s_{t+1}$
+>   * 更新状态$s_{t+1}=s_t$
+>   * 存储转移(transition)即$(s_t,a_t,r-t,s_{t+1})$到经验回放$D$中
+>   * (更新策略)从$D$随机采样一个小批量的转移
+>   * (更新策略)计算实际的Q值$y_{j}=\left\{\begin{array}{cc}r_{j} & \text { 如果回合在时步 j+1终止 }\\ r_{j}+\gamma \max _{a^{\prime}} \hat{Q}\left(\phi_{j+1}, a^{\prime} ; \theta^{-}\right) & \text {否则 }\end{array}\right.$
+>   * (更新策略)对损失函数$\left(y_{j}-Q\left(\phi_{j}, a_{j} ; \theta\right)\right)^{2}$关于参数$\theta$做梯度下降
+>   * (更新策略)每$C$步重置$\hat{Q}=Q$
+
+用代码来实现的话如下：
+
+```python
+rewards = [] # 记录奖励
+    ma_rewards = []  # 记录滑动平均奖励
+    for i_ep in range(cfg.train_eps):
+        state = env.reset()
+        done = False
+        ep_reward = 0
+        while True:
+            action = agent.choose_action(state)
+            next_state, reward, done, _ = env.step(action)
+            ep_reward += reward
+            agent.memory.push(state, action, reward, next_state, done)
+            state = next_state
+            agent.update()
+            if done:
+                break
+        if (i_ep+1) % cfg.target_update == 0:
+            agent.target_net.load_state_dict(agent.policy_net.state_dict())
+        if (i_ep+1)%10 == 0:
+            print('回合：{}/{}, 奖励：{}'.format(i_ep+1, cfg.train_eps, ep_reward))
+        rewards.append(ep_reward)
+        # save ma_rewards
+        if ma_rewards:
+            ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
+        else:
+            ma_rewards.append(ep_reward)
+```
+
+
+
+可以看到，DQN的训练模式其实和大多强化学习算法是一样的套路，但与传统的Q学习算法相比，DQN使用神经网络来代替之前的Q表格从而存储更多的信息，且由于使用了神经网络所以我们一般需要利用随机梯度下降来优化Q值的预测。此外多了经验回放缓冲区(replay memory)，并且使用两个网络，即目标网络和当前网络。
+
+## 经验回放缓冲区
+
+从伪代码中可以看出来，经验回放缓冲区的功能有两个，一个是将每一步采集的转移(transition，包括状态，动作，奖励，下一时刻的状态)存储到缓冲区中，并且缓冲区具备一定的容量(capacity)，另一个是在更新策略的时候需要随机采样小批量的转移进行优化。因此我们可以定义一个ReplayBuffer类，包括push和sample两个函数，用于存储和采样。
+
+```python
+import random
+class ReplayBuffer:
+    def __init__(self, capacity):
+        self.capacity = capacity # 经验回放的容量
+        self.buffer = [] # 缓冲区
+        self.position = 0 
+    
+    def push(self, state, action, reward, next_state, done):
+        ''' 缓冲区是一个队列，容量超出时去掉开始存入的转移(transition)
+        '''
+        if len(self.buffer) < self.capacity:
+            self.buffer.append(None)
+        self.buffer[self.position] = (state, action, reward, next_state, done)
+        self.position = (self.position + 1) % self.capacity 
+    
+    def sample(self, batch_size):
+        batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移
+        state, action, reward, next_state, done =  zip(*batch) # 解压成状态，动作等
+        return state, action, reward, next_state, done
+    def __len__(self):
+        ''' 返回当前存储的量
+        '''
+        return len(self.buffer)
+```
+
+## Q网络
+
+在DQN中我们使用神经网络替代原有的Q表，从而能够存储更多的Q值，实现更为高级的策略以便用于复杂的环境，这里我们用的是一个三层的感知机或者说全连接网络：
+
+```python
+class MLP(nn.Module):
+    def __init__(self, input_dim,output_dim,hidden_dim=128):
+        """ 初始化q网络，为全连接网络
+            input_dim: 输入的特征数即环境的状态维度
+            output_dim: 输出的动作维度
+        """
+        super(MLP, self).__init__()
+        self.fc1 = nn.Linear(input_dim, hidden_dim) # 输入层
+        self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
+        self.fc3 = nn.Linear(hidden_dim, output_dim) # 输出层
+        
+    def forward(self, x):
+        # 各层对应的激活函数
+        x = F.relu(self.fc1(x)) 
+        x = F.relu(self.fc2(x))
+        return self.fc3(x)
+```
+
+学过深度学习的同学应该都对这个网络十分熟悉，在强化学习中，网络的输入一般是状态，输出则是一个动作，假如总共有两个动作，那么这里的动作维度就是2，可能的输出就是0或1，一般我们用ReLU作为激活函数。根据实际需要也可以改变神经网络的模型结构等等，比如若我们使用图像作为输入的话，这里可以使用卷积神经网络(CNN)。
+
+## DQN算法
+
+跟前面的项目实战一样，DQN算法一般也包括选择动作和更新策略两个函数，首先我们看选择动作：
+
+```python
+def choose_action(self, state):
+        '''选择动作
+        '''
+        self.frame_idx += 1
+        if random.random() > self.epsilon(self.frame_idx):
+            with torch.no_grad():
+                state = torch.tensor([state], device=self.device, dtype=torch.float32)
+                q_values = self.policy_net(state)
+                action = q_values.max(1)[1].item() # 选择Q值最大的动作
+        else:
+            action = random.randrange(self.action_dim)
+```
+
+可以看到跟Q学习算法其实是一样的，都是用的$\epsilon-greedy$策略，只是使用神经网络的话我们需要通过Torch或者Tensorflow工具来处理相应的数据。
+
+而DQN更新策略的步骤稍微复杂一点，主要包括三个部分：随机采样，计算期望Q值和梯度下降，如下：
+
+```python
+def update(self):
+        if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时，不更新策略
+            return
+        # 从经验回放中(replay memory)中随机采样一个批量的转移(transition)
+        state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
+            self.batch_size)
+        # 转为张量
+        state_batch = torch.tensor(
+            state_batch, device=self.device, dtype=torch.float)
+        action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(
+            1)  
+        reward_batch = torch.tensor(
+            reward_batch, device=self.device, dtype=torch.float)  
+        next_state_batch = torch.tensor(
+            next_state_batch, device=self.device, dtype=torch.float)
+        done_batch = torch.tensor(np.float32(
+            done_batch), device=self.device)
+        q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch) # 计算当前状态(s_t,a)对应的Q(s_t, a)
+        next_q_values = self.target_net(next_state_batch).max(1)[0].detach() # 计算下一时刻的状态(s_t_,a)对应的Q值
+        # 计算期望的Q值，对于终止状态，此时done_batch[0]=1, 对应的expected_q_value等于reward
+        expected_q_values = reward_batch + self.gamma * next_q_values * (1-done_batch)
+        loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1))  # 计算均方根损失
+        # 优化更新模型
+        self.optimizer.zero_grad()  
+        loss.backward()
+        for param in self.policy_net.parameters():  # clip防止梯度爆炸
+            param.grad.data.clamp_(-1, 1)
+        self.optimizer.step() 
+```
+
+## 结果分析
+
+完成代码之后，我们先来看看DQN算法的训练效果，曲线如下：
+
+![train_rewards_curve_cn](../../easy_rl_book/res/ch7/assets/train_rewards_curve_cn-1689150.png)
+
+从图中看出，算法其实已经在60回合左右达到收敛，最后一直维持在最佳奖励200左右，可能会有轻微的波动，这是因为我们在收敛的情况下依然保持了一定的探索率，即epsilon_end=0.01。现在我们可以载入模型看看测试的效果：
+
+![eval_rewards_curve_cn](../../easy_rl_book/res/ch7/assets/eval_rewards_curve_cn-1689282.png)
+
+我们测试了30个回合，每回合都保持在200左右，说明我们的模型学习得不错了！
@@ -0,0 +1,165 @@
+# 使用Q学习解决悬崖寻路问题
+
+强化学习在运动规划方面也有很大的应用前景，已有很多适用于强化学习的相关仿真环境，小到迷宫，大到贴近真实的自动驾驶环境[CARLA](http://carla.org/)。本次使用[OpenAI Gym](https://gym.openai.com/)开发的CliffWalking-v0环境，带大家入门Q学习算法的代码实战。
+
+## CliffWalking-v0环境简介
+
+我们首先简单介绍一下这个环境，该环境中文名叫悬崖寻路（CliffWalking），是一个迷宫类问题。如下图，在一个4 x 12的网格中，智能体以网格的左下角位置为起点，以网格的下角位置为终点，目标是移动智能体到达终点位置，智能体每次可以在上、下、左、右这4个方向中移动一步，每移动一步会得到-1单位的奖励。
+
+<div align=center>
+<img src="assets/cliffwalking_1.png" alt="cliffwalking_1" style="zoom:50%;" />
+</div>
+起终点之间是一段悬崖，即编号为37～46的网格，智能体移动过程中会有如下的限制：
+
+* 智能体不能移出网格边界，如果智能体想执行某个动作移出网格，那么这一步智能体不会移动，但是这个操作依然会得到-1单位的奖励
+* 如果智能体“掉入悬崖” ，会立即回到起点位置，并得到-100单位的奖励
+* 当智能体移动到终点时，该回合结束，该回合总奖励为各步奖励之和
+
+我们的目标是以最少的步数到达终点，容易看出最少需要13步智能体才能从起点到终点，因此最佳算法收敛的情况下，每回合的总奖励应该是-13，这样人工分析出期望的奖励也便于我们判断算法的收敛情况作出相应调整。
+
+现在我们可以在代码中定义环境，如下：
+
+```python
+import gym # 导入gym模块
+from envs.gridworld_env import CliffWalkingWapper # 导入自定义装饰器
+
+env = gym.make('CliffWalking-v0')  # 定义环境
+env = CliffWalkingWapper(env) # 装饰环境
+```
+
+这里我们在程序中使用了一个装饰器重新定义环境，但不影响对环境的理解，感兴趣的同学具体看相关代码。可以由于gym环境封装得比较好，所以我们想要使用这个环境只需要使用gym.make命令输入函数名即可，然后我们可以查看环境的状态和动作维度目：
+
+```python
+state_dim = env.observation_space.n # 状态维度
+action_dim = env.action_space.n # 动作维度
+print(f"状态维度：{state_dim}，动作维度：{action_dim}")
+```
+
+打印出来的结果如下：
+
+```bash
+状态维度：48，动作维度：4
+```
+
+我们的状态维度是48个，这里我们设置的是智能体当前所在网格的编号，而动作维度是4，这表示有0，1，2，3对应着上下左右四个动作。另外我们也可以初始化环境并打印当前所在的状态：
+
+```python
+state = env.reset()
+print(state)
+```
+
+结果显示为：
+
+```bash
+36
+```
+
+也就是说当前智能体的状态即当前所在的网格编号是36，正好对应我们前面讲到的起点。
+
+## 强化学习基本接口
+
+这里所说的接口就是一般强化学习的训练模式，也是大多数算法伪代码遵循的套路，步骤如下：
+
+* 初始化环境和智能体
+* 对于每个回合，智能体选择动作
+* 环境接收动作反馈下一个状态和奖励
+* 智能体进行策略更新(学习)
+* 多个回合算法收敛之后保存模型以及做后续的分析画图等
+
+代码如下：
+
+```python
+env = gym.make('CliffWalking-v0')  # 定义环境
+env = CliffWalkingWapper(env) # 装饰环境
+env.seed(1) # 设置随机种子
+state_dim = env.observation_space.n # 状态维度
+action_dim = env.action_space.n # 动作维度
+agent = QLearning(state_dim,action_dim,cfg) # cfg存储算法相关参数
+for i_ep in range(cfg.train_eps): # cfg.train_eps表示最大训练的回合数
+    ep_reward = 0  # 记录每个回合的奖励
+    state = env.reset()  # 重置环境
+    while True: 
+        action = agent.choose_action(state)  # 算法选择一个动作
+        next_state, reward, done, _ = env.step(action)  # 环境根据动作反馈奖励和下一个状态
+        agent.update(state, action, reward, next_state, done)  # 算法更新
+        state = next_state  # 更新状态
+        ep_reward += reward
+        if done: # 终止状态提前停止
+            break
+```
+
+通常我们会记录并分析奖励的变化，所以在接口基础上加一些变量记录每回合的奖励，此外由于强化学习学习过程得到的奖励可能会产生振荡，因此我们也适用一个滑动平均的量来反映奖励变化的趋势，如下：
+
+```bash
+rewards = []  
+ma_rewards = [] # 滑动平均奖励
+for i_ep in range(cfg.train_eps):
+    ep_reward = 0  # 记录每个回合的奖励
+    state = env.reset()  # 重置环境, 重新开一局（即开始新的一个回合）
+    while True:
+        action = agent.choose_action(state)  # 根据算法选择一个动作
+        next_state, reward, done, _ = env.step(action)  # 与环境进行一次动作交互
+        agent.update(state, action, reward, next_state, done)  # Q-learning算法更新
+        state = next_state  # 存储上一个观察值
+        ep_reward += reward
+        if done:
+            break
+rewards.append(ep_reward)
+if ma_rewards:
+    ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
+    else:
+        ma_rewards.append(ep_reward)
+```
+
+## Q学习算法
+
+了解了基本接口之后，现在我们看看Q学习算法具体是怎么实现的，前面讲到智能体其实在整个训练中就做两件事，一个是选择动作，一个是更新策略，所以我们可以定义一个Qlearning类，里面主要包含两个函数choose_action和update。
+
+我们先看看choose_action函数是怎么定义的，如下：
+
+```python
+def choose_action(self, state):
+      self.sample_count += 1
+      self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) 
+          math.exp(-1. * self.sample_count / self.epsilon_decay) # epsilon是会递减的，这里选择指数递减
+      # e-greedy 策略
+      if np.random.uniform(0, 1) > self.epsilon:
+          action = np.argmax(self.Q_table[str(state)]) # 选择Q(s,a)最大对应的动作
+      else:
+          action = np.random.choice(self.action_dim) # 随机选择动作
+      return action
+```
+
+
+
+一般我们使用$\varepsilon-greedy$策略选择动作，我们的输入就是当前的状态，随机选取一个值，当这个值大于我们设置的$\varepsilon$时，我们选取Q值最大对应的动作，否则随机选择动作，这样就能在训练中让智能体保持一定的探索率，这也是平衡探索与利用的技巧之一。
+
+下面是我们要实现的策略更新函数：
+
+```python
+def update(self, state, action, reward, next_state, done):
+        Q_predict = self.Q_table[str(state)][action] 
+        if done: # 终止状态
+            Q_target = reward  
+        else:
+            Q_target = reward + self.gamma * np.max(self.Q_table[str(next_state)]) 
+        self.Q_table[str(state)][action] += self.lr * (Q_target - Q_predict)
+```
+
+这里面实现的逻辑就是伪代码中的更新公式：
+
+<img src="assets/image-20210911213241605.png" alt="image-20210911213241605" style="zoom:50%;" />
+
+注意终止状态下，我们是获取不到下一个动作的，我们直接将Q值（Q_target）更新为对应的奖励即可。
+
+## 结果分析
+
+到现在我们就基本完成了Q学习的代码实现，具体可以查看github上的源码，运行代码结果如下：
+
+![train_rewards_curve_cn](assets/train_rewards_curve_cn.png)
+
+由于这个环境比较简单，可以看到算法很快达到收敛，然后我们再测试我们训练好的模型，一般测试模型只需要20到50左右的回合数即可：
+
+![eval_rewards_curve_cn](assets/eval_rewards_curve_cn.png)
+
+这里我们测试的回合数为30，可以看到每个回合智能体都达到了最优的奖励，说明我们的算法训练的效果很不错！
@@ -0,0 +1,39 @@
+食用本篇之前，需要有DQN算法的基础，参考[DQN算法实战](../DQN)。
+
+## 原理简介
+
+Double-DQN是2016年提出的算法，灵感源自2010年的Double-Qlearning，可参考论文[Deep Reinforcement Learning with Double Q-learning](https://arxiv.org/abs/1509.06461)。
+跟Nature DQN一样，Double-DQN也用了两个网络，一个当前网络(对应用$Q$表示)，一个目标网络(对应一般用$Q'$表示，为方便区分，以下用$Q_{tar}$代替)。我们先回忆一下，对于非终止状态，目标$Q_{tar}$值计算如下
+![在这里插入图片描述](assets/20201222145725907.png)
+
+而在Double-DQN中，不再是直接从目标$Q_{tar}$网络中选择各个动作中的最大$Q_{tar}$值，而是先从当前$Q$网络选择$Q$值最大对应的动作，然后代入到目标网络中计算对应的值：
+![在这里插入图片描述](assets/20201222150225327.png)
+Double-DQN的好处是Nature DQN中使用max虽然可以快速让Q值向可能的优化目标靠拢，但是很容易过犹不及，导致过度估计(Over Estimation)，所谓过度估计就是最终我们得到的算法模型有很大的偏差(bias)。为了解决这个问题， DDQN通过解耦目标Q值动作的选择和目标Q值的计算这两步，来达到消除过度估计的问题，感兴趣可以阅读原论文。
+
+伪代码如下：
+![在这里插入图片描述](assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70.png)
+当然也可以两个网络可以同时为当前网络和目标网络，如下：
+![在这里插入图片描述](assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837146.png)
+或者这样更好理解如何同时为当前网络和目标网络：
+![在这里插入图片描述](assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837157.png)
+
+## 代码实战
+完整程序见[github](https://github.com/JohnJim0816/reinforcement-learning-tutorials/tree/master/DoubleDQN)。结合上面的原理，其实Double DQN改进来很简单，基本只需要在```update```中修改几行代码，如下：
+```python
+'''以下是Nature DQN的q_target计算方式
+next_q_state_value = self.target_net(
+next_state_batch).max(1)[0].detach()  # # 计算所有next states的Q'(s_{t+1})的最大值，Q'为目标网络的q函数,比如tensor([ 0.0060, -0.0171,...,])
+#计算 q_target
+#对于终止状态，此时done_batch[0]=1, 对应的expected_q_value等于reward
+q_target = reward_batch + self.gamma * next_q_state_value * (1-done_batch[0])
+'''
+'''以下是Double DQNq_target计算方式，与NatureDQN稍有不同'''
+next_target_values = self.target_net(
+next_state_batch)
+#选出Q(s_t‘, a)对应的action，代入到next_target_values获得target net对应的next_q_value，即Q’(s_t|a=argmax Q(s_t‘, a))
+next_target_q_value = next_target_values.gather(1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1)
+q_target = reward_batch + self.gamma * next_target_q_value * (1-done_batch[0])
+```
+reward变化结果如下：
+![在这里插入图片描述](assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210328110837128.png)
+其中下边蓝色和红色分别表示Double DQN和Nature DQN在训练中的reward变化图，而上面蓝色和绿色则表示Double DQN和Nature DQN在测试中的reward变化图。
@@ -0,0 +1,161 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+@Author: John
+@Email: johnjim0816@gmail.com
+@Date: 2020-06-12 00:50:49
+@LastEditor: John
+LastEditTime: 2021-11-19 18:07:09
+@Discription: 
+@Environment: python 3.7.7
+'''
+'''off-policy
+'''
+
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.nn.functional as F
+import random
+import math
+import numpy as np
+
+class ReplayBuffer:
+    def __init__(self, capacity):
+        self.capacity = capacity # 经验回放的容量
+        self.buffer = [] # 缓冲区
+        self.position = 0 
+    
+    def push(self, state, action, reward, next_state, done):
+        ''' 缓冲区是一个队列，容量超出时去掉开始存入的转移(transition)
+        '''
+        if len(self.buffer) < self.capacity:
+            self.buffer.append(None)
+        self.buffer[self.position] = (state, action, reward, next_state, done)
+        self.position = (self.position + 1) % self.capacity 
+    
+    def sample(self, batch_size):
+        batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移
+        state, action, reward, next_state, done =  zip(*batch) # 解压成状态，动作等
+        return state, action, reward, next_state, done
+    
+    def __len__(self):
+        ''' 返回当前存储的量
+        '''
+        return len(self.buffer)
+
+class MLP(nn.Module):
+    def __init__(self, state_dim,action_dim,hidden_dim=128):
+        """ 初始化q网络，为全连接网络
+            state_dim: 输入的特征数即环境的状态维度
+            action_dim: 输出的动作维度
+        """
+        super(MLP, self).__init__()
+        self.fc1 = nn.Linear(state_dim, hidden_dim) # 输入层
+        self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
+        self.fc3 = nn.Linear(hidden_dim, action_dim) # 输出层
+        
+    def forward(self, x):
+        # 各层对应的激活函数
+        x = F.relu(self.fc1(x)) 
+        x = F.relu(self.fc2(x))
+        return self.fc3(x)
+        
+class DoubleDQN:
+    def __init__(self, state_dim, action_dim, cfg):
+        self.action_dim = action_dim  # 总的动作个数
+        self.device = cfg.device  # 设备，cpu或gpu等
+        self.gamma = cfg.gamma
+        # e-greedy策略相关参数
+        self.actions_count = 0
+        self.epsilon_start = cfg.epsilon_start
+        self.epsilon_end = cfg.epsilon_end
+        self.epsilon_decay = cfg.epsilon_decay
+        self.batch_size = cfg.batch_size
+        self.policy_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
+        self.target_net = MLP(state_dim, action_dim,hidden_dim=cfg.hidden_dim).to(self.device)
+        # target_net copy from policy_net
+        for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
+            target_param.data.copy_(param.data)
+        # self.target_net.eval()  # 不启用 BatchNormalization 和 Dropout
+        # 可查parameters()与state_dict()的区别，前者require_grad=True
+        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr)
+        self.loss = 0
+        self.memory = ReplayBuffer(cfg.memory_capacity)
+        
+    def choose_action(self, state):
+        '''选择动作
+        '''
+        self.actions_count += 1
+        self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
+            math.exp(-1. * self.actions_count / self.epsilon_decay)
+        if random.random() > self.epsilon:
+            with torch.no_grad():
+                # 先转为张量便于丢给神经网络,state元素数据原本为float64
+                # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
+                state = torch.tensor(
+                    [state], device=self.device, dtype=torch.float32)
+                # 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
+                q_value = self.policy_net(state)
+                # tensor.max(1)返回每行的最大值以及对应的下标，
+                # 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0]))
+                # 所以tensor.max(1)[1]返回最大值对应的下标，即action
+                action = q_value.max(1)[1].item()  
+        else:
+            action = random.randrange(self.action_dim)
+        return action
+    def update(self):
+
+        if len(self.memory) < self.batch_size:
+            return
+        # 从memory中随机采样transition
+        state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
+            self.batch_size)
+        # convert to tensor
+        state_batch = torch.tensor(
+            state_batch, device=self.device, dtype=torch.float)
+        action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(
+            1)  # 例如tensor([[1],...,[0]])
+        reward_batch = torch.tensor(
+            reward_batch, device=self.device, dtype=torch.float)  # tensor([1., 1.,...,1])
+        next_state_batch = torch.tensor(
+            next_state_batch, device=self.device, dtype=torch.float)
+        
+        done_batch = torch.tensor(np.float32(
+            done_batch), device=self.device)  # 将bool转为float然后转为张量
+        # 计算当前(s_t,a)对应的Q(s_t, a)
+        q_values = self.policy_net(state_batch) 
+        next_q_values = self.policy_net(next_state_batch)
+        # 代入当前选择的action，得到Q(s_t|a=a_t)
+        q_value = q_values.gather(dim=1, index=action_batch)
+        '''以下是Nature DQN的q_target计算方式
+        # 计算所有next states的Q'(s_{t+1})的最大值，Q'为目标网络的q函数
+        next_q_state_value = self.target_net(
+            next_state_batch).max(1)[0].detach()  # 比如tensor([ 0.0060, -0.0171,...,])
+        # 计算 q_target
+        # 对于终止状态，此时done_batch[0]=1, 对应的expected_q_value等于reward
+        q_target = reward_batch + self.gamma * next_q_state_value * (1-done_batch[0])
+        '''
+        '''以下是Double DQN q_target计算方式，与NatureDQN稍有不同'''
+        next_target_values = self.target_net(
+            next_state_batch)
+        # 选出Q(s_t‘, a)对应的action，代入到next_target_values获得target net对应的next_q_value，即Q’(s_t|a=argmax Q(s_t‘, a))
+        next_target_q_value = next_target_values.gather(1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1)
+        q_target = reward_batch + self.gamma * next_target_q_value * (1-done_batch)
+        self.loss = nn.MSELoss()(q_value, q_target.unsqueeze(1))  # 计算 均方误差loss
+        # 优化模型
+        self.optimizer.zero_grad()  # zero_grad清除上一步所有旧的gradients from the last step
+        # loss.backward()使用backpropagation计算loss相对于所有parameters(需要gradients)的微分
+        self.loss.backward()
+        for param in self.policy_net.parameters():  # clip防止梯度爆炸
+            param.grad.data.clamp_(-1, 1)
+        self.optimizer.step()  # 更新模型
+
+    def save(self,path):
+        torch.save(self.target_net.state_dict(), path+'checkpoint.pth')
+
+    def load(self,path):
+        self.target_net.load_state_dict(torch.load(path+'checkpoint.pth'))  
+        for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()):
+            param.data.copy_(target_param.data)  
@@ -0,0 +1,144 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Author: JiangJi
+Email: johnjim0816@gmail.com
+Date: 2021-11-07 18:10:37
+LastEditor: JiangJi
+LastEditTime: 2021-12-29 15:02:30
+Discription: 
+'''
+
+import sys,os
+curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
+parent_path = os.path.dirname(curr_path) # 父路径
+sys.path.append(parent_path) # 添加路径到系统路径
+
+import gym
+import torch
+import datetime
+
+from common.utils import save_results, make_dir
+from common.utils import plot_rewards
+from DoubleDQN.double_dqn import DoubleDQN
+
+curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
+
+class Config:
+    def __init__(self):
+        ################################## 环境超参数 ###################################
+        self.algo_name = 'DoubleDQN' # 算法名称
+        self.env_name = 'CartPole-v0'  # 环境名称
+        self.device = torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu")  # 检测GPU
+        self.train_eps = 200  # 训练的回合数
+        self.test_eps = 30  # 测试的回合数
+        ################################################################################
+
+        ################################## 算法超参数 ###################################
+        self.gamma = 0.95  # 强化学习中的折扣因子
+        self.epsilon_start = 0.95  # e-greedy策略中初始epsilon
+        self.epsilon_end = 0.01  # e-greedy策略中的终止epsilon
+        self.epsilon_decay = 500  # e-greedy策略中epsilon的衰减率
+        self.lr = 0.0001  # 学习率
+        self.memory_capacity = 100000  # 经验回放的容量
+        self.batch_size = 64  # mini-batch SGD中的批量大小
+        self.target_update = 2  # 目标网络的更新频率
+        self.hidden_dim = 256  # 网络隐藏层
+        ################################################################################
+
+        ################################# 保存结果相关参数 ##############################
+        self.result_path = curr_path + "/outputs/" + self.env_name + \
+            '/' + curr_time + '/results/'  # 保存结果的路径
+        self.model_path = curr_path + "/outputs/" + self.env_name + \
+            '/' + curr_time + '/models/'  # 保存模型的路径
+        self.save = True # 是否保存图片
+        ################################################################################
+        
+        
+def env_agent_config(cfg,seed=1):
+    env = gym.make(cfg.env_name)  
+    env.seed(seed)
+    state_dim = env.observation_space.shape[0]
+    action_dim = env.action_space.n
+    agent = DoubleDQN(state_dim,action_dim,cfg)
+    return env,agent
+
+def train(cfg,env,agent):
+    print('开始训练!')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
+    rewards = [] # 记录所有回合的奖励
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    for i_ep in range(cfg.train_eps):
+        ep_reward = 0 # 记录一回合内的奖励
+        state = env.reset() # 重置环境，返回初始状态
+        while True:
+            action = agent.choose_action(state) 
+            next_state, reward, done, _ = env.step(action)
+            ep_reward += reward
+            agent.memory.push(state, action, reward, next_state, done) 
+            state = next_state 
+            agent.update() 
+            if done:
+                break
+        if i_ep % cfg.target_update == 0:
+            agent.target_net.load_state_dict(agent.policy_net.state_dict())
+        if (i_ep+1)%10 == 0: 
+            print(f'回合：{i_ep+1}/{cfg.train_eps}，奖励：{ep_reward}')
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(
+                0.9*ma_rewards[-1]+0.1*ep_reward)
+        else:
+            ma_rewards.append(ep_reward)   
+    print('完成训练！')
+    env.close()
+    return rewards,ma_rewards
+
+def test(cfg,env,agent):
+    print('开始测试!')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
+    ############# 由于测试不需要使用epsilon-greedy策略，所以相应的值设置为0 ###############
+    cfg.epsilon_start = 0.0  # e-greedy策略中初始epsilon
+    cfg.epsilon_end = 0.0  # e-greedy策略中的终止epsilon
+    ################################################################################
+    rewards = [] # 记录所有回合的奖励
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    
+    for i_ep in range(cfg.test_eps):
+        state = env.reset() 
+        ep_reward = 0   
+        while True:
+            action = agent.choose_action(state) 
+            next_state, reward, done, _ = env.step(action)  
+            state = next_state  
+            ep_reward += reward
+            if done:
+                break
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
+        else:
+            ma_rewards.append(ep_reward)
+        print(f"回合：{i_ep+1}/{cfg.test_eps}，奖励：{ep_reward:.1f}")
+    print('完成测试！')
+    env.close()
+    return rewards,ma_rewards  
+
+if __name__ == "__main__":
+    cfg = Config()
+    # 训练
+    env, agent = env_agent_config(cfg)
+    rewards, ma_rewards = train(cfg, env, agent)
+    make_dir(cfg.result_path, cfg.model_path)  # 创建保存结果和模型路径的文件夹
+    agent.save(path=cfg.model_path)  # 保存模型
+    save_results(rewards, ma_rewards, tag='train',
+                 path=cfg.result_path)  # 保存结果
+    plot_rewards(rewards, ma_rewards, cfg, tag="train")  # 画出结果
+    # 测试
+    env, agent = env_agent_config(cfg)
+    agent.load(path=cfg.model_path)  # 导入模型
+    rewards, ma_rewards = test(cfg, env, agent)
+    save_results(rewards, ma_rewards, tag='test',
+                 path=cfg.result_path)  # 保存结果
+    plot_rewards(rewards, ma_rewards, cfg, tag="test")  # 画出结果
@@ -0,0 +1,167 @@
+import math
+import random
+
+import gym
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.nn.functional as F
+from torch.distributions import Normal
+import matplotlib.pyplot as plt
+import seaborn as sns
+import sys,os
+curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
+parent_path = os.path.dirname(curr_path) # 父路径
+sys.path.append(parent_path) # 添加父路径到系统路径sys.path
+
+use_cuda = torch.cuda.is_available()
+device   = torch.device("cuda" if use_cuda else "cpu")
+
+from common.multiprocessing_env import SubprocVecEnv
+
+num_envs = 16
+env_name = "Pendulum-v0"
+
+def make_env():
+    def _thunk():
+        env = gym.make(env_name)
+        return env
+
+    return _thunk
+
+envs = [make_env() for i in range(num_envs)]
+envs = SubprocVecEnv(envs)
+
+env = gym.make(env_name)
+
+def init_weights(m):
+    if isinstance(m, nn.Linear):
+        nn.init.normal_(m.weight, mean=0., std=0.1)
+        nn.init.constant_(m.bias, 0.1)
+
+class ActorCritic(nn.Module):
+    def __init__(self, num_inputs, num_outputs, hidden_size, std=0.0):
+        super(ActorCritic, self).__init__()
+        
+        self.critic = nn.Sequential(
+            nn.Linear(num_inputs, hidden_size),
+            nn.ReLU(),
+            nn.Linear(hidden_size, 1)
+        )
+        
+        self.actor = nn.Sequential(
+            nn.Linear(num_inputs, hidden_size),
+            nn.ReLU(),
+            nn.Linear(hidden_size, num_outputs),
+        )
+        self.log_std = nn.Parameter(torch.ones(1, num_outputs) * std)
+        
+        self.apply(init_weights)
+        
+    def forward(self, x):
+        value = self.critic(x)
+        mu    = self.actor(x)
+        std   = self.log_std.exp().expand_as(mu)
+        dist  = Normal(mu, std)
+        return dist, value
+
+
+def plot(frame_idx, rewards):
+    plt.figure(figsize=(20,5))
+    plt.subplot(131)
+    plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1]))
+    plt.plot(rewards)
+    plt.show()
+    
+def test_env(vis=False):
+    state = env.reset()
+    if vis: env.render()
+    done = False
+    total_reward = 0
+    while not done:
+        state = torch.FloatTensor(state).unsqueeze(0).to(device)
+        dist, _ = model(state)
+        next_state, reward, done, _ = env.step(dist.sample().cpu().numpy()[0])
+        state = next_state
+        if vis: env.render()
+        total_reward += reward
+    return total_reward
+
+def compute_gae(next_value, rewards, masks, values, gamma=0.99, tau=0.95):
+    values = values + [next_value]
+    gae = 0
+    returns = []
+    for step in reversed(range(len(rewards))):
+        delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step]
+        gae = delta + gamma * tau * masks[step] * gae
+        returns.insert(0, gae + values[step])
+    return returns
+
+num_inputs  = envs.observation_space.shape[0]
+num_outputs = envs.action_space.shape[0]
+
+#Hyper params:
+hidden_size = 256
+lr          = 3e-2
+num_steps   = 20
+
+model = ActorCritic(num_inputs, num_outputs, hidden_size).to(device)
+optimizer = optim.Adam(model.parameters())
+
+max_frames   = 100000
+frame_idx    = 0
+test_rewards = []
+
+state = envs.reset()
+
+while frame_idx < max_frames:
+
+    log_probs = []
+    values    = []
+    rewards   = []
+    masks     = []
+    entropy = 0
+
+    for _ in range(num_steps):
+        state = torch.FloatTensor(state).to(device)
+        dist, value = model(state)
+
+        action = dist.sample()
+        next_state, reward, done, _ = envs.step(action.cpu().numpy())
+
+        log_prob = dist.log_prob(action)
+        entropy += dist.entropy().mean()
+        
+        log_probs.append(log_prob)
+        values.append(value)
+        rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
+        masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))
+        
+        state = next_state
+        frame_idx += 1
+        
+        if frame_idx % 1000 == 0:
+            test_rewards.append(np.mean([test_env() for _ in range(10)]))
+            print(test_rewards[-1])
+            # plot(frame_idx, test_rewards)
+            
+    next_state = torch.FloatTensor(next_state).to(device)
+    _, next_value = model(next_state)
+    returns = compute_gae(next_value, rewards, masks, values)
+    
+    log_probs = torch.cat(log_probs)
+    returns   = torch.cat(returns).detach()
+    values    = torch.cat(values)
+
+    advantage = returns - values
+
+    actor_loss  = -(log_probs * advantage.detach()).mean()
+    critic_loss = advantage.pow(2).mean()
+
+    loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy
+
+    optimizer.zero_grad()
+    loss.backward()
+    optimizer.step()
@@ -0,0 +1,13 @@
+# Hierarchical DQN
+
+## 原理简介
+
+Hierarchical DQN是一种分层强化学习方法，与DQN相比增加了一个meta controller，
+
+![image-20210331153115575](assets/image-20210331153115575.png)
+
+即学习时，meta controller每次会生成一个goal，然后controller或者说下面的actor就会达到这个goal，直到done为止。这就相当于给agent增加了一个队长，队长擅长制定局部目标，指导agent前行，这样应对一些每回合步数较长或者稀疏奖励的问题会有所帮助。
+
+## 伪代码
+
+![image-20210331153542314](assets/image-20210331153542314.png)
@@ -0,0 +1,154 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Author: John
+Email: johnjim0816@gmail.com
+Date: 2021-03-24 22:18:18
+LastEditor: John
+LastEditTime: 2021-05-04 22:39:34
+Discription: 
+Environment: 
+'''
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.nn.functional as F
+import numpy as np
+import random,math
+
+class ReplayBuffer:
+    def __init__(self, capacity):
+        self.capacity = capacity # 经验回放的容量
+        self.buffer = [] # 缓冲区
+        self.position = 0 
+    
+    def push(self, state, action, reward, next_state, done):
+        ''' 缓冲区是一个队列，容量超出时去掉开始存入的转移(transition)
+        '''
+        if len(self.buffer) < self.capacity:
+            self.buffer.append(None)
+        self.buffer[self.position] = (state, action, reward, next_state, done)
+        self.position = (self.position + 1) % self.capacity 
+    
+    def sample(self, batch_size):
+        batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移
+        state, action, reward, next_state, done =  zip(*batch) # 解压成状态，动作等
+        return state, action, reward, next_state, done
+    
+    def __len__(self):
+        ''' 返回当前存储的量
+        '''
+        return len(self.buffer)
+class MLP(nn.Module):
+    def __init__(self, input_dim,output_dim,hidden_dim=128):
+        """ 初始化q网络，为全连接网络
+            input_dim: 输入的特征数即环境的状态维度
+            output_dim: 输出的动作维度
+        """
+        super(MLP, self).__init__()
+        self.fc1 = nn.Linear(input_dim, hidden_dim) # 输入层
+        self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
+        self.fc3 = nn.Linear(hidden_dim, output_dim) # 输出层
+        
+    def forward(self, x):
+        # 各层对应的激活函数
+        x = F.relu(self.fc1(x)) 
+        x = F.relu(self.fc2(x))
+        return self.fc3(x)
+        
+class HierarchicalDQN:
+    def __init__(self,state_dim,action_dim,cfg):
+        self.state_dim = state_dim
+        self.action_dim = action_dim
+        self.gamma = cfg.gamma
+        self.device = cfg.device
+        self.batch_size = cfg.batch_size
+        self.frame_idx = 0  # 用于epsilon的衰减计数
+        self.epsilon = lambda frame_idx: cfg.epsilon_end + (cfg.epsilon_start - cfg.epsilon_end ) * math.exp(-1. * frame_idx / cfg.epsilon_decay)
+        self.policy_net = MLP(2*state_dim, action_dim,cfg.hidden_dim).to(self.device)
+        self.meta_policy_net = MLP(state_dim, state_dim,cfg.hidden_dim).to(self.device)
+        self.optimizer = optim.Adam(self.policy_net.parameters(),lr=cfg.lr)
+        self.meta_optimizer = optim.Adam(self.meta_policy_net.parameters(),lr=cfg.lr)
+        self.memory = ReplayBuffer(cfg.memory_capacity)
+        self.meta_memory = ReplayBuffer(cfg.memory_capacity)
+        self.loss_numpy  = 0
+        self.meta_loss_numpy  = 0
+        self.losses = []
+        self.meta_losses = []
+    def to_onehot(self,x):
+        oh = np.zeros(self.state_dim)
+        oh[x - 1] = 1.
+        return oh
+    def set_goal(self,state):
+        if random.random() > self.epsilon(self.frame_idx):
+            with torch.no_grad():
+                state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(0)
+                goal = self.meta_policy_net(state).max(1)[1].item() 
+        else:
+            goal = random.randrange(self.state_dim)
+        return goal
+    def choose_action(self,state):
+        self.frame_idx += 1
+        if random.random() > self.epsilon(self.frame_idx):
+            with torch.no_grad():
+                state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(0)
+                q_value = self.policy_net(state)
+                action = q_value.max(1)[1].item()  
+        else:
+            action = random.randrange(self.action_dim)
+        return action
+    def update(self):
+        self.update_policy()
+        self.update_meta()
+    def update_policy(self): 
+        if self.batch_size > len(self.memory):
+            return
+        state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(self.batch_size)
+        state_batch = torch.tensor(state_batch,device=self.device,dtype=torch.float)
+        action_batch = torch.tensor(action_batch,device=self.device,dtype=torch.int64).unsqueeze(1)  
+        reward_batch = torch.tensor(reward_batch,device=self.device,dtype=torch.float)  
+        next_state_batch = torch.tensor(next_state_batch,device=self.device, dtype=torch.float)
+        done_batch = torch.tensor(np.float32(done_batch),device=self.device)
+        q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch).squeeze(1)
+        next_state_values = self.policy_net(next_state_batch).max(1)[0].detach()
+        expected_q_values = reward_batch + 0.99 * next_state_values * (1-done_batch)
+        loss = nn.MSELoss()(q_values, expected_q_values) 
+        self.optimizer.zero_grad() 
+        loss.backward()
+        for param in self.policy_net.parameters():  # clip防止梯度爆炸
+            param.grad.data.clamp_(-1, 1)
+        self.optimizer.step()  
+        self.loss_numpy = loss.detach().cpu().numpy()
+        self.losses.append(self.loss_numpy)  
+    def update_meta(self):
+        if self.batch_size > len(self.meta_memory):
+            return
+        state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.meta_memory.sample(self.batch_size)
+        state_batch = torch.tensor(state_batch,device=self.device,dtype=torch.float)
+        action_batch = torch.tensor(action_batch,device=self.device,dtype=torch.int64).unsqueeze(1)  
+        reward_batch = torch.tensor(reward_batch,device=self.device,dtype=torch.float)  
+        next_state_batch = torch.tensor(next_state_batch,device=self.device, dtype=torch.float)
+        done_batch = torch.tensor(np.float32(done_batch),device=self.device)
+        q_values = self.meta_policy_net(state_batch).gather(dim=1, index=action_batch).squeeze(1)
+        next_state_values = self.meta_policy_net(next_state_batch).max(1)[0].detach()
+        expected_q_values = reward_batch + 0.99 * next_state_values * (1-done_batch)
+        meta_loss = nn.MSELoss()(q_values, expected_q_values) 
+        self.meta_optimizer.zero_grad() 
+        meta_loss.backward()
+        for param in self.meta_policy_net.parameters():  # clip防止梯度爆炸
+            param.grad.data.clamp_(-1, 1)
+        self.meta_optimizer.step() 
+        self.meta_loss_numpy = meta_loss.detach().cpu().numpy()
+        self.meta_losses.append(self.meta_loss_numpy)
+
+    def save(self, path):
+        torch.save(self.policy_net.state_dict(), path+'policy_checkpoint.pth')
+        torch.save(self.meta_policy_net.state_dict(), path+'meta_checkpoint.pth')
+
+    def load(self, path):
+        self.policy_net.load_state_dict(torch.load(path+'policy_checkpoint.pth'))
+        self.meta_policy_net.load_state_dict(torch.load(path+'meta_checkpoint.pth'))
+        
+
+        
+        
@@ -0,0 +1,88 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Author: John
+Email: johnjim0816@gmail.com
+Date: 2021-03-29 10:37:32
+LastEditor: John
+LastEditTime: 2021-05-04 22:35:56
+Discription: 
+Environment: 
+'''
+import sys
+import os
+curr_path = os.path.dirname(os.path.abspath(__file__))  # 当前文件所在绝对路径
+parent_path = os.path.dirname(curr_path)  # 父路径
+sys.path.append(parent_path)  # 添加路径到系统路径
+
+import datetime
+import numpy as np
+import torch
+import gym
+
+from common.utils import save_results,make_dir
+from common.utils import plot_rewards
+from HierarchicalDQN.agent import HierarchicalDQN
+from HierarchicalDQN.train import train,test
+
+curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")  # 获取当前时间
+algo_name = "Hierarchical DQN"  # 算法名称
+env_name = 'CartPole-v0'  # 环境名称
+class HierarchicalDQNConfig:
+    def __init__(self):
+        self.algo_name = algo_name  # 算法名称
+        self.env_name = env_name  # 环境名称
+        self.device = torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu")  # 检测GPU
+        self.train_eps = 300  # 训练的episode数目
+        self.test_eps = 50  # 测试的episode数目
+        self.gamma = 0.99
+        self.epsilon_start = 1  # start epsilon of e-greedy policy
+        self.epsilon_end = 0.01
+        self.epsilon_decay = 200
+        self.lr = 0.0001  # learning rate
+        self.memory_capacity = 10000  # Replay Memory capacity
+        self.batch_size = 32
+        self.target_update = 2  # 目标网络的更新频率
+        self.hidden_dim = 256  # 网络隐藏层
+class PlotConfig:
+    ''' 绘图相关参数设置
+    '''
+
+    def __init__(self) -> None:
+        self.algo_name = algo_name  # 算法名称
+        self.env_name = env_name  # 环境名称
+        self.device = torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu")  # 检测GPU
+        self.result_path = curr_path + "/outputs/" + self.env_name + \
+            '/' + curr_time + '/results/'  # 保存结果的路径
+        self.model_path = curr_path + "/outputs/" + self.env_name + \
+            '/' + curr_time + '/models/'  # 保存模型的路径
+        self.save = True  # 是否保存图片
+
+def env_agent_config(cfg,seed=1):
+    env = gym.make(cfg.env_name)  
+    env.seed(seed)
+    state_dim = env.observation_space.shape[0]
+    action_dim = env.action_space.n
+    agent = HierarchicalDQN(state_dim,action_dim,cfg)
+    return env,agent
+
+if __name__ == "__main__":
+    cfg = HierarchicalDQNConfig()
+    plot_cfg = PlotConfig()
+    # 训练
+    env, agent = env_agent_config(cfg, seed=1)
+    rewards, ma_rewards = train(cfg, env, agent)
+    make_dir(plot_cfg.result_path, plot_cfg.model_path)  # 创建保存结果和模型路径的文件夹
+    agent.save(path=plot_cfg.model_path)  # 保存模型
+    save_results(rewards, ma_rewards, tag='train',
+                path=plot_cfg.result_path)  # 保存结果
+    plot_rewards(rewards, ma_rewards, plot_cfg, tag="train")  # 画出结果
+    # 测试
+    env, agent = env_agent_config(cfg, seed=10)
+    agent.load(path=plot_cfg.model_path)  # 导入模型
+    rewards, ma_rewards = test(cfg, env, agent)
+    save_results(rewards, ma_rewards, tag='test', path=plot_cfg.result_path)  # 保存结果
+    plot_rewards(rewards, ma_rewards, plot_cfg, tag="test")  # 画出结果
+
@@ -0,0 +1,77 @@
+import sys
+import os
+curr_path = os.path.dirname(os.path.abspath(__file__))  # 当前文件所在绝对路径
+parent_path = os.path.dirname(curr_path)  # 父路径
+sys.path.append(parent_path)  # 添加路径到系统路径
+
+import numpy as np
+
+def train(cfg, env, agent):
+    print('开始训练!')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
+    rewards = [] # 记录所有回合的奖励
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    for i_ep in range(cfg.train_eps):
+        state = env.reset()
+        done = False
+        ep_reward = 0
+        while not done:
+            goal = agent.set_goal(state)
+            onehot_goal = agent.to_onehot(goal)
+            meta_state = state
+            extrinsic_reward = 0
+            while not done and goal != np.argmax(state):
+                goal_state = np.concatenate([state, onehot_goal])
+                action = agent.choose_action(goal_state)
+                next_state, reward, done, _ = env.step(action)
+                ep_reward += reward
+                extrinsic_reward += reward
+                intrinsic_reward = 1.0 if goal == np.argmax(
+                    next_state) else 0.0
+                agent.memory.push(goal_state, action, intrinsic_reward, np.concatenate(
+                    [next_state, onehot_goal]), done)
+                state = next_state
+                agent.update()
+        if (i_ep+1)%10 == 0: 
+            print(f'回合：{i_ep+1}/{cfg.train_eps}，奖励：{ep_reward}，Loss:{agent.loss_numpy:.2f}， Meta_Loss:{agent.meta_loss_numpy:.2f}')
+        agent.meta_memory.push(meta_state, goal, extrinsic_reward, state, done)
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(
+                0.9*ma_rewards[-1]+0.1*ep_reward)
+        else:
+            ma_rewards.append(ep_reward)
+    print('完成训练！')
+    return rewards, ma_rewards
+
+def test(cfg, env, agent):
+    print('开始测试!')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
+    rewards = [] # 记录所有回合的奖励
+    ma_rewards = []  # 记录所有回合的滑动平均奖励
+    for i_ep in range(cfg.train_eps):
+        state = env.reset()
+        done = False
+        ep_reward = 0
+        while not done:
+            goal = agent.set_goal(state)
+            onehot_goal = agent.to_onehot(goal)
+            extrinsic_reward = 0
+            while not done and goal != np.argmax(state):
+                goal_state = np.concatenate([state, onehot_goal])
+                action = agent.choose_action(goal_state)
+                next_state, reward, done, _ = env.step(action)
+                ep_reward += reward
+                extrinsic_reward += reward
+                state = next_state
+                agent.update()
+        if (i_ep+1)%10 == 0: 
+            print(f'回合：{i_ep+1}/{cfg.train_eps}，奖励：{ep_reward}，Loss:{agent.loss_numpy:.2f}， Meta_Loss:{agent.meta_loss_numpy:.2f}')
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(
+                0.9*ma_rewards[-1]+0.1*ep_reward)
+        else:
+            ma_rewards.append(ep_reward)
+    print('完成训练！')
+    return rewards, ma_rewards
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2020 John Jim
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -0,0 +1,7 @@
+## 记录笔者更新的日志
+
+**2021.12.28-1**：将```task.py```中的两个Config类合并为一个，并加以注释便于阅读，从DQN算法开始更新
+
+**2021.12.22-3**：将```agent.py```更改为对应的算法名称，便于区分如```dqn```与```dqn_cnn```的情况  
+**2021.12.22-2**：简化了代码结构，将原来的```train.py```和```task.py```等合并到```task.py```中  
+**2021.12.22-1**：简化了代码结构，将原来的```model.py```和```memory.py```等合并到```agent.py```中，```plot.py```的内容合并到```common.utils.py```中
@@ -0,0 +1,5 @@
+# *On-Policy First-Visit MC Control*
+
+### 伪代码
+
+![mc_control_algo](assets/mc_control_algo.png)
@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Author: John
+Email: johnjim0816@gmail.com
+Date: 2021-03-12 16:14:34
+LastEditor: John
+LastEditTime: 2021-05-05 16:58:39
+Discription: 
+Environment: 
+'''
+import numpy as np
+from collections import defaultdict
+import torch
+import dill
+
+class FisrtVisitMC:
+    ''' On-Policy First-Visit MC Control
+    '''
+    def __init__(self,action_dim,cfg):
+        self.action_dim = action_dim
+        self.epsilon = cfg.epsilon
+        self.gamma = cfg.gamma 
+        self.Q_table = defaultdict(lambda: np.zeros(action_dim))
+        self.returns_sum = defaultdict(float) # sum of returns
+        self.returns_count = defaultdict(float)
+        
+    def choose_action(self,state):
+        ''' e-greed policy '''
+        if state in self.Q_table.keys():
+            best_action = np.argmax(self.Q_table[state])
+            action_probs = np.ones(self.action_dim, dtype=float) * self.epsilon / self.action_dim
+            action_probs[best_action] += (1.0 - self.epsilon)
+            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
+        else:
+            action = np.random.randint(0,self.action_dim)
+        return action
+    def update(self,one_ep_transition):
+        # Find all (state, action) pairs we've visited in this one_ep_transition
+        # We convert each state to a tuple so that we can use it as a dict key
+        sa_in_episode = set([(tuple(x[0]), x[1]) for x in one_ep_transition])
+        for state, action in sa_in_episode:
+            sa_pair = (state, action)
+            # Find the first occurence of the (state, action) pair in the one_ep_transition
+            first_occurence_idx = next(i for i,x in enumerate(one_ep_transition)
+                                       if x[0] == state and x[1] == action)
+            # Sum up all rewards since the first occurance
+            G = sum([x[2]*(self.gamma**i) for i,x in enumerate(one_ep_transition[first_occurence_idx:])])
+            # Calculate average return for this state over all sampled episodes
+            self.returns_sum[sa_pair] += G
+            self.returns_count[sa_pair] += 1.0
+            self.Q_table[state][action] = self.returns_sum[sa_pair] / self.returns_count[sa_pair]
+    def save(self,path):
+        '''把 Q表格 的数据保存到文件中
+        '''
+        torch.save(
+            obj=self.Q_table,
+            f=path+"Q_table",
+            pickle_module=dill
+        )
+
+    def load(self, path):
+        '''从文件中读取数据到 Q表格
+        '''
+        self.Q_table =torch.load(f=path+"Q_table",pickle_module=dill)
--- a/Show More
+++ b/Show More