add some codes

2020-07-20 23:56:20 +08:00
parent aae36f5bb8
commit f4ac39625a
41 changed files with 1799 additions and 7 deletions
--- a/README.md
+++ b/README.md
@@ -31,6 +31,7 @@
 ## 主要贡献者

 - [@qiwang067](https://github.com/qiwang067)
+- [@JohnJim0816](https://github.com/JohnJim0816)

 ## 关注我们

--- a/codes/Q-learning/agent.py
+++ b/codes/Q-learning/agent.py
@@ -0,0 +1,75 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# -*- coding: utf-8 -*-
+
+import numpy as np
+
+
+class QLearningAgent(object):
+    def __init__(self,
+                 obs_n,
+                 act_n,
+                 learning_rate=0.01,
+                 gamma=0.9,
+                 e_greed=0.1):
+        self.act_n = act_n  # 动作维度，有几个动作可选
+        self.lr = learning_rate  # 学习率
+        self.gamma = gamma  # reward的衰减率
+        self.epsilon = e_greed  # 按一定概率随机选动作
+        self.Q = np.zeros((obs_n, act_n))
+
+    # 根据输入观察值，采样输出的动作值，带探索
+    def sample(self, obs):
+        if np.random.uniform(0, 1) < (1.0 - self.epsilon):  #根据table的Q值选动作
+            action = self.predict(obs)
+        else:
+            action = np.random.choice(self.act_n)  #有一定概率随机探索选取一个动作
+        return action
+
+    # 根据输入观察值，预测输出的动作值
+    def predict(self, obs):
+        Q_list = self.Q[obs, :]
+        maxQ = np.max(Q_list)
+        action_list = np.where(Q_list == maxQ)[0]  # maxQ可能对应多个action
+        action = np.random.choice(action_list)
+        return action
+
+    # 学习方法，也就是更新Q-table的方法
+    def learn(self, obs, action, reward, next_obs, done):
+        """ off-policy
+            obs: 交互前的obs, s_t
+            action: 本次交互选择的action, a_t
+            reward: 本次动作获得的奖励r
+            next_obs: 本次交互后的obs, s_t+1
+            done: episode是否结束
+        """
+        predict_Q = self.Q[obs, action]
+        if done:
+            target_Q = reward  # 没有下一个状态了
+        else:
+            target_Q = reward + self.gamma * np.max(
+                self.Q[next_obs, :])  # Q-learning
+        self.Q[obs, action] += self.lr * (target_Q - predict_Q)  # 修正q
+
+    # 把 Q表格 的数据保存到文件中
+    def save(self):
+        npy_file = './q_table.npy'
+        np.save(npy_file, self.Q)
+        print(npy_file + ' saved.')
+
+    # 从文件中读取数据到 Q表格
+    def restore(self, npy_file='./q_table.npy'):
+        self.Q = np.load(npy_file)
+        print(npy_file + ' loaded.')
--- a/codes/Q-learning/gridworld.py
+++ b/codes/Q-learning/gridworld.py
@@ -0,0 +1,195 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# -*- coding: utf-8 -*-
+
+import gym
+import turtle
+import numpy as np
+
+# turtle tutorial : https://docs.python.org/3.3/library/turtle.html
+
+
+def GridWorld(gridmap=None, is_slippery=False):
+    if gridmap is None:
+        gridmap = ['SFFF', 'FHFH', 'FFFH', 'HFFG']
+    env = gym.make("FrozenLake-v0", desc=gridmap, is_slippery=False)
+    env = FrozenLakeWapper(env)
+    return env
+
+
+class FrozenLakeWapper(gym.Wrapper):
+    def __init__(self, env):
+        gym.Wrapper.__init__(self, env)
+        self.max_y = env.desc.shape[0]
+        self.max_x = env.desc.shape[1]
+        self.t = None
+        self.unit = 50
+
+    def draw_box(self, x, y, fillcolor='', line_color='gray'):
+        self.t.up()
+        self.t.goto(x * self.unit, y * self.unit)
+        self.t.color(line_color)
+        self.t.fillcolor(fillcolor)
+        self.t.setheading(90)
+        self.t.down()
+        self.t.begin_fill()
+        for _ in range(4):
+            self.t.forward(self.unit)
+            self.t.right(90)
+        self.t.end_fill()
+
+    def move_player(self, x, y):
+        self.t.up()
+        self.t.setheading(90)
+        self.t.fillcolor('red')
+        self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit)
+
+    def render(self):
+        if self.t == None:
+            self.t = turtle.Turtle()
+            self.wn = turtle.Screen()
+            self.wn.setup(self.unit * self.max_x + 100,
+                          self.unit * self.max_y + 100)
+            self.wn.setworldcoordinates(0, 0, self.unit * self.max_x,
+                                        self.unit * self.max_y)
+            self.t.shape('circle')
+            self.t.width(2)
+            self.t.speed(0)
+            self.t.color('gray')
+            for i in range(self.desc.shape[0]):
+                for j in range(self.desc.shape[1]):
+                    x = j
+                    y = self.max_y - 1 - i
+                    if self.desc[i][j] == b'S':  # Start
+                        self.draw_box(x, y, 'white')
+                    elif self.desc[i][j] == b'F':  # Frozen ice
+                        self.draw_box(x, y, 'white')
+                    elif self.desc[i][j] == b'G':  # Goal
+                        self.draw_box(x, y, 'yellow')
+                    elif self.desc[i][j] == b'H':  # Hole
+                        self.draw_box(x, y, 'black')
+                    else:
+                        self.draw_box(x, y, 'white')
+            self.t.shape('turtle')
+
+        x_pos = self.s % self.max_x
+        y_pos = self.max_y - 1 - int(self.s / self.max_x)
+        self.move_player(x_pos, y_pos)
+
+
+class CliffWalkingWapper(gym.Wrapper):
+    def __init__(self, env):
+        gym.Wrapper.__init__(self, env)
+        self.t = None
+        self.unit = 50
+        self.max_x = 12
+        self.max_y = 4
+
+    def draw_x_line(self, y, x0, x1, color='gray'):
+        assert x1 > x0
+        self.t.color(color)
+        self.t.setheading(0)
+        self.t.up()
+        self.t.goto(x0, y)
+        self.t.down()
+        self.t.forward(x1 - x0)
+
+    def draw_y_line(self, x, y0, y1, color='gray'):
+        assert y1 > y0
+        self.t.color(color)
+        self.t.setheading(90)
+        self.t.up()
+        self.t.goto(x, y0)
+        self.t.down()
+        self.t.forward(y1 - y0)
+
+    def draw_box(self, x, y, fillcolor='', line_color='gray'):
+        self.t.up()
+        self.t.goto(x * self.unit, y * self.unit)
+        self.t.color(line_color)
+        self.t.fillcolor(fillcolor)
+        self.t.setheading(90)
+        self.t.down()
+        self.t.begin_fill()
+        for i in range(4):
+            self.t.forward(self.unit)
+            self.t.right(90)
+        self.t.end_fill()
+
+    def move_player(self, x, y):
+        self.t.up()
+        self.t.setheading(90)
+        self.t.fillcolor('red')
+        self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit)
+
+    def render(self):
+        if self.t == None:
+            self.t = turtle.Turtle()
+            self.wn = turtle.Screen()
+            self.wn.setup(self.unit * self.max_x + 100,
+                          self.unit * self.max_y + 100)
+            self.wn.setworldcoordinates(0, 0, self.unit * self.max_x,
+                                        self.unit * self.max_y)
+            self.t.shape('circle')
+            self.t.width(2)
+            self.t.speed(0)
+            self.t.color('gray')
+            for _ in range(2):
+                self.t.forward(self.max_x * self.unit)
+                self.t.left(90)
+                self.t.forward(self.max_y * self.unit)
+                self.t.left(90)
+            for i in range(1, self.max_y):
+                self.draw_x_line(
+                    y=i * self.unit, x0=0, x1=self.max_x * self.unit)
+            for i in range(1, self.max_x):
+                self.draw_y_line(
+                    x=i * self.unit, y0=0, y1=self.max_y * self.unit)
+
+            for i in range(1, self.max_x - 1):
+                self.draw_box(i, 0, 'black')
+            self.draw_box(self.max_x - 1, 0, 'yellow')
+            self.t.shape('turtle')
+
+        x_pos = self.s % self.max_x
+        y_pos = self.max_y - 1 - int(self.s / self.max_x)
+        self.move_player(x_pos, y_pos)
+
+
+if __name__ == '__main__':
+    # 环境1：FrozenLake, 可以配置冰面是否是滑的
+    # 0 left, 1 down, 2 right, 3 up
+    env = gym.make("FrozenLake-v0", is_slippery=False)
+    env = FrozenLakeWapper(env)
+
+    # 环境2：CliffWalking, 悬崖环境
+    # env = gym.make("CliffWalking-v0")  # 0 up, 1 right, 2 down, 3 left
+    # env = CliffWalkingWapper(env)
+
+    # 环境3：自定义格子世界，可以配置地图, S为出发点Start, F为平地Floor, H为洞Hole, G为出口目标Goal
+    # gridmap = [
+    #         'SFFF',
+    #         'FHFF',
+    #         'FFFF',
+    #         'HFGF' ]
+    # env = GridWorld(gridmap)
+
+    env.reset()
+    for step in range(10):
+        action = np.random.randint(0, 4)
+        obs, reward, done, info = env.step(action)
+        print('step {}: action {}, obs {}, reward {}, done {}, info {}'.format(\
+                step, action, obs, reward, done, info))
+        # env.render() # 渲染一帧图像
--- a/codes/Q-learning/train.py
+++ b/codes/Q-learning/train.py
@@ -0,0 +1,90 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# -*- coding: utf-8 -*-
+
+import gym
+from gridworld import CliffWalkingWapper, FrozenLakeWapper
+from agent import QLearningAgent
+import time
+
+
+def run_episode(env, agent, render=False):
+    total_steps = 0  # 记录每个episode走了多少step
+    total_reward = 0
+
+    obs = env.reset()  # 重置环境, 重新开一局（即开始新的一个episode）
+
+    while True:
+        action = agent.sample(obs)  # 根据算法选择一个动作
+        next_obs, reward, done, _ = env.step(action)  # 与环境进行一个交互
+        # 训练 Q-learning算法
+        agent.learn(obs, action, reward, next_obs, done)  # 不需要下一步的action
+
+        obs = next_obs  # 存储上一个观察值
+        total_reward += reward
+        total_steps += 1  # 计算step数
+        if render:
+            env.render()  #渲染新的一帧图形
+        if done:
+            break
+    return total_reward, total_steps
+
+
+def test_episode(env, agent):
+    total_reward = 0
+    obs = env.reset()
+    while True:
+        action = agent.predict(obs)  # greedy
+        next_obs, reward, done, _ = env.step(action)
+        total_reward += reward
+        obs = next_obs
+        time.sleep(0.5)
+        env.render()
+        if done:
+            print('test reward = %.1f' % (total_reward))
+            break
+
+
+def main():
+    # env = gym.make("FrozenLake-v0", is_slippery=False)  # 0 left, 1 down, 2 right, 3 up
+    # env = FrozenLakeWapper(env)
+
+    env = gym.make("CliffWalking-v0")  # 0 up, 1 right, 2 down, 3 left
+    env = CliffWalkingWapper(env)
+
+    agent = QLearningAgent(
+        obs_n=env.observation_space.n,
+        act_n=env.action_space.n,
+        learning_rate=0.1,
+        gamma=0.9,
+        e_greed=0.1)
+
+    is_render = False
+    for episode in range(500):
+        ep_reward, ep_steps = run_episode(env, agent, is_render)
+        print('Episode %s: steps = %s , reward = %.1f' % (episode, ep_steps,
+                                                          ep_reward))
+
+        # 每隔20个episode渲染一下看看效果
+        if episode % 20 == 0:
+            is_render = True
+        else:
+            is_render = False
+    # 训练结束，查看算法效果
+    test_episode(env, agent)
+
+
+if __name__ == "__main__":
+    main()
--- a/codes/Sarsa/agent.py
+++ b/codes/Sarsa/agent.py
@@ -0,0 +1,74 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# -*- coding: utf-8 -*-
+
+import numpy as np
+
+# 根据Q表格选动作
+class SarsaAgent(object):
+    def __init__(self,
+                 obs_n,
+                 act_n,
+                 learning_rate=0.01,
+                 gamma=0.9,
+                 e_greed=0.1):
+        self.act_n = act_n  # 动作维度，有几个动作可选
+        self.lr = learning_rate  # 学习率
+        self.gamma = gamma  # reward的衰减率
+        self.epsilon = e_greed  # 按一定概率随机选动作
+        self.Q = np.zeros((obs_n, act_n))  # 初始化Q表格
+
+    # 根据输入观察值，采样输出的动作值，带探索(epsilon-greedy，训练时用这个方法)
+    def sample(self, obs):
+        if np.random.uniform(0, 1) < (1.0 - self.epsilon):  #根据table的Q值选动作
+            action = self.predict(obs)
+        else:
+            action = np.random.choice(self.act_n)  #有一定概率随机探索选取一个动作
+        return action
+
+    # 根据输入观察值，预测输出的动作值（已有里面挑最大，贪心的算法，只有利用，没有探索）
+    def predict(self, obs):
+        Q_list = self.Q[obs, :]
+        maxQ = np.max(Q_list)  # 找到最大Q对应的下标 
+        action_list = np.where(Q_list == maxQ)[0]  # maxQ可能对应多个action
+        action = np.random.choice(action_list)  # 从这些action中随机挑一个action（可以打印出来看看）
+        return action
+
+    # 学习方法，也就是更新Q-table的方法
+    def learn(self, obs, action, reward, next_obs, next_action, done):
+        """ on-policy
+            obs: 交互前的obs, s_t
+            action: 本次交互选择的action, a_t
+            reward: 本次动作获得的奖励r
+            next_obs: 本次交互后的obs, s_t+1
+            next_action: 根据当前Q表格, 针对next_obs会选择的动作, a_t+1
+            done: episode是否结束
+        """
+        predict_Q = self.Q[obs, action]
+        if done:  # done为ture的话，代表这是episode最后一个状态
+            target_Q = reward  # 没有下一个状态了
+        else:
+            target_Q = reward + self.gamma * self.Q[next_obs,
+                                                    next_action]  # Sarsa
+        self.Q[obs, action] += self.lr * (target_Q - predict_Q)  # 修正q
+
+    def save(self):
+        npy_file = './q_table.npy'
+        np.save(npy_file, self.Q)
+        print(npy_file + ' saved.')
+
+    def restore(self, npy_file='./q_table.npy'):
+        self.Q = np.load(npy_file)
+        print(npy_file + ' loaded.')
--- a/codes/Sarsa/gridworld.py
+++ b/codes/Sarsa/gridworld.py
@@ -0,0 +1,195 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# -*- coding: utf-8 -*-
+
+import gym
+import turtle
+import numpy as np
+
+# turtle tutorial : https://docs.python.org/3.3/library/turtle.html
+
+
+def GridWorld(gridmap=None, is_slippery=False):
+    if gridmap is None:
+        gridmap = ['SFFF', 'FHFH', 'FFFH', 'HFFG']
+    env = gym.make("FrozenLake-v0", desc=gridmap, is_slippery=False)
+    env = FrozenLakeWapper(env)
+    return env
+
+
+class FrozenLakeWapper(gym.Wrapper):
+    def __init__(self, env):
+        gym.Wrapper.__init__(self, env)
+        self.max_y = env.desc.shape[0]
+        self.max_x = env.desc.shape[1]
+        self.t = None
+        self.unit = 50
+
+    def draw_box(self, x, y, fillcolor='', line_color='gray'):
+        self.t.up()
+        self.t.goto(x * self.unit, y * self.unit)
+        self.t.color(line_color)
+        self.t.fillcolor(fillcolor)
+        self.t.setheading(90)
+        self.t.down()
+        self.t.begin_fill()
+        for _ in range(4):
+            self.t.forward(self.unit)
+            self.t.right(90)
+        self.t.end_fill()
+
+    def move_player(self, x, y):
+        self.t.up()
+        self.t.setheading(90)
+        self.t.fillcolor('red')
+        self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit)
+
+    def render(self):
+        if self.t == None:
+            self.t = turtle.Turtle()
+            self.wn = turtle.Screen()
+            self.wn.setup(self.unit * self.max_x + 100,
+                          self.unit * self.max_y + 100)
+            self.wn.setworldcoordinates(0, 0, self.unit * self.max_x,
+                                        self.unit * self.max_y)
+            self.t.shape('circle')
+            self.t.width(2)
+            self.t.speed(0)
+            self.t.color('gray')
+            for i in range(self.desc.shape[0]):
+                for j in range(self.desc.shape[1]):
+                    x = j
+                    y = self.max_y - 1 - i
+                    if self.desc[i][j] == b'S':  # Start
+                        self.draw_box(x, y, 'white')
+                    elif self.desc[i][j] == b'F':  # Frozen ice
+                        self.draw_box(x, y, 'white')
+                    elif self.desc[i][j] == b'G':  # Goal
+                        self.draw_box(x, y, 'yellow')
+                    elif self.desc[i][j] == b'H':  # Hole
+                        self.draw_box(x, y, 'black')
+                    else:
+                        self.draw_box(x, y, 'white')
+            self.t.shape('turtle')
+
+        x_pos = self.s % self.max_x
+        y_pos = self.max_y - 1 - int(self.s / self.max_x)
+        self.move_player(x_pos, y_pos)
+
+
+class CliffWalkingWapper(gym.Wrapper):
+    def __init__(self, env):
+        gym.Wrapper.__init__(self, env)
+        self.t = None
+        self.unit = 50
+        self.max_x = 12
+        self.max_y = 4
+
+    def draw_x_line(self, y, x0, x1, color='gray'):
+        assert x1 > x0
+        self.t.color(color)
+        self.t.setheading(0)
+        self.t.up()
+        self.t.goto(x0, y)
+        self.t.down()
+        self.t.forward(x1 - x0)
+
+    def draw_y_line(self, x, y0, y1, color='gray'):
+        assert y1 > y0
+        self.t.color(color)
+        self.t.setheading(90)
+        self.t.up()
+        self.t.goto(x, y0)
+        self.t.down()
+        self.t.forward(y1 - y0)
+
+    def draw_box(self, x, y, fillcolor='', line_color='gray'):
+        self.t.up()
+        self.t.goto(x * self.unit, y * self.unit)
+        self.t.color(line_color)
+        self.t.fillcolor(fillcolor)
+        self.t.setheading(90)
+        self.t.down()
+        self.t.begin_fill()
+        for i in range(4):
+            self.t.forward(self.unit)
+            self.t.right(90)
+        self.t.end_fill()
+
+    def move_player(self, x, y):
+        self.t.up()
+        self.t.setheading(90)
+        self.t.fillcolor('red')
+        self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit)
+
+    def render(self):
+        if self.t == None:
+            self.t = turtle.Turtle()
+            self.wn = turtle.Screen()
+            self.wn.setup(self.unit * self.max_x + 100,
+                          self.unit * self.max_y + 100)
+            self.wn.setworldcoordinates(0, 0, self.unit * self.max_x,
+                                        self.unit * self.max_y)
+            self.t.shape('circle')
+            self.t.width(2)
+            self.t.speed(0)
+            self.t.color('gray')
+            for _ in range(2):
+                self.t.forward(self.max_x * self.unit)
+                self.t.left(90)
+                self.t.forward(self.max_y * self.unit)
+                self.t.left(90)
+            for i in range(1, self.max_y):
+                self.draw_x_line(
+                    y=i * self.unit, x0=0, x1=self.max_x * self.unit)
+            for i in range(1, self.max_x):
+                self.draw_y_line(
+                    x=i * self.unit, y0=0, y1=self.max_y * self.unit)
+
+            for i in range(1, self.max_x - 1):
+                self.draw_box(i, 0, 'black')
+            self.draw_box(self.max_x - 1, 0, 'yellow')
+            self.t.shape('turtle')
+
+        x_pos = self.s % self.max_x
+        y_pos = self.max_y - 1 - int(self.s / self.max_x)
+        self.move_player(x_pos, y_pos)
+
+
+if __name__ == '__main__':
+    # 环境1：FrozenLake, 可以配置冰面是否是滑的
+    # 0 left, 1 down, 2 right, 3 up
+    env = gym.make("FrozenLake-v0", is_slippery=False)
+    env = FrozenLakeWapper(env)
+
+    # 环境2：CliffWalking, 悬崖环境
+    # env = gym.make("CliffWalking-v0")  # 0 up, 1 right, 2 down, 3 left
+    # env = CliffWalkingWapper(env)
+
+    # 环境3：自定义格子世界，可以配置地图, S为出发点Start, F为平地Floor, H为洞Hole, G为出口目标Goal
+    # gridmap = [
+    #         'SFFF',
+    #         'FHFF',
+    #         'FFFF',
+    #         'HFGF' ]
+    # env = GridWorld(gridmap)
+
+    env.reset()
+    for step in range(10):
+        action = np.random.randint(0, 4)
+        obs, reward, done, info = env.step(action)
+        print('step {}: action {}, obs {}, reward {}, done {}, info {}'.format(\
+                step, action, obs, reward, done, info))
+        # env.render() # 渲染一帧图像
--- a/codes/Sarsa/train.py
+++ b/codes/Sarsa/train.py
@@ -0,0 +1,92 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# -*- coding: utf-8 -*-
+
+import gym
+from gridworld import CliffWalkingWapper, FrozenLakeWapper
+from agent import SarsaAgent
+import time
+
+
+def run_episode(env, agent, render=False):
+    total_steps = 0  # 记录每个episode走了多少step
+    total_reward = 0
+
+    obs = env.reset()  # 重置环境, 重新开一局（即开始新的一个episode）
+    action = agent.sample(obs)  # 根据算法选择一个动作
+
+    while True:
+        next_obs, reward, done, _ = env.step(action)  # 与环境进行一个交互
+        next_action = agent.sample(next_obs)  # 根据算法选择一个动作
+        # 训练 Sarsa 算法
+        agent.learn(obs, action, reward, next_obs, next_action, done)
+
+        action = next_action
+        obs = next_obs  # 存储上一个观察值
+        total_reward += reward
+        total_steps += 1  # 计算step数
+        if render:
+            env.render()  #渲染新的一帧图形
+        if done:
+            break
+    return total_reward, total_steps
+
+
+def test_episode(env, agent):
+    total_reward = 0
+    obs = env.reset()
+    while True:
+        action = agent.predict(obs)  # greedy，只取最优的动作
+        next_obs, reward, done, _ = env.step(action)
+        total_reward += reward
+        obs = next_obs
+        time.sleep(0.5)  # 每个step延迟0.5秒来看看效果
+        env.render()
+        if done:
+            print('test reward = %.1f' % (total_reward))
+            break
+
+
+def main():
+    # env = gym.make("FrozenLake-v0", is_slippery=False)  # 0 left, 1 down, 2 right, 3 up
+    # env = FrozenLakeWapper(env)
+
+    env = gym.make("CliffWalking-v0")  # 0 up, 1 right, 2 down, 3 left
+    env = CliffWalkingWapper(env)  # 这行不加也可以，这个是为了显示效果更好一点
+    
+    agent = SarsaAgent(
+        obs_n=env.observation_space.n,
+        act_n=env.action_space.n,
+        learning_rate=0.1,
+        gamma=0.9,
+        e_greed=0.1)
+
+    is_render = False
+    for episode in range(500):
+        ep_reward, ep_steps = run_episode(env, agent, is_render)
+        print('Episode %s: steps = %s , reward = %.1f' % (episode, ep_steps,
+                                                          ep_reward))
+
+        # 每隔20个episode渲染一下看看效果（每个episode都渲染的话，时间会比较长）
+        if episode % 20 == 0:
+            is_render = True
+        else:
+            is_render = False
+    # 训练结束，查看算法效果
+    test_episode(env, agent)
+
+
+if __name__ == "__main__":
+    main()
--- a/codes/ddpg/ddpg.py
+++ b/codes/ddpg/ddpg.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+@Author: John
+@Email: johnjim0816@gmail.com
+@Date: 2020-06-09 20:25:52
+@LastEditor: John
+@LastEditTime: 2020-06-14 11:43:17
+@Discription: 
+@Environment: python 3.7.7
+'''
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.optim as optim
+
+from model import Actor, Critic
+from memory import ReplayBuffer
+
+
+class DDPG:
+    def __init__(self, n_states, n_actions, hidden_dim=30, device="cpu", critic_lr=1e-3,
+                 actor_lr=1e-4, gamma=0.99, soft_tau=1e-2, memory_capacity=100000, batch_size=128):
+        self.device = device
+        self.critic = Critic(n_states, n_actions, hidden_dim).to(device)
+        self.actor = Actor(n_states, n_actions, hidden_dim).to(device)
+        self.target_critic = Critic(n_states, n_actions, hidden_dim).to(device)
+        self.target_actor = Actor(n_states, n_actions, hidden_dim).to(device)
+
+        for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()):
+            target_param.data.copy_(param.data)
+        for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()):
+            target_param.data.copy_(param.data)
+
+        self.critic_optimizer = optim.Adam(
+            self.critic.parameters(),  lr=critic_lr)
+        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)
+        self.critic_criterion = nn.MSELoss()
+        self.memory = ReplayBuffer(memory_capacity)
+        self.batch_size = batch_size
+        self.soft_tau = soft_tau
+        self.gamma = gamma
+
+    def select_action(self, state):
+        return self.actor.select_action(state)
+
+    def update(self):
+        if len(self.memory) < self.batch_size:
+            return
+        state, action, reward, next_state, done = self.memory.sample(
+            self.batch_size) 
+        # 将所有变量转为张量
+        state = torch.FloatTensor(state).to(self.device)
+        next_state = torch.FloatTensor(next_state).to(self.device)
+        action = torch.FloatTensor(action).to(self.device)
+        reward = torch.FloatTensor(reward).unsqueeze(1).to(self.device)
+        
+        done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(self.device)
+        policy_loss = self.critic(state, self.actor(state))
+        policy_loss = -policy_loss.mean()
+
+        next_action = self.target_actor(next_state)
+        target_value = self.target_critic(next_state, next_action.detach())
+        expected_value = reward + (1.0 - done) * self.gamma * target_value
+        expected_value = torch.clamp(expected_value, -np.inf, np.inf)
+        
+        value = self.critic(state, action)
+
+        value_loss = self.critic_criterion(value, expected_value.detach())
+
+        self.actor_optimizer.zero_grad()
+        policy_loss.backward()
+        self.actor_optimizer.step()
+
+        self.critic_optimizer.zero_grad()
+        value_loss.backward()
+        self.critic_optimizer.step()
+        for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()):
+            target_param.data.copy_(
+                target_param.data * (1.0 - self.soft_tau) +
+                param.data * self.soft_tau
+            )
+        for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()):
+            target_param.data.copy_(
+                target_param.data * (1.0 - self.soft_tau) +
+                param.data * self.soft_tau
+            )
--- a/codes/ddpg/env.py
+++ b/codes/ddpg/env.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+@Author: John
+@Email: johnjim0816@gmail.com
+@Date: 2020-06-10 15:28:30
+@LastEditor: John
+@LastEditTime: 2020-06-12 22:49:18
+@Discription: 
+@Environment: python 3.7.7
+'''
+import gym
+import numpy as np
+
+class NormalizedActions(gym.ActionWrapper):
+
+    def action(self, action):
+        
+        low_bound   = self.action_space.low
+        upper_bound = self.action_space.high
+        action = low_bound + (action + 1.0) * 0.5 * (upper_bound - low_bound)
+        action = np.clip(action, low_bound, upper_bound)
+        
+        return action
+
+    def reverse_action(self, action):
+        low_bound   = self.action_space.low
+        upper_bound = self.action_space.high
+        action = 2 * (action - low_bound) / (upper_bound - low_bound) - 1
+        action = np.clip(action, low_bound, upper_bound)
+        return action
--- a/codes/ddpg/main.py
+++ b/codes/ddpg/main.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+@Author: John
+@Email: johnjim0816@gmail.com
+@Date: 2020-06-11 20:58:21
+@LastEditor: John
+@LastEditTime: 2020-07-20 23:01:02
+@Discription: 
+@Environment: python 3.7.7
+'''
+import torch
+import gym
+
+from ddpg import DDPG
+from env import NormalizedActions
+from noise import OUNoise
+from plot import plot
+
+import argparse
+
+def get_args():
+    '''模型建立好之后只需要在这里调参
+    '''
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--gamma", default=0.99, type=float) # q-learning中的gamma
+    parser.add_argument("--critic_lr", default=1e-3, type=float) # critic学习率
+    parser.add_argument("--actor_lr", default=1e-4, type=float)
+
+    parser.add_argument("--memory_capacity", default=10000, type=int,help="capacity of Replay Memory")
+
+    parser.add_argument("--batch_size", default=128, type=int,help="batch size of memory sampling")
+    parser.add_argument("--max_episodes", default=200, type=int)
+    parser.add_argument("--max_steps", default=200, type=int)
+    parser.add_argument("--target_update", default=4, type=int,help="when(every default 10 eisodes) to update target net ")
+    config = parser.parse_args()
+
+    return config
+
+if __name__ == "__main__":
+
+    cfg = get_args()
+    env = NormalizedActions(gym.make("Pendulum-v0"))
+    
+    # 增加action噪声
+    ou_noise = OUNoise(env.action_space)
+    
+    n_states  = env.observation_space.shape[0] 
+    n_actions = env.action_space.shape[0] 
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    agent=DDPG(n_states,n_actions,device="cpu", critic_lr=1e-3,
+                 actor_lr=1e-4, gamma=0.99, soft_tau=1e-2, memory_capacity=100000, batch_size=128)
+
+    rewards  = []
+    moving_average_rewards = []
+    for i_episode in range(1,cfg.max_episodes+1):
+        state=env.reset()
+        ou_noise.reset()
+        ep_reward = 0
+        for i_step in range(1,cfg.max_steps+1):
+            action = agent.select_action(state)   
+            action = ou_noise.get_action(action, i_step) # 即paper中的random process
+            next_state, reward, done, _ = env.step(action)
+            ep_reward += reward
+            agent.memory.push(state, action, reward, next_state, done)
+            agent.update()
+            state = next_state
+            if done:
+                break
+        print('Episode:', i_episode, ' Reward: %i' % int(ep_reward),)
+        rewards.append(ep_reward)
+        #
+        if i_episode == 1:
+            moving_average_rewards.append(ep_reward)
+        else:
+            moving_average_rewards.append(
+                0.9*moving_average_rewards[-1]+0.1*ep_reward)
+    print('Complete！')
+    import os
+    import numpy as np
+    output_path = os.path.dirname(__file__)+"/result/"
+    if not os.path.exists(output_path):
+        os.mkdir(output_path)
+    np.save(output_path+"rewards.npy", rewards)
+    np.save(output_path+"moving_average_rewards.npy", moving_average_rewards)
+  
+    plot(rewards)
+    plot(moving_average_rewards,ylabel="moving_average_rewards")
--- a/codes/ddpg/memory.py
+++ b/codes/ddpg/memory.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+@Author: John
+@Email: johnjim0816@gmail.com
+@Date: 2020-06-10 15:27:16
+@LastEditor: John
+@LastEditTime: 2020-06-13 00:29:45
+@Discription: 
+@Environment: python 3.7.7
+'''
+import random
+import numpy as np
+
+class ReplayBuffer:
+    
+    def __init__(self, capacity):
+        self.capacity = capacity
+        self.buffer = []
+        self.position = 0
+    
+    def push(self, state, action, reward, next_state, done):
+        if len(self.buffer) < self.capacity:
+            self.buffer.append(None)
+        self.buffer[self.position] = (state, action, reward, next_state, done)
+        self.position = (self.position + 1) % self.capacity
+    
+    def sample(self, batch_size):
+        batch = random.sample(self.buffer, batch_size)
+        state_batch, action_batch, reward_batch, next_state_batch, done_batch = map(np.stack, zip(*batch))
+        return state_batch, action_batch, reward_batch, next_state_batch, done_batch
+    
+    def __len__(self):
+        return len(self.buffer)
--- a/codes/ddpg/model.py
+++ b/codes/ddpg/model.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+@Author: John
+@Email: johnjim0816@gmail.com
+@Date: 2020-06-10 15:03:59
+@LastEditor: John
+@LastEditTime: 2020-06-14 11:42:45
+@Discription: 
+@Environment: python 3.7.7
+'''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Critic(nn.Module):
+    def __init__(self, n_obs, n_actions, hidden_size, init_w=3e-3):
+        super(Critic, self).__init__()
+        
+        self.linear1 = nn.Linear(n_obs + n_actions, hidden_size)
+        self.linear2 = nn.Linear(hidden_size, hidden_size)
+        self.linear3 = nn.Linear(hidden_size, 1)
+    
+        self.linear3.weight.data.uniform_(-init_w, init_w)
+        self.linear3.bias.data.uniform_(-init_w, init_w)
+        
+    def forward(self, state, action):
+        x = torch.cat([state, action], 1)
+        x = F.relu(self.linear1(x))
+        x = F.relu(self.linear2(x))
+        x = self.linear3(x)
+        return x
+
+class Actor(nn.Module):
+    def __init__(self, n_obs, n_actions, hidden_size, init_w=3e-3):
+        super(Actor, self).__init__()  
+        self.linear1 = nn.Linear(n_obs, hidden_size)
+        self.linear2 = nn.Linear(hidden_size, hidden_size)
+        self.linear3 = nn.Linear(hidden_size, n_actions)
+        
+        self.linear3.weight.data.uniform_(-init_w, init_w)
+        self.linear3.bias.data.uniform_(-init_w, init_w)
+        
+    def forward(self, x):
+        x = F.relu(self.linear1(x))
+        x = F.relu(self.linear2(x))
+        x = F.tanh(self.linear3(x))
+        return x
+    
+    def select_action(self, state):
+        
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        state  = torch.FloatTensor(state).unsqueeze(0).to(device)
+        # print(state)
+        action = self.forward(state)
+        return action.detach().cpu().numpy()[0, 0]
--- a/codes/ddpg/noise.py
+++ b/codes/ddpg/noise.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+@Author: John
+@Email: johnjim0816@gmail.com
+@Date: 2020-06-11 20:58:59
+@LastEditor: John
+@LastEditTime: 2020-06-11 20:59:20
+@Discription: 
+@Environment: python 3.7.7
+'''
+import numpy as np
+
+class OUNoise(object):
+    def __init__(self, action_space, mu=0.0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000):
+        self.mu           = mu
+        self.theta        = theta
+        self.sigma        = max_sigma
+        self.max_sigma    = max_sigma
+        self.min_sigma    = min_sigma
+        self.decay_period = decay_period
+        self.n_actions   = action_space.shape[0]
+        self.low          = action_space.low
+        self.high         = action_space.high
+        self.reset()
+        
+    def reset(self):
+        self.obs = np.ones(self.n_actions) * self.mu
+        
+    def evolve_obs(self):
+        x  = self.obs
+        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.n_actions)
+        self.obs = x + dx
+        return self.obs
+    
+    def get_action(self, action, t=0):
+        ou_obs = self.evolve_obs()
+        self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period)
+        return np.clip(action + ou_obs, self.low, self.high)
--- a/codes/ddpg/plot.py
+++ b/codes/ddpg/plot.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+@Author: John
+@Email: johnjim0816@gmail.com
+@Date: 2020-06-11 16:30:09
+@LastEditor: John
+@LastEditTime: 2020-06-12 11:34:52
+@Discription: 
+@Environment: python 3.7.7
+'''
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns; sns.set()
+import numpy as np
+import os 
+
+# def plot(item,ylabel='rewards'):
+#     plt.figure()
+#     plt.plot(np.arange(len(item)), item)
+#     plt.title(ylabel+' of DDPG') 
+#     plt.ylabel(ylabel)
+#     plt.xlabel('episodes')
+#     plt.savefig(os.path.dirname(__file__)+"/result/"+ylabel+".png")
+#     plt.show()
+
+def plot(item,ylabel='rewards'):
+    df = pd.DataFrame(dict(time=np.arange(500),
+                       value=np.random.randn(500).cumsum()))
+    g = sns.relplot(x="time", y="value", kind="line", data=df)
+    g.fig.autofmt_xdate()
+    # time = range(len(item))
+    # sns.set(style="darkgrid", font_scale=1.5)
+    # sns.lineplot(time=time, data=item, color="r", condition="behavior_cloning")
+    # # sns.tsplot(time=time, data=x2, color="b", condition="dagger")
+    # plt.ylabel("Reward")
+    # plt.xlabel("Iteration Number")
+    # plt.title("Imitation Learning")
+
+    plt.show()
+if __name__ == "__main__":
+
+    output_path = os.path.dirname(__file__)+"/result/"
+    rewards=np.load(output_path+"rewards.npy", )
+    moving_average_rewards=np.load(output_path+"moving_average_rewards.npy",)
+    plot(rewards)
+    plot(moving_average_rewards,ylabel='moving_average_rewards')
--- a/codes/ddpg/result/moving_average_rewards.npy
+++ b/codes/ddpg/result/moving_average_rewards.npy
--- a/codes/ddpg/result/moving_average_rewards.png
+++ b/codes/ddpg/result/moving_average_rewards.png
--- a/codes/ddpg/result/rewards.npy
+++ b/codes/ddpg/result/rewards.npy
--- a/codes/ddpg/result/rewards.png
+++ b/codes/ddpg/result/rewards.png
--- a/codes/dqn/dqn.py
+++ b/codes/dqn/dqn.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+@Author: John
+@Email: johnjim0816@gmail.com
+@Date: 2020-06-12 00:50:49
+@LastEditor: John
+@LastEditTime: 2020-06-14 13:56:45
+@Discription: 
+@Environment: python 3.7.7
+'''
+'''off-policy
+'''
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.nn.functional as F
+import random
+import math
+import numpy as np
+from memory import ReplayBuffer
+from model import FCN
+
+class DQN:
+    def __init__(self, n_states, n_actions, gamma=0.99, epsilon_start=0.9, epsilon_end=0.05, epsilon_decay=200, memory_capacity=10000, policy_lr=0.01,batch_size=128, device="cpu"):
+        self.actions_count = 0
+        self.n_actions = n_actions
+        self.device = device
+        self.gamma = gamma
+        self.epsilon = 0
+        self.epsilon_start = epsilon_start
+        self.epsilon_end = epsilon_end
+        self.epsilon_decay = epsilon_decay
+        self.batch_size = batch_size
+        self.policy_net = FCN(n_states,n_actions).to(self.device)
+        self.target_net = FCN(n_states,n_actions).to(self.device)
+        self.target_net.load_state_dict(self.policy_net.state_dict())
+        self.target_net.eval()  # 不启用 BatchNormalization 和 Dropout
+        self.optimizer = optim.Adam(self.policy_net.parameters(),lr=policy_lr)
+        self.loss = 0
+        self.memory = ReplayBuffer(memory_capacity)
+
+    def select_action(self,state):
+        '''选择工作
+        Args:
+            state [array]: 状态
+        Returns:
+            [array]: 动作
+        '''
+        self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
+            math.exp(-1. * self.actions_count / self.epsilon_decay)
+        self.actions_count += 1
+        if random.random() > self.epsilon:
+            with torch.no_grad():
+                state   = torch.tensor([state],device=self.device,dtype=torch.float32) # 先转为张量便于丢给神经网络,state元素数据原本为float64；注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
+                q_value = self.policy_net(state) # tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
+                action  = q_value.max(1)[1].item()
+        else:
+            action = random.randrange(self.n_actions)
+        return action
+    def update(self):
+
+        if len(self.memory) < self.batch_size:
+            return
+
+        state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(self.batch_size)
+        
+        state_batch = torch.tensor(state_batch,device=self.device,dtype=torch.float) # 例如tensor([[-4.5543e-02, -2.3910e-01,  1.8344e-02,  2.3158e-01],...,[-1.8615e-02, -2.3921e-01, -1.1791e-02,  2.3400e-01]])
+        action_batch  = torch.tensor(action_batch,device=self.device).unsqueeze(1) # 例如tensor([[1],...,[0]])
+        reward_batch  = torch.tensor(reward_batch,device=self.device,dtype=torch.float) # tensor([1., 1.,...,1])
+        next_state_batch = torch.tensor(next_state_batch,device=self.device,dtype=torch.float)
+        done_batch = torch.tensor(np.float32(done_batch),device=self.device).unsqueeze(1) # 将bool转为float然后转为张量
+        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
+        # columns of actions taken. These are the actions which would've been taken
+        # for each batch state according to policy_net
+        q_values = self.policy_net(state_batch).gather(1, action_batch) # 等价于self.forward       
+        # Compute V(s_{t+1}) for all next states.
+        # Expected values of actions for non_final_next_states are computed based
+        # on the "older" target_net; selecting their best reward with max(1)[0].
+        # This is merged based on the mask, such that we'll have either the expected
+        # state value or 0 in case the state was final.
+     
+        next_state_values = self.target_net(
+            next_state_batch).max(1)[0].detach() # tensor([ 0.0060, -0.0171,...,])
+        # Compute the expected Q values
+        expected_q_values = reward_batch + self.gamma * next_state_values * (1-done_batch[0])
+
+        # Compute Huber loss
+        # self.loss = nn.MSELoss(q_values, expected_q_values.unsqueeze(1)) 
+        self.loss = nn.MSELoss()(q_values,expected_q_values.unsqueeze(1))
+        # Optimize the model
+        self.optimizer.zero_grad() # zero_grad clears old gradients from the last step (otherwise you’d just accumulate the gradients from all loss.backward() calls).
+        self.loss.backward() # loss.backward() computes the derivative of the loss w.r.t. the parameters (or anything requiring gradients) using backpropagation.
+        for param in self.policy_net.parameters(): # clip防止梯度爆炸
+            param.grad.data.clamp_(-1, 1)
+        self.optimizer.step() # causes the optimizer to take a step based on the gradients of the parameters.
+        
+
--- a/codes/dqn/main.py
+++ b/codes/dqn/main.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+@Author: John
+@Email: johnjim0816@gmail.com
+@Date: 2020-06-12 00:48:57
+@LastEditor: John
+@LastEditTime: 2020-07-20 23:02:16
+@Discription: 
+@Environment: python 3.7.7
+'''
+'''未完成
+'''
+import gym
+import torch
+from dqn import DQN
+from plot import plot
+import argparse
+def get_args():
+    '''模型建立好之后只需要在这里调参
+    '''
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--gamma", default=0.99,
+                        type=float)  # q-learning中的gamma
+    parser.add_argument("--epsilon_start", default=0.95,
+                        type=float)  # 基于贪心选择action对应的参数epsilon
+    parser.add_argument("--epsilon_end", default=0.01, type=float)
+    parser.add_argument("--epsilon_decay", default=200, type=float)
+    parser.add_argument("--policy_lr", default=0.01, type=float)
+    parser.add_argument("--memory_capacity", default=1000,
+                        type=int, help="capacity of Replay Memory")
+
+    parser.add_argument("--batch_size", default=32, type=int,
+                        help="batch size of memory sampling")
+    parser.add_argument("--max_episodes", default=200, type=int)
+    parser.add_argument("--max_steps", default=200, type=int)
+    parser.add_argument("--target_update", default=2, type=int,
+                        help="when(every default 10 eisodes) to update target net ")
+    config = parser.parse_args()
+
+    return config
+
+
+if __name__ == "__main__":
+
+    cfg = get_args()
+    # if gpu is to be used
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    env = gym.make('CartPole-v0').unwrapped
+    env.seed(1)
+    n_states = env.observation_space.shape[0]
+    n_actions = env.action_space.n
+    agent = DQN(n_states=n_states, n_actions=n_actions, device=device, gamma=cfg.gamma, epsilon_start=cfg.epsilon_start,
+                epsilon_end=cfg.epsilon_end, epsilon_decay=cfg.epsilon_decay,policy_lr=cfg.policy_lr, memory_capacity=cfg.memory_capacity, batch_size=cfg.batch_size)
+    rewards = []
+    moving_average_rewards = []
+    for i_episode in range(1, cfg.max_episodes+1):
+        # Initialize the environment and state
+        state = env.reset()
+        ep_reward = 0
+        for t in range(1, cfg.max_steps+1):
+            # Select and perform an action
+            action = agent.select_action(state)
+            next_state, reward, done, _ = env.step(action)
+            ep_reward += reward
+            # Store the transition in memory
+            agent.memory.push(state,action,reward,next_state,done)
+            # Move to the next state
+            state = next_state
+            # Perform one step of the optimization (on the target network)
+            agent.update()
+            if done:
+                break
+
+        # Update the target network, copying all weights and biases in DQN
+        if i_episode % cfg.target_update == 0:
+            agent.target_net.load_state_dict(agent.policy_net.state_dict())
+        print('Episode:', i_episode, ' Reward: %i' %
+              int(ep_reward), 'Explore: %.2f' % agent.epsilon)
+        rewards.append(ep_reward)
+        # 计算滑动窗口的reward
+        if i_episode == 1:
+            moving_average_rewards.append(ep_reward)
+        else:
+            moving_average_rewards.append(
+                0.9*moving_average_rewards[-1]+0.1*ep_reward)
+
+    import os
+    import numpy as np
+    output_path = os.path.dirname(__file__)+"/result/"
+    if not os.path.exists(output_path):
+        os.mkdir(output_path)
+    np.save(output_path+"rewards.npy", rewards)
+    np.save(output_path+"moving_average_rewards.npy", moving_average_rewards)
+    print('Complete！')
+    plot(rewards)
+    plot(moving_average_rewards, ylabel="moving_average_rewards")
--- a/codes/dqn/memory.py
+++ b/codes/dqn/memory.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+@Author: John
+@Email: johnjim0816@gmail.com
+@Date: 2020-06-10 15:27:16
+@LastEditor: John
+@LastEditTime: 2020-06-14 11:36:24
+@Discription: 
+@Environment: python 3.7.7
+'''
+import random
+import numpy as np
+
+class ReplayBuffer:
+    
+    def __init__(self, capacity):
+        self.capacity = capacity
+        self.buffer = []
+        self.position = 0
+    
+    def push(self, state, action, reward, next_state, done):
+        if len(self.buffer) < self.capacity:
+            self.buffer.append(None)
+        self.buffer[self.position] = (state, action, reward, next_state, done)
+        self.position = (self.position + 1) % self.capacity
+    
+    def sample(self, batch_size):
+        batch = random.sample(self.buffer, batch_size)
+        state, action, reward, next_state, done =  zip(*batch)
+        return state, action, reward, next_state, done
+    
+    def __len__(self):
+        return len(self.buffer)
+
--- a/codes/dqn/model.py
+++ b/codes/dqn/model.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+@Author: John
+@Email: johnjim0816@gmail.com
+@Date: 2020-06-12 00:47:02
+@LastEditor: John
+@LastEditTime: 2020-06-14 11:23:04
+@Discription: 
+@Environment: python 3.7.7
+'''
+import torch.nn as nn
+import torch.nn.functional as F
+
+class FCN(nn.Module):
+    def __init__(self, n_states=4, n_actions=18):
+        """
+        Initialize a deep Q-learning network for testing algorithm
+            n_states: number of features of input.
+            n_actions: number of action-value to output, one-to-one correspondence to action in game.
+        """
+        super(FCN, self).__init__()
+        self.fc1 = nn.Linear(n_states, 128)
+        self.fc2 = nn.Linear(128, 128)
+        self.fc3 = nn.Linear(128, n_actions)
+        
+    def forward(self, x):
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        return self.fc3(x)
--- a/codes/dqn/plot.py
+++ b/codes/dqn/plot.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+@Author: John
+@Email: johnjim0816@gmail.com
+@Date: 2020-06-11 16:30:09
+@LastEditor: John
+@LastEditTime: 2020-06-14 11:38:42
+@Discription: 
+@Environment: python 3.7.7
+'''
+import matplotlib.pyplot as plt
+import numpy as np
+import os 
+
+def plot(item,ylabel='rewards'):
+    plt.figure()
+    plt.plot(np.arange(len(item)), item)
+    plt.title(ylabel+' of DQN') 
+    plt.ylabel('rewards')
+    plt.xlabel('episodes')
+    plt.savefig(os.path.dirname(__file__)+"/result/"+ylabel+".png")
+    plt.show()
+if __name__ == "__main__":
+
+    output_path = os.path.dirname(__file__)+"/result/"
+    rewards=np.load(output_path+"rewards.npy", )
+    moving_average_rewards=np.load(output_path+"moving_average_rewards.npy",)
+    plot(rewards)
+    plot(moving_average_rewards,ylabel='moving_average_rewards')
--- a/codes/dqn/result/moving_average_rewards.npy
+++ b/codes/dqn/result/moving_average_rewards.npy
--- a/codes/dqn/result/moving_average_rewards.png
+++ b/codes/dqn/result/moving_average_rewards.png
--- a/codes/dqn/result/rewards.npy
+++ b/codes/dqn/result/rewards.npy
--- a/codes/dqn/result/rewards.png
+++ b/codes/dqn/result/rewards.png
--- a/codes/dqn_cnn/dqn.py
+++ b/codes/dqn_cnn/dqn.py
@@ -0,0 +1,107 @@
+import random
+import math
+import torch
+import torch.optim as optim
+import torch.nn.functional as F
+from memory import ReplayBuffer
+from model import CNN
+
+
+class DQN:
+    def __init__(self, screen_height=0, screen_width=0, n_actions=0, gamma=0.999, epsilon_start=0.9, epsilon_end=0.05, epsilon_decay=200, memory_capacity=10000, batch_size=128, device="cpu"):
+        self.actions_count = 0
+        self.n_actions = n_actions
+        self.device = device
+        self.gamma = gamma
+        self.epsilon = 0
+        self.epsilon_start = epsilon_start
+        self.epsilon_end = epsilon_end
+        self.epsilon_decay = epsilon_decay
+        self.batch_size = batch_size
+        self.policy_net = CNN(screen_height, screen_width,
+                              n_actions).to(self.device)
+        self.target_net = CNN(screen_height, screen_width,
+                              n_actions).to(self.device)
+        self.target_net.load_state_dict(self.policy_net.state_dict())
+        self.target_net.eval()  # 不启用 BatchNormalization 和 Dropout
+        self.optimizer = optim.RMSprop(self.policy_net.parameters())
+        self.loss = 0
+        self.memory = ReplayBuffer(memory_capacity)
+        
+
+    def select_action(self, state):
+        '''choose_action [summary]
+        Args:
+            state [torch tensor]: [description]
+        Returns:
+            actions [torch tensor]: [description]
+        '''
+        sample = random.random()
+        self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
+            math.exp(-1. * self.actions_count / self.epsilon_decay)
+        self.actions_count += 1
+        if sample > self.epsilon:
+            with torch.no_grad():
+                # t.max(1) will return largest column value of each row.
+                # second column on max result is index of where max element was
+                # found, so we pick action with the larger expected reward.
+                
+                q_value = self.policy_net(state) # q_value比如tensor([[-0.2522,  0.3887]])
+                action = q_value.max(1)[1].view(1, 1)  # q_value最大对应的下标，注意该action是个张量，如tensor([1])
+                return action
+        else:
+            return torch.tensor([[random.randrange(self.n_actions)]], device=self.device, dtype=torch.long)
+
+    def update(self):
+        if len(self.memory) < self.batch_size:
+            return
+        transitions = self.memory.sample(self.batch_size)
+        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
+        # detailed explanation). This converts batch-array of Transitions
+        # to Transition of batch-arrays.
+        batch = self.memory.Transition(*zip(*transitions))
+
+        # Compute a mask of non-final states and concatenate the batch elements
+        # (a final state would've been the one after which simulation ended)
+        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
+                                                batch.next_state)), device=self.device, dtype=torch.bool)
+        
+        non_final_next_states = torch.cat([s for s in batch.next_state
+                                           if s is not None])
+        state_batch = torch.cat(batch.state)   
+        action_batch = torch.cat(batch.action) 
+        reward_batch = torch.cat(batch.reward) # tensor([1., 1.,...,])
+        
+
+        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
+        # columns of actions taken. These are the actions which would've been taken
+        # for each batch state according to policy_net
+        state_action_values = self.policy_net(
+            state_batch).gather(1, action_batch) #tensor([[ 1.1217],...,[ 0.8314]])
+
+        # Compute V(s_{t+1}) for all next states.
+        # Expected values of actions for non_final_next_states are computed based
+        # on the "older" target_net; selecting their best reward with max(1)[0].
+        # This is merged based on the mask, such that we'll have either the expected
+        # state value or 0 in case the state was final.
+        next_state_values = torch.zeros(self.batch_size, device=self.device)
+
+        next_state_values[non_final_mask] = self.target_net(
+            non_final_next_states).max(1)[0].detach()
+
+        # Compute the expected Q values
+        expected_state_action_values = (next_state_values * self.gamma) + reward_batch # tensor([0.9685, 0.9683,...,])
+        
+        # Compute Huber loss
+        self.loss = F.smooth_l1_loss(
+            state_action_values, expected_state_action_values.unsqueeze(1))  # .unsqueeze增加一个维度
+        # Optimize the model
+        self.optimizer.zero_grad() # zero_grad clears old gradients from the last step (otherwise you’d just accumulate the gradients from all loss.backward() calls).
+        self.loss.backward() # loss.backward() computes the derivative of the loss w.r.t. the parameters (or anything requiring gradients) using backpropagation.
+        for param in self.policy_net.parameters(): # clip防止梯度爆炸
+            param.grad.data.clamp_(-1, 1)
+        self.optimizer.step() # causes the optimizer to take a step based on the gradients of the parameters.
+
+
+if __name__ == "__main__":
+    dqn = DQN()
--- a/codes/dqn_cnn/main.py
+++ b/codes/dqn_cnn/main.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+@Author: John
+@Email: johnjim0816@gmail.com
+@Date: 2020-06-11 10:01:09
+@LastEditor: John
+@LastEditTime: 2020-06-13 00:24:31
+@Discription: 
+@Environment: python 3.7.7
+'''
+'''
+应该是没有收敛，但是pytorch官方教程的结果也差不多
+'''
+import gym
+import torch
+
+from screen_state import get_screen
+from dqn import DQN
+from plot import plot
+
+import argparse
+
+def get_args():
+    '''模型建立好之后只需要在这里调参
+    '''
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--gamma", default=0.999, type=float) # q-learning中的gamma
+    parser.add_argument("--epsilon_start", default=0.9, type=float) # 基于贪心选择action对应的参数epsilon
+    parser.add_argument("--epsilon_end", default=0.05, type=float)
+    parser.add_argument("--epsilon_decay", default=200, type=float)
+
+    parser.add_argument("--memory_capacity", default=10000, type=int,help="capacity of Replay Memory")
+
+    parser.add_argument("--batch_size", default=128, type=int,help="batch size of memory sampling")
+    parser.add_argument("--max_episodes", default=100, type=int)
+    parser.add_argument("--max_steps", default=200, type=int)
+    parser.add_argument("--target_update", default=4, type=int,help="when(every default 10 eisodes) to update target net ")
+    config = parser.parse_args()
+
+    return config
+
+if __name__ == "__main__":
+
+    cfg = get_args()
+    # if gpu is to be used
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    # Get screen size so that we can initialize layers correctly based on shape
+    # returned from AI gym. Typical dimensions at this point are close to 3x40x90
+    # which is the result of a clamped and down-scaled render buffer in get_screen(env,device)
+    env = gym.make('CartPole-v0').unwrapped
+    env.reset()
+    init_screen = get_screen(env, device)
+    _, _, screen_height, screen_width = init_screen.shape
+    # Get number of actions from gym action space
+    n_actions = env.action_space.n
+    agent = DQN(screen_height=screen_height, screen_width=screen_width,
+             n_actions=n_actions, device=device, gamma=cfg.gamma, epsilon_start=cfg.epsilon_start, epsilon_end=cfg.epsilon_end, epsilon_decay=cfg.epsilon_decay, memory_capacity=cfg.memory_capacity,batch_size=cfg.batch_size)
+
+    rewards = []
+    moving_average_rewards = []
+    for i_episode in range(1,cfg.max_episodes+1):
+        # Initialize the environment and state
+        env.reset()
+        last_screen = get_screen(env, device)
+        current_screen = get_screen(env, device)
+        state = current_screen - last_screen
+        ep_reward = 0
+        for t in range(1,cfg.max_steps+1):
+            # Select and perform an action
+            action = agent.select_action(state)     
+            _, reward, done, _ = env.step(action.item())
+            ep_reward += reward
+            reward = torch.tensor([reward], device=device)
+            # Observe new state
+            last_screen = current_screen
+            current_screen = get_screen(env, device)
+
+            if done: break
+            next_state = current_screen - last_screen
+
+            # Store the transition in memory
+            agent.memory.push(state, action, next_state, reward)
+
+            # Move to the next state
+            state = next_state
+
+            # Perform one step of the optimization (on the target network)
+            agent.update()
+
+        # Update the target network, copying all weights and biases in DQN
+        if i_episode % cfg.target_update == 0:
+            agent.target_net.load_state_dict(agent.policy_net.state_dict())
+        print('Episode:', i_episode, ' Reward: %i' %int(ep_reward), 'Explore: %.2f' % agent.epsilon)
+        rewards.append(ep_reward)
+        if i_episode == 1:
+            moving_average_rewards.append(ep_reward)
+        else:
+            moving_average_rewards.append(
+                0.9*moving_average_rewards[-1]+0.1*ep_reward)
+
+    import os
+    import numpy as np
+    output_path = os.path.dirname(__file__)+"/result/"
+    if not os.path.exists(output_path):
+        os.mkdir(output_path)
+    np.save(output_path+"rewards.npy", rewards)
+    np.save(output_path+"moving_average_rewards.npy", moving_average_rewards)
+    print('Complete！')
+    plot(rewards)
+    plot(moving_average_rewards,ylabel="moving_average_rewards")
+
+    
--- a/codes/dqn_cnn/memory.py
+++ b/codes/dqn_cnn/memory.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+@Author: John
+@Email: johnjim0816@gmail.com
+@Date: 2020-06-11 09:42:44
+@LastEditor: John
+@LastEditTime: 2020-06-11 15:50:33
+@Discription: 
+@Environment: python 3.7.7
+'''
+from collections import namedtuple
+import random
+
+
+                      
+class ReplayBuffer(object):
+
+    def __init__(self, capacity):
+        self.capacity = capacity
+        self.buffer = []
+        self.position = 0
+        self.Transition = namedtuple('Transition',
+                        ('state', 'action', 'next_state', 'reward'))
+
+    def push(self, *args):
+        """Saves a transition."""
+        if len(self.buffer) < self.capacity:
+            self.buffer.append(None)
+        self.buffer[self.position] = self.Transition(*args)
+        self.position = (self.position + 1) % self.capacity
+
+    def sample(self, batch_size):
+        return random.sample(self.buffer, batch_size)
+
+    def __len__(self):
+        return len(self.buffer)
--- a/codes/dqn_cnn/model.py
+++ b/codes/dqn_cnn/model.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+@Author: John
+@Email: johnjim0816@gmail.com
+@Date: 2020-06-11 12:18:12
+@LastEditor: John
+@LastEditTime: 2020-06-11 17:23:45
+@Discription: 
+@Environment: python 3.7.7
+'''
+import torch.nn as nn
+import torch.nn.functional as F
+
+class CNN(nn.Module):
+
+    def __init__(self, h, w, n_outputs):
+        super(CNN, self).__init__()
+        self.conv1 = nn.Conv2d(3, 16, kernel_size=5, stride=2)
+        self.bn1 = nn.BatchNorm2d(16)
+        self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2)
+        self.bn2 = nn.BatchNorm2d(32)
+        self.conv3 = nn.Conv2d(32, 32, kernel_size=5, stride=2)
+        self.bn3 = nn.BatchNorm2d(32)
+
+        # Number of Linear input connections depends on output of conv2d layers
+        # and therefore the input image size, so compute it.
+        def conv2d_size_out(size, kernel_size = 5, stride = 2):
+            return (size - (kernel_size - 1) - 1) // stride  + 1
+        convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
+        convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
+        linear_input_size = convw * convh * 32
+        self.head = nn.Linear(linear_input_size, n_outputs)
+
+    # Called with either one element to determine next action, or a batch
+    # during optimization. Returns tensor([[left0exp,right0exp]...]).
+    def forward(self, x):
+        x = F.relu(self.bn1(self.conv1(x)))
+        x = F.relu(self.bn2(self.conv2(x)))
+        x = F.relu(self.bn3(self.conv3(x)))
+        return self.head(x.view(x.size(0), -1))
--- a/codes/dqn_cnn/plot.py
+++ b/codes/dqn_cnn/plot.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+@Author: John
+@Email: johnjim0816@gmail.com
+@Date: 2020-06-11 16:30:09
+@LastEditor: John
+@LastEditTime: 2020-06-11 22:27:24
+@Discription: 
+@Environment: python 3.7.7
+'''
+import matplotlib.pyplot as plt
+import numpy as np
+import os 
+
+def plot(item,ylabel='rewards'):
+    plt.figure()
+    plt.plot(np.arange(len(item)), item)
+    plt.title(ylabel+' of CnnDQN') 
+    plt.ylabel('rewards')
+    plt.xlabel('episodes')
+    
+    plt.savefig(os.path.dirname(__file__)+"/result/"+ylabel+".png")
+    plt.show()
--- a/codes/dqn_cnn/result/moving_average_rewards.npy
+++ b/codes/dqn_cnn/result/moving_average_rewards.npy
--- a/codes/dqn_cnn/result/moving_average_rewards.png
+++ b/codes/dqn_cnn/result/moving_average_rewards.png
--- a/codes/dqn_cnn/result/rewards.npy
+++ b/codes/dqn_cnn/result/rewards.npy
--- a/codes/dqn_cnn/result/rewards.png
+++ b/codes/dqn_cnn/result/rewards.png
--- a/codes/dqn_cnn/screen_state.py
+++ b/codes/dqn_cnn/screen_state.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+@Author: John
+@Email: johnjim0816@gmail.com
+@Date: 2020-06-11 10:02:35
+@LastEditor: John
+@LastEditTime: 2020-06-11 16:57:34
+@Discription: 
+@Environment: python 3.7.7
+'''
+
+import numpy as np
+import torch
+import torchvision.transforms as T
+from PIL import Image
+
+resize = T.Compose([T.ToPILImage(),
+                    T.Resize(40, interpolation=Image.CUBIC),
+                    T.ToTensor()])
+
+
+def get_cart_location(env,screen_width):
+    world_width = env.x_threshold * 2
+    scale = screen_width / world_width
+    return int(env.state[0] * scale + screen_width / 2.0)  # MIDDLE OF CART
+
+def get_screen(env,device):
+    # Returned screen requested by gym is 400x600x3, but is sometimes larger
+    # such as 800x1200x3. Transpose it into torch order (CHW).
+    screen = env.render(mode='rgb_array').transpose((2, 0, 1))
+    # Cart is in the lower half, so strip off the top and bottom of the screen
+    _, screen_height, screen_width = screen.shape
+    screen = screen[:, int(screen_height*0.4):int(screen_height * 0.8)]
+    view_width = int(screen_width * 0.6)
+    cart_location = get_cart_location(env,screen_width)
+    if cart_location < view_width // 2:
+        slice_range = slice(view_width)
+    elif cart_location > (screen_width - view_width // 2):
+        slice_range = slice(-view_width, None)
+    else:
+        slice_range = slice(cart_location - view_width // 2,
+                            cart_location + view_width // 2)
+    # Strip off the edges, so that we have a square image centered on a cart
+    screen = screen[:, :, slice_range]
+    # Convert to float, rescale, convert to torch tensor
+    # (this doesn't require a copy)
+    screen = np.ascontiguousarray(screen, dtype=np.float32) / 255
+    screen = torch.from_numpy(screen)
+    # Resize, and add a batch dimension (BCHW)
+    return resize(screen).unsqueeze(0).to(device)
+
+if __name__ == "__main__":
+
+    import gym
+    env = gym.make('CartPole-v0').unwrapped
+    # if gpu is to be used
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    env.reset()
+    import matplotlib.pyplot as plt
+    
+    plt.figure()
+    plt.imshow(get_screen(env,device).cpu().squeeze(0).permute(1, 2, 0).numpy(),
+            interpolation='none')
+    plt.title('Example extracted screen')
+    plt.show()
--- a/docs/README.md
+++ b/docs/README.md
@@ -29,6 +29,7 @@
 ## 主要贡献者

 - [@qiwang067](https://github.com/qiwang067)
+- [@JohnJim0816](https://github.com/JohnJim0816)

 ## 关注我们

--- a/docs/chapter2/chapter2.md
+++ b/docs/chapter2/chapter2.md
@@ -10,7 +10,7 @@

 强化学习的三个重要的要素：状态、动作和奖励。强化学习智能体跟环境是一步一步交互的，就是我先观察一下状态，然后再输入动作。再观察一下状态，再输出动作，拿到这些 reward 。它是一个跟时间相关的一个序列决策的问题。

-举个例子，在 $t-1$ 时刻，我看到了熊对我招手，那我下意识的可能输出的动作就是我赶紧跑路。熊看到了有人跑了，可能就觉得发现猎物，开始发动攻击。而在 $t$ 时刻的话，我如果选择装死的动作，可能熊咬了咬我那个摔了几下就发现就觉得挺无趣的，可能会走开。这个时候，我再跑路的话可能就跑路成功了，就是这样子的一个序列决策的过程。
+举个例子，在 $t-1$ 时刻，我看到了熊对我招手，那我下意识的可能输出的动作就是赶紧跑路。熊看到了有人跑了，可能就觉得发现猎物，开始发动攻击。而在 $t$ 时刻的话，我如果选择装死的动作，可能熊咬了咬我那个摔了几下就发现就觉得挺无趣的，可能会走开。这个时候，我再跑路的话可能就跑路成功了，就是这样子的一个序列决策的过程。

 当然在输出每一个动作之前，其实你都是可以选择不同的动作。比如说在 $t$ 时刻，我选择跑路的时候，熊已经追上来了，如果说 $t$ 时刻，我没有选择装死，而我是选择跑路的话，这个时候熊已经追上了，那这个时候，其实我有两种情况转移到不同的状态去，就我有一定的概率可以逃跑成功，也有很大的概率我会逃跑失败。那我们就用状态转移概率 $p\left[s_{t+1}, r_{t} \mid s_{t}, a_{t}\right]$ 来表述说在 $s_t$ 的状态选择了 $a_t$ 的动作的时候，转移到 $s_{t+1}$ ，而且拿到  $r_t$ 的概率是多少。

@@ -103,7 +103,7 @@ $$

 ![](img/2.14.png)

-这种强化方式其实在数学上面一行公式就表达出来了。这种更新的方式叫做`时序差分(Temporal Difference)`。这个公式它想要表达就是我可以拿下一步的 Q 值 $Q(S_{t+_1},A_{t+1})$ 来更新我这一步的 Q 值 $Q(S_t,A_t)$ 。
+这种强化方式其实在数学上面一行公式就表达出来了。这种更新的方式叫做`时序差分(Temporal Difference)`。这个公式就是说可以拿下一步的 Q 值 $Q(S_{t+_1},A_{t+1})$ 来更新我这一步的 Q 值 $Q(S_t,A_t)$ 。

 为了理解这个公式，如图所示，我们先把 $R_{t+1}+\gamma Q\left(S_{t+1}, A_{t+1}\right.)$ 当作是一个目标值，就是 $Q(S_t,A_t)$ 想要去逼近的一个目标值。我们想要计算的就是这个 $Q(S_t,A_t)$ 。因为最开始 Q 值都是随机初始化或者是初始化为零。它需要不断的去逼近它理想中真实的Q 值，我们就叫 target 。Target 就是未来收益的总和大概有多少，而且是带衰减的那个。

@@ -176,11 +176,14 @@ $$

 ![](img/2.19.png)

-我们再仔细地对比一下两个更新公式，它们俩的更新公式都是一样的。区别只在 target 计算的这一部分。Sarsa 是 $R_{t+1}+\gamma Q(S_{t+1}, A_{t+1})$  ，Q-learning 是$R_{t+1}+\gamma  \underset{a}{\max} Q\left(S_{t+1}, a\right)$ 。
+Sarsa 和 Q-learning 的更新公式都是一样的，区别只在 target 计算的这一部分，

-Sarsa 实际上都是用自己的策略产生了 S,A,R,S',A' 这一条轨迹。然后拿着 $Q(S_{t+1},A_{t+1})$ 去更新原本的Q值 $Q(S_t,A_t)$。 但是 Q-learning 并不需要知道，我实际上选择哪一个 action ，它默认下一个动作就是 Q 最大的那个动作。所以它知道实际上 behavior policy 可能会有 10% 的概率去选择别的动作，但是 Q-learning 并不担心受到探索的影响，它默认了就按照最优的策略来去优化我的目标策略，所以它可以更大胆地去寻找最优的路径，它其实会表现的比 Sarsa 大胆非常多。
+* Sarsa 是 $R_{t+1}+\gamma Q(S_{t+1}, A_{t+1})$  ；
+* Q-learning 是$R_{t+1}+\gamma  \underset{a}{\max} Q\left(S_{t+1}, a\right)$ 。

-然后Q-learning 的这个逐步的一个拆解的话，跟Sarsa 唯一一点不一样就是我并不需要提前知道我 $A_2$ ，我就能更新 $Q(S_1,A_1)$ 。在训练一个 episode 这个流程图当中，Q-leanring 在 learn 之前它也不需要去拿到 next action A'，它只需要前面四个 $(S,A,R,S')$也就可以了。这一点就是跟 Sarsa 有一个很明显的区别。这边我们给出[ Q-learning 的 Python实现](https://github.com/datawhalechina/leedeeprl-notes/tree/master/docs/code/Q-learning)。
+Sarsa 实际上都是用自己的策略产生了 S,A,R,S',A' 这一条轨迹。然后拿着 $Q(S_{t+1},A_{t+1})$ 去更新原本的 Q 值 $Q(S_t,A_t)$。 但是 Q-learning 并不需要知道，我实际上选择哪一个 action ，它默认下一个动作就是 Q 最大的那个动作。Q-learning 知道实际上 behavior policy 可能会有 10% 的概率去选择别的动作，但是 Q-learning 并不担心受到探索的影响，它默认了就按照最优的策略来去优化我的目标策略，所以它可以更大胆地去寻找最优的路径，它其实会表现的比 Sarsa 大胆非常多。
+
+然后 Q-learning 的这个逐步的一个拆解的话，跟 Sarsa 唯一一点不一样就是我并不需要提前知道我 $A_2$ ，我就能更新 $Q(S_1,A_1)$ 。在训练一个 episode 这个流程图当中，Q-learning 在 learn 之前它也不需要去拿到 next action A'，它只需要前面四个 $(S,A,R,S')$也就可以了，这一点就是跟 Sarsa 有一个很明显的区别。这边我们给出[ Q-learning 的 Python实现](https://github.com/datawhalechina/leedeeprl-notes/tree/master/docs/code/Q-learning)。

 ### Q-function Bellman Equation

@@ -195,7 +198,7 @@ $$

 >Bellman Equation 就是当前状态与未来状态的迭代关系，表示当前状态的值函数可以通过下个状态的值函数来计算。Bellman Equation 因其提出者、动态规划创始人 Richard Bellman 而得名 ，也 叫作“动态规划方程”。

-从另一方面考虑，在计算 $t$ 时刻的动作价值  $Q^{\pi}(s_t,a_t)$ 时，需要知道在 $t$、$t+1$、$t+2 \cdots \cdots$ 时刻的奖励，这样就不仅需要知道某一状态的所有可能出现的后续状态以及对应的奖励值，还要进行全宽度的回溯来更新状态的价值。这种方法无法在状态转移函数未知或者大规模问题中使用。因此，Q- learning 采用了浅层的时序差分采样学习，在计算累积奖励时，基于当前策略 $\pi$  预测接下来发生的 $n$ 步动作（$n$ 可以取 1 到 $+\infty$）并计算其奖励值。
+从另一方面考虑，在计算 $t$ 时刻的动作价值  $Q^{\pi}(s_t,a_t)$ 时，需要知道在 $t$、$t+1$、$t+2 \cdots \cdots$ 时刻的奖励，这样就不仅需要知道某一状态的所有可能出现的后续状态以及对应的奖励值，还要进行全宽度的回溯来更新状态的价值。这种方法无法在状态转移函数未知或者大规模问题中使用。因此，Q-learning 采用了浅层的时序差分采样学习，在计算累积奖励时，基于当前策略 $\pi$  预测接下来发生的 $n$ 步动作（$n$ 可以取 1 到 $+\infty$）并计算其奖励值。

 具体来说，假设在状态 $s_t$ 下选择了动作 $a_t$，并得到了奖励 $r_t$ ，此时状态转移到 $s_{t+1}$，如果在此状态下根据同样的策略选择了动作 $a_{t+1}$ ，则 $Q^{\pi}(s_t,a_t)$ 可以表示为
 $$
--- a/docs/chapter3/chapter3.md
+++ b/docs/chapter3/chapter3.md
@@ -235,6 +235,8 @@ $$

 Advantage function 的意义就是，假设我们在某一个 state $s_t$ 执行某一个 action $a_t$，相较于其他可能的 action，它有多好。它在意的不是一个绝对的好，而是相对的好，即`相对优势(relative advantage)`。因为会减掉一个 b，减掉一个 baseline， 所以这个东西是相对的好，不是绝对的好。 $A^{\theta}\left(s_{t}, a_{t}\right)$ 通常可以是由一个 network estimate 出来的，这个 network 叫做 critic。 

+
+
 ## References

 * [Intro to Reinforcement Learning (强化学习纲要）](https://github.com/zhoubolei/introRL)
--- a/docs/chapter4/chapter4.md
+++ b/docs/chapter4/chapter4.md
@@ -168,7 +168,7 @@ KL divergence 到底指的是什么？这边我是直接把 KL divergence 当做

 ![](img/4.9.png)

-在PPO 的paper 里面还有一个 `adaptive KL divergence`，这边会遇到一个问题就是 $\beta$  要设多少，它就跟那个regularization 一样。regularization 前面也要乘一个weight，所以这个 KL divergence 前面也要乘一个 weight，但 $\beta$  要设多少呢？所以有个动态调整 $\beta$ 的方法。在这个方法里面呢，你先设一个 KL divergence，你可以接受的最大值。然后假设你发现说你 optimize 完这个式子以后，KL divergence 的项太大，那就代表说后面这个 penalize 的 term 没有发挥作用，那就把 $\beta$ 调大。那另外你定一个 KL divergence 的最小值。如果发现 optimize 完上面这个式子以后，KL divergence 比最小值还要小，那代表后面这一项的效果太强了，你怕他只弄后面这一项，那$\theta$ 跟$\theta^k$ 都一样，这不是你要的，所以你这个时候你叫要减少 $\beta$。所以 $\beta$ 是可以动态调整的。这个叫做 adaptive KL penalty。
+在PPO 的paper 里面还有一个 `adaptive KL divergence`，这边会遇到一个问题就是 $\beta$  要设多少，它就跟那个regularization 一样。regularization 前面也要乘一个weight，所以这个 KL divergence 前面也要乘一个 weight，但 $\beta$  要设多少呢？所以有个动态调整 $\beta$ 的方法。在这个方法里面呢，你先设一个 KL divergence，你可以接受的最大值。然后假设你发现说你 optimize 完这个式子以后，KL divergence 的项太大，那就代表说后面这个 penalize 的 term 没有发挥作用，那就把 $\beta$ 调大。那另外你定一个 KL divergence 的最小值。如果发现 optimize 完上面这个式子以后，KL divergence 比最小值还要小，那代表后面这一项的效果太强了，你怕他只弄后面这一项，那 $\theta$ 跟 $\theta^k$ 都一样，这不是你要的，所以你这个时候你叫要减少 $\beta$。所以 $\beta$ 是可以动态调整的。这个叫做 adaptive KL penalty。

 ![](img/4.10.png)