更新蘑菇书附书代码

2022-12-04 20:54:36 +08:00
parent f030fe283d
commit dc8d13a13e
23 changed files with 10784 additions and 0 deletions
--- a/notebooks/A2C.ipynb
+++ b/notebooks/A2C.ipynb
--- a/notebooks/DDPG.ipynb
+++ b/notebooks/DDPG.ipynb
--- a/notebooks/DQN.ipynb
+++ b/notebooks/DQN.ipynb
--- a/notebooks/DoubleDQN.ipynb
+++ b/notebooks/DoubleDQN.ipynb
--- a/notebooks/DuelingDQN.ipynb
+++ b/notebooks/DuelingDQN.ipynb
--- a/notebooks/MonteCarlo.ipynb
+++ b/notebooks/MonteCarlo.ipynb
--- a/notebooks/NoisyDQN.ipynb
+++ b/notebooks/NoisyDQN.ipynb
--- a/notebooks/PER_DQN.ipynb
+++ b/notebooks/PER_DQN.ipynb
--- a/notebooks/PPO.ipynb
+++ b/notebooks/PPO.ipynb
--- a/notebooks/PPO暂存.md
+++ b/notebooks/PPO暂存.md
@@ -0,0 +1,142 @@
+## 原理简介
+
+PPO是一种on-policy算法，具有较好的性能，其前身是TRPO算法，也是policy gradient算法的一种，它是现在 OpenAI 默认的强化学习算法，具体原理可参考[PPO算法讲解](https://datawhalechina.github.io/easy-rl/#/chapter5/chapter5)。PPO算法主要有两个变种，一个是结合KL penalty的，一个是用了clip方法，本文实现的是后者即```PPO-clip```。
+## 伪代码
+要实现必先了解伪代码，伪代码如下：
+![在这里插入图片描述](assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70.png)
+这是谷歌找到的一张比较适合的图，本人比较懒就没有修改，上面的```k```就是第```k```个episode，第六步是用随机梯度下降的方法优化，这里的损失函数(即```argmax```后面的部分)可能有点难理解，可参考[PPO paper](https://arxiv.org/abs/1707.06347)，如下：
+![在这里插入图片描述](assets/20210323154236878.png)
+第七步就是一个平方损失函数，即实际回报与期望回报的差平方。
+## 代码实战
+[点击查看完整代码](https://github.com/JohnJim0816/rl-tutorials/tree/master/PPO)
+### PPOmemory
+首先第三步需要搜集一条轨迹信息，我们可以定义一个```PPOmemory```来存储相关信息：
+```python
+class PPOMemory:
+    def __init__(self, batch_size):
+        self.states = []
+        self.probs = []
+        self.vals = []
+        self.actions = []
+        self.rewards = []
+        self.dones = []
+        self.batch_size = batch_size
+    def sample(self):
+        batch_step = np.arange(0, len(self.states), self.batch_size)
+        indices = np.arange(len(self.states), dtype=np.int64)
+        np.random.shuffle(indices)
+        batches = [indices[i:i+self.batch_size] for i in batch_step]
+        return np.array(self.states),\
+                np.array(self.actions),\
+                np.array(self.probs),\
+                np.array(self.vals),\
+                np.array(self.rewards),\
+                np.array(self.dones),\
+                batches
+
+    def push(self, state, action, probs, vals, reward, done):
+        self.states.append(state)
+        self.actions.append(action)
+        self.probs.append(probs)
+        self.vals.append(vals)
+        self.rewards.append(reward)
+        self.dones.append(done)
+
+    def clear(self):
+        self.states = []
+        self.probs = []
+        self.actions = []
+        self.rewards = []
+        self.dones = []
+        self.vals = []
+```
+这里的push函数就是将得到的相关量放入memory中，sample就是随机采样出来，方便第六步的随机梯度下降。
+### PPO model
+model就是actor和critic两个网络了：
+```python
+import torch.nn as nn
+from torch.distributions.categorical import Categorical
+class Actor(nn.Module):
+    def __init__(self,n_states, n_actions,
+            hidden_dim=256):
+        super(Actor, self).__init__()
+
+        self.actor = nn.Sequential(
+                nn.Linear(n_states, hidden_dim),
+                nn.ReLU(),
+                nn.Linear(hidden_dim, hidden_dim),
+                nn.ReLU(),
+                nn.Linear(hidden_dim, n_actions),
+                nn.Softmax(dim=-1)
+        )
+    def forward(self, state):
+        dist = self.actor(state)
+        dist = Categorical(dist)
+        return dist
+
+class Critic(nn.Module):
+    def __init__(self, n_states,hidden_dim=256):
+        super(Critic, self).__init__()
+        self.critic = nn.Sequential(
+                nn.Linear(n_states, hidden_dim),
+                nn.ReLU(),
+                nn.Linear(hidden_dim, hidden_dim),
+                nn.ReLU(),
+                nn.Linear(hidden_dim, 1)
+        )
+    def forward(self, state):
+        value = self.critic(state)
+        return value
+```
+这里Actor就是得到一个概率分布(Categorica，也可以是别的分布，可以搜索torch distributionsl)，critc根据当前状态得到一个值，这里的输入维度可以是```n_states+n_actions```，即将action信息也纳入critic网络中，这样会更好一些，感兴趣的小伙伴可以试试。
+
+### PPO update
+定义一个update函数主要实现伪代码中的第六步和第七步：
+```python
+def update(self):
+    for _ in range(self.n_epochs):
+        state_arr, action_arr, old_prob_arr, vals_arr,\
+        reward_arr, dones_arr, batches = \
+                self.memory.sample()
+        values = vals_arr
+        ### compute advantage ###
+        advantage = np.zeros(len(reward_arr), dtype=np.float32)
+        for t in range(len(reward_arr)-1):
+            discount = 1
+            a_t = 0
+            for k in range(t, len(reward_arr)-1):
+                a_t += discount*(reward_arr[k] + self.gamma*values[k+1]*\
+                        (1-int(dones_arr[k])) - values[k])
+                discount *= self.gamma*self.gae_lambda
+            advantage[t] = a_t
+        advantage = torch.tensor(advantage).to(self.device)
+        ### SGD ###
+        values = torch.tensor(values).to(self.device)
+        for batch in batches:
+            states = torch.tensor(state_arr[batch], dtype=torch.float).to(self.device)
+            old_probs = torch.tensor(old_prob_arr[batch]).to(self.device)
+            actions = torch.tensor(action_arr[batch]).to(self.device)
+            dist = self.actor(states)
+            critic_value = self.critic(states)
+            critic_value = torch.squeeze(critic_value)
+            new_probs = dist.log_prob(actions)
+            prob_ratio = new_probs.exp() / old_probs.exp()
+            weighted_probs = advantage[batch] * prob_ratio
+            weighted_clipped_probs = torch.clamp(prob_ratio, 1-self.policy_clip,
+                    1+self.policy_clip)*advantage[batch]
+            actor_loss = -torch.min(weighted_probs, weighted_clipped_probs).mean()
+            returns = advantage[batch] + values[batch]
+            critic_loss = (returns-critic_value)**2
+            critic_loss = critic_loss.mean()
+            total_loss = actor_loss + 0.5*critic_loss
+            self.actor_optimizer.zero_grad()
+            self.critic_optimizer.zero_grad()
+            total_loss.backward()
+            self.actor_optimizer.step()
+            self.critic_optimizer.step()
+    self.memory.clear()
+```
+该部分首先从memory中提取搜集到的轨迹信息，然后计算gae，即advantage，接着使用随机梯度下降更新网络，最后清除memory以便搜集下一条轨迹信息。
+
+最后实现效果如下：
+![在这里插入图片描述](assets/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pvaG5KaW0w,size_16,color_FFFFFF,t_70-20210405110725113.png)
--- a/notebooks/PolicyGradient.ipynb
+++ b/notebooks/PolicyGradient.ipynb
@@ -0,0 +1,202 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. 定义算法\n",
+    "\n",
+    "最基础的策略梯度算法就是REINFORCE算法，又称作Monte-Carlo Policy Gradient算法。我们策略优化的目标如下：\n",
+    "\n",
+    "$$\n",
+    "J_{\\theta}= \\Psi_{\\pi} \\nabla_\\theta \\log \\pi_\\theta\\left(a_t \\mid s_t\\right)\n",
+    "$$\n",
+    "\n",
+    "其中$\\Psi_{\\pi}$在REINFORCE算法中表示衰减的回报(具体公式见伪代码)，也可以用优势来估计，也就是我们熟知的A3C算法，这个在后面包括GAE算法中都会讲到。\n",
+    "\n",
+    "### 1.1. 策略函数设计\n",
+    "\n",
+    "既然策略梯度是直接对策略函数进行梯度计算，那么策略函数如何设计呢？一般来讲有两种设计方式，一个是softmax函数，另外一个是高斯分布$\\mathbb{N}\\left(\\phi(\\mathbb{s})^{\\mathbb{\\pi}} \\theta, \\sigma^2\\right)$，前者用于离散动作空间，后者多用于连续动作空间。\n",
+    "\n",
+    "softmax函数可以表示为：\n",
+    "$$\n",
+    "\\pi_\\theta(s, a)=\\frac{e^{\\phi(s, a)^{T_\\theta}}}{\\sum_b e^{\\phi(s, b)^{T^T}}}\n",
+    "$$\n",
+    "对应的梯度为：\n",
+    "$$\n",
+    "\\nabla_\\theta \\log \\pi_\\theta(s, a)=\\phi(s, a)-\\mathbb{E}_{\\pi_\\theta}[\\phi(s,)\n",
+    "$$\n",
+    "高斯分布对应的梯度为：\n",
+    "$$\n",
+    "\\nabla_\\theta \\log \\pi_\\theta(s, a)=\\frac{\\left(a-\\phi(s)^T \\theta\\right) \\phi(s)}{\\sigma^2}\n",
+    "$$\n",
+    "但是对于一些特殊的情况，例如在本次演示中动作维度=2且为离散空间，这个时候可以用伯努利分布来实现，这种方式其实是不推荐的，这里给大家做演示也是为了展现一些特殊情况，启发大家一些思考，例如Bernoulli，Binomial，Gaussian分布之间的关系。简单说来，Binomial分布，$n = 1$时就是Bernoulli分布，$n \\rightarrow \\infty$时就是Gaussian分布。\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 1.2. 模型设计\n",
+    "\n",
+    "前面讲到，尽管本次演示是离散空间，但是由于动作维度等于2，此时就可以用特殊的高斯分布来表示策略函数，即伯努利分布。伯努利的分布实际上是用一个概率作为输入，然后从中采样动作，伯努利采样出来的动作只可能是0或1，就像投掷出硬币的正反面。在这种情况下，我们的策略模型就需要在MLP的基础上，将状态作为输入，将动作作为倒数第二层输出，并在最后一层增加激活函数来输出对应动作的概率。不清楚激活函数作用的同学可以再看一遍深度学习相关的知识，简单来说其作用就是增加神经网络的非线性。既然需要输出对应动作的概率，那么输出的值需要处于0-1之间，此时sigmoid函数刚好满足我们的需求，实现代码参考如下。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.nn.functional as F\n",
+    "class PGNet(nn.Module):\n",
+    "    def __init__(self, input_dim,output_dim,hidden_dim=128):\n",
+    "        \"\"\" 初始化q网络，为全连接网络\n",
+    "            input_dim: 输入的特征数即环境的状态维度\n",
+    "            output_dim: 输出的动作维度\n",
+    "        \"\"\"\n",
+    "        super(PGNet, self).__init__()\n",
+    "        self.fc1 = nn.Linear(input_dim, hidden_dim) # 输入层\n",
+    "        self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层\n",
+    "        self.fc3 = nn.Linear(hidden_dim, output_dim) # 输出层\n",
+    "    def forward(self, x):\n",
+    "        x = F.relu(self.fc1(x))\n",
+    "        x = F.relu(self.fc2(x))\n",
+    "        x = torch.sigmoid(self.fc3(x))\n",
+    "        return x"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 1.3. 更新函数设计\n",
+    "\n",
+    "前面提到我们的优化目标也就是策略梯度算法的损失函数如下：\n",
+    "$$\n",
+    "J_{\\theta}= \\Psi_{\\pi} \\nabla_\\theta \\log \\pi_\\theta\\left(a_t \\mid s_t\\right)\n",
+    "$$\n",
+    "\n",
+    "我们需要拆开成两个部分$\\Psi_{\\pi}$和$\\nabla_\\theta \\log \\pi_\\theta\\left(a_t \\mid s_t\\right)$分开计算，首先看值函数部分$\\Psi_{\\pi}$，在REINFORCE算法中值函数是从当前时刻开始的衰减回报，如下：\n",
+    "$$\n",
+    "G \\leftarrow \\sum_{k=t+1}^{T} \\gamma^{k-1} r_{k}\n",
+    "$$\n",
+    "\n",
+    "这个实际用代码来实现的时候可能有点绕，我们可以倒过来看，在同一回合下，我们的终止时刻是$T$，那么对应的回报$G_T=\\gamma^{T-1}r_T$，而对应的$G_{T-1}=\\gamma^{T-2}r_{T-1}+\\gamma^{T-1}r_T$，在这里代码中我们使用了一个动态规划的技巧，如下：\n",
+    "```python\n",
+    "running_add = running_add * self.gamma + reward_pool[i] # running_add初始值为0\n",
+    "```\n",
+    "这个公式也是倒过来循环的，第一次的值等于：\n",
+    "$$\n",
+    "running\\_add = r_T\n",
+    "$$\n",
+    "第二次的值则等于：\n",
+    "$$\n",
+    "running\\_add = r_T*\\gamma+r_{T-1}\n",
+    "$$\n",
+    "第三次的值等于：\n",
+    "$$\n",
+    "running\\_add = (r_T*\\gamma+r_{T-1})*\\gamma+r_{T-2} = r_T*\\gamma^2+r_{T-1}*\\gamma+r_{T-2}\n",
+    "$$\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from torch.distributions import Bernoulli\n",
+    "from torch.autograd import Variable\n",
+    "import numpy as np\n",
+    "\n",
+    "class PolicyGradient:\n",
+    "    \n",
+    "    def __init__(self, model,memory,cfg):\n",
+    "        self.gamma = cfg['gamma']\n",
+    "        self.device = torch.device(cfg['device']) \n",
+    "        self.memory = memory\n",
+    "        self.policy_net = model.to(self.device)\n",
+    "        self.optimizer = torch.optim.RMSprop(self.policy_net.parameters(), lr=cfg['lr'])\n",
+    "\n",
+    "    def sample_action(self,state):\n",
+    "\n",
+    "        state = torch.from_numpy(state).float()\n",
+    "        state = Variable(state)\n",
+    "        probs = self.policy_net(state)\n",
+    "        m = Bernoulli(probs) # 伯努利分布\n",
+    "        action = m.sample()\n",
+    "        \n",
+    "        action = action.data.numpy().astype(int)[0] # 转为标量\n",
+    "        return action\n",
+    "    def predict_action(self,state):\n",
+    "\n",
+    "        state = torch.from_numpy(state).float()\n",
+    "        state = Variable(state)\n",
+    "        probs = self.policy_net(state)\n",
+    "        m = Bernoulli(probs) # 伯努利分布\n",
+    "        action = m.sample()\n",
+    "        action = action.data.numpy().astype(int)[0] # 转为标量\n",
+    "        return action\n",
+    "        \n",
+    "    def update(self):\n",
+    "        state_pool,action_pool,reward_pool= self.memory.sample()\n",
+    "        state_pool,action_pool,reward_pool = list(state_pool),list(action_pool),list(reward_pool)\n",
+    "        # Discount reward\n",
+    "        running_add = 0\n",
+    "        for i in reversed(range(len(reward_pool))):\n",
+    "            if reward_pool[i] == 0:\n",
+    "                running_add = 0\n",
+    "            else:\n",
+    "                running_add = running_add * self.gamma + reward_pool[i]\n",
+    "                reward_pool[i] = running_add\n",
+    "\n",
+    "        # Normalize reward\n",
+    "        reward_mean = np.mean(reward_pool)\n",
+    "        reward_std = np.std(reward_pool)\n",
+    "        for i in range(len(reward_pool)):\n",
+    "            reward_pool[i] = (reward_pool[i] - reward_mean) / reward_std\n",
+    "\n",
+    "        # Gradient Desent\n",
+    "        self.optimizer.zero_grad()\n",
+    "\n",
+    "        for i in range(len(reward_pool)):\n",
+    "            state = state_pool[i]\n",
+    "            action = Variable(torch.FloatTensor([action_pool[i]]))\n",
+    "            reward = reward_pool[i]\n",
+    "            state = Variable(torch.from_numpy(state).float())\n",
+    "            probs = self.policy_net(state)\n",
+    "            m = Bernoulli(probs)\n",
+    "            loss = -m.log_prob(action) * reward  # Negtive score function x reward\n",
+    "            # print(loss)\n",
+    "            loss.backward()\n",
+    "        self.optimizer.step()\n",
+    "        self.memory.clear()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.7.13 ('easyrl')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.7.13"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "8994a120d39b6e6a2ecc94b4007f5314b68aa69fc88a7f00edf21be39b41f49c"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/notebooks/Q-learning/Q-learning探索策略研究.ipynb
+++ b/notebooks/Q-learning/Q-learning探索策略研究.ipynb
--- a/notebooks/Q-learning/QLearning.ipynb
+++ b/notebooks/Q-learning/QLearning.ipynb
--- a/notebooks/README.md
+++ b/notebooks/README.md
@@ -0,0 +1,23 @@
+# 蘑菇书附书代码
+
+## 安装说明
+
+目前支持Python 3.7和Gym 0.25.2版本。
+
+创建Conda环境（需先安装Anaconda）
+```bash
+conda create -n joyrl python=3.7
+conda activate joyrl
+pip install -r requirements.txt
+```
+
+安装Torch：
+
+```bash
+# CPU
+conda install pytorch==1.10.0 torchvision==0.11.0 torchaudio==0.10.0 cpuonly -c pytorch
+# GPU
+conda install pytorch==1.10.0 torchvision==0.11.0 torchaudio==0.10.0 cudatoolkit=11.3 -c pytorch -c conda-forge
+# GPU镜像安装
+pip install torch==1.10.0+cu113 torchvision==0.11.0+cu113 torchaudio==0.10.0 --extra-index-url https://download.pytorch.org/whl/cu113
+```
--- a/notebooks/Sarsa.ipynb
+++ b/notebooks/Sarsa.ipynb
--- a/Iteration/README.md
+++ b/Iteration/README.md
--- a/Iteration/value_iteration.ipynb
+++ b/Iteration/value_iteration.ipynb
@@ -0,0 +1,232 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 值迭代算法\n",
+    "作者：stzhao\n",
+    "github: https://github.com/zhaoshitian"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 一、定义环境\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import sys,os\n",
+    "curr_path = os.path.abspath('')\n",
+    "parent_path = os.path.dirname(curr_path)\n",
+    "sys.path.append(parent_path)\n",
+    "from envs.simple_grid import DrunkenWalkEnv"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def all_seed(env,seed = 1):\n",
+    "    ## 这个函数主要是为了固定随机种子\n",
+    "    import numpy as np\n",
+    "    import random\n",
+    "    import os\n",
+    "    env.seed(seed) \n",
+    "    np.random.seed(seed)\n",
+    "    random.seed(seed)\n",
+    "    os.environ['PYTHONHASHSEED'] = str(seed) \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "env = DrunkenWalkEnv(map_name=\"theAlley\")\n",
+    "all_seed(env, seed = 1) # 设置随机种子为1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 二、价值迭代算法\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def value_iteration(env, theta=0.005, discount_factor=0.9):\n",
+    "    Q = np.zeros((env.nS, env.nA)) # 初始化一个Q表格\n",
+    "    count = 0\n",
+    "    while True:\n",
+    "        delta = 0.0\n",
+    "        Q_tmp = np.zeros((env.nS, env.nA))\n",
+    "        for state in range(env.nS):\n",
+    "            for a in range(env.nA):\n",
+    "                accum = 0.0\n",
+    "                reward_total = 0.0\n",
+    "                for prob, next_state, reward, done in env.P[state][a]:\n",
+    "                    accum += prob* np.max(Q[next_state, :])\n",
+    "                    reward_total += prob * reward\n",
+    "                Q_tmp[state, a] = reward_total + discount_factor * accum\n",
+    "                delta = max(delta, abs(Q_tmp[state, a] - Q[state, a]))\n",
+    "        Q = Q_tmp\n",
+    "        \n",
+    "        count += 1\n",
+    "        if delta < theta or count > 100: # 这里设置了即使算法没有收敛，跑100次也退出循环\n",
+    "            break \n",
+    "    return Q"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[2.25015697e+22 2.53142659e+22 4.50031394e+22 2.53142659e+22]\n",
+      " [2.81269621e+22 5.41444021e+22 1.01257064e+23 5.41444021e+22]\n",
+      " [6.32856648e+22 1.21824905e+23 2.27828393e+23 1.21824905e+23]\n",
+      " [1.42392746e+23 2.74106036e+23 5.12613885e+23 2.74106036e+23]\n",
+      " [3.20383678e+23 5.76690620e+23 1.15338124e+24 5.76690620e+23]\n",
+      " [7.20863276e+23 1.38766181e+24 2.59510779e+24 1.38766181e+24]\n",
+      " [1.62194237e+24 3.12223906e+24 5.83899253e+24 3.12223906e+24]\n",
+      " [3.64937033e+24 7.02503789e+24 1.31377332e+25 7.02503789e+24]\n",
+      " [8.21108325e+24 1.47799498e+25 2.95598997e+25 1.47799498e+25]\n",
+      " [1.84749373e+25 3.55642543e+25 6.65097743e+25 3.55642543e+25]\n",
+      " [4.15686089e+25 8.00195722e+25 1.49646992e+26 8.00195722e+25]\n",
+      " [9.35293701e+25 1.80044037e+26 3.36705732e+26 1.80044037e+26]\n",
+      " [5.89235032e+26 7.36543790e+26 7.57587898e+26 7.36543790e+26]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "Q = value_iteration(env)\n",
+    "print(Q)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]\n"
+     ]
+    }
+   ],
+   "source": [
+    "policy = np.zeros([env.nS, env.nA]) # 初始化一个策略表格\n",
+    "for state in range(env.nS):\n",
+    "    best_action = np.argmax(Q[state, :]) #根据价值迭代算法得到的Q表格选择出策略\n",
+    "    policy[state, best_action] = 1\n",
+    "\n",
+    "policy = [int(np.argwhere(policy[i]==1)) for i in range(env.nS) ]\n",
+    "print(policy)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 三、测试"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_episode = 1000 # 测试1000次\n",
+    "def test(env,policy):\n",
+    "    \n",
+    "    rewards = []  # 记录所有回合的奖励\n",
+    "    success = []  # 记录该回合是否成功走到终点\n",
+    "    for i_ep in range(num_episode):\n",
+    "        ep_reward = 0  # 记录每个episode的reward\n",
+    "        state = env.reset()  # 重置环境, 重新开一局（即开始新的一个回合） 这里state=0\n",
+    "        while True:\n",
+    "            action = policy[state]  # 根据算法选择一个动作\n",
+    "            next_state, reward, done, _ = env.step(action)  # 与环境进行一个交互\n",
+    "            state = next_state  # 更新状态\n",
+    "            ep_reward += reward\n",
+    "            if done:\n",
+    "                break\n",
+    "        if state==12: # 即走到终点\n",
+    "            success.append(1)\n",
+    "        else:\n",
+    "            success.append(0)\n",
+    "        rewards.append(ep_reward)\n",
+    "    acc_suc = np.array(success).sum()/num_episode\n",
+    "    print(\"测试的成功率是：\", acc_suc)\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 79,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "测试的成功率是： 0.64\n"
+     ]
+    }
+   ],
+   "source": [
+    "test(env, policy)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.10.6 ('RL')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "88a829278351aa402b7d6303191a511008218041c5cfdb889d81328a3ea60fbc"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/notebooks/common/multiprocessing_env.py
+++ b/notebooks/common/multiprocessing_env.py
@@ -0,0 +1,153 @@
+# 该代码来自 openai baseline，用于多线程环境
+# https://github.com/openai/baselines/tree/master/baselines/common/vec_env
+
+import numpy as np
+from multiprocessing import Process, Pipe
+
+def worker(remote, parent_remote, env_fn_wrapper):
+    parent_remote.close()
+    env = env_fn_wrapper.x()
+    while True:
+        cmd, data = remote.recv()
+        if cmd == 'step':
+            ob, reward, done, info = env.step(data)
+            if done:
+                ob = env.reset()
+            remote.send((ob, reward, done, info))
+        elif cmd == 'reset':
+            ob = env.reset()
+            remote.send(ob)
+        elif cmd == 'reset_task':
+            ob = env.reset_task()
+            remote.send(ob)
+        elif cmd == 'close':
+            remote.close()
+            break
+        elif cmd == 'get_spaces':
+            remote.send((env.observation_space, env.action_space))
+        else:
+            raise NotImplementedError
+
+class VecEnv(object):
+    """
+    An abstract asynchronous, vectorized environment.
+    """
+    def __init__(self, num_envs, observation_space, action_space):
+        self.num_envs = num_envs
+        self.observation_space = observation_space
+        self.action_space = action_space
+
+    def reset(self):
+        """
+        Reset all the environments and return an array of
+        observations, or a tuple of observation arrays.
+        If step_async is still doing work, that work will
+        be cancelled and step_wait() should not be called
+        until step_async() is invoked again.
+        """
+        pass
+
+    def step_async(self, actions):
+        """
+        Tell all the environments to start taking a step
+        with the given actions.
+        Call step_wait() to get the results of the step.
+        You should not call this if a step_async run is
+        already pending.
+        """
+        pass
+
+    def step_wait(self):
+        """
+        Wait for the step taken with step_async().
+        Returns (obs, rews, dones, infos):
+         - obs: an array of observations, or a tuple of
+                arrays of observations.
+         - rews: an array of rewards
+         - dones: an array of "episode done" booleans
+         - infos: a sequence of info objects
+        """
+        pass
+
+    def close(self):
+        """
+        Clean up the environments' resources.
+        """
+        pass
+
+    def step(self, actions):
+        self.step_async(actions)
+        return self.step_wait()
+
+    
+class CloudpickleWrapper(object):
+    """
+    Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
+    """
+    def __init__(self, x):
+        self.x = x
+    def __getstate__(self):
+        import cloudpickle
+        return cloudpickle.dumps(self.x)
+    def __setstate__(self, ob):
+        import pickle
+        self.x = pickle.loads(ob)
+
+        
+class SubprocVecEnv(VecEnv):
+    def __init__(self, env_fns, spaces=None):
+        """
+        envs: list of gym environments to run in subprocesses
+        """
+        self.waiting = False
+        self.closed = False
+        nenvs = len(env_fns)
+        self.nenvs = nenvs
+        self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
+        self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
+            for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
+        for p in self.ps:
+            p.daemon = True # if the main process crashes, we should not cause things to hang
+            p.start()
+        for remote in self.work_remotes:
+            remote.close()
+
+        self.remotes[0].send(('get_spaces', None))
+        observation_space, action_space = self.remotes[0].recv()
+        VecEnv.__init__(self, len(env_fns), observation_space, action_space)
+
+    def step_async(self, actions):
+        for remote, action in zip(self.remotes, actions):
+            remote.send(('step', action))
+        self.waiting = True
+
+    def step_wait(self):
+        results = [remote.recv() for remote in self.remotes]
+        self.waiting = False
+        obs, rews, dones, infos = zip(*results)
+        return np.stack(obs), np.stack(rews), np.stack(dones), infos
+
+    def reset(self):
+        for remote in self.remotes:
+            remote.send(('reset', None))
+        return np.stack([remote.recv() for remote in self.remotes])
+
+    def reset_task(self):
+        for remote in self.remotes:
+            remote.send(('reset_task', None))
+        return np.stack([remote.recv() for remote in self.remotes])
+
+    def close(self):
+        if self.closed:
+            return
+        if self.waiting:
+            for remote in self.remotes:            
+                remote.recv()
+        for remote in self.remotes:
+            remote.send(('close', None))
+        for p in self.ps:
+            p.join()
+            self.closed = True
+            
+    def __len__(self):
+        return self.nenvs
--- a/notebooks/envs/racetrack.py
+++ b/notebooks/envs/racetrack.py
@@ -0,0 +1,243 @@
+import time
+import random
+import numpy as np
+import os
+import matplotlib.pyplot as plt
+import matplotlib.patheffects as pe
+from IPython.display import clear_output
+from gym.spaces import Discrete,Box
+from gym import Env
+from matplotlib import colors
+
+class RacetrackEnv(Env) :
+    """
+    Class representing a race-track environment inspired by exercise 5.12 in Sutton & Barto 2018 (p.111).
+    Please do not make changes to this class - it will be overwritten with a clean version when it comes to marking.
+
+    The dynamics of this environment are detailed in this coursework exercise's jupyter notebook, although I have
+    included rather verbose comments here  for those of you who are interested in how the environment has been
+    implemented (though this should not impact your solution code).ss
+    """
+
+    ACTIONS_DICT = {
+        0 : (1, -1),  # Acc Vert., Brake Horiz.
+        1 : (1, 0),   # Acc Vert., Hold Horiz.
+        2 : (1, 1),   # Acc Vert., Acc Horiz.
+        3 : (0, -1),  # Hold Vert., Brake Horiz.
+        4 : (0, 0),   # Hold Vert., Hold Horiz.
+        5 : (0, 1),   # Hold Vert., Acc Horiz.
+        6 : (-1, -1), # Brake Vert., Brake Horiz.
+        7 : (-1, 0),  # Brake Vert., Hold Horiz.
+        8 : (-1, 1)   # Brake Vert., Acc Horiz.
+    }
+
+
+    CELL_TYPES_DICT = {
+        0 : "track",
+        1 : "wall",
+        2 : "start",
+        3 : "goal"
+    }
+    metadata = {'render_modes': ['human'],
+     "render_fps": 4,}
+
+    def __init__(self,render_mode = 'human') :
+        # Load racetrack map from file.
+        self.track = np.flip(np.loadtxt(os.path.dirname(__file__)+"/track.txt", dtype = int), axis = 0)
+
+
+        # Discover start grid squares.
+        self.initial_states = []
+        for y in range(self.track.shape[0]) :
+            for x in range(self.track.shape[1]) :
+                if (self.CELL_TYPES_DICT[self.track[y, x]] == "start") :
+                    self.initial_states.append((y, x))
+        high= np.array([np.finfo(np.float32).max, np.finfo(np.float32).max, np.finfo(np.float32).max, np.finfo(np.float32).max])
+        self.observation_space = Box(low=-high, high=high, shape=(4,), dtype=np.float32)
+        self.action_space = Discrete(9)
+        self.is_reset = False
+
+    def step(self, action : int) :
+        """
+        Takes a given action in the environment's current state, and returns a next state,
+        reward, and whether the next state is done or not.
+
+        Arguments:
+            action {int} -- The action to take in the environment's current state. Should be an integer in the range [0-8].
+
+        Raises:
+            RuntimeError: Raised when the environment needs resetting.\n
+            TypeError: Raised when an action of an invalid type is given.\n
+            ValueError: Raised when an action outside the range [0-8] is given.\n
+
+        Returns:
+            A tuple of:\n
+                {(int, int, int, int)} -- The next state, a tuple of (y_pos, x_pos, y_velocity, x_velocity).\n
+                {int} -- The reward earned by taking the given action in the current environment state.\n
+                {bool} -- Whether the environment's next state is done or not.\n
+
+        """
+
+        # Check whether a reset is needed.
+        if (not self.is_reset) :
+            raise RuntimeError(".step() has been called when .reset() is needed.\n" +
+                               "You need to call .reset() before using .step() for the first time, and after an episode ends.\n" +
+                               ".reset() initialises the environment at the start of an episode, then returns an initial state.")
+
+        # Check that action is the correct type (either a python integer or a numpy integer).
+        if (not (isinstance(action, int) or isinstance(action, np.integer))) :
+            raise TypeError("action should be an integer.\n" +
+                            "action value {} of type {} was supplied.".format(action, type(action)))
+
+        # Check that action is an allowed value.
+        if (action < 0 or action > 8) :
+            raise ValueError("action must be an integer in the range [0-8] corresponding to one of the legal actions.\n" +
+                             "action value {} was supplied.".format(action))
+
+
+        # Update Velocity.
+        # With probability, 0.85 update velocity components as intended.
+        if (np.random.uniform() < 0.8) :
+            (d_y, d_x) = self.ACTIONS_DICT[action]
+        # With probability, 0.15 Do not change velocity components.
+        else :
+            (d_y, d_x) = (0, 0)
+
+        self.velocity = (self.velocity[0] + d_y, self.velocity[1] + d_x)
+
+		# Keep velocity within bounds (-10, 10).
+        if (self.velocity[0] > 10) :
+            self.velocity[0] = 10
+        elif (self.velocity[0] < -10) :
+            self.velocity[0] = -10
+        if (self.velocity[1] > 10) :
+            self.velocity[1] = 10
+        elif (self.velocity[1] < -10) :
+            self.velocity[1] = -10
+
+        # Update Position.
+        new_position = (self.position[0] + self.velocity[0], self.position[1] + self.velocity[1])
+
+        reward = 0
+        done = False
+
+        # If position is out-of-bounds, return to start and set velocity components to zero.
+        if (new_position[0] < 0 or new_position[1] < 0 or new_position[0] >= self.track.shape[0] or new_position[1] >= self.track.shape[1]) :
+            self.position = random.choice(self.initial_states)
+            self.velocity = (0, 0)
+            reward -= 10
+        # If position is in a wall grid-square, return to start and set velocity components to zero.
+        elif (self.CELL_TYPES_DICT[self.track[new_position]] == "wall") :
+            self.position = random.choice(self.initial_states)
+            self.velocity = (0, 0)
+            reward -= 10
+        # If position is in a track grid-squre or a start-square, update position.
+        elif (self.CELL_TYPES_DICT[self.track[new_position]] in ["track", "start"]) :
+            self.position = new_position
+        # If position is in a goal grid-square, end episode.
+        elif (self.CELL_TYPES_DICT[self.track[new_position]] == "goal") :
+            self.position = new_position
+            reward += 10
+            done = True
+        # If this gets reached, then the student has touched something they shouldn't have. Naughty!
+        else :
+            raise RuntimeError("You've met with a terrible fate, haven't you?\nDon't modify things you shouldn't!")
+
+        # Penalise every timestep.
+        reward -= 1
+
+        # Require a reset if the current state is done.
+        if (done) :
+            self.is_reset = False
+
+        # Return next state, reward, and whether the episode has ended.
+        return np.array([self.position[0], self.position[1], self.velocity[0], self.velocity[1]]), reward, done,{}
+
+
+    def reset(self,seed=None) :
+        """
+        Resets the environment, ready for a new episode to begin, then returns an initial state.
+        The initial state will be a starting grid square randomly chosen using a uniform distribution,
+        with both components of the velocity being zero.
+
+        Returns:
+            {(int, int, int, int)} -- an initial state, a tuple of (y_pos, x_pos, y_velocity, x_velocity).
+        """
+
+        # Pick random starting grid-square.
+        self.position = random.choice(self.initial_states)
+
+        # Set both velocity components to zero.
+        self.velocity = (0, 0)
+
+        self.is_reset = True
+
+        return np.array([self.position[0], self.position[1], self.velocity[0], self.velocity[1]])
+
+
+    def render(self, render_mode = 'human') :
+        """
+        Renders a pretty matplotlib plot representing the current state of the environment.
+        Calling this method on subsequent timesteps will update the plot.
+        This is VERY VERY SLOW and wil slow down training a lot. Only use for debugging/testing.
+
+        Arguments:
+            sleep_time {float} -- How many seconds (or partial seconds) you want to wait on this rendered frame.
+
+        """
+        # Turn interactive render_mode on.
+        plt.ion()
+        fig = plt.figure(num = "env_render")
+        ax = plt.gca()
+        ax.clear()
+        clear_output(wait = True)
+
+        # Prepare the environment plot and mark the car's position.
+        env_plot = np.copy(self.track)
+        env_plot[self.position] = 4
+        env_plot = np.flip(env_plot, axis = 0)
+
+        # Plot the gridworld.
+        cmap = colors.ListedColormap(["white", "black", "green", "red", "yellow"])
+        bounds = list(range(6))
+        norm = colors.BoundaryNorm(bounds, cmap.N)
+        ax.imshow(env_plot, cmap = cmap, norm = norm, zorder = 0)
+
+        # Plot the velocity.
+        if (not self.velocity == (0, 0)) :
+            ax.arrow(self.position[1], self.track.shape[0] - 1 - self.position[0], self.velocity[1], -self.velocity[0],
+                     path_effects=[pe.Stroke(linewidth=1, foreground='black')], color = "yellow", width = 0.1, length_includes_head = True, zorder = 2)
+
+        # Set up axes.
+        ax.grid(which = 'major', axis = 'both', linestyle = '-', color = 'k', linewidth = 2, zorder = 1)
+        ax.set_xticks(np.arange(-0.5, self.track.shape[1] , 1));
+        ax.set_xticklabels([])
+        ax.set_yticks(np.arange(-0.5, self.track.shape[0], 1));
+        ax.set_yticklabels([])
+
+        # Draw everything.
+        #fig.canvas.draw()
+        #fig.canvas.flush_events()
+        plt.show()
+        # time sleep
+        time.sleep(0.1)
+
+    def get_actions(self) :
+        """
+        Returns the available actions in the current state - will always be a list
+        of integers in the range [0-8].
+        """
+        return [*self.ACTIONS_DICT]
+if __name__ == "__main__":
+    num_steps = 1000000
+    env = RacetrackEnv()
+    state = env.reset()
+    print(state)
+    for _ in range(num_steps) :
+
+        next_state, reward, done,_ = env.step(random.choice(env.get_actions()))
+        print(next_state)
+        env.render()
+
+        if (done) :
+            _ = env.reset()
--- a/notebooks/envs/simple_grid.py
+++ b/notebooks/envs/simple_grid.py
@@ -0,0 +1,303 @@
+#!/usr/bin/env python
+
+# simple_grid.py
+# based on frozen_lake.py
+# adapted by Frans Oliehoek.
+# 
+import sys
+from contextlib import closing
+
+import numpy as np
+from io import StringIO
+#from six import StringIO, b
+import gym
+from gym import utils
+from gym import Env, spaces
+from gym.utils import seeding
+
+
+def categorical_sample(prob_n, np_random):
+    """
+    Sample from categorical distribution
+    Each row specifies class probabilities
+    """
+    prob_n = np.asarray(prob_n)
+    csprob_n = np.cumsum(prob_n)
+    return (csprob_n > np_random.rand()).argmax()
+
+
+class DiscreteEnv(Env):
+
+    """
+    Has the following members
+    - nS: number of states
+    - nA: number of actions
+    - P: transitions (*)
+    - isd: initial state distribution (**)
+
+    (*) dictionary of lists, where
+      P[s][a] == [(probability, nextstate, reward, done), ...]
+    (**) list or array of length nS
+
+
+    """
+
+    def __init__(self, nS, nA, P, isd):
+        self.P = P
+        self.isd = isd
+        self.lastaction = None  # for rendering
+        self.nS = nS
+        self.nA = nA
+
+        self.action_space = spaces.Discrete(self.nA)
+        self.observation_space = spaces.Discrete(self.nS)
+
+        self.seed()
+        self.s = categorical_sample(self.isd, self.np_random)
+
+    def seed(self, seed=None):
+        self.np_random, seed = seeding.np_random(seed)
+        return [seed]
+
+    def reset(self):
+        self.s = categorical_sample(self.isd, self.np_random)
+        self.lastaction = None
+        return int(self.s)
+
+    def step(self, a):
+        transitions = self.P[self.s][a]
+        i = categorical_sample([t[0] for t in transitions], self.np_random)
+        p, s, r, d = transitions[i]
+        self.s = s
+        self.lastaction = a
+        return (int(s), r, d, {"prob": p})
+LEFT = 0
+DOWN = 1
+RIGHT = 2
+UP = 3
+
+MAPS = {
+    "theAlley": [
+        "S...H...H...G"
+    ],
+    "walkInThePark": [
+        "S.......",
+        ".....H..",
+        "........",
+        "......H.",
+        "........",
+        "...H...G"
+    ],
+    "1Dtest": [
+
+    ],
+    "4x4": [
+        "S...",
+        ".H.H",
+        "...H",
+        "H..G"
+    ],
+    "8x8": [
+        "S.......",
+        "........",
+        "...H....",
+        ".....H..",
+        "...H....",
+        ".HH...H.",
+        ".H..H.H.",
+        "...H...G"
+    ],
+}
+
+POTHOLE_PROB = 0.2
+BROKEN_LEG_PENALTY = -5
+SLEEP_DEPRIVATION_PENALTY = -0.0
+REWARD = 10
+
+def generate_random_map(size=8, p=0.8):
+    """Generates a random valid map (one that has a path from start to goal)
+    :param size: size of each side of the grid
+    :param p: probability that a tile is frozen
+    """
+    valid = False
+
+    # DFS to check that it's a valid path.
+    def is_valid(res):
+        frontier, discovered = [], set()
+        frontier.append((0,0))
+        while frontier:
+            r, c = frontier.pop()
+            if not (r,c) in discovered:
+                discovered.add((r,c))
+                directions = [(1, 0), (0, 1), (-1, 0), (0, -1)]
+                for x, y in directions:
+                    r_new = r + x
+                    c_new = c + y
+                    if r_new < 0 or r_new >= size or c_new < 0 or c_new >= size:
+                        continue
+                    if res[r_new][c_new] == 'G':
+                        return True
+                    if (res[r_new][c_new] not in '#H'):
+                        frontier.append((r_new, c_new))
+        return False
+
+    while not valid:
+        p = min(1, p)
+        res = np.random.choice(['.', 'H'], (size, size), p=[p, 1-p])
+        res[0][0] = 'S'
+        res[-1][-1] = 'G'
+        valid = is_valid(res)
+    return ["".join(x) for x in res]
+
+
+class DrunkenWalkEnv(DiscreteEnv):
+    """
+    A simple grid environment, completely based on the code of 'FrozenLake', credits to 
+    the original authors.
+
+    You're finding your way home (G) after a great party which was happening at (S).
+    Unfortunately, due to recreational intoxication you find yourself only moving into 
+    the intended direction 80% of the time, and perpendicular to that the other 20%.
+
+    To make matters worse, the local community has been cutting the budgets for pavement
+    maintenance, which means that the way to home is full of potholes, which are very likely
+    to make you trip. If you fall, you are obviously magically transported back to the party, 
+    without getting some of that hard-earned sleep.
+
+        S...
+        .H.H
+        ...H
+        H..G
+
+    S : starting point
+    . : normal pavement
+    H : pothole, you have a POTHOLE_PROB chance of tripping
+    G : goal, time for bed
+
+    The episode ends when you reach the goal or trip.
+    You receive a reward of +10 if you reach the goal, 
+    but get a SLEEP_DEPRIVATION_PENALTY and otherwise.
+
+    """
+
+    metadata = {'render.modes': ['human', 'ansi']}
+
+    def __init__(self, desc=None, map_name="4x4",is_slippery=True):
+        """ This generates a map and sets all transition probabilities.
+
+            (by passing constructed nS, nA, P, isd to DiscreteEnv)
+        """
+        if desc is None and map_name is None:
+            desc = generate_random_map()
+        elif desc is None:
+            desc = MAPS[map_name]
+
+        self.desc = desc = np.asarray(desc,dtype='c')
+        self.nrow, self.ncol = nrow, ncol = desc.shape
+        self.reward_range = (0, 1)
+
+        nA = 4
+        nS = nrow * ncol
+
+        isd = np.array(desc == b'S').astype('float64').ravel()
+        isd /= isd.sum()
+
+        # We need to pass 'P' to DiscreteEnv:
+        # P dictionary dict of dicts of lists, where
+        # P[s][a] == [(probability, nextstate, reward, done), ...]
+        P = {s : {a : [] for a in range(nA)} for s in range(nS)}
+
+        def convert_rc_to_s(row, col):
+            return row*ncol + col
+
+        #def inc(row, col, a):
+        def intended_destination(row, col, a):
+            if a == LEFT:
+                col = max(col-1,0)
+            elif a == DOWN:
+                row = min(row+1,nrow-1)
+            elif a == RIGHT:
+                col = min(col+1,ncol-1)
+            elif a == UP:
+                row = max(row-1,0)
+            return (row, col)
+
+        def construct_transition_for_intended(row, col, a, prob, li):
+            """ this constructs a transition to the "intended_destination(row, col, a)"
+                and adds it to the transition list (which could be for a different action b).
+
+            """
+            newrow, newcol = intended_destination(row, col, a)
+            newstate = convert_rc_to_s(newrow, newcol)
+            newletter = desc[newrow, newcol]
+            done = bytes(newletter) in b'G'
+            rew = REWARD if newletter == b'G' else SLEEP_DEPRIVATION_PENALTY
+            li.append( (prob, newstate, rew, done) )
+
+
+        #THIS IS WHERE THE MATRIX OF TRANSITION PROBABILITIES IS COMPUTED.
+        for row in range(nrow):
+            for col in range(ncol):
+                # specify transitions for s=(row, col)
+                s = convert_rc_to_s(row, col)
+                letter = desc[row, col]
+                for a in range(4):
+                    # specify transitions for action a
+                    li = P[s][a]
+                    if letter in b'G':
+                        # We are at the goal ('G').... 
+                        # This is a strange case:
+                        # - conceptually, we can think of this as:
+                        #     always transition to a 'terminated' state where we willget 0 reward.
+                        #
+                        # - But in gym, in practie, this case should not be happening at all!!!
+                        #   Gym will alreay have returned 'done' when transitioning TO the goal state (not from it).
+                        #   So we will never use the transition probabilities *from* the goal state.
+                        #   So, from gym's perspective we could specify anything we like here. E.g.,:
+                        #       li.append((1.0, 59, 42000000, True))
+                        #
+                        # However, if we want to be able to use the transition matrix to do value iteration, it is important
+                        # that we get 0 reward ever after.
+                        li.append((1.0, s, 0, True))
+
+                    if letter in b'H':
+                        #We are at a pothole ('H')
+                        #when we are at a pothole, we trip with prob. POTHOLE_PROB
+                        li.append((POTHOLE_PROB, s, BROKEN_LEG_PENALTY, True))
+                        construct_transition_for_intended(row, col, a, 1.0 - POTHOLE_PROB, li)
+                        
+                    else:
+                        # We are at normal pavement (.)
+                        # with prob. 0.8 we move as intended:
+                        construct_transition_for_intended(row, col, a, 0.8, li)
+                        # but with prob. 0.1 we move sideways to intended:
+                        for b in [(a-1)%4, (a+1)%4]:
+                            construct_transition_for_intended(row, col, b, 0.1, li)
+
+        super(DrunkenWalkEnv, self).__init__(nS, nA, P, isd)
+
+    def action_to_string(self, action_index):
+        s ="{}".format(["Left","Down","Right","Up"][action_index])
+        return s
+
+    def render(self, mode='human'):
+        outfile = StringIO() if mode == 'ansi' else sys.stdout
+
+        row, col = self.s // self.ncol, self.s % self.ncol
+        desc = self.desc.tolist()
+        desc = [[c.decode('utf-8') for c in line] for line in desc]
+        desc[row][col] = utils.colorize(desc[row][col], "red", highlight=True)
+        if self.lastaction is not None:
+            outfile.write(" (last action was '{action}')\n".format( action=self.action_to_string(self.lastaction) ) )
+        else:
+            outfile.write("\n")
+        outfile.write("\n".join(''.join(line) for line in desc)+"\n")
+
+        if mode != 'human':
+            with closing(outfile):
+                return outfile.getvalue()
+if __name__ == "__main__":
+    # env = DrunkenWalkEnv(map_name="walkInThePark")
+    env = DrunkenWalkEnv(map_name="theAlley")
+    n_states = env.observation_space.n
+    n_actions = env.action_space.n
--- a/notebooks/envs/track.txt
+++ b/notebooks/envs/track.txt
@@ -0,0 +1,15 @@
+1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+1 1 1 1 1 1 1 1 0 0 0 0 0 3 3 3 3 3 1
+1 1 1 1 1 1 0 0 0 0 0 0 0 3 3 3 3 3 1
+1 1 1 1 1 0 0 0 0 0 0 0 0 3 3 3 3 3 1
+1 1 1 1 0 0 0 0 0 0 0 0 0 3 3 3 3 3 1
+1 1 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
+1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
+1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1
+1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1
+1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+1 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
--- a/notebooks/figs/duelingdqn_model.png
+++ b/notebooks/figs/duelingdqn_model.png
--- a/notebooks/requirements.txt
+++ b/notebooks/requirements.txt
@@ -0,0 +1,11 @@
+pyyaml==6.0
+ipykernel==6.15.1
+jupyter==1.0.0
+matplotlib==3.5.3
+seaborn==0.12.1
+dill==0.3.5.1
+argparse==1.4.0
+pandas==1.3.5
+pyglet==1.5.26
+importlib-metadata<5.0
+setuptools==65.2.0