更新蘑菇书附书代码

2022-12-04 20:54:36 +08:00
parent f030fe283d
commit dc8d13a13e
23 changed files with 10784 additions and 0 deletions
--- a/notebooks/envs/racetrack.py
+++ b/notebooks/envs/racetrack.py
@@ -0,0 +1,243 @@
+import time
+import random
+import numpy as np
+import os
+import matplotlib.pyplot as plt
+import matplotlib.patheffects as pe
+from IPython.display import clear_output
+from gym.spaces import Discrete,Box
+from gym import Env
+from matplotlib import colors
+
+class RacetrackEnv(Env) :
+    """
+    Class representing a race-track environment inspired by exercise 5.12 in Sutton & Barto 2018 (p.111).
+    Please do not make changes to this class - it will be overwritten with a clean version when it comes to marking.
+
+    The dynamics of this environment are detailed in this coursework exercise's jupyter notebook, although I have
+    included rather verbose comments here  for those of you who are interested in how the environment has been
+    implemented (though this should not impact your solution code).ss
+    """
+
+    ACTIONS_DICT = {
+        0 : (1, -1),  # Acc Vert., Brake Horiz.
+        1 : (1, 0),   # Acc Vert., Hold Horiz.
+        2 : (1, 1),   # Acc Vert., Acc Horiz.
+        3 : (0, -1),  # Hold Vert., Brake Horiz.
+        4 : (0, 0),   # Hold Vert., Hold Horiz.
+        5 : (0, 1),   # Hold Vert., Acc Horiz.
+        6 : (-1, -1), # Brake Vert., Brake Horiz.
+        7 : (-1, 0),  # Brake Vert., Hold Horiz.
+        8 : (-1, 1)   # Brake Vert., Acc Horiz.
+    }
+
+
+    CELL_TYPES_DICT = {
+        0 : "track",
+        1 : "wall",
+        2 : "start",
+        3 : "goal"
+    }
+    metadata = {'render_modes': ['human'],
+     "render_fps": 4,}
+
+    def __init__(self,render_mode = 'human') :
+        # Load racetrack map from file.
+        self.track = np.flip(np.loadtxt(os.path.dirname(__file__)+"/track.txt", dtype = int), axis = 0)
+
+
+        # Discover start grid squares.
+        self.initial_states = []
+        for y in range(self.track.shape[0]) :
+            for x in range(self.track.shape[1]) :
+                if (self.CELL_TYPES_DICT[self.track[y, x]] == "start") :
+                    self.initial_states.append((y, x))
+        high= np.array([np.finfo(np.float32).max, np.finfo(np.float32).max, np.finfo(np.float32).max, np.finfo(np.float32).max])
+        self.observation_space = Box(low=-high, high=high, shape=(4,), dtype=np.float32)
+        self.action_space = Discrete(9)
+        self.is_reset = False
+
+    def step(self, action : int) :
+        """
+        Takes a given action in the environment's current state, and returns a next state,
+        reward, and whether the next state is done or not.
+
+        Arguments:
+            action {int} -- The action to take in the environment's current state. Should be an integer in the range [0-8].
+
+        Raises:
+            RuntimeError: Raised when the environment needs resetting.\n
+            TypeError: Raised when an action of an invalid type is given.\n
+            ValueError: Raised when an action outside the range [0-8] is given.\n
+
+        Returns:
+            A tuple of:\n
+                {(int, int, int, int)} -- The next state, a tuple of (y_pos, x_pos, y_velocity, x_velocity).\n
+                {int} -- The reward earned by taking the given action in the current environment state.\n
+                {bool} -- Whether the environment's next state is done or not.\n
+
+        """
+
+        # Check whether a reset is needed.
+        if (not self.is_reset) :
+            raise RuntimeError(".step() has been called when .reset() is needed.\n" +
+                               "You need to call .reset() before using .step() for the first time, and after an episode ends.\n" +
+                               ".reset() initialises the environment at the start of an episode, then returns an initial state.")
+
+        # Check that action is the correct type (either a python integer or a numpy integer).
+        if (not (isinstance(action, int) or isinstance(action, np.integer))) :
+            raise TypeError("action should be an integer.\n" +
+                            "action value {} of type {} was supplied.".format(action, type(action)))
+
+        # Check that action is an allowed value.
+        if (action < 0 or action > 8) :
+            raise ValueError("action must be an integer in the range [0-8] corresponding to one of the legal actions.\n" +
+                             "action value {} was supplied.".format(action))
+
+
+        # Update Velocity.
+        # With probability, 0.85 update velocity components as intended.
+        if (np.random.uniform() < 0.8) :
+            (d_y, d_x) = self.ACTIONS_DICT[action]
+        # With probability, 0.15 Do not change velocity components.
+        else :
+            (d_y, d_x) = (0, 0)
+
+        self.velocity = (self.velocity[0] + d_y, self.velocity[1] + d_x)
+
+		# Keep velocity within bounds (-10, 10).
+        if (self.velocity[0] > 10) :
+            self.velocity[0] = 10
+        elif (self.velocity[0] < -10) :
+            self.velocity[0] = -10
+        if (self.velocity[1] > 10) :
+            self.velocity[1] = 10
+        elif (self.velocity[1] < -10) :
+            self.velocity[1] = -10
+
+        # Update Position.
+        new_position = (self.position[0] + self.velocity[0], self.position[1] + self.velocity[1])
+
+        reward = 0
+        done = False
+
+        # If position is out-of-bounds, return to start and set velocity components to zero.
+        if (new_position[0] < 0 or new_position[1] < 0 or new_position[0] >= self.track.shape[0] or new_position[1] >= self.track.shape[1]) :
+            self.position = random.choice(self.initial_states)
+            self.velocity = (0, 0)
+            reward -= 10
+        # If position is in a wall grid-square, return to start and set velocity components to zero.
+        elif (self.CELL_TYPES_DICT[self.track[new_position]] == "wall") :
+            self.position = random.choice(self.initial_states)
+            self.velocity = (0, 0)
+            reward -= 10
+        # If position is in a track grid-squre or a start-square, update position.
+        elif (self.CELL_TYPES_DICT[self.track[new_position]] in ["track", "start"]) :
+            self.position = new_position
+        # If position is in a goal grid-square, end episode.
+        elif (self.CELL_TYPES_DICT[self.track[new_position]] == "goal") :
+            self.position = new_position
+            reward += 10
+            done = True
+        # If this gets reached, then the student has touched something they shouldn't have. Naughty!
+        else :
+            raise RuntimeError("You've met with a terrible fate, haven't you?\nDon't modify things you shouldn't!")
+
+        # Penalise every timestep.
+        reward -= 1
+
+        # Require a reset if the current state is done.
+        if (done) :
+            self.is_reset = False
+
+        # Return next state, reward, and whether the episode has ended.
+        return np.array([self.position[0], self.position[1], self.velocity[0], self.velocity[1]]), reward, done,{}
+
+
+    def reset(self,seed=None) :
+        """
+        Resets the environment, ready for a new episode to begin, then returns an initial state.
+        The initial state will be a starting grid square randomly chosen using a uniform distribution,
+        with both components of the velocity being zero.
+
+        Returns:
+            {(int, int, int, int)} -- an initial state, a tuple of (y_pos, x_pos, y_velocity, x_velocity).
+        """
+
+        # Pick random starting grid-square.
+        self.position = random.choice(self.initial_states)
+
+        # Set both velocity components to zero.
+        self.velocity = (0, 0)
+
+        self.is_reset = True
+
+        return np.array([self.position[0], self.position[1], self.velocity[0], self.velocity[1]])
+
+
+    def render(self, render_mode = 'human') :
+        """
+        Renders a pretty matplotlib plot representing the current state of the environment.
+        Calling this method on subsequent timesteps will update the plot.
+        This is VERY VERY SLOW and wil slow down training a lot. Only use for debugging/testing.
+
+        Arguments:
+            sleep_time {float} -- How many seconds (or partial seconds) you want to wait on this rendered frame.
+
+        """
+        # Turn interactive render_mode on.
+        plt.ion()
+        fig = plt.figure(num = "env_render")
+        ax = plt.gca()
+        ax.clear()
+        clear_output(wait = True)
+
+        # Prepare the environment plot and mark the car's position.
+        env_plot = np.copy(self.track)
+        env_plot[self.position] = 4
+        env_plot = np.flip(env_plot, axis = 0)
+
+        # Plot the gridworld.
+        cmap = colors.ListedColormap(["white", "black", "green", "red", "yellow"])
+        bounds = list(range(6))
+        norm = colors.BoundaryNorm(bounds, cmap.N)
+        ax.imshow(env_plot, cmap = cmap, norm = norm, zorder = 0)
+
+        # Plot the velocity.
+        if (not self.velocity == (0, 0)) :
+            ax.arrow(self.position[1], self.track.shape[0] - 1 - self.position[0], self.velocity[1], -self.velocity[0],
+                     path_effects=[pe.Stroke(linewidth=1, foreground='black')], color = "yellow", width = 0.1, length_includes_head = True, zorder = 2)
+
+        # Set up axes.
+        ax.grid(which = 'major', axis = 'both', linestyle = '-', color = 'k', linewidth = 2, zorder = 1)
+        ax.set_xticks(np.arange(-0.5, self.track.shape[1] , 1));
+        ax.set_xticklabels([])
+        ax.set_yticks(np.arange(-0.5, self.track.shape[0], 1));
+        ax.set_yticklabels([])
+
+        # Draw everything.
+        #fig.canvas.draw()
+        #fig.canvas.flush_events()
+        plt.show()
+        # time sleep
+        time.sleep(0.1)
+
+    def get_actions(self) :
+        """
+        Returns the available actions in the current state - will always be a list
+        of integers in the range [0-8].
+        """
+        return [*self.ACTIONS_DICT]
+if __name__ == "__main__":
+    num_steps = 1000000
+    env = RacetrackEnv()
+    state = env.reset()
+    print(state)
+    for _ in range(num_steps) :
+
+        next_state, reward, done,_ = env.step(random.choice(env.get_actions()))
+        print(next_state)
+        env.render()
+
+        if (done) :
+            _ = env.reset()
--- a/notebooks/envs/simple_grid.py
+++ b/notebooks/envs/simple_grid.py
@@ -0,0 +1,303 @@
+#!/usr/bin/env python
+
+# simple_grid.py
+# based on frozen_lake.py
+# adapted by Frans Oliehoek.
+# 
+import sys
+from contextlib import closing
+
+import numpy as np
+from io import StringIO
+#from six import StringIO, b
+import gym
+from gym import utils
+from gym import Env, spaces
+from gym.utils import seeding
+
+
+def categorical_sample(prob_n, np_random):
+    """
+    Sample from categorical distribution
+    Each row specifies class probabilities
+    """
+    prob_n = np.asarray(prob_n)
+    csprob_n = np.cumsum(prob_n)
+    return (csprob_n > np_random.rand()).argmax()
+
+
+class DiscreteEnv(Env):
+
+    """
+    Has the following members
+    - nS: number of states
+    - nA: number of actions
+    - P: transitions (*)
+    - isd: initial state distribution (**)
+
+    (*) dictionary of lists, where
+      P[s][a] == [(probability, nextstate, reward, done), ...]
+    (**) list or array of length nS
+
+
+    """
+
+    def __init__(self, nS, nA, P, isd):
+        self.P = P
+        self.isd = isd
+        self.lastaction = None  # for rendering
+        self.nS = nS
+        self.nA = nA
+
+        self.action_space = spaces.Discrete(self.nA)
+        self.observation_space = spaces.Discrete(self.nS)
+
+        self.seed()
+        self.s = categorical_sample(self.isd, self.np_random)
+
+    def seed(self, seed=None):
+        self.np_random, seed = seeding.np_random(seed)
+        return [seed]
+
+    def reset(self):
+        self.s = categorical_sample(self.isd, self.np_random)
+        self.lastaction = None
+        return int(self.s)
+
+    def step(self, a):
+        transitions = self.P[self.s][a]
+        i = categorical_sample([t[0] for t in transitions], self.np_random)
+        p, s, r, d = transitions[i]
+        self.s = s
+        self.lastaction = a
+        return (int(s), r, d, {"prob": p})
+LEFT = 0
+DOWN = 1
+RIGHT = 2
+UP = 3
+
+MAPS = {
+    "theAlley": [
+        "S...H...H...G"
+    ],
+    "walkInThePark": [
+        "S.......",
+        ".....H..",
+        "........",
+        "......H.",
+        "........",
+        "...H...G"
+    ],
+    "1Dtest": [
+
+    ],
+    "4x4": [
+        "S...",
+        ".H.H",
+        "...H",
+        "H..G"
+    ],
+    "8x8": [
+        "S.......",
+        "........",
+        "...H....",
+        ".....H..",
+        "...H....",
+        ".HH...H.",
+        ".H..H.H.",
+        "...H...G"
+    ],
+}
+
+POTHOLE_PROB = 0.2
+BROKEN_LEG_PENALTY = -5
+SLEEP_DEPRIVATION_PENALTY = -0.0
+REWARD = 10
+
+def generate_random_map(size=8, p=0.8):
+    """Generates a random valid map (one that has a path from start to goal)
+    :param size: size of each side of the grid
+    :param p: probability that a tile is frozen
+    """
+    valid = False
+
+    # DFS to check that it's a valid path.
+    def is_valid(res):
+        frontier, discovered = [], set()
+        frontier.append((0,0))
+        while frontier:
+            r, c = frontier.pop()
+            if not (r,c) in discovered:
+                discovered.add((r,c))
+                directions = [(1, 0), (0, 1), (-1, 0), (0, -1)]
+                for x, y in directions:
+                    r_new = r + x
+                    c_new = c + y
+                    if r_new < 0 or r_new >= size or c_new < 0 or c_new >= size:
+                        continue
+                    if res[r_new][c_new] == 'G':
+                        return True
+                    if (res[r_new][c_new] not in '#H'):
+                        frontier.append((r_new, c_new))
+        return False
+
+    while not valid:
+        p = min(1, p)
+        res = np.random.choice(['.', 'H'], (size, size), p=[p, 1-p])
+        res[0][0] = 'S'
+        res[-1][-1] = 'G'
+        valid = is_valid(res)
+    return ["".join(x) for x in res]
+
+
+class DrunkenWalkEnv(DiscreteEnv):
+    """
+    A simple grid environment, completely based on the code of 'FrozenLake', credits to 
+    the original authors.
+
+    You're finding your way home (G) after a great party which was happening at (S).
+    Unfortunately, due to recreational intoxication you find yourself only moving into 
+    the intended direction 80% of the time, and perpendicular to that the other 20%.
+
+    To make matters worse, the local community has been cutting the budgets for pavement
+    maintenance, which means that the way to home is full of potholes, which are very likely
+    to make you trip. If you fall, you are obviously magically transported back to the party, 
+    without getting some of that hard-earned sleep.
+
+        S...
+        .H.H
+        ...H
+        H..G
+
+    S : starting point
+    . : normal pavement
+    H : pothole, you have a POTHOLE_PROB chance of tripping
+    G : goal, time for bed
+
+    The episode ends when you reach the goal or trip.
+    You receive a reward of +10 if you reach the goal, 
+    but get a SLEEP_DEPRIVATION_PENALTY and otherwise.
+
+    """
+
+    metadata = {'render.modes': ['human', 'ansi']}
+
+    def __init__(self, desc=None, map_name="4x4",is_slippery=True):
+        """ This generates a map and sets all transition probabilities.
+
+            (by passing constructed nS, nA, P, isd to DiscreteEnv)
+        """
+        if desc is None and map_name is None:
+            desc = generate_random_map()
+        elif desc is None:
+            desc = MAPS[map_name]
+
+        self.desc = desc = np.asarray(desc,dtype='c')
+        self.nrow, self.ncol = nrow, ncol = desc.shape
+        self.reward_range = (0, 1)
+
+        nA = 4
+        nS = nrow * ncol
+
+        isd = np.array(desc == b'S').astype('float64').ravel()
+        isd /= isd.sum()
+
+        # We need to pass 'P' to DiscreteEnv:
+        # P dictionary dict of dicts of lists, where
+        # P[s][a] == [(probability, nextstate, reward, done), ...]
+        P = {s : {a : [] for a in range(nA)} for s in range(nS)}
+
+        def convert_rc_to_s(row, col):
+            return row*ncol + col
+
+        #def inc(row, col, a):
+        def intended_destination(row, col, a):
+            if a == LEFT:
+                col = max(col-1,0)
+            elif a == DOWN:
+                row = min(row+1,nrow-1)
+            elif a == RIGHT:
+                col = min(col+1,ncol-1)
+            elif a == UP:
+                row = max(row-1,0)
+            return (row, col)
+
+        def construct_transition_for_intended(row, col, a, prob, li):
+            """ this constructs a transition to the "intended_destination(row, col, a)"
+                and adds it to the transition list (which could be for a different action b).
+
+            """
+            newrow, newcol = intended_destination(row, col, a)
+            newstate = convert_rc_to_s(newrow, newcol)
+            newletter = desc[newrow, newcol]
+            done = bytes(newletter) in b'G'
+            rew = REWARD if newletter == b'G' else SLEEP_DEPRIVATION_PENALTY
+            li.append( (prob, newstate, rew, done) )
+
+
+        #THIS IS WHERE THE MATRIX OF TRANSITION PROBABILITIES IS COMPUTED.
+        for row in range(nrow):
+            for col in range(ncol):
+                # specify transitions for s=(row, col)
+                s = convert_rc_to_s(row, col)
+                letter = desc[row, col]
+                for a in range(4):
+                    # specify transitions for action a
+                    li = P[s][a]
+                    if letter in b'G':
+                        # We are at the goal ('G').... 
+                        # This is a strange case:
+                        # - conceptually, we can think of this as:
+                        #     always transition to a 'terminated' state where we willget 0 reward.
+                        #
+                        # - But in gym, in practie, this case should not be happening at all!!!
+                        #   Gym will alreay have returned 'done' when transitioning TO the goal state (not from it).
+                        #   So we will never use the transition probabilities *from* the goal state.
+                        #   So, from gym's perspective we could specify anything we like here. E.g.,:
+                        #       li.append((1.0, 59, 42000000, True))
+                        #
+                        # However, if we want to be able to use the transition matrix to do value iteration, it is important
+                        # that we get 0 reward ever after.
+                        li.append((1.0, s, 0, True))
+
+                    if letter in b'H':
+                        #We are at a pothole ('H')
+                        #when we are at a pothole, we trip with prob. POTHOLE_PROB
+                        li.append((POTHOLE_PROB, s, BROKEN_LEG_PENALTY, True))
+                        construct_transition_for_intended(row, col, a, 1.0 - POTHOLE_PROB, li)
+                        
+                    else:
+                        # We are at normal pavement (.)
+                        # with prob. 0.8 we move as intended:
+                        construct_transition_for_intended(row, col, a, 0.8, li)
+                        # but with prob. 0.1 we move sideways to intended:
+                        for b in [(a-1)%4, (a+1)%4]:
+                            construct_transition_for_intended(row, col, b, 0.1, li)
+
+        super(DrunkenWalkEnv, self).__init__(nS, nA, P, isd)
+
+    def action_to_string(self, action_index):
+        s ="{}".format(["Left","Down","Right","Up"][action_index])
+        return s
+
+    def render(self, mode='human'):
+        outfile = StringIO() if mode == 'ansi' else sys.stdout
+
+        row, col = self.s // self.ncol, self.s % self.ncol
+        desc = self.desc.tolist()
+        desc = [[c.decode('utf-8') for c in line] for line in desc]
+        desc[row][col] = utils.colorize(desc[row][col], "red", highlight=True)
+        if self.lastaction is not None:
+            outfile.write(" (last action was '{action}')\n".format( action=self.action_to_string(self.lastaction) ) )
+        else:
+            outfile.write("\n")
+        outfile.write("\n".join(''.join(line) for line in desc)+"\n")
+
+        if mode != 'human':
+            with closing(outfile):
+                return outfile.getvalue()
+if __name__ == "__main__":
+    # env = DrunkenWalkEnv(map_name="walkInThePark")
+    env = DrunkenWalkEnv(map_name="theAlley")
+    n_states = env.observation_space.n
+    n_actions = env.action_space.n
--- a/notebooks/envs/track.txt
+++ b/notebooks/envs/track.txt
@@ -0,0 +1,15 @@
+1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+1 1 1 1 1 1 1 1 0 0 0 0 0 3 3 3 3 3 1
+1 1 1 1 1 1 0 0 0 0 0 0 0 3 3 3 3 3 1
+1 1 1 1 1 0 0 0 0 0 0 0 0 3 3 3 3 3 1
+1 1 1 1 0 0 0 0 0 0 0 0 0 3 3 3 3 3 1
+1 1 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
+1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
+1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1
+1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1
+1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+1 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1