easy-rl/notebooks/envs/racetrack.py

import time
import random
import numpy as np
import os
import matplotlib.pyplot as plt
import matplotlib.patheffects as pe
from IPython.display import clear_output
from gym.spaces import Discrete,Box
from gym import Env
from matplotlib import colors

class RacetrackEnv(Env) :
    """
    Class representing a race-track environment inspired by exercise 5.12 in Sutton & Barto 2018 (p.111).
    Please do not make changes to this class - it will be overwritten with a clean version when it comes to marking.

    The dynamics of this environment are detailed in this coursework exercise's jupyter notebook, although I have
    included rather verbose comments here  for those of you who are interested in how the environment has been
    implemented (though this should not impact your solution code).ss
    """

    ACTIONS_DICT = {
        0 : (1, -1),  # Acc Vert., Brake Horiz.
        1 : (1, 0),   # Acc Vert., Hold Horiz.
        2 : (1, 1),   # Acc Vert., Acc Horiz.
        3 : (0, -1),  # Hold Vert., Brake Horiz.
        4 : (0, 0),   # Hold Vert., Hold Horiz.
        5 : (0, 1),   # Hold Vert., Acc Horiz.
        6 : (-1, -1), # Brake Vert., Brake Horiz.
        7 : (-1, 0),  # Brake Vert., Hold Horiz.
        8 : (-1, 1)   # Brake Vert., Acc Horiz.
    }


    CELL_TYPES_DICT = {
        0 : "track",
        1 : "wall",
        2 : "start",
        3 : "goal"
    }
    metadata = {'render_modes': ['human'],
     "render_fps": 4,}

    def __init__(self,render_mode = 'human') :
        # Load racetrack map from file.
        self.track = np.flip(np.loadtxt(os.path.dirname(__file__)+"/track.txt", dtype = int), axis = 0)


        # Discover start grid squares.
        self.initial_states = []
        for y in range(self.track.shape[0]) :
            for x in range(self.track.shape[1]) :
                if (self.CELL_TYPES_DICT[self.track[y, x]] == "start") :
                    self.initial_states.append((y, x))
        high= np.array([np.finfo(np.float32).max, np.finfo(np.float32).max, np.finfo(np.float32).max, np.finfo(np.float32).max])
        self.observation_space = Box(low=-high, high=high, shape=(4,), dtype=np.float32)
        self.action_space = Discrete(9)
        self.is_reset = False

    def step(self, action : int) :
        """
        Takes a given action in the environment's current state, and returns a next state,
        reward, and whether the next state is done or not.

        Arguments:
            action {int} -- The action to take in the environment's current state. Should be an integer in the range [0-8].

        Raises:
            RuntimeError: Raised when the environment needs resetting.\n
            TypeError: Raised when an action of an invalid type is given.\n
            ValueError: Raised when an action outside the range [0-8] is given.\n

        Returns:
            A tuple of:\n
                {(int, int, int, int)} -- The next state, a tuple of (y_pos, x_pos, y_velocity, x_velocity).\n
                {int} -- The reward earned by taking the given action in the current environment state.\n
                {bool} -- Whether the environment's next state is done or not.\n

        """

        # Check whether a reset is needed.
        if (not self.is_reset) :
            raise RuntimeError(".step() has been called when .reset() is needed.\n" +
                               "You need to call .reset() before using .step() for the first time, and after an episode ends.\n" +
                               ".reset() initialises the environment at the start of an episode, then returns an initial state.")

        # Check that action is the correct type (either a python integer or a numpy integer).
        if (not (isinstance(action, int) or isinstance(action, np.integer))) :
            raise TypeError("action should be an integer.\n" +
                            "action value {} of type {} was supplied.".format(action, type(action)))

        # Check that action is an allowed value.
        if (action < 0 or action > 8) :
            raise ValueError("action must be an integer in the range [0-8] corresponding to one of the legal actions.\n" +
                             "action value {} was supplied.".format(action))


        # Update Velocity.
        # With probability, 0.85 update velocity components as intended.
        if (np.random.uniform() < 0.8) :
            (d_y, d_x) = self.ACTIONS_DICT[action]
        # With probability, 0.15 Do not change velocity components.
        else :
            (d_y, d_x) = (0, 0)

        self.velocity = (self.velocity[0] + d_y, self.velocity[1] + d_x)

		# Keep velocity within bounds (-10, 10).
        if (self.velocity[0] > 10) :
            self.velocity[0] = 10
        elif (self.velocity[0] < -10) :
            self.velocity[0] = -10
        if (self.velocity[1] > 10) :
            self.velocity[1] = 10
        elif (self.velocity[1] < -10) :
            self.velocity[1] = -10

        # Update Position.
        new_position = (self.position[0] + self.velocity[0], self.position[1] + self.velocity[1])

        reward = 0
        done = False

        # If position is out-of-bounds, return to start and set velocity components to zero.
        if (new_position[0] < 0 or new_position[1] < 0 or new_position[0] >= self.track.shape[0] or new_position[1] >= self.track.shape[1]) :
            self.position = random.choice(self.initial_states)
            self.velocity = (0, 0)
            reward -= 10
        # If position is in a wall grid-square, return to start and set velocity components to zero.
        elif (self.CELL_TYPES_DICT[self.track[new_position]] == "wall") :
            self.position = random.choice(self.initial_states)
            self.velocity = (0, 0)
            reward -= 10
        # If position is in a track grid-squre or a start-square, update position.
        elif (self.CELL_TYPES_DICT[self.track[new_position]] in ["track", "start"]) :
            self.position = new_position
        # If position is in a goal grid-square, end episode.
        elif (self.CELL_TYPES_DICT[self.track[new_position]] == "goal") :
            self.position = new_position
            reward += 10
            done = True
        # If this gets reached, then the student has touched something they shouldn't have. Naughty!
        else :
            raise RuntimeError("You've met with a terrible fate, haven't you?\nDon't modify things you shouldn't!")

        # Penalise every timestep.
        reward -= 1

        # Require a reset if the current state is done.
        if (done) :
            self.is_reset = False

        # Return next state, reward, and whether the episode has ended.
        return np.array([self.position[0], self.position[1], self.velocity[0], self.velocity[1]]), reward, done,{}


    def reset(self,seed=None) :
        """
        Resets the environment, ready for a new episode to begin, then returns an initial state.
        The initial state will be a starting grid square randomly chosen using a uniform distribution,
        with both components of the velocity being zero.

        Returns:
            {(int, int, int, int)} -- an initial state, a tuple of (y_pos, x_pos, y_velocity, x_velocity).
        """

        # Pick random starting grid-square.
        self.position = random.choice(self.initial_states)

        # Set both velocity components to zero.
        self.velocity = (0, 0)

        self.is_reset = True

        return np.array([self.position[0], self.position[1], self.velocity[0], self.velocity[1]])


    def render(self, render_mode = 'human') :
        """
        Renders a pretty matplotlib plot representing the current state of the environment.
        Calling this method on subsequent timesteps will update the plot.
        This is VERY VERY SLOW and wil slow down training a lot. Only use for debugging/testing.

        Arguments:
            sleep_time {float} -- How many seconds (or partial seconds) you want to wait on this rendered frame.

        """
        # Turn interactive render_mode on.
        plt.ion()
        fig = plt.figure(num = "env_render")
        ax = plt.gca()
        ax.clear()
        clear_output(wait = True)

        # Prepare the environment plot and mark the car's position.
        env_plot = np.copy(self.track)
        env_plot[self.position] = 4
        env_plot = np.flip(env_plot, axis = 0)

        # Plot the gridworld.
        cmap = colors.ListedColormap(["white", "black", "green", "red", "yellow"])
        bounds = list(range(6))
        norm = colors.BoundaryNorm(bounds, cmap.N)
        ax.imshow(env_plot, cmap = cmap, norm = norm, zorder = 0)

        # Plot the velocity.
        if (not self.velocity == (0, 0)) :
            ax.arrow(self.position[1], self.track.shape[0] - 1 - self.position[0], self.velocity[1], -self.velocity[0],
                     path_effects=[pe.Stroke(linewidth=1, foreground='black')], color = "yellow", width = 0.1, length_includes_head = True, zorder = 2)

        # Set up axes.
        ax.grid(which = 'major', axis = 'both', linestyle = '-', color = 'k', linewidth = 2, zorder = 1)
        ax.set_xticks(np.arange(-0.5, self.track.shape[1] , 1));
        ax.set_xticklabels([])
        ax.set_yticks(np.arange(-0.5, self.track.shape[0], 1));
        ax.set_yticklabels([])

        # Draw everything.
        #fig.canvas.draw()
        #fig.canvas.flush_events()
        plt.show()
        # time sleep
        time.sleep(0.1)

    def get_actions(self) :
        """
        Returns the available actions in the current state - will always be a list
        of integers in the range [0-8].
        """
        return [*self.ACTIONS_DICT]
if __name__ == "__main__":
    num_steps = 1000000
    env = RacetrackEnv()
    state = env.reset()
    print(state)
    for _ in range(num_steps) :

        next_state, reward, done,_ = env.step(random.choice(env.get_actions()))
        print(next_state)
        env.render()

        if (done) :
            _ = env.reset()