更新蘑菇书附书代码

This commit is contained in:
johnjim0816
2022-12-04 20:54:36 +08:00
parent f030fe283d
commit dc8d13a13e
23 changed files with 10784 additions and 0 deletions

243
notebooks/envs/racetrack.py Normal file
View File

@@ -0,0 +1,243 @@
import time
import random
import numpy as np
import os
import matplotlib.pyplot as plt
import matplotlib.patheffects as pe
from IPython.display import clear_output
from gym.spaces import Discrete,Box
from gym import Env
from matplotlib import colors
class RacetrackEnv(Env) :
"""
Class representing a race-track environment inspired by exercise 5.12 in Sutton & Barto 2018 (p.111).
Please do not make changes to this class - it will be overwritten with a clean version when it comes to marking.
The dynamics of this environment are detailed in this coursework exercise's jupyter notebook, although I have
included rather verbose comments here for those of you who are interested in how the environment has been
implemented (though this should not impact your solution code).ss
"""
ACTIONS_DICT = {
0 : (1, -1), # Acc Vert., Brake Horiz.
1 : (1, 0), # Acc Vert., Hold Horiz.
2 : (1, 1), # Acc Vert., Acc Horiz.
3 : (0, -1), # Hold Vert., Brake Horiz.
4 : (0, 0), # Hold Vert., Hold Horiz.
5 : (0, 1), # Hold Vert., Acc Horiz.
6 : (-1, -1), # Brake Vert., Brake Horiz.
7 : (-1, 0), # Brake Vert., Hold Horiz.
8 : (-1, 1) # Brake Vert., Acc Horiz.
}
CELL_TYPES_DICT = {
0 : "track",
1 : "wall",
2 : "start",
3 : "goal"
}
metadata = {'render_modes': ['human'],
"render_fps": 4,}
def __init__(self,render_mode = 'human') :
# Load racetrack map from file.
self.track = np.flip(np.loadtxt(os.path.dirname(__file__)+"/track.txt", dtype = int), axis = 0)
# Discover start grid squares.
self.initial_states = []
for y in range(self.track.shape[0]) :
for x in range(self.track.shape[1]) :
if (self.CELL_TYPES_DICT[self.track[y, x]] == "start") :
self.initial_states.append((y, x))
high= np.array([np.finfo(np.float32).max, np.finfo(np.float32).max, np.finfo(np.float32).max, np.finfo(np.float32).max])
self.observation_space = Box(low=-high, high=high, shape=(4,), dtype=np.float32)
self.action_space = Discrete(9)
self.is_reset = False
def step(self, action : int) :
"""
Takes a given action in the environment's current state, and returns a next state,
reward, and whether the next state is done or not.
Arguments:
action {int} -- The action to take in the environment's current state. Should be an integer in the range [0-8].
Raises:
RuntimeError: Raised when the environment needs resetting.\n
TypeError: Raised when an action of an invalid type is given.\n
ValueError: Raised when an action outside the range [0-8] is given.\n
Returns:
A tuple of:\n
{(int, int, int, int)} -- The next state, a tuple of (y_pos, x_pos, y_velocity, x_velocity).\n
{int} -- The reward earned by taking the given action in the current environment state.\n
{bool} -- Whether the environment's next state is done or not.\n
"""
# Check whether a reset is needed.
if (not self.is_reset) :
raise RuntimeError(".step() has been called when .reset() is needed.\n" +
"You need to call .reset() before using .step() for the first time, and after an episode ends.\n" +
".reset() initialises the environment at the start of an episode, then returns an initial state.")
# Check that action is the correct type (either a python integer or a numpy integer).
if (not (isinstance(action, int) or isinstance(action, np.integer))) :
raise TypeError("action should be an integer.\n" +
"action value {} of type {} was supplied.".format(action, type(action)))
# Check that action is an allowed value.
if (action < 0 or action > 8) :
raise ValueError("action must be an integer in the range [0-8] corresponding to one of the legal actions.\n" +
"action value {} was supplied.".format(action))
# Update Velocity.
# With probability, 0.85 update velocity components as intended.
if (np.random.uniform() < 0.8) :
(d_y, d_x) = self.ACTIONS_DICT[action]
# With probability, 0.15 Do not change velocity components.
else :
(d_y, d_x) = (0, 0)
self.velocity = (self.velocity[0] + d_y, self.velocity[1] + d_x)
# Keep velocity within bounds (-10, 10).
if (self.velocity[0] > 10) :
self.velocity[0] = 10
elif (self.velocity[0] < -10) :
self.velocity[0] = -10
if (self.velocity[1] > 10) :
self.velocity[1] = 10
elif (self.velocity[1] < -10) :
self.velocity[1] = -10
# Update Position.
new_position = (self.position[0] + self.velocity[0], self.position[1] + self.velocity[1])
reward = 0
done = False
# If position is out-of-bounds, return to start and set velocity components to zero.
if (new_position[0] < 0 or new_position[1] < 0 or new_position[0] >= self.track.shape[0] or new_position[1] >= self.track.shape[1]) :
self.position = random.choice(self.initial_states)
self.velocity = (0, 0)
reward -= 10
# If position is in a wall grid-square, return to start and set velocity components to zero.
elif (self.CELL_TYPES_DICT[self.track[new_position]] == "wall") :
self.position = random.choice(self.initial_states)
self.velocity = (0, 0)
reward -= 10
# If position is in a track grid-squre or a start-square, update position.
elif (self.CELL_TYPES_DICT[self.track[new_position]] in ["track", "start"]) :
self.position = new_position
# If position is in a goal grid-square, end episode.
elif (self.CELL_TYPES_DICT[self.track[new_position]] == "goal") :
self.position = new_position
reward += 10
done = True
# If this gets reached, then the student has touched something they shouldn't have. Naughty!
else :
raise RuntimeError("You've met with a terrible fate, haven't you?\nDon't modify things you shouldn't!")
# Penalise every timestep.
reward -= 1
# Require a reset if the current state is done.
if (done) :
self.is_reset = False
# Return next state, reward, and whether the episode has ended.
return np.array([self.position[0], self.position[1], self.velocity[0], self.velocity[1]]), reward, done,{}
def reset(self,seed=None) :
"""
Resets the environment, ready for a new episode to begin, then returns an initial state.
The initial state will be a starting grid square randomly chosen using a uniform distribution,
with both components of the velocity being zero.
Returns:
{(int, int, int, int)} -- an initial state, a tuple of (y_pos, x_pos, y_velocity, x_velocity).
"""
# Pick random starting grid-square.
self.position = random.choice(self.initial_states)
# Set both velocity components to zero.
self.velocity = (0, 0)
self.is_reset = True
return np.array([self.position[0], self.position[1], self.velocity[0], self.velocity[1]])
def render(self, render_mode = 'human') :
"""
Renders a pretty matplotlib plot representing the current state of the environment.
Calling this method on subsequent timesteps will update the plot.
This is VERY VERY SLOW and wil slow down training a lot. Only use for debugging/testing.
Arguments:
sleep_time {float} -- How many seconds (or partial seconds) you want to wait on this rendered frame.
"""
# Turn interactive render_mode on.
plt.ion()
fig = plt.figure(num = "env_render")
ax = plt.gca()
ax.clear()
clear_output(wait = True)
# Prepare the environment plot and mark the car's position.
env_plot = np.copy(self.track)
env_plot[self.position] = 4
env_plot = np.flip(env_plot, axis = 0)
# Plot the gridworld.
cmap = colors.ListedColormap(["white", "black", "green", "red", "yellow"])
bounds = list(range(6))
norm = colors.BoundaryNorm(bounds, cmap.N)
ax.imshow(env_plot, cmap = cmap, norm = norm, zorder = 0)
# Plot the velocity.
if (not self.velocity == (0, 0)) :
ax.arrow(self.position[1], self.track.shape[0] - 1 - self.position[0], self.velocity[1], -self.velocity[0],
path_effects=[pe.Stroke(linewidth=1, foreground='black')], color = "yellow", width = 0.1, length_includes_head = True, zorder = 2)
# Set up axes.
ax.grid(which = 'major', axis = 'both', linestyle = '-', color = 'k', linewidth = 2, zorder = 1)
ax.set_xticks(np.arange(-0.5, self.track.shape[1] , 1));
ax.set_xticklabels([])
ax.set_yticks(np.arange(-0.5, self.track.shape[0], 1));
ax.set_yticklabels([])
# Draw everything.
#fig.canvas.draw()
#fig.canvas.flush_events()
plt.show()
# time sleep
time.sleep(0.1)
def get_actions(self) :
"""
Returns the available actions in the current state - will always be a list
of integers in the range [0-8].
"""
return [*self.ACTIONS_DICT]
if __name__ == "__main__":
num_steps = 1000000
env = RacetrackEnv()
state = env.reset()
print(state)
for _ in range(num_steps) :
next_state, reward, done,_ = env.step(random.choice(env.get_actions()))
print(next_state)
env.render()
if (done) :
_ = env.reset()

View File

@@ -0,0 +1,303 @@
#!/usr/bin/env python
# simple_grid.py
# based on frozen_lake.py
# adapted by Frans Oliehoek.
#
import sys
from contextlib import closing
import numpy as np
from io import StringIO
#from six import StringIO, b
import gym
from gym import utils
from gym import Env, spaces
from gym.utils import seeding
def categorical_sample(prob_n, np_random):
"""
Sample from categorical distribution
Each row specifies class probabilities
"""
prob_n = np.asarray(prob_n)
csprob_n = np.cumsum(prob_n)
return (csprob_n > np_random.rand()).argmax()
class DiscreteEnv(Env):
"""
Has the following members
- nS: number of states
- nA: number of actions
- P: transitions (*)
- isd: initial state distribution (**)
(*) dictionary of lists, where
P[s][a] == [(probability, nextstate, reward, done), ...]
(**) list or array of length nS
"""
def __init__(self, nS, nA, P, isd):
self.P = P
self.isd = isd
self.lastaction = None # for rendering
self.nS = nS
self.nA = nA
self.action_space = spaces.Discrete(self.nA)
self.observation_space = spaces.Discrete(self.nS)
self.seed()
self.s = categorical_sample(self.isd, self.np_random)
def seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)
return [seed]
def reset(self):
self.s = categorical_sample(self.isd, self.np_random)
self.lastaction = None
return int(self.s)
def step(self, a):
transitions = self.P[self.s][a]
i = categorical_sample([t[0] for t in transitions], self.np_random)
p, s, r, d = transitions[i]
self.s = s
self.lastaction = a
return (int(s), r, d, {"prob": p})
LEFT = 0
DOWN = 1
RIGHT = 2
UP = 3
MAPS = {
"theAlley": [
"S...H...H...G"
],
"walkInThePark": [
"S.......",
".....H..",
"........",
"......H.",
"........",
"...H...G"
],
"1Dtest": [
],
"4x4": [
"S...",
".H.H",
"...H",
"H..G"
],
"8x8": [
"S.......",
"........",
"...H....",
".....H..",
"...H....",
".HH...H.",
".H..H.H.",
"...H...G"
],
}
POTHOLE_PROB = 0.2
BROKEN_LEG_PENALTY = -5
SLEEP_DEPRIVATION_PENALTY = -0.0
REWARD = 10
def generate_random_map(size=8, p=0.8):
"""Generates a random valid map (one that has a path from start to goal)
:param size: size of each side of the grid
:param p: probability that a tile is frozen
"""
valid = False
# DFS to check that it's a valid path.
def is_valid(res):
frontier, discovered = [], set()
frontier.append((0,0))
while frontier:
r, c = frontier.pop()
if not (r,c) in discovered:
discovered.add((r,c))
directions = [(1, 0), (0, 1), (-1, 0), (0, -1)]
for x, y in directions:
r_new = r + x
c_new = c + y
if r_new < 0 or r_new >= size or c_new < 0 or c_new >= size:
continue
if res[r_new][c_new] == 'G':
return True
if (res[r_new][c_new] not in '#H'):
frontier.append((r_new, c_new))
return False
while not valid:
p = min(1, p)
res = np.random.choice(['.', 'H'], (size, size), p=[p, 1-p])
res[0][0] = 'S'
res[-1][-1] = 'G'
valid = is_valid(res)
return ["".join(x) for x in res]
class DrunkenWalkEnv(DiscreteEnv):
"""
A simple grid environment, completely based on the code of 'FrozenLake', credits to
the original authors.
You're finding your way home (G) after a great party which was happening at (S).
Unfortunately, due to recreational intoxication you find yourself only moving into
the intended direction 80% of the time, and perpendicular to that the other 20%.
To make matters worse, the local community has been cutting the budgets for pavement
maintenance, which means that the way to home is full of potholes, which are very likely
to make you trip. If you fall, you are obviously magically transported back to the party,
without getting some of that hard-earned sleep.
S...
.H.H
...H
H..G
S : starting point
. : normal pavement
H : pothole, you have a POTHOLE_PROB chance of tripping
G : goal, time for bed
The episode ends when you reach the goal or trip.
You receive a reward of +10 if you reach the goal,
but get a SLEEP_DEPRIVATION_PENALTY and otherwise.
"""
metadata = {'render.modes': ['human', 'ansi']}
def __init__(self, desc=None, map_name="4x4",is_slippery=True):
""" This generates a map and sets all transition probabilities.
(by passing constructed nS, nA, P, isd to DiscreteEnv)
"""
if desc is None and map_name is None:
desc = generate_random_map()
elif desc is None:
desc = MAPS[map_name]
self.desc = desc = np.asarray(desc,dtype='c')
self.nrow, self.ncol = nrow, ncol = desc.shape
self.reward_range = (0, 1)
nA = 4
nS = nrow * ncol
isd = np.array(desc == b'S').astype('float64').ravel()
isd /= isd.sum()
# We need to pass 'P' to DiscreteEnv:
# P dictionary dict of dicts of lists, where
# P[s][a] == [(probability, nextstate, reward, done), ...]
P = {s : {a : [] for a in range(nA)} for s in range(nS)}
def convert_rc_to_s(row, col):
return row*ncol + col
#def inc(row, col, a):
def intended_destination(row, col, a):
if a == LEFT:
col = max(col-1,0)
elif a == DOWN:
row = min(row+1,nrow-1)
elif a == RIGHT:
col = min(col+1,ncol-1)
elif a == UP:
row = max(row-1,0)
return (row, col)
def construct_transition_for_intended(row, col, a, prob, li):
""" this constructs a transition to the "intended_destination(row, col, a)"
and adds it to the transition list (which could be for a different action b).
"""
newrow, newcol = intended_destination(row, col, a)
newstate = convert_rc_to_s(newrow, newcol)
newletter = desc[newrow, newcol]
done = bytes(newletter) in b'G'
rew = REWARD if newletter == b'G' else SLEEP_DEPRIVATION_PENALTY
li.append( (prob, newstate, rew, done) )
#THIS IS WHERE THE MATRIX OF TRANSITION PROBABILITIES IS COMPUTED.
for row in range(nrow):
for col in range(ncol):
# specify transitions for s=(row, col)
s = convert_rc_to_s(row, col)
letter = desc[row, col]
for a in range(4):
# specify transitions for action a
li = P[s][a]
if letter in b'G':
# We are at the goal ('G')....
# This is a strange case:
# - conceptually, we can think of this as:
# always transition to a 'terminated' state where we willget 0 reward.
#
# - But in gym, in practie, this case should not be happening at all!!!
# Gym will alreay have returned 'done' when transitioning TO the goal state (not from it).
# So we will never use the transition probabilities *from* the goal state.
# So, from gym's perspective we could specify anything we like here. E.g.,:
# li.append((1.0, 59, 42000000, True))
#
# However, if we want to be able to use the transition matrix to do value iteration, it is important
# that we get 0 reward ever after.
li.append((1.0, s, 0, True))
if letter in b'H':
#We are at a pothole ('H')
#when we are at a pothole, we trip with prob. POTHOLE_PROB
li.append((POTHOLE_PROB, s, BROKEN_LEG_PENALTY, True))
construct_transition_for_intended(row, col, a, 1.0 - POTHOLE_PROB, li)
else:
# We are at normal pavement (.)
# with prob. 0.8 we move as intended:
construct_transition_for_intended(row, col, a, 0.8, li)
# but with prob. 0.1 we move sideways to intended:
for b in [(a-1)%4, (a+1)%4]:
construct_transition_for_intended(row, col, b, 0.1, li)
super(DrunkenWalkEnv, self).__init__(nS, nA, P, isd)
def action_to_string(self, action_index):
s ="{}".format(["Left","Down","Right","Up"][action_index])
return s
def render(self, mode='human'):
outfile = StringIO() if mode == 'ansi' else sys.stdout
row, col = self.s // self.ncol, self.s % self.ncol
desc = self.desc.tolist()
desc = [[c.decode('utf-8') for c in line] for line in desc]
desc[row][col] = utils.colorize(desc[row][col], "red", highlight=True)
if self.lastaction is not None:
outfile.write(" (last action was '{action}')\n".format( action=self.action_to_string(self.lastaction) ) )
else:
outfile.write("\n")
outfile.write("\n".join(''.join(line) for line in desc)+"\n")
if mode != 'human':
with closing(outfile):
return outfile.getvalue()
if __name__ == "__main__":
# env = DrunkenWalkEnv(map_name="walkInThePark")
env = DrunkenWalkEnv(map_name="theAlley")
n_states = env.observation_space.n
n_actions = env.action_space.n

15
notebooks/envs/track.txt Normal file
View File

@@ -0,0 +1,15 @@
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 0 0 0 0 0 3 3 3 3 3 1
1 1 1 1 1 1 0 0 0 0 0 0 0 3 3 3 3 3 1
1 1 1 1 1 0 0 0 0 0 0 0 0 3 3 3 3 3 1
1 1 1 1 0 0 0 0 0 0 0 0 0 3 3 3 3 3 1
1 1 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1
1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1
1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1