update rainbowdqn

2022-05-31 01:20:58 +08:00
parent cfc0f6492e
commit c7c94468c9
149 changed files with 1866 additions and 1549 deletions
--- a/codes/envs/blackjack.py
+++ b/codes/envs/blackjack.py
@@ -77,7 +77,7 @@ class BlackjackEnv(gym.Env):
        self.natural = natural
        # Start the first game
        self._reset()        # Number of 
-        self.action_dim = 2
+        self.n_actions = 2

    def reset(self):
        return self._reset()
--- a/codes/envs/cliff_walking.py
+++ b/codes/envs/cliff_walking.py
@@ -31,7 +31,7 @@ class CliffWalkingEnv(discrete.DiscreteEnv):
        self.shape = (4, 12)

        nS = np.prod(self.shape)
-        action_dim = 4
+        n_actions = 4

        # Cliff Location
        self._cliff = np.zeros(self.shape, dtype=np.bool)
@@ -41,7 +41,7 @@ class CliffWalkingEnv(discrete.DiscreteEnv):
        P = {}
        for s in range(nS):
            position = np.unravel_index(s, self.shape)
-            P[s] = { a : [] for a in range(action_dim) }
+            P[s] = { a : [] for a in range(n_actions) }
            P[s][UP] = self._calculate_transition_prob(position, [-1, 0])
            P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1])
            P[s][DOWN] = self._calculate_transition_prob(position, [1, 0])
@@ -51,7 +51,7 @@ class CliffWalkingEnv(discrete.DiscreteEnv):
        isd = np.zeros(nS)
        isd[np.ravel_multi_index((3,0), self.shape)] = 1.0

-        super(CliffWalkingEnv, self).__init__(nS, action_dim, P, isd)
+        super(CliffWalkingEnv, self).__init__(nS, n_actions, P, isd)

    def render(self, mode='human', close=False):
        self._render(mode, close)
--- a/codes/envs/gridworld.py
+++ b/codes/envs/gridworld.py
@@ -37,7 +37,7 @@ class GridworldEnv(discrete.DiscreteEnv):
        self.shape = shape

        nS = np.prod(shape)
-        action_dim = 4
+        n_actions = 4

        MAX_Y = shape[0]
        MAX_X = shape[1]
@@ -51,7 +51,7 @@ class GridworldEnv(discrete.DiscreteEnv):
            y, x = it.multi_index

            # P[s][a] = (prob, next_state, reward, is_done)
-            P[s] = {a : [] for a in range(action_dim)}
+            P[s] = {a : [] for a in range(n_actions)}

            is_done = lambda s: s == 0 or s == (nS - 1)
            reward = 0.0 if is_done(s) else -1.0
@@ -82,7 +82,7 @@ class GridworldEnv(discrete.DiscreteEnv):
        # This should not be used in any model-free learning algorithm
        self.P = P

-        super(GridworldEnv, self).__init__(nS, action_dim, P, isd)
+        super(GridworldEnv, self).__init__(nS, n_actions, P, isd)

    def _render(self, mode='human', close=False):
        """ Renders the current gridworld layout
--- a/codes/envs/gridworld_env.py
+++ b/codes/envs/gridworld_env.py
@@ -1,195 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# -*- coding: utf-8 -*-
-
-import gym
-import turtle
-import numpy as np
-
-# turtle tutorial : https://docs.python.org/3.3/library/turtle.html
-
-
-def GridWorld(gridmap=None, is_slippery=False):
-    if gridmap is None:
-        gridmap = ['SFFF', 'FHFH', 'FFFH', 'HFFG']
-    env = gym.make("FrozenLake-v0", desc=gridmap, is_slippery=False)
-    env = FrozenLakeWapper(env)
-    return env
-
-
-class FrozenLakeWapper(gym.Wrapper):
-    def __init__(self, env):
-        gym.Wrapper.__init__(self, env)
-        self.max_y = env.desc.shape[0]
-        self.max_x = env.desc.shape[1]
-        self.t = None
-        self.unit = 50
-
-    def draw_box(self, x, y, fillcolor='', line_color='gray'):
-        self.t.up()
-        self.t.goto(x * self.unit, y * self.unit)
-        self.t.color(line_color)
-        self.t.fillcolor(fillcolor)
-        self.t.setheading(90)
-        self.t.down()
-        self.t.begin_fill()
-        for _ in range(4):
-            self.t.forward(self.unit)
-            self.t.right(90)
-        self.t.end_fill()
-
-    def move_player(self, x, y):
-        self.t.up()
-        self.t.setheading(90)
-        self.t.fillcolor('red')
-        self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit)
-
-    def render(self):
-        if self.t == None:
-            self.t = turtle.Turtle()
-            self.wn = turtle.Screen()
-            self.wn.setup(self.unit * self.max_x + 100,
-                          self.unit * self.max_y + 100)
-            self.wn.setworldcoordinates(0, 0, self.unit * self.max_x,
-                                        self.unit * self.max_y)
-            self.t.shape('circle')
-            self.t.width(2)
-            self.t.speed(0)
-            self.t.color('gray')
-            for i in range(self.desc.shape[0]):
-                for j in range(self.desc.shape[1]):
-                    x = j
-                    y = self.max_y - 1 - i
-                    if self.desc[i][j] == b'S':  # Start
-                        self.draw_box(x, y, 'white')
-                    elif self.desc[i][j] == b'F':  # Frozen ice
-                        self.draw_box(x, y, 'white')
-                    elif self.desc[i][j] == b'G':  # Goal
-                        self.draw_box(x, y, 'yellow')
-                    elif self.desc[i][j] == b'H':  # Hole
-                        self.draw_box(x, y, 'black')
-                    else:
-                        self.draw_box(x, y, 'white')
-            self.t.shape('turtle')
-
-        x_pos = self.s % self.max_x
-        y_pos = self.max_y - 1 - int(self.s / self.max_x)
-        self.move_player(x_pos, y_pos)
-
-
-class CliffWalkingWapper(gym.Wrapper):
-    def __init__(self, env):
-        gym.Wrapper.__init__(self, env)
-        self.t = None
-        self.unit = 50
-        self.max_x = 12
-        self.max_y = 4
-
-    def draw_x_line(self, y, x0, x1, color='gray'):
-        assert x1 > x0
-        self.t.color(color)
-        self.t.setheading(0)
-        self.t.up()
-        self.t.goto(x0, y)
-        self.t.down()
-        self.t.forward(x1 - x0)
-
-    def draw_y_line(self, x, y0, y1, color='gray'):
-        assert y1 > y0
-        self.t.color(color)
-        self.t.setheading(90)
-        self.t.up()
-        self.t.goto(x, y0)
-        self.t.down()
-        self.t.forward(y1 - y0)
-
-    def draw_box(self, x, y, fillcolor='', line_color='gray'):
-        self.t.up()
-        self.t.goto(x * self.unit, y * self.unit)
-        self.t.color(line_color)
-        self.t.fillcolor(fillcolor)
-        self.t.setheading(90)
-        self.t.down()
-        self.t.begin_fill()
-        for i in range(4):
-            self.t.forward(self.unit)
-            self.t.right(90)
-        self.t.end_fill()
-
-    def move_player(self, x, y):
-        self.t.up()
-        self.t.setheading(90)
-        self.t.fillcolor('red')
-        self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit)
-
-    def render(self):
-        if self.t == None:
-            self.t = turtle.Turtle()
-            self.wn = turtle.Screen()
-            self.wn.setup(self.unit * self.max_x + 100,
-                          self.unit * self.max_y + 100)
-            self.wn.setworldcoordinates(0, 0, self.unit * self.max_x,
-                                        self.unit * self.max_y)
-            self.t.shape('circle')
-            self.t.width(2)
-            self.t.speed(0)
-            self.t.color('gray')
-            for _ in range(2):
-                self.t.forward(self.max_x * self.unit)
-                self.t.left(90)
-                self.t.forward(self.max_y * self.unit)
-                self.t.left(90)
-            for i in range(1, self.max_y):
-                self.draw_x_line(
-                    y=i * self.unit, x0=0, x1=self.max_x * self.unit)
-            for i in range(1, self.max_x):
-                self.draw_y_line(
-                    x=i * self.unit, y0=0, y1=self.max_y * self.unit)
-
-            for i in range(1, self.max_x - 1):
-                self.draw_box(i, 0, 'black')
-            self.draw_box(self.max_x - 1, 0, 'yellow')
-            self.t.shape('turtle')
-
-        x_pos = self.s % self.max_x
-        y_pos = self.max_y - 1 - int(self.s / self.max_x)
-        self.move_player(x_pos, y_pos)
-
-
-if __name__ == '__main__':
-    # 环境1：FrozenLake, 可以配置冰面是否是滑的
-    # 0 left, 1 down, 2 right, 3 up
-    env = gym.make("FrozenLake-v0", is_slippery=False)
-    env = FrozenLakeWapper(env)
-
-    # 环境2：CliffWalking, 悬崖环境
-    # env = gym.make("CliffWalking-v0")  # 0 up, 1 right, 2 down, 3 left
-    # env = CliffWalkingWapper(env)
-
-    # 环境3：自定义格子世界，可以配置地图, S为出发点Start, F为平地Floor, H为洞Hole, G为出口目标Goal
-    # gridmap = [
-    #         'SFFF',
-    #         'FHFF',
-    #         'FFFF',
-    #         'HFGF' ]
-    # env = GridWorld(gridmap)
-
-    env.reset()
-    for step in range(10):
-        action = np.random.randint(0, 4)
-        obs, reward, done, info = env.step(action)
-        print('step {}: action {}, obs {}, reward {}, done {}, info {}'.format(\
-                step, action, obs, reward, done, info))
-        # env.render() # 渲染一帧图像
--- a/codes/envs/snake/checkpoint.npy
+++ b/codes/envs/snake/checkpoint.npy
--- a/codes/envs/snake/checkpoint1.npy
+++ b/codes/envs/snake/checkpoint1.npy
--- a/codes/envs/snake/checkpoint2.npy
+++ b/codes/envs/snake/checkpoint2.npy
--- a/codes/envs/snake/checkpoint3.npy
+++ b/codes/envs/snake/checkpoint3.npy
--- a/codes/envs/snake/q_agent.npy
+++ b/codes/envs/snake/q_agent.npy
--- a/codes/envs/stochastic_mdp.py
+++ b/codes/envs/stochastic_mdp.py
@@ -17,31 +17,31 @@ class StochasticMDP:
    def __init__(self):
        self.end = False
        self.curr_state = 2
-        self.action_dim = 2
-        self.state_dim = 6
+        self.n_actions = 2
+        self.n_states = 6
        self.p_right = 0.5

    def reset(self):
        self.end = False
        self.curr_state = 2
-        state = np.zeros(self.state_dim)
+        state = np.zeros(self.n_states)
        state[self.curr_state - 1] = 1.
        return state

    def step(self, action):
        if self.curr_state != 1:
            if action == 1:
-                if random.random() < self.p_right and self.curr_state < self.state_dim:
+                if random.random() < self.p_right and self.curr_state < self.n_states:
                    self.curr_state += 1
                else:
                    self.curr_state -= 1

            if action == 0:
                self.curr_state -= 1
-        if self.curr_state == self.state_dim:
+        if self.curr_state == self.n_states:
            self.end = True

-        state = np.zeros(self.state_dim)
+        state = np.zeros(self.n_states)
        state[self.curr_state - 1] = 1.

        if self.curr_state == 1:
--- a/codes/envs/windy_gridworld.py
+++ b/codes/envs/windy_gridworld.py
@@ -30,7 +30,7 @@ class WindyGridworldEnv(discrete.DiscreteEnv):
        self.shape = (7, 10)

        nS = np.prod(self.shape)
-        action_dim = 4
+        n_actions = 4

        # Wind strength
        winds = np.zeros(self.shape)
@@ -41,7 +41,7 @@ class WindyGridworldEnv(discrete.DiscreteEnv):
        P = {}
        for s in range(nS):
            position = np.unravel_index(s, self.shape)
-            P[s] = { a : [] for a in range(action_dim) }
+            P[s] = { a : [] for a in range(n_actions) }
            P[s][UP] = self._calculate_transition_prob(position, [-1, 0], winds)
            P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1], winds)
            P[s][DOWN] = self._calculate_transition_prob(position, [1, 0], winds)
@@ -51,7 +51,7 @@ class WindyGridworldEnv(discrete.DiscreteEnv):
        isd = np.zeros(nS)
        isd[np.ravel_multi_index((3,0), self.shape)] = 1.0

-        super(WindyGridworldEnv, self).__init__(nS, action_dim, P, isd)
+        super(WindyGridworldEnv, self).__init__(nS, n_actions, P, isd)

    def render(self, mode='human', close=False):
        self._render(mode, close)