update

2021-03-23 17:05:29 +08:00
parent bf0f2990cf
commit 5d8bf4802a
15 changed files with 117 additions and 74 deletions
--- a/codes/MonteCarlo/README.md
+++ b/codes/MonteCarlo/README.md
@@ -1,11 +1,5 @@
 # *On-Policy First-Visit MC Control*

-## 环境说明
-
-见[环境说明](https://github.com/JohnJim0816/reinforcement-learning-tutorials/blob/master/env_info.md)中的The Racetrack
-
-## First-Visit MC 介绍
-
 ### 伪代码

 ![mc_control_algo](assets/mc_control_algo.png)
--- a/codes/PolicyGradient/agent.py
+++ b/codes/PolicyGradient/agent.py
@@ -5,7 +5,7 @@ Author: John
 Email: johnjim0816@gmail.com
 Date: 2020-11-22 23:27:44
 LastEditor: John
-LastEditTime: 2021-03-13 11:50:16
+LastEditTime: 2021-03-23 16:37:14
 Discription: 
 Environment: 
 '''
@@ -13,14 +13,13 @@ import torch
 from torch.distributions import Bernoulli
 from torch.autograd import Variable
 import numpy as np
-
-from common.model import MLP1
+from PolicyGradient.model import MLP

 class PolicyGradient:
    
    def __init__(self, n_states,cfg):
        self.gamma = cfg.gamma
-        self.policy_net = MLP1(n_states,hidden_dim=cfg.hidden_dim)
+        self.policy_net = MLP(n_states,hidden_dim=cfg.hidden_dim)
        self.optimizer = torch.optim.RMSprop(self.policy_net.parameters(), lr=cfg.lr)
        self.batch_size = cfg.batch_size

@@ -66,6 +65,6 @@ class PolicyGradient:
            loss.backward()
        self.optimizer.step()
    def save_model(self,path):
-        torch.save(self.policy_net.state_dict(), path+'pg_checkpoint.pth')
+        torch.save(self.policy_net.state_dict(), path+'pg_checkpoint.pt')
    def load_model(self,path):
-        self.policy_net.load_state_dict(torch.load(path+'pg_checkpoint.pth')) 
+        self.policy_net.load_state_dict(torch.load(path+'pg_checkpoint.pt')) 
--- a/codes/PolicyGradient/main.py
+++ b/codes/PolicyGradient/main.py
@@ -5,12 +5,12 @@ Author: John
 Email: johnjim0816@gmail.com
 Date: 2020-11-22 23:21:53
 LastEditor: John
-LastEditTime: 2021-03-13 11:50:32
+LastEditTime: 2021-03-23 16:38:54
 Discription: 
 Environment: 
 '''
 import sys,os
-sys.path.append(os.getcwd()) # 添加当前终端路径
+sys.path.append(os.getcwd()) # add current terminal path to sys.path
 from itertools import count
 import datetime
 import gym
@@ -18,25 +18,25 @@ from PolicyGradient.agent import PolicyGradient
 from common.plot import plot_rewards
 from common.utils import save_results

-SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
-SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' # 生成保存的模型路径
-if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"): # 检测是否存在文件夹
+SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
+SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/'  # path to save model
+if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"): 
    os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/")
-if not os.path.exists(SAVED_MODEL_PATH): # 检测是否存在文件夹
+if not os.path.exists(SAVED_MODEL_PATH):
    os.mkdir(SAVED_MODEL_PATH)
-RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/' # 存储reward的路径
-if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/results/"): # 检测是否存在文件夹
+RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/' # path to save rewards
+if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/results/"): 
    os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/results/")
-if not os.path.exists(RESULT_PATH): # 检测是否存在文件夹
+if not os.path.exists(RESULT_PATH): 
    os.mkdir(RESULT_PATH)

 class PGConfig:
    def __init__(self):
        self.train_eps = 300 # 训练的episode数目
        self.batch_size = 8
-        self.lr = 0.01 # 学习率
+        self.lr = 0.01 # learning rate
        self.gamma = 0.99
-        self.hidden_dim = 36 # 隐藏层维度
+        self.hidden_dim = 36 # dimmension of hidden layer
        
 def train(cfg,env,agent):
    '''下面带pool都是存放的transition序列用于gradient'''
--- a/codes/PolicyGradient/model.py
+++ b/codes/PolicyGradient/model.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Author: John
+Email: johnjim0816@gmail.com
+Date: 2021-03-23 16:35:58
+LastEditor: John
+LastEditTime: 2021-03-23 16:36:20
+Discription: 
+Environment: 
+'''
+import torch.nn as nn
+import torch.nn.functional as F
+class MLP(nn.Module):
+    ''' 多层感知机
+        输入：state维度
+        输出：概率
+    '''
+    def __init__(self,n_states,hidden_dim = 36):
+        super(MLP, self).__init__()
+        # 24和36为hidden layer的层数，可根据state_dim, n_actions的情况来改变
+        self.fc1 = nn.Linear(n_states, hidden_dim)
+        self.fc2 = nn.Linear(hidden_dim,hidden_dim)
+        self.fc3 = nn.Linear(hidden_dim, 1)  # Prob of Left
+
+    def forward(self, x):
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = F.sigmoid(self.fc3(x))
+        return x
--- a/codes/QLearning/README.md
+++ b/codes/QLearning/README.md
@@ -1,19 +0,0 @@
-## CliffWalking-v0环境简介
-
-悬崖寻路问题（CliffWalking）是指在一个4 x 12的网格中，智能体以网格的左下角位置为起点，以网格的下角位置为终点，目标是移动智能体到达终点位置，智能体每次可以在上、下、左、右这4个方向中移动一步，每移动一步会得到-1单位的奖励。
-
-<img src="assets/image-20201007211441036.png" alt="image-20201007211441036" style="zoom:50%;" />
-
-如图，红色部分表示悬崖，数字代表智能体能够观测到的位置信息，即observation，总共会有0-47等48个不同的值，智能体再移动中会有以下限制：
-
-* 智能体不能移出网格，如果智能体想执行某个动作移出网格，那么这一步智能体不会移动，但是这个操作依然会得到-1单位的奖励
-
-* 如果智能体“掉入悬崖” ，会立即回到起点位置，并得到-100单位的奖励
-
-* 当智能体移动到终点时，该回合结束，该回合总奖励为各步奖励之和
-
-实际的仿真界面如下：
-
-<img src="assets/image-20201007211858925.png" alt="image-20201007211858925" style="zoom:50%;" />
-
-由于从起点到终点最少需要13步，每步得到-1的reward，因此最佳训练算法下，每个episode下reward总和应该为-13。
--- a/codes/common/model.py
+++ b/codes/common/model.py
@@ -5,7 +5,7 @@ Author: John
 Email: johnjim0816@gmail.com
 Date: 2021-03-12 21:14:12
 LastEditor: John
-LastEditTime: 2021-03-20 16:44:00
+LastEditTime: 2021-03-23 16:35:46
 Discription: 
 Environment: 
 '''
@@ -13,24 +13,6 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.distributions import Categorical
-class MLP1(nn.Module):
-    ''' 多层感知机
-        输入：state维度
-        输出：概率
-    '''
-    def __init__(self,n_states,hidden_dim = 36):
-        super(MLP1, self).__init__()
-        # 24和36为hidden layer的层数，可根据state_dim, n_actions的情况来改变
-        self.fc1 = nn.Linear(n_states, hidden_dim)
-        self.fc2 = nn.Linear(hidden_dim,hidden_dim)
-        self.fc3 = nn.Linear(hidden_dim, 1)  # Prob of Left
-
-    def forward(self, x):
-        x = F.relu(self.fc1(x))
-        x = F.relu(self.fc2(x))
-        x = F.sigmoid(self.fc3(x))
-        return x
-

 class MLP2(nn.Module):
    def __init__(self, n_states,n_actions,hidden_dim=128):
--- a/codes/env_info.md
+++ b/codes/env_info.md
@@ -1,13 +0,0 @@
-## 环境说明
-
-### [CartPole v0](https://github.com/openai/gym/wiki/CartPole-v0)
-
-<img src="assets/image-20200820174307301.png" alt="image-20200820174307301" style="zoom:50%;" />
-
-通过向左或向右推车能够实现平衡，所以动作空间由两个动作组成。每进行一个step就会给一个reward，如果无法保持平衡那么done等于true，本次episode失败。理想状态下，每个episode至少能进行200个step，也就是说每个episode的reward总和至少为200，step数目至少为200
-
-### [Pendulum-v0](https://github.com/openai/gym/wiki/Pendulum-v0)
-
-<img src="assets/image-20200820174814084.png" alt="image-20200820174814084" style="zoom:50%;" />
-
-钟摆以随机位置开始，目标是将其摆动，使其保持向上直立。动作空间是连续的，值的区间为[-2,2]。每个step给的reward最低为-16.27，最高为0。目前最好的成绩是100个episode的reward之和为-123.11 ± 6.86。
--- a/codes/envs/assets/action_grid.png
+++ b/codes/envs/assets/action_grid.png
--- a/codes/envs/assets/image-20200820174307301.png
+++ b/codes/envs/assets/image-20200820174307301.png
--- a/codes/envs/assets/image-20200820174814084.png
+++ b/codes/envs/assets/image-20200820174814084.png
--- a/codes/QLearning/assets/image-20201007211441036.png
+++ b/codes/QLearning/assets/image-20201007211441036.png
--- a/codes/QLearning/assets/image-20201007211858925.png
+++ b/codes/QLearning/assets/image-20201007211858925.png
--- a/codes/envs/assets/track_big.png
+++ b/codes/envs/assets/track_big.png
--- a/codes/envs/gym_info.md
+++ b/codes/envs/gym_info.md
@@ -0,0 +1,33 @@
+## 环境说明
+
+### [CartPole v0](https://github.com/openai/gym/wiki/CartPole-v0)
+
+<img src="assets/image-20200820174307301.png" alt="image-20200820174307301" style="zoom:50%;" />
+
+通过向左或向右推车能够实现平衡，所以动作空间由两个动作组成。每进行一个step就会给一个reward，如果无法保持平衡那么done等于true，本次episode失败。理想状态下，每个episode至少能进行200个step，也就是说每个episode的reward总和至少为200，step数目至少为200
+
+### [Pendulum-v0](https://github.com/openai/gym/wiki/Pendulum-v0)
+
+<img src="assets/image-20200820174814084.png" alt="image-20200820174814084" style="zoom:50%;" />
+
+钟摆以随机位置开始，目标是将其摆动，使其保持向上直立。动作空间是连续的，值的区间为[-2,2]。每个step给的reward最低为-16.27，最高为0。目前最好的成绩是100个episode的reward之和为-123.11 ± 6.86。
+
+### CliffWalking-v0
+
+悬崖寻路问题（CliffWalking）是指在一个4 x 12的网格中，智能体以网格的左下角位置为起点，以网格的下角位置为终点，目标是移动智能体到达终点位置，智能体每次可以在上、下、左、右这4个方向中移动一步，每移动一步会得到-1单位的奖励。
+
+<img src="./assets/image-20201007211441036.png" alt="image-20201007211441036" style="zoom:50%;" />
+
+如图，红色部分表示悬崖，数字代表智能体能够观测到的位置信息，即observation，总共会有0-47等48个不同的值，智能体再移动中会有以下限制：
+
+* 智能体不能移出网格，如果智能体想执行某个动作移出网格，那么这一步智能体不会移动，但是这个操作依然会得到-1单位的奖励
+
+* 如果智能体“掉入悬崖” ，会立即回到起点位置，并得到-100单位的奖励
+
+* 当智能体移动到终点时，该回合结束，该回合总奖励为各步奖励之和
+
+实际的仿真界面如下：
+
+<img src="./assets/image-20201007211858925.png" alt="image-20201007211858925" style="zoom:50%;" />
+
+由于从起点到终点最少需要13步，每步得到-1的reward，因此最佳训练算法下，每个episode下reward总和应该为-13。
--- a/codes/envs/racetrack_env.md
+++ b/codes/envs/racetrack_env.md
@@ -0,0 +1,37 @@
+## The Racetrack Environment
+We have implemented a custom environment called "Racetrack" for you to use during this piece of coursework. It is inspired by the environment described in the course textbook (Reinforcement Learning, Sutton & Barto, 2018, Exercise 5.12), but is not exactly the same.
+
+### Environment Description
+
+Consider driving a race car around a turn on a racetrack. In order to complete the race as quickly as possible, you would want to drive as fast as you can but, to avoid running off the track, you must slow down while turning.
+
+In our simplified racetrack environment, the agent is at one of a discrete set of grid positions. The agent also has a discrete speed in two directions, $x$ and $y$. So the state is represented as follows:
+$$(\text{position}_y, \text{position}_x, \text{velocity}_y, \text{velocity}_x)$$
+
+The agent collects a reward of -1 at each time step, an additional -10 for leaving the track (i.e., ending up on a black grid square in the figure below), and an additional +10 for reaching the finish line (any of the red grid squares). The agent starts each episode in a randomly selected  grid-square on the starting line (green grid squares) with a speed of zero in both directions. At each time step, the agent can change its speed in both directions. Each speed can be changed by +1, -1 or 0, giving a total of nine actions. For example, the agent may increase its speed in the $x$ direction by -1 and its speed in the $y$ direction by +1. The agent's speed cannot be greater than +10 or less than -10 in either direction.
+
+<img src="assets/track_big.png" style="width: 600px;"/>
+
+
+The agent's next state is determined by its current grid square, its current speed in two directions, and the changes it  makes to its speed in the two directions. This environment is stochastic. When the agent tries to change its speed, no change occurs (in either direction) with probability 0.2. In other words, 20% of the time, the agent's action is ignored and the car's speed remains the same in both directions.
+
+If the agent leaves the track, it is returned to a random start grid-square and has its speed set to zero in both directions; the episode continues. An episode ends only when the agent transitions to a goal grid-square.
+
+
+
+### Environment Implementation
+We have implemented the above environment in the `racetrack_env.py` file, for you to use in this coursework. Please use this implementation instead of writing your own, and please do not modify the environment.
+
+We provide a `RacetrackEnv` class for your agents to interact with. The class has the following methods:
+- **`reset()`** - this method initialises the environment, chooses a random starting state, and returns it. This method should be called before the start of every episode.
+- **`step(action)`** - this method takes an integer action (more on this later), and executes one time-step in the environment. It returns a tuple containing the next state, the reward collected, and whether the next state is a terminal state.
+- **`render(sleep_time)`** - this method renders a matplotlib graph representing the environment. It takes an optional float parameter giving the number of seconds to display each time-step. This method is useful for testing and debugging, but should not be used during training since it is *very* slow. **Do not use this method in your final submission**.
+- **`get_actions()`** - a simple method that returns the available actions in the current state. Always returns a list containing integers in the range [0-8] (more on this later).
+
+In our code, states are represented as Python tuples - specifically a tuple of four integers. For example, if the agent is in a grid square with coordinates ($Y = 2$, $X = 3$), and is moving zero cells vertically and one cell horizontally per time-step, the state is represented as `(2, 3, 0, 1)`. Tuples of this kind will be returned by the `reset()` and `step(action)` methods.
+
+There are nine actions available to the agent in each state, as described above. However, to simplify your code, we have represented each of the nine actions as an integer in the range [0-8]. The table below shows the index of each action, along with the corresponding changes it will cause to the agent's speed in each direction.
+
+<img src="assets/action_grid.png" style="width: 250px;"/>
+
+For example, taking action 8 will increase the agent's speed in the $x$ direction, but decrease its speed in the $y$ direction.