add some code

2020-07-19 17:39:59 +08:00
parent 12fd207331
commit 58c4797676
10 changed files with 735 additions and 15 deletions
--- a/docs/code/Sarsa/train.py
+++ b/docs/code/Sarsa/train.py
@@ -0,0 +1,92 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# -*- coding: utf-8 -*-
+
+import gym
+from gridworld import CliffWalkingWapper, FrozenLakeWapper
+from agent import SarsaAgent
+import time
+
+
+def run_episode(env, agent, render=False):
+    total_steps = 0  # 记录每个episode走了多少step
+    total_reward = 0
+
+    obs = env.reset()  # 重置环境, 重新开一局（即开始新的一个episode）
+    action = agent.sample(obs)  # 根据算法选择一个动作
+
+    while True:
+        next_obs, reward, done, _ = env.step(action)  # 与环境进行一个交互
+        next_action = agent.sample(next_obs)  # 根据算法选择一个动作
+        # 训练 Sarsa 算法
+        agent.learn(obs, action, reward, next_obs, next_action, done)
+
+        action = next_action
+        obs = next_obs  # 存储上一个观察值
+        total_reward += reward
+        total_steps += 1  # 计算step数
+        if render:
+            env.render()  #渲染新的一帧图形
+        if done:
+            break
+    return total_reward, total_steps
+
+
+def test_episode(env, agent):
+    total_reward = 0
+    obs = env.reset()
+    while True:
+        action = agent.predict(obs)  # greedy，只取最优的动作
+        next_obs, reward, done, _ = env.step(action)
+        total_reward += reward
+        obs = next_obs
+        time.sleep(0.5)  # 每个step延迟0.5秒来看看效果
+        env.render()
+        if done:
+            print('test reward = %.1f' % (total_reward))
+            break
+
+
+def main():
+    # env = gym.make("FrozenLake-v0", is_slippery=False)  # 0 left, 1 down, 2 right, 3 up
+    # env = FrozenLakeWapper(env)
+
+    env = gym.make("CliffWalking-v0")  # 0 up, 1 right, 2 down, 3 left
+    env = CliffWalkingWapper(env)  # 这行不加也可以，这个是为了显示效果更好一点
+    
+    agent = SarsaAgent(
+        obs_n=env.observation_space.n,
+        act_n=env.action_space.n,
+        learning_rate=0.1,
+        gamma=0.9,
+        e_greed=0.1)
+
+    is_render = False
+    for episode in range(500):
+        ep_reward, ep_steps = run_episode(env, agent, is_render)
+        print('Episode %s: steps = %s , reward = %.1f' % (episode, ep_steps,
+                                                          ep_reward))
+
+        # 每隔20个episode渲染一下看看效果（每个episode都渲染的话，时间会比较长）
+        if episode % 20 == 0:
+            is_render = True
+        else:
+            is_render = False
+    # 训练结束，查看算法效果
+    test_episode(env, agent)
+
+
+if __name__ == "__main__":
+    main()