update projects

2022-07-31 23:42:12 +08:00
parent e9b3e92141
commit ffab9e3028
236 changed files with 370 additions and 133 deletions
--- a/projects/codes/QLearning/env/gridworld_env.py
+++ b/projects/codes/QLearning/env/gridworld_env.py
@@ -0,0 +1,195 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# -*- coding: utf-8 -*-
+
+import gym
+import turtle
+import numpy as np
+
+# turtle tutorial : https://docs.python.org/3.3/library/turtle.html
+
+
+def GridWorld(gridmap=None, is_slippery=False):
+    if gridmap is None:
+        gridmap = ['SFFF', 'FHFH', 'FFFH', 'HFFG']
+    env = gym.make("FrozenLake-v0", desc=gridmap, is_slippery=False)
+    env = FrozenLakeWapper(env)
+    return env
+
+
+class FrozenLakeWapper(gym.Wrapper):
+    def __init__(self, env):
+        gym.Wrapper.__init__(self, env)
+        self.max_y = env.desc.shape[0]
+        self.max_x = env.desc.shape[1]
+        self.t = None
+        self.unit = 50
+
+    def draw_box(self, x, y, fillcolor='', line_color='gray'):
+        self.t.up()
+        self.t.goto(x * self.unit, y * self.unit)
+        self.t.color(line_color)
+        self.t.fillcolor(fillcolor)
+        self.t.setheading(90)
+        self.t.down()
+        self.t.begin_fill()
+        for _ in range(4):
+            self.t.forward(self.unit)
+            self.t.right(90)
+        self.t.end_fill()
+
+    def move_player(self, x, y):
+        self.t.up()
+        self.t.setheading(90)
+        self.t.fillcolor('red')
+        self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit)
+
+    def render(self):
+        if self.t == None:
+            self.t = turtle.Turtle()
+            self.wn = turtle.Screen()
+            self.wn.setup(self.unit * self.max_x + 100,
+                          self.unit * self.max_y + 100)
+            self.wn.setworldcoordinates(0, 0, self.unit * self.max_x,
+                                        self.unit * self.max_y)
+            self.t.shape('circle')
+            self.t.width(2)
+            self.t.speed(0)
+            self.t.color('gray')
+            for i in range(self.desc.shape[0]):
+                for j in range(self.desc.shape[1]):
+                    x = j
+                    y = self.max_y - 1 - i
+                    if self.desc[i][j] == b'S':  # Start
+                        self.draw_box(x, y, 'white')
+                    elif self.desc[i][j] == b'F':  # Frozen ice
+                        self.draw_box(x, y, 'white')
+                    elif self.desc[i][j] == b'G':  # Goal
+                        self.draw_box(x, y, 'yellow')
+                    elif self.desc[i][j] == b'H':  # Hole
+                        self.draw_box(x, y, 'black')
+                    else:
+                        self.draw_box(x, y, 'white')
+            self.t.shape('turtle')
+
+        x_pos = self.s % self.max_x
+        y_pos = self.max_y - 1 - int(self.s / self.max_x)
+        self.move_player(x_pos, y_pos)
+
+
+class CliffWalkingWapper(gym.Wrapper):
+    def __init__(self, env):
+        gym.Wrapper.__init__(self, env)
+        self.t = None
+        self.unit = 50
+        self.max_x = 12
+        self.max_y = 4
+
+    def draw_x_line(self, y, x0, x1, color='gray'):
+        assert x1 > x0
+        self.t.color(color)
+        self.t.setheading(0)
+        self.t.up()
+        self.t.goto(x0, y)
+        self.t.down()
+        self.t.forward(x1 - x0)
+
+    def draw_y_line(self, x, y0, y1, color='gray'):
+        assert y1 > y0
+        self.t.color(color)
+        self.t.setheading(90)
+        self.t.up()
+        self.t.goto(x, y0)
+        self.t.down()
+        self.t.forward(y1 - y0)
+
+    def draw_box(self, x, y, fillcolor='', line_color='gray'):
+        self.t.up()
+        self.t.goto(x * self.unit, y * self.unit)
+        self.t.color(line_color)
+        self.t.fillcolor(fillcolor)
+        self.t.setheading(90)
+        self.t.down()
+        self.t.begin_fill()
+        for i in range(4):
+            self.t.forward(self.unit)
+            self.t.right(90)
+        self.t.end_fill()
+
+    def move_player(self, x, y):
+        self.t.up()
+        self.t.setheading(90)
+        self.t.fillcolor('red')
+        self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit)
+
+    def render(self):
+        if self.t == None:
+            self.t = turtle.Turtle()
+            self.wn = turtle.Screen()
+            self.wn.setup(self.unit * self.max_x + 100,
+                          self.unit * self.max_y + 100)
+            self.wn.setworldcoordinates(0, 0, self.unit * self.max_x,
+                                        self.unit * self.max_y)
+            self.t.shape('circle')
+            self.t.width(2)
+            self.t.speed(0)
+            self.t.color('gray')
+            for _ in range(2):
+                self.t.forward(self.max_x * self.unit)
+                self.t.left(90)
+                self.t.forward(self.max_y * self.unit)
+                self.t.left(90)
+            for i in range(1, self.max_y):
+                self.draw_x_line(
+                    y=i * self.unit, x0=0, x1=self.max_x * self.unit)
+            for i in range(1, self.max_x):
+                self.draw_y_line(
+                    x=i * self.unit, y0=0, y1=self.max_y * self.unit)
+
+            for i in range(1, self.max_x - 1):
+                self.draw_box(i, 0, 'black')
+            self.draw_box(self.max_x - 1, 0, 'yellow')
+            self.t.shape('turtle')
+
+        x_pos = self.s % self.max_x
+        y_pos = self.max_y - 1 - int(self.s / self.max_x)
+        self.move_player(x_pos, y_pos)
+
+
+if __name__ == '__main__':
+    # 环境1：FrozenLake, 可以配置冰面是否是滑的
+    # 0 left, 1 down, 2 right, 3 up
+    env = gym.make("FrozenLake-v0", is_slippery=False)
+    env = FrozenLakeWapper(env)
+
+    # 环境2：CliffWalking, 悬崖环境
+    # env = gym.make("CliffWalking-v0")  # 0 up, 1 right, 2 down, 3 left
+    # env = CliffWalkingWapper(env)
+
+    # 环境3：自定义格子世界，可以配置地图, S为出发点Start, F为平地Floor, H为洞Hole, G为出口目标Goal
+    # gridmap = [
+    #         'SFFF',
+    #         'FHFF',
+    #         'FFFF',
+    #         'HFGF' ]
+    # env = GridWorld(gridmap)
+
+    env.reset()
+    for step in range(10):
+        action = np.random.randint(0, 4)
+        obs, reward, done, info = env.step(action)
+        print('step {}: action {}, obs {}, reward {}, done {}, info {}'.format(\
+                step, action, obs, reward, done, info))
+        # env.render() # 渲染一帧图像
--- a/projects/codes/QLearning/outputs/CliffWalking-v0/20220210-005501/models/Qleaning_model.pkl
+++ b/projects/codes/QLearning/outputs/CliffWalking-v0/20220210-005501/models/Qleaning_model.pkl
--- a/projects/codes/QLearning/outputs/CliffWalking-v0/20220210-005501/results/test_ma_rewards.npy
+++ b/projects/codes/QLearning/outputs/CliffWalking-v0/20220210-005501/results/test_ma_rewards.npy
--- a/projects/codes/QLearning/outputs/CliffWalking-v0/20220210-005501/results/test_rewards.npy
+++ b/projects/codes/QLearning/outputs/CliffWalking-v0/20220210-005501/results/test_rewards.npy
--- a/projects/codes/QLearning/outputs/CliffWalking-v0/20220210-005501/results/test_rewards_curve.png
+++ b/projects/codes/QLearning/outputs/CliffWalking-v0/20220210-005501/results/test_rewards_curve.png
--- a/projects/codes/QLearning/outputs/CliffWalking-v0/20220210-005501/results/train_ma_rewards.npy
+++ b/projects/codes/QLearning/outputs/CliffWalking-v0/20220210-005501/results/train_ma_rewards.npy
--- a/projects/codes/QLearning/outputs/CliffWalking-v0/20220210-005501/results/train_rewards.npy
+++ b/projects/codes/QLearning/outputs/CliffWalking-v0/20220210-005501/results/train_rewards.npy
--- a/projects/codes/QLearning/outputs/CliffWalking-v0/20220210-005501/results/train_rewards_curve.png
+++ b/projects/codes/QLearning/outputs/CliffWalking-v0/20220210-005501/results/train_rewards_curve.png
--- a/projects/codes/QLearning/qlearning.py
+++ b/projects/codes/QLearning/qlearning.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Author: John
+Email: johnjim0816@gmail.com
+Date: 2020-09-11 23:03:00
+LastEditor: John
+LastEditTime: 2021-12-22 10:54:57
+Discription: use defaultdict to define Q table
+Environment: 
+'''
+import numpy as np
+import math
+import torch
+from collections import defaultdict
+
+class QLearning(object):
+    def __init__(self,n_states,
+                 n_actions,cfg):
+        self.n_actions = n_actions 
+        self.lr = cfg.lr  # 学习率
+        self.gamma = cfg.gamma  
+        self.epsilon = 0 
+        self.sample_count = 0  
+        self.epsilon_start = cfg.epsilon_start
+        self.epsilon_end = cfg.epsilon_end
+        self.epsilon_decay = cfg.epsilon_decay
+        self.Q_table  = defaultdict(lambda: np.zeros(n_actions)) # 用嵌套字典存放状态->动作->状态-动作值（Q值）的映射，即Q表
+    def choose_action(self, state):
+        self.sample_count += 1
+        self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
+            math.exp(-1. * self.sample_count / self.epsilon_decay) # epsilon是会递减的，这里选择指数递减
+        # e-greedy 策略
+        if np.random.uniform(0, 1) > self.epsilon:
+            action = np.argmax(self.Q_table[str(state)]) # 选择Q(s,a)最大对应的动作
+        else:
+            action = np.random.choice(self.n_actions) # 随机选择动作
+        return action
+    def predict(self,state):
+        action = np.argmax(self.Q_table[str(state)])
+        return action
+    def update(self, state, action, reward, next_state, done):
+        Q_predict = self.Q_table[str(state)][action] 
+        if done: # 终止状态
+            Q_target = reward  
+        else:
+            Q_target = reward + self.gamma * np.max(self.Q_table[str(next_state)]) 
+        self.Q_table[str(state)][action] += self.lr * (Q_target - Q_predict)
+    def save(self,path):
+        import dill
+        torch.save(
+            obj=self.Q_table,
+            f=path+"Qleaning_model.pkl",
+            pickle_module=dill
+        )
+        print("保存模型成功！")
+    def load(self, path):
+        import dill
+        self.Q_table =torch.load(f=path+'Qleaning_model.pkl',pickle_module=dill)
+        print("加载模型成功！")
--- a/projects/codes/QLearning/task0.py
+++ b/projects/codes/QLearning/task0.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Author: John
+Email: johnjim0816@gmail.com
+Date: 2020-09-11 23:03:00
+LastEditor: John
+LastEditTime: 2022-06-21 19:36:05
+Discription: 
+Environment: 
+'''
+import sys
+import os
+curr_path = os.path.dirname(os.path.abspath(__file__))  # 当前文件所在绝对路径
+parent_path = os.path.dirname(curr_path)  # 父路径
+sys.path.append(parent_path)  # 添加路径到系统路径
+
+import gym
+import torch
+import datetime
+
+from env.gridworld_env import CliffWalkingWapper
+from qlearning import QLearning
+from common.utils import plot_rewards
+from common.utils import save_results,make_dir
+
+curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
+class Config:
+    '''超参数
+    '''
+
+    def __init__(self):
+        ################################## 环境超参数 ###################################
+        self.algo_name = 'Q-learning'  # 算法名称
+        self.env_name = 'CliffWalking-v0'  # 环境名称
+        self.device = torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu")  # 检测GPUgjgjlkhfsf风刀霜的撒发十
+        self.seed = 10 # 随机种子，置0则不设置随机种子
+        self.train_eps = 400  # 训练的回合数
+        self.test_eps = 30  # 测试的回合数
+        ################################################################################
+        
+        ################################## 算法超参数 ###################################
+        self.gamma = 0.90  # 强化学习中的折扣因子
+        self.epsilon_start = 0.95  # e-greedy策略中初始epsilon
+        self.epsilon_end = 0.01  # e-greedy策略中的终止epsilon
+        self.epsilon_decay = 300  # e-greedy策略中epsilon的衰减率
+        self.lr = 0.1  # 学习率
+        ################################################################################
+        
+        ################################# 保存结果相关参数 ################################
+        self.result_path = curr_path + "/outputs/" + self.env_name + \
+            '/' + curr_time + '/results/'  # 保存结果的路径
+        self.model_path = curr_path + "/outputs/" + self.env_name + \
+            '/' + curr_time + '/models/'  # 保存模型的路径
+        self.save = True # 是否保存图片
+        ################################################################################
+        
+def train(cfg,env,agent):
+    print('开始训练！')
+    print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
+    rewards = []  # 记录奖励
+    ma_rewards = [] # 记录滑动平均奖励
+    for i_ep in range(cfg.train_eps):
+        ep_reward = 0  # 记录每个回合的奖励
+        state = env.reset()  # 重置环境,即开始新的回合
+        while True:
+            action = agent.choose_action(state)  # 根据算法选择一个动作
+            next_state, reward, done, _ = env.step(action)  # 与环境进行一次动作交互
+            agent.update(state, action, reward, next_state, done)  # Q学习算法更新
+            state = next_state  # 更新状态
+            ep_reward += reward
+            if done:
+                break
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
+        else:
+            ma_rewards.append(ep_reward)
+        print("回合数：{}/{}，奖励{:.1f}".format(i_ep+1, cfg.train_eps,ep_reward))
+    print('完成训练！')
+    return rewards,ma_rewards
+    
+def test(cfg,env,agent):
+    print('开始测试！')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
+    rewards = []  # 记录所有回合的奖励
+    ma_rewards = [] # 滑动平均的奖励
+    for i_ep in range(cfg.test_eps):
+        ep_reward = 0  # 记录每个episode的reward
+        state = env.reset()  # 重置环境, 重新开一局（即开始新的一个回合）
+        while True:
+            action = agent.predict(state)  # 根据算法选择一个动作
+            next_state, reward, done, _ = env.step(action)  # 与环境进行一个交互
+            state = next_state  # 更新状态
+            ep_reward += reward
+            if done:
+                break
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
+        else:
+            ma_rewards.append(ep_reward)
+        print(f"回合数：{i_ep+1}/{cfg.test_eps}, 奖励：{ep_reward:.1f}")
+    print('完成测试！')
+    return rewards,ma_rewards
+        
+def env_agent_config(cfg,seed=1):
+    '''创建环境和智能体
+    Args:
+        cfg ([type]): [description]
+        seed (int, optional): 随机种子. Defaults to 1.
+    Returns:
+        env [type]: 环境
+        agent : 智能体
+    '''    
+    env = gym.make(cfg.env_name)  
+    env = CliffWalkingWapper(env)
+    env.seed(seed) # 设置随机种子
+    n_states = env.observation_space.n # 状态维度
+    n_actions = env.action_space.n # 动作维度
+    agent = QLearning(n_states,n_actions,cfg)
+    return env,agent
+if __name__ == "__main__":
+    cfg = Config()
+    # 训练
+    env, agent = env_agent_config(cfg, seed=1)
+    rewards, ma_rewards = train(cfg, env, agent)
+    make_dir(cfg.result_path, cfg.model_path)  # 创建保存结果和模型路径的文件夹
+    agent.save(path=cfg.model_path)  # 保存模型
+    save_results(rewards, ma_rewards, tag='train',
+                path=cfg.result_path)  # 保存结果
+    plot_rewards(rewards, ma_rewards, cfg, tag="train")  # 画出结果
+    # 测试
+    env, agent = env_agent_config(cfg, seed=10)
+    agent.load(path=cfg.model_path)  # 导入模型
+    rewards, ma_rewards = test(cfg, env, agent)
+    save_results(rewards, ma_rewards, tag='test', path=cfg.result_path)  # 保存结果
+    plot_rewards(rewards, ma_rewards, cfg, tag="test")  # 画出结果
+        
+