diff --git a/codes/QLearning/README.md b/codes/QLearning/README.md new file mode 100644 index 0000000..0f6a87c --- /dev/null +++ b/codes/QLearning/README.md @@ -0,0 +1,19 @@ +## CliffWalking-v0环境简介 + +悬崖寻路问题(CliffWalking)是指在一个4 x 12的网格中,智能体以网格的左下角位置为起点,以网格的下角位置为终点,目标是移动智能体到达终点位置,智能体每次可以在上、下、左、右这4个方向中移动一步,每移动一步会得到-1单位的奖励。 + +image-20201007211441036 + +如图,红色部分表示悬崖,数字代表智能体能够观测到的位置信息,即observation,总共会有0-47等48个不同的值,智能体再移动中会有以下限制: + +* 智能体不能移出网格,如果智能体想执行某个动作移出网格,那么这一步智能体不会移动,但是这个操作依然会得到-1单位的奖励 + +* 如果智能体“掉入悬崖” ,会立即回到起点位置,并得到-100单位的奖励 + +* 当智能体移动到终点时,该回合结束,该回合总奖励为各步奖励之和 + +实际的仿真界面如下: + +image-20201007211858925 + +由于从起点到终点最少需要13步,每步得到-1的reward,因此最佳训练算法下,每个episode下reward总和应该为-13。 \ No newline at end of file diff --git a/codes/QLearning/agent.py b/codes/QLearning/agent.py new file mode 100644 index 0000000..3e9fb2d --- /dev/null +++ b/codes/QLearning/agent.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python +# coding=utf-8 +''' +Author: John +Email: johnjim0816@gmail.com +Date: 2020-09-11 23:03:00 +LastEditor: John +LastEditTime: 2021-03-11 19:16:27 +Discription: +Environment: +''' +from functools import update_wrapper +import numpy as np +import math +import torch +from collections import defaultdict + +class QLearning(object): + def __init__(self, + n_actions,cfg): + self.n_actions = n_actions # number of actions + self.lr = cfg.lr # learning rate + self.gamma = cfg.gamma + self.epsilon = 0 + self.sample_count = 0 # epsilon随训练的也就是采样次数逐渐衰减,所以需要计数 + self.epsilon_start = cfg.epsilon_start + self.epsilon_end = cfg.epsilon_end + self.epsilon_decay = cfg.epsilon_decay + self.Q_table = defaultdict(lambda: np.zeros(n_actions)) # 使用字典存储Q表,个人比较喜欢这种,也可以用下面一行的二维数组表示,但是需要额外更改代码 + # self.Q_table = np.zeros((n_states, n_actions)) # Q表 + def choose_action(self, state): + self.sample_count += 1 + self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ + math.exp(-1. * self.sample_count / self.epsilon_decay) + # 随机选取0-1之间的值,如果大于epsilon就按照贪心策略选取action,否则随机选取 + if np.random.uniform(0, 1) > self.epsilon: + action = np.argmax(self.Q_table[state]) + else: + action = np.random.choice(self.n_actions) # 有一定概率随机探索选取一个动作 + return action + + def update(self, state, action, reward, next_state, done): + Q_predict = self.Q_table[state][action] + if done: + Q_target = reward # terminal state + else: + Q_target = reward + self.gamma * np.max( + self.Q_table[next_state]) # Q_table-learning + self.Q_table[state][action] += self.lr * (Q_target - Q_predict) + def save(self,path): + '''把 Q表格 的数据保存到文件中 + ''' + import dill + torch.save( + obj=self.Q_table, + f=path, + pickle_module=dill + ) + + def load(self, path): + '''从文件中读取数据到 Q表格 + ''' + self.Q_table =torch.load(f='prod_dls.pkl',pickle_module=dill) \ No newline at end of file diff --git a/codes/QLearning/assets/image-20201007211441036.png b/codes/QLearning/assets/image-20201007211441036.png new file mode 100644 index 0000000..ae5b0f8 Binary files /dev/null and b/codes/QLearning/assets/image-20201007211441036.png differ diff --git a/codes/QLearning/assets/image-20201007211858925.png b/codes/QLearning/assets/image-20201007211858925.png new file mode 100644 index 0000000..0bbb5b2 Binary files /dev/null and b/codes/QLearning/assets/image-20201007211858925.png differ diff --git a/codes/QLearning/main.py b/codes/QLearning/main.py new file mode 100644 index 0000000..aa1e3b2 --- /dev/null +++ b/codes/QLearning/main.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python +# coding=utf-8 +''' +Author: John +Email: johnjim0816@gmail.com +Date: 2020-09-11 23:03:00 +LastEditor: John +LastEditTime: 2021-03-11 19:22:50 +Discription: +Environment: +''' + +import sys,os +sys.path.append(os.getcwd()) # 添加当前终端路径 +import argparse +import gym +import datetime +from QLearning.plot import plot +from QLearning.utils import save_results +from envs.gridworld_env import CliffWalkingWapper, FrozenLakeWapper +from QLearning.agent import QLearning + +SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") +SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' +RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/result/"+SEQUENCE+'/' + +def get_args(): + '''训练的模型参数 + ''' + parser = argparse.ArgumentParser() + '''训练相关参数''' + parser.add_argument("--n_episodes", default=500, + type=int, help="训练的最大episode数目") + '''算法相关参数''' + parser.add_argument("--gamma", default=0.9, + type=float, help="reward的衰减率") + parser.add_argument("--epsilon_start", default=0.99, + type=float, help="e-greedy策略中初始epsilon") + parser.add_argument("--epsilon_end", default=0.01, + type=float, help="e-greedy策略中的结束epsilon") + parser.add_argument("--epsilon_decay", default=200, + type=float, help="e-greedy策略中epsilon的衰减率") + parser.add_argument("--lr", default=0.1, type=float, help="学习率") + config = parser.parse_args() + return config +def train(cfg,env,agent): + # env = gym.make("FrozenLake-v0", is_slippery=False) # 0 left, 1 down, 2 right, 3 up + # env = FrozenLakeWapper(env) + rewards = [] # 记录所有episode的reward, + steps = [] # 记录所有episode的steps + for i_episode in range(cfg.n_episodes): + ep_reward = 0 # 记录每个episode的reward + ep_steps = 0 # 记录每个episode走了多少step + obs = env.reset() # 重置环境, 重新开一局(即开始新的一个episode) + while True: + action = agent.choose_action(obs) # 根据算法选择一个动作 + next_obs, reward, done, _ = env.step(action) # 与环境进行一个交互 + # 训练 Q-learning算法 + agent.update(obs, action, reward, next_obs, done) # 不需要下一步的action + obs = next_obs # 存储上一个观察值 + ep_reward += reward + ep_steps += 1 # 计算step数 + if done: + break + steps.append(ep_steps) + # 计算滑动平均的reward + if rewards: + rewards.append(rewards[-1]*0.9+ep_reward*0.1) + else: + rewards.append(ep_reward) + print("Episode:{}/{}: reward:{:.1f}".format(i_episode+1, cfg.n_episodes,ep_reward)) + plot(rewards) + if not os.path.exists(SAVED_MODEL_PATH): + os.mkdir(SAVED_MODEL_PATH) + agent.save(SAVED_MODEL_PATH+'Q_table.pkl') # 训练结束,保存模型 + '''存储reward等相关结果''' + save_results(rewards,tag='train',result_path=RESULT_PATH) + +def eval(cfg,env,agent): + # env = gym.make("FrozenLake-v0", is_slippery=False) # 0 left, 1 down, 2 right, 3 up + # env = FrozenLakeWapper(env) + rewards = [] # 记录所有episode的reward, + steps = [] # 记录所有episode的steps + for i_episode in range(20): + ep_reward = 0 # 记录每个episode的reward + ep_steps = 0 # 记录每个episode走了多少step + obs = env.reset() # 重置环境, 重新开一局(即开始新的一个episode) + while True: + action = agent.choose_action(obs) # 根据算法选择一个动作 + next_obs, reward, done, _ = env.step(action) # 与环境进行一个交互 + obs = next_obs # 存储上一个观察值 + ep_reward += reward + ep_steps += 1 # 计算step数 + if done: + break + steps.append(ep_steps) + # 计算滑动平均的reward + if rewards: + rewards.append(rewards[-1]*0.9+ep_reward*0.1) + else: + rewards.append(ep_reward) + print("Episode:{}/{}: reward:{:.1f}".format(i_episode+1, cfg.n_episodes,ep_reward)) + plot(rewards) + '''存储reward等相关结果''' + save_results(rewards,tag='eval',result_path=RESULT_PATH) + +if __name__ == "__main__": + cfg = get_args() + env = gym.make("CliffWalking-v0") # 0 up, 1 right, 2 down, 3 left + env = CliffWalkingWapper(env) + n_actions = env.action_space.n + agent = QLearning(n_actions,cfg) + train(cfg,env,agent) + eval(cfg,env,agent) + diff --git a/codes/QLearning/plot.py b/codes/QLearning/plot.py new file mode 100644 index 0000000..e64ceba --- /dev/null +++ b/codes/QLearning/plot.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python +# coding=utf-8 +''' +Author: John +Email: johnjim0816@gmail.com +Date: 2020-10-07 20:57:11 +LastEditor: John +LastEditTime: 2020-10-07 21:00:29 +Discription: +Environment: +''' +import matplotlib.pyplot as plt +import seaborn as sns +import numpy as np +import os + +def plot(item,ylabel='rewards'): + sns.set() + plt.figure() + plt.plot(np.arange(len(item)), item) + plt.title(ylabel+' of Q-learning') + plt.ylabel(ylabel) + plt.xlabel('episodes') + plt.savefig(os.path.dirname(__file__)+"/result/"+ylabel+".png") + plt.show() + +if __name__ == "__main__": + + output_path = os.path.dirname(__file__)+"/result/" + rewards=np.load(output_path+"rewards_train.npy", ) + MA_rewards=np.load(output_path+"MA_rewards_train.npy") + steps = np.load(output_path+"steps_train.npy") + plot(rewards) + plot(MA_rewards,ylabel='moving_average_rewards') + plot(steps,ylabel='steps') \ No newline at end of file diff --git a/codes/QLearning/result/20210311-192256/rewards_eval.npy b/codes/QLearning/result/20210311-192256/rewards_eval.npy new file mode 100644 index 0000000..9bee5e4 Binary files /dev/null and b/codes/QLearning/result/20210311-192256/rewards_eval.npy differ diff --git a/codes/QLearning/result/20210311-192256/rewards_train.npy b/codes/QLearning/result/20210311-192256/rewards_train.npy new file mode 100644 index 0000000..9395542 Binary files /dev/null and b/codes/QLearning/result/20210311-192256/rewards_train.npy differ diff --git a/codes/QLearning/result/rewards.png b/codes/QLearning/result/rewards.png new file mode 100644 index 0000000..4acca82 Binary files /dev/null and b/codes/QLearning/result/rewards.png differ diff --git a/codes/QLearning/saved_model/20210311-192256/Q_table.pkl b/codes/QLearning/saved_model/20210311-192256/Q_table.pkl new file mode 100644 index 0000000..159318d Binary files /dev/null and b/codes/QLearning/saved_model/20210311-192256/Q_table.pkl differ diff --git a/codes/QLearning/utils.py b/codes/QLearning/utils.py new file mode 100644 index 0000000..d777986 --- /dev/null +++ b/codes/QLearning/utils.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python +# coding=utf-8 +''' +Author: John +Email: johnjim0816@gmail.com +Date: 2020-11-23 13:44:52 +LastEditor: John +LastEditTime: 2021-03-11 19:18:34 +Discription: +Environment: +''' +import os +import numpy as np + + +def save_results(rewards,tag='train',result_path='./result'): + '''保存reward等结果 + ''' + if not os.path.exists(result_path): # 检测是否存在文件夹 + os.mkdir(result_path) + np.save(result_path+'rewards_'+tag+'.npy', rewards) + print('results saved!')