diff --git a/codes/Q-learning/README.md b/codes/Q-learning/README.md index 4fb7748..be11391 100644 --- a/codes/Q-learning/README.md +++ b/codes/Q-learning/README.md @@ -16,4 +16,23 @@ ![](assets/cliffwalking_2.png) -由于从起点到终点最少需要13步,每步得到-1的reward,因此最佳训练算法下,每个episode下reward总和应该为-13。 \ No newline at end of file +由于从起点到终点最少需要13步,每步得到-1的reward,因此最佳训练算法下,每个episode下reward总和应该为-13。 + + +## 使用 + +train: + +```python +python main.py +``` + +eval: + +```python +python main.py --train 0 +``` +tensorboard: +```python +tensorboard --logdir logs +``` \ No newline at end of file diff --git a/codes/Q-learning/agent.py b/codes/Q-learning/agent.py index 6c21251..cbad64e 100644 --- a/codes/Q-learning/agent.py +++ b/codes/Q-learning/agent.py @@ -5,7 +5,7 @@ Author: John Email: johnjim0816@gmail.com Date: 2020-09-11 23:03:00 LastEditor: John -LastEditTime: 2020-10-07 20:48:29 +LastEditTime: 2020-11-24 20:22:03 Discription: Environment: ''' @@ -81,14 +81,11 @@ class QLearning(object): self.Q_table[next_obs, :]) # Q_table-learning self.Q_table[obs, action] += self.lr * (Q_target - Q_predict) # 修正q - def save(self): + def save_model(self,path): '''把 Q表格 的数据保存到文件中 ''' - npy_file = './result/Q_table.npy' - np.save(npy_file, self.Q_table) - print(npy_file + ' saved.') - def load(self, npy_file='./result/Q_table.npy'): + np.save(path, self.Q_table) + def load_model(self, path): '''从文件中读取数据到 Q表格 ''' - self.Q_table = np.load(npy_file) - print(npy_file + 'loaded.') + self.Q_table = np.load(path) diff --git a/codes/Q-learning/gridworld.py b/codes/Q-learning/env.py similarity index 96% rename from codes/Q-learning/gridworld.py rename to codes/Q-learning/env.py index 31d968f..373e3f4 100644 --- a/codes/Q-learning/gridworld.py +++ b/codes/Q-learning/env.py @@ -18,10 +18,14 @@ import gym import turtle import numpy as np -# turtle tutorial : https://docs.python.org/3.3/library/turtle.html +def env_init_1(): + ''' 初始化CliffWalking-v0环境 + ''' + env = gym.make("CliffWalking-v0") # 0 up, 1 right, 2 down, 3 left + env = CliffWalkingWapper(env) + return env - -def GridWorld(gridmap=None, is_slippery=False): +def env_init_2(gridmap=None, is_slippery=False): if gridmap is None: gridmap = ['SFFF', 'FHFH', 'FFFH', 'HFFG'] env = gym.make("FrozenLake-v0", desc=gridmap, is_slippery=False) diff --git a/codes/Q-learning/main.py b/codes/Q-learning/main.py index 431ed09..9f85d2f 100644 --- a/codes/Q-learning/main.py +++ b/codes/Q-learning/main.py @@ -5,7 +5,7 @@ Author: John Email: johnjim0816@gmail.com Date: 2020-09-11 23:03:00 LastEditor: John -LastEditTime: 2020-10-07 21:05:33 +LastEditTime: 2020-11-24 19:56:23 Discription: Environment: ''' @@ -26,35 +26,23 @@ Environment: # -*- coding: utf-8 -*- import gym -from gridworld import CliffWalkingWapper, FrozenLakeWapper +from env import CliffWalkingWapper, FrozenLakeWapper from agent import QLearning import os import numpy as np import argparse import time import matplotlib.pyplot as plt -def get_args(): - '''训练的模型参数 - ''' - parser = argparse.ArgumentParser() - parser.add_argument("--gamma", default=0.9, - type=float, help="reward 的衰减率") - parser.add_argument("--epsilon_start", default=0.9, - type=float,help="e-greedy策略中初始epsilon") - parser.add_argument("--epsilon_end", default=0.1, type=float,help="e-greedy策略中的结束epsilon") - parser.add_argument("--epsilon_decay", default=200, type=float,help="e-greedy策略中epsilon的衰减率") - parser.add_argument("--policy_lr", default=0.1, type=float,help="学习率") - parser.add_argument("--max_episodes", default=500, type=int,help="训练的最大episode数目") - - config = parser.parse_args() - - return config +from env import env_init_1 +from params import get_args +from params import SEQUENCE, SAVED_MODEL_PATH, RESULT_PATH +from utils import save_results,save_model +from plot import plot def train(cfg): - # env = gym.make("FrozenLake-v0", is_slippery=False) # 0 left, 1 down, 2 right, 3 up - # env = FrozenLakeWapper(env) - env = gym.make("CliffWalking-v0") # 0 up, 1 right, 2 down, 3 left - env = CliffWalkingWapper(env) + '''# env = gym.make("FrozenLake-v0", is_slippery=False) # 0 left, 1 down, 2 right, 3 up + # env = FrozenLakeWapper(env)''' + env = env_init_1() agent = QLearning( obs_dim=env.observation_space.n, action_dim=env.action_space.n, @@ -84,7 +72,7 @@ def train(cfg): break steps.append(ep_steps) rewards.append(ep_reward) - # 计算滑动平均的reward + '''计算滑动平均的reward''' if i_episode == 1: MA_rewards.append(ep_reward) else: @@ -92,20 +80,17 @@ def train(cfg): 0.9*MA_rewards[-1]+0.1*ep_reward) print('Episode %s: steps = %s , reward = %.1f, explore = %.2f' % (i_episode, ep_steps, ep_reward,agent.epsilon)) - # 每隔20个episode渲染一下看看效果 + '''每隔20个episode渲染一下看看效果''' if i_episode % 20 == 0: render = True else: render = False - agent.save() # 训练结束,保存模型 - - output_path = os.path.dirname(__file__)+"/result/" - # 检测是否存在文件夹 - if not os.path.exists(output_path): - os.mkdir(output_path) - np.save(output_path+"rewards_train.npy", rewards) - np.save(output_path+"MA_rewards_train.npy", MA_rewards) - np.save(output_path+"steps_train.npy", steps) + print('Complete training!') + save_model(agent,model_path=SAVED_MODEL_PATH) + '''存储reward等相关结果''' + save_results(rewards,MA_rewards,tag='train',result_path=RESULT_PATH) + plot(rewards) + plot(MA_rewards,ylabel='moving_average_rewards_train') def test(cfg): @@ -144,12 +129,23 @@ def test(cfg): MA_rewards.append( 0.9*MA_rewards[-1]+0.1*ep_reward) print('Episode %s: steps = %s , reward = %.1f' % (i_episode, ep_steps, ep_reward)) - plt.plot(MA_rewards) - plt.show() + print('Complete training!') + save_model(agent,model_path=SAVED_MODEL_PATH) + '''存储reward等相关结果''' + save_results(rewards,MA_rewards,tag='train',result_path=RESULT_PATH) + plot(rewards) + plot(MA_rewards,ylabel='moving_average_rewards_train') + def main(): cfg = get_args() # train(cfg) test(cfg) if __name__ == "__main__": - main() \ No newline at end of file + cfg = get_args() + if cfg.train: + train(cfg) + eval(cfg) + else: + model_path = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/" + eval(cfg,saved_model_path=model_path) \ No newline at end of file diff --git a/codes/Q-learning/params.py b/codes/Q-learning/params.py new file mode 100644 index 0000000..89cbee7 --- /dev/null +++ b/codes/Q-learning/params.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python +# coding=utf-8 +''' +Author: John +Email: johnjim0816@gmail.com +Date: 2020-11-24 19:45:58 +LastEditor: John +LastEditTime: 2020-11-24 19:53:13 +Discription: +Environment: +''' +import argparse +import datetime +import os + +SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") +SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' +RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/result/"+SEQUENCE+'/' + +def get_args(): + '''训练的模型参数 + ''' + parser = argparse.ArgumentParser() + parser.add_argument("--train", default=1, type=int) # 1 表示训练,0表示只进行eval + parser.add_argument("--gamma", default=0.9, + type=float, help="reward 的衰减率") + parser.add_argument("--epsilon_start", default=0.9, + type=float,help="e-greedy策略中初始epsilon") + parser.add_argument("--epsilon_end", default=0.1, type=float,help="e-greedy策略中的结束epsilon") + parser.add_argument("--epsilon_decay", default=200, type=float,help="e-greedy策略中epsilon的衰减率") + parser.add_argument("--policy_lr", default=0.1, type=float,help="学习率") + parser.add_argument("--max_episodes", default=500, type=int,help="训练的最大episode数目") + + config = parser.parse_args() + + return config \ No newline at end of file diff --git a/codes/Q-learning/result/20201124-201903/moving_average_rewards_train.npy b/codes/Q-learning/result/20201124-201903/moving_average_rewards_train.npy new file mode 100644 index 0000000..3804bd9 Binary files /dev/null and b/codes/Q-learning/result/20201124-201903/moving_average_rewards_train.npy differ diff --git a/codes/Q-learning/result/20201124-201903/rewards_train.npy b/codes/Q-learning/result/20201124-201903/rewards_train.npy new file mode 100644 index 0000000..3029df3 Binary files /dev/null and b/codes/Q-learning/result/20201124-201903/rewards_train.npy differ diff --git a/codes/Q-learning/result/Q_table.npy b/codes/Q-learning/result/Q_table.npy index 63662d0..efc9546 100644 Binary files a/codes/Q-learning/result/Q_table.npy and b/codes/Q-learning/result/Q_table.npy differ diff --git a/codes/Q-learning/result/moving_average_rewards_train.png b/codes/Q-learning/result/moving_average_rewards_train.png new file mode 100644 index 0000000..f72ef4d Binary files /dev/null and b/codes/Q-learning/result/moving_average_rewards_train.png differ diff --git a/codes/Q-learning/result/rewards.png b/codes/Q-learning/result/rewards.png index b82663d..3a1005f 100644 Binary files a/codes/Q-learning/result/rewards.png and b/codes/Q-learning/result/rewards.png differ diff --git a/codes/Q-learning/saved_model/20201124-201903/checkpoint.npy b/codes/Q-learning/saved_model/20201124-201903/checkpoint.npy new file mode 100644 index 0000000..0a49f57 Binary files /dev/null and b/codes/Q-learning/saved_model/20201124-201903/checkpoint.npy differ diff --git a/codes/Q-learning/utils.py b/codes/Q-learning/utils.py new file mode 100644 index 0000000..f6fe640 --- /dev/null +++ b/codes/Q-learning/utils.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python +# coding=utf-8 +''' +Author: John +Email: johnjim0816@gmail.com +Date: 2020-11-24 19:50:18 +LastEditor: John +LastEditTime: 2020-11-24 20:20:46 +Discription: +Environment: +''' +import os +import numpy as np + + +def save_results(rewards,moving_average_rewards,tag='train',result_path='./result'): + '''保存reward等结果 + ''' + if not os.path.exists(result_path): # 检测是否存在文件夹 + os.mkdir(result_path) + np.save(result_path+'rewards_'+tag+'.npy', rewards) + np.save(result_path+'moving_average_rewards_'+tag+'.npy', moving_average_rewards) + print('results saved!') + +def save_model(agent,model_path='./saved_model'): + if not os.path.exists(model_path): # 检测是否存在文件夹 + os.mkdir(model_path) + agent.save_model(model_path+'checkpoint') + print('model saved!') \ No newline at end of file