update q-learning

This commit is contained in:
JohnJim0816
2020-11-24 20:29:23 +08:00
parent 4cc12bf97f
commit cfe5a89fa7
12 changed files with 129 additions and 48 deletions

View File

@@ -16,4 +16,23 @@
![](assets/cliffwalking_2.png)
由于从起点到终点最少需要13步每步得到-1的reward因此最佳训练算法下每个episode下reward总和应该为-13。
由于从起点到终点最少需要13步每步得到-1的reward因此最佳训练算法下每个episode下reward总和应该为-13。
## 使用
train:
```python
python main.py
```
eval:
```python
python main.py --train 0
```
tensorboard
```python
tensorboard --logdir logs
```

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com
Date: 2020-09-11 23:03:00
LastEditor: John
LastEditTime: 2020-10-07 20:48:29
LastEditTime: 2020-11-24 20:22:03
Discription:
Environment:
'''
@@ -81,14 +81,11 @@ class QLearning(object):
self.Q_table[next_obs, :]) # Q_table-learning
self.Q_table[obs, action] += self.lr * (Q_target - Q_predict) # 修正q
def save(self):
def save_model(self,path):
'''把 Q表格 的数据保存到文件中
'''
npy_file = './result/Q_table.npy'
np.save(npy_file, self.Q_table)
print(npy_file + ' saved.')
def load(self, npy_file='./result/Q_table.npy'):
np.save(path, self.Q_table)
def load_model(self, path):
'''从文件中读取数据到 Q表格
'''
self.Q_table = np.load(npy_file)
print(npy_file + 'loaded.')
self.Q_table = np.load(path)

View File

@@ -18,10 +18,14 @@ import gym
import turtle
import numpy as np
# turtle tutorial : https://docs.python.org/3.3/library/turtle.html
def env_init_1():
''' 初始化CliffWalking-v0环境
'''
env = gym.make("CliffWalking-v0") # 0 up, 1 right, 2 down, 3 left
env = CliffWalkingWapper(env)
return env
def GridWorld(gridmap=None, is_slippery=False):
def env_init_2(gridmap=None, is_slippery=False):
if gridmap is None:
gridmap = ['SFFF', 'FHFH', 'FFFH', 'HFFG']
env = gym.make("FrozenLake-v0", desc=gridmap, is_slippery=False)

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com
Date: 2020-09-11 23:03:00
LastEditor: John
LastEditTime: 2020-10-07 21:05:33
LastEditTime: 2020-11-24 19:56:23
Discription:
Environment:
'''
@@ -26,35 +26,23 @@ Environment:
# -*- coding: utf-8 -*-
import gym
from gridworld import CliffWalkingWapper, FrozenLakeWapper
from env import CliffWalkingWapper, FrozenLakeWapper
from agent import QLearning
import os
import numpy as np
import argparse
import time
import matplotlib.pyplot as plt
def get_args():
'''训练的模型参数
'''
parser = argparse.ArgumentParser()
parser.add_argument("--gamma", default=0.9,
type=float, help="reward 的衰减率")
parser.add_argument("--epsilon_start", default=0.9,
type=float,help="e-greedy策略中初始epsilon")
parser.add_argument("--epsilon_end", default=0.1, type=float,help="e-greedy策略中的结束epsilon")
parser.add_argument("--epsilon_decay", default=200, type=float,help="e-greedy策略中epsilon的衰减率")
parser.add_argument("--policy_lr", default=0.1, type=float,help="学习率")
parser.add_argument("--max_episodes", default=500, type=int,help="训练的最大episode数目")
config = parser.parse_args()
return config
from env import env_init_1
from params import get_args
from params import SEQUENCE, SAVED_MODEL_PATH, RESULT_PATH
from utils import save_results,save_model
from plot import plot
def train(cfg):
# env = gym.make("FrozenLake-v0", is_slippery=False) # 0 left, 1 down, 2 right, 3 up
# env = FrozenLakeWapper(env)
env = gym.make("CliffWalking-v0") # 0 up, 1 right, 2 down, 3 left
env = CliffWalkingWapper(env)
'''# env = gym.make("FrozenLake-v0", is_slippery=False) # 0 left, 1 down, 2 right, 3 up
# env = FrozenLakeWapper(env)'''
env = env_init_1()
agent = QLearning(
obs_dim=env.observation_space.n,
action_dim=env.action_space.n,
@@ -84,7 +72,7 @@ def train(cfg):
break
steps.append(ep_steps)
rewards.append(ep_reward)
# 计算滑动平均的reward
'''计算滑动平均的reward'''
if i_episode == 1:
MA_rewards.append(ep_reward)
else:
@@ -92,20 +80,17 @@ def train(cfg):
0.9*MA_rewards[-1]+0.1*ep_reward)
print('Episode %s: steps = %s , reward = %.1f, explore = %.2f' % (i_episode, ep_steps,
ep_reward,agent.epsilon))
# 每隔20个episode渲染一下看看效果
'''每隔20个episode渲染一下看看效果'''
if i_episode % 20 == 0:
render = True
else:
render = False
agent.save() # 训练结束,保存模型
output_path = os.path.dirname(__file__)+"/result/"
# 检测是否存在文件夹
if not os.path.exists(output_path):
os.mkdir(output_path)
np.save(output_path+"rewards_train.npy", rewards)
np.save(output_path+"MA_rewards_train.npy", MA_rewards)
np.save(output_path+"steps_train.npy", steps)
print('Complete training')
save_model(agent,model_path=SAVED_MODEL_PATH)
'''存储reward等相关结果'''
save_results(rewards,MA_rewards,tag='train',result_path=RESULT_PATH)
plot(rewards)
plot(MA_rewards,ylabel='moving_average_rewards_train')
def test(cfg):
@@ -144,12 +129,23 @@ def test(cfg):
MA_rewards.append(
0.9*MA_rewards[-1]+0.1*ep_reward)
print('Episode %s: steps = %s , reward = %.1f' % (i_episode, ep_steps, ep_reward))
plt.plot(MA_rewards)
plt.show()
print('Complete training')
save_model(agent,model_path=SAVED_MODEL_PATH)
'''存储reward等相关结果'''
save_results(rewards,MA_rewards,tag='train',result_path=RESULT_PATH)
plot(rewards)
plot(MA_rewards,ylabel='moving_average_rewards_train')
def main():
cfg = get_args()
# train(cfg)
test(cfg)
if __name__ == "__main__":
main()
cfg = get_args()
if cfg.train:
train(cfg)
eval(cfg)
else:
model_path = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"
eval(cfg,saved_model_path=model_path)

View File

@@ -0,0 +1,36 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2020-11-24 19:45:58
LastEditor: John
LastEditTime: 2020-11-24 19:53:13
Discription:
Environment:
'''
import argparse
import datetime
import os
SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/'
RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/result/"+SEQUENCE+'/'
def get_args():
'''训练的模型参数
'''
parser = argparse.ArgumentParser()
parser.add_argument("--train", default=1, type=int) # 1 表示训练0表示只进行eval
parser.add_argument("--gamma", default=0.9,
type=float, help="reward 的衰减率")
parser.add_argument("--epsilon_start", default=0.9,
type=float,help="e-greedy策略中初始epsilon")
parser.add_argument("--epsilon_end", default=0.1, type=float,help="e-greedy策略中的结束epsilon")
parser.add_argument("--epsilon_decay", default=200, type=float,help="e-greedy策略中epsilon的衰减率")
parser.add_argument("--policy_lr", default=0.1, type=float,help="学习率")
parser.add_argument("--max_episodes", default=500, type=int,help="训练的最大episode数目")
config = parser.parse_args()
return config

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 28 KiB

After

Width:  |  Height:  |  Size: 31 KiB

29
codes/Q-learning/utils.py Normal file
View File

@@ -0,0 +1,29 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2020-11-24 19:50:18
LastEditor: John
LastEditTime: 2020-11-24 20:20:46
Discription:
Environment:
'''
import os
import numpy as np
def save_results(rewards,moving_average_rewards,tag='train',result_path='./result'):
'''保存reward等结果
'''
if not os.path.exists(result_path): # 检测是否存在文件夹
os.mkdir(result_path)
np.save(result_path+'rewards_'+tag+'.npy', rewards)
np.save(result_path+'moving_average_rewards_'+tag+'.npy', moving_average_rewards)
print('results saved!')
def save_model(agent,model_path='./saved_model'):
if not os.path.exists(model_path): # 检测是否存在文件夹
os.mkdir(model_path)
agent.save_model(model_path+'checkpoint')
print('model saved')