Merge branch 'master' of https://github.com/datawhalechina/easy-rl
5
codes/.gitignore
vendored
@@ -1,5 +0,0 @@
|
|||||||
.DS_Store
|
|
||||||
.ipynb_checkpoints
|
|
||||||
__pycache__
|
|
||||||
.vscode
|
|
||||||
test.py
|
|
||||||
@@ -2,13 +2,13 @@
|
|||||||
|
|
||||||
## 原理简介
|
## 原理简介
|
||||||
|
|
||||||
DQN是Q-learning算法的优化和延伸,Q-learning中使用有限的Q表存储值的信息,而DQN中则用神经网络替代Q表存储信息,这样更适用于高维的情况,相关知识基础可参考[EasyRL-DQN](https://datawhalechina.github.io/easy-rl/#/chapter6/chapter6)。
|
DQN是Q-leanning算法的优化和延伸,Q-leaning中使用有限的Q表存储值的信息,而DQN中则用神经网络替代Q表存储信息,这样更适用于高维的情况,相关知识基础可参考[datawhale李宏毅笔记-Q学习](https://datawhalechina.github.io/easy-rl/#/chapter6/chapter6)。
|
||||||
|
|
||||||
论文方面主要可以参考两篇,一篇就是2013年谷歌DeepMind团队的[Playing Atari with Deep Reinforcement Learning](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf),一篇是也是他们团队后来在Nature杂志上发表的[Human-level control through deep reinforcement learning](https://web.stanford.edu/class/psych209/Readings/MnihEtAlHassibis15NatureControlDeepRL.pdf)。后者在算法层面增加target q-net,也可以叫做Nature DQN。
|
论文方面主要可以参考两篇,一篇就是2013年谷歌DeepMind团队的[Playing Atari with Deep Reinforcement Learning](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf),一篇是也是他们团队后来在Nature杂志上发表的[Human-level control through deep reinforcement learning](https://web.stanford.edu/class/psych209/Readings/MnihEtAlHassibis15NatureControlDeepRL.pdf)。后者在算法层面增加target q-net,也可以叫做Nature DQN。
|
||||||
|
|
||||||
Nature DQN使用了两个Q网络,一个当前Q网络𝑄用来选择动作,更新模型参数,另一个目标Q网络𝑄′用于计算目标Q值。目标Q网络的网络参数不需要迭代更新,而是每隔一段时间从当前Q网络𝑄复制过来,即延时更新,这样可以减少目标Q值和当前的Q值相关性。
|
Nature DQN使用了两个Q网络,一个当前Q网络𝑄用来选择动作,更新模型参数,另一个目标Q网络𝑄′用于计算目标Q值。目标Q网络的网络参数不需要迭代更新,而是每隔一段时间从当前Q网络𝑄复制过来,即延时更新,这样可以减少目标Q值和当前的Q值相关性。
|
||||||
|
|
||||||
要注意的是,两个Q网络的结构是一模一样的,这样才可以复制网络参数。Nature DQN和[Playing Atari with Deep Reinforcement Learning](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf)相比,除了用一个新的相同结构的目标Q网络来计算目标Q值以外,其余部分基本是完全相同的。细节也可参考[强化学习(九)Deep Q-Learning进阶之Nature DQN](https://www.cnblogs.com/pinard/p/9756075.html)。
|
要注意的是,两个Q网络的结构是一模一样的。这样才可以复制网络参数。Nature DQN和[Playing Atari with Deep Reinforcement Learning](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf)相比,除了用一个新的相同结构的目标Q网络来计算目标Q值以外,其余部分基本是完全相同的。细节也可参考[强化学习(九)Deep Q-Learning进阶之Nature DQN](https://www.cnblogs.com/pinard/p/9756075.html)。
|
||||||
|
|
||||||
https://blog.csdn.net/JohnJim0/article/details/109557173)
|
https://blog.csdn.net/JohnJim0/article/details/109557173)
|
||||||
|
|
||||||
|
|||||||
|
After Width: | Height: | Size: 55 KiB |
|
After Width: | Height: | Size: 58 KiB |
@@ -5,7 +5,7 @@ Author: JiangJi
|
|||||||
Email: johnjim0816@gmail.com
|
Email: johnjim0816@gmail.com
|
||||||
Date: 2021-12-22 11:14:17
|
Date: 2021-12-22 11:14:17
|
||||||
LastEditor: JiangJi
|
LastEditor: JiangJi
|
||||||
LastEditTime: 2022-02-10 06:17:41
|
LastEditTime: 2022-06-18 20:12:20
|
||||||
Discription: 使用 Nature DQN 训练 CartPole-v1
|
Discription: 使用 Nature DQN 训练 CartPole-v1
|
||||||
'''
|
'''
|
||||||
import sys
|
import sys
|
||||||
@@ -17,6 +17,9 @@ sys.path.append(parent_path) # 添加路径到系统路径
|
|||||||
import gym
|
import gym
|
||||||
import torch
|
import torch
|
||||||
import datetime
|
import datetime
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
from common.utils import save_results, make_dir
|
from common.utils import save_results, make_dir
|
||||||
from common.utils import plot_rewards, plot_rewards_cn
|
from common.utils import plot_rewards, plot_rewards_cn
|
||||||
from dqn import DQN
|
from dqn import DQN
|
||||||
@@ -33,18 +36,18 @@ class DQNConfig:
|
|||||||
self.env_name = env_name # 环境名称
|
self.env_name = env_name # 环境名称
|
||||||
self.device = torch.device(
|
self.device = torch.device(
|
||||||
"cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
|
"cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
|
||||||
self.train_eps = 200 # 训练的回合数
|
self.train_eps = 300 # 训练的回合数
|
||||||
self.test_eps = 30 # 测试的回合数
|
self.test_eps = 20 # 测试的回合数
|
||||||
# 超参数
|
# 超参数
|
||||||
self.gamma = 0.95 # 强化学习中的折扣因子
|
self.gamma = 0.99 # 强化学习中的折扣因子
|
||||||
self.epsilon_start = 0.90 # e-greedy策略中初始epsilon
|
self.epsilon_start = 0.99 # e-greedy策略中初始epsilon
|
||||||
self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon
|
self.epsilon_end = 0.005 # e-greedy策略中的终止epsilon
|
||||||
self.epsilon_decay = 500 # e-greedy策略中epsilon的衰减率
|
self.epsilon_decay = 500 # e-greedy策略中epsilon的衰减率
|
||||||
self.lr = 0.0001 # 学习率
|
self.lr = 0.0001 # 学习率
|
||||||
self.memory_capacity = 100000 # 经验回放的容量
|
self.memory_capacity = 100000 # 经验回放的容量
|
||||||
self.batch_size = 64 # mini-batch SGD中的批量大小
|
self.batch_size = 128 # mini-batch SGD中的批量大小
|
||||||
self.target_update = 4 # 目标网络的更新频率
|
self.target_update = 4 # 目标网络的更新频率
|
||||||
self.hidden_dim = 256 # 网络隐藏层
|
self.hidden_dim = 512 # 网络隐藏层
|
||||||
class PlotConfig:
|
class PlotConfig:
|
||||||
''' 绘图相关参数设置
|
''' 绘图相关参数设置
|
||||||
'''
|
'''
|
||||||
@@ -60,6 +63,22 @@ class PlotConfig:
|
|||||||
'/' + curr_time + '/models/' # 保存模型的路径
|
'/' + curr_time + '/models/' # 保存模型的路径
|
||||||
self.save = True # 是否保存图片
|
self.save = True # 是否保存图片
|
||||||
|
|
||||||
|
class MLP(nn.Module):
|
||||||
|
def __init__(self, n_states,n_actions,hidden_dim=128):
|
||||||
|
""" 初始化q网络,为全连接网络
|
||||||
|
n_states: 输入的特征数即环境的状态维度
|
||||||
|
n_actions: 输出的动作维度
|
||||||
|
"""
|
||||||
|
super(MLP, self).__init__()
|
||||||
|
self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
|
||||||
|
self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
|
||||||
|
self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
# 各层对应的激活函数
|
||||||
|
x = F.relu(self.fc1(x))
|
||||||
|
x = F.relu(self.fc2(x))
|
||||||
|
return self.fc3(x)
|
||||||
|
|
||||||
def env_agent_config(cfg, seed=1):
|
def env_agent_config(cfg, seed=1):
|
||||||
''' 创建环境和智能体
|
''' 创建环境和智能体
|
||||||
@@ -68,7 +87,8 @@ def env_agent_config(cfg, seed=1):
|
|||||||
env.seed(seed) # 设置随机种子
|
env.seed(seed) # 设置随机种子
|
||||||
n_states = env.observation_space.shape[0] # 状态维度
|
n_states = env.observation_space.shape[0] # 状态维度
|
||||||
n_actions = env.action_space.n # 动作维度
|
n_actions = env.action_space.n # 动作维度
|
||||||
agent = DQN(n_states, n_actions, cfg) # 创建智能体
|
model = MLP(n_states,n_actions)
|
||||||
|
agent = DQN(n_actions,model,cfg) # 创建智能体
|
||||||
return env, agent
|
return env, agent
|
||||||
|
|
||||||
def train(cfg, env, agent):
|
def train(cfg, env, agent):
|
||||||
|
|||||||
@@ -1,184 +0,0 @@
|
|||||||
import random
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
import tensorflow as tf
|
|
||||||
import os
|
|
||||||
import gym
|
|
||||||
import time
|
|
||||||
from collections import deque
|
|
||||||
from tensorflow.keras import optimizers
|
|
||||||
from keras.models import Sequential
|
|
||||||
from keras.layers import Dense, Dropout
|
|
||||||
from keras.layers import Activation, Flatten, Conv1D, MaxPooling1D,Reshape
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
|
|
||||||
class DQN:
|
|
||||||
def __init__(self, env):
|
|
||||||
self.env = env
|
|
||||||
self.memory = deque(maxlen=400000)
|
|
||||||
self.gamma = 0.99
|
|
||||||
self.epsilon = 1.0
|
|
||||||
self.epsilon_min = 0.01
|
|
||||||
self.epsilon_decay = self.epsilon_min / 500000
|
|
||||||
|
|
||||||
self.batch_size = 32
|
|
||||||
self.train_start = 1000
|
|
||||||
self.state_size = self.env.observation_space.shape[0]*4
|
|
||||||
self.action_size = self.env.action_space.n
|
|
||||||
self.learning_rate = 0.00025
|
|
||||||
|
|
||||||
self.evaluation_model = self.create_model()
|
|
||||||
self.target_model = self.create_model()
|
|
||||||
|
|
||||||
def create_model(self):
|
|
||||||
model = Sequential()
|
|
||||||
model.add(Dense(128*2, input_dim=self.state_size,activation='relu'))
|
|
||||||
model.add(Dense(128*2, activation='relu'))
|
|
||||||
model.add(Dense(128*2, activation='relu'))
|
|
||||||
model.add(Dense(self.env.action_space.n, activation='linear'))
|
|
||||||
model.compile(loss='mean_squared_error', optimizer=optimizers.RMSprop(lr=self.learning_rate,decay=0.99,epsilon=1e-6))
|
|
||||||
return model
|
|
||||||
|
|
||||||
def choose_action(self, state, steps):
|
|
||||||
if steps > 50000:
|
|
||||||
if self.epsilon > self.epsilon_min:
|
|
||||||
self.epsilon -= self.epsilon_decay
|
|
||||||
if np.random.random() < self.epsilon:
|
|
||||||
return self.env.action_space.sample()
|
|
||||||
return np.argmax(self.evaluation_model.predict(state)[0])
|
|
||||||
|
|
||||||
def remember(self, cur_state, action, reward, new_state, done):
|
|
||||||
if not hasattr(self, 'memory_counter'):
|
|
||||||
self.memory_counter = 0
|
|
||||||
|
|
||||||
transition = (cur_state, action, reward, new_state, done)
|
|
||||||
self.memory.extend([transition])
|
|
||||||
|
|
||||||
self.memory_counter += 1
|
|
||||||
|
|
||||||
def replay(self):
|
|
||||||
if len(self.memory) < self.train_start:
|
|
||||||
return
|
|
||||||
|
|
||||||
mini_batch = random.sample(self.memory, self.batch_size)
|
|
||||||
|
|
||||||
update_input = np.zeros((self.batch_size, self.state_size))
|
|
||||||
update_target = np.zeros((self.batch_size, self.action_size))
|
|
||||||
|
|
||||||
for i in range(self.batch_size):
|
|
||||||
state, action, reward, new_state, done = mini_batch[i]
|
|
||||||
target = self.evaluation_model.predict(state)[0]
|
|
||||||
|
|
||||||
if done:
|
|
||||||
target[action] = reward
|
|
||||||
else:
|
|
||||||
target[action] = reward + self.gamma * np.amax(self.target_model.predict(new_state)[0])
|
|
||||||
|
|
||||||
update_input[i] = state
|
|
||||||
update_target[i] = target
|
|
||||||
|
|
||||||
self.evaluation_model.fit(update_input, update_target, batch_size=self.batch_size, epochs=1, verbose=0)
|
|
||||||
|
|
||||||
def target_train(self):
|
|
||||||
self.target_model.set_weights(self.evaluation_model.get_weights())
|
|
||||||
return
|
|
||||||
|
|
||||||
def visualize(self, reward, episode):
|
|
||||||
plt.plot(episode, reward, 'ob-')
|
|
||||||
plt.title('Average reward each 100 episode')
|
|
||||||
plt.ylabel('Reward')
|
|
||||||
plt.xlabel('Episodes')
|
|
||||||
plt.grid()
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
def transform(self,state):
|
|
||||||
if state.shape[1]==512:
|
|
||||||
return state
|
|
||||||
a=[np.binary_repr(x,width=8) for x in state[0]]
|
|
||||||
res=[]
|
|
||||||
for x in a:
|
|
||||||
res.extend([x[:2],x[2:4],x[4:6],x[6:]])
|
|
||||||
res=[int(x,2) for x in res]
|
|
||||||
return np.array(res)
|
|
||||||
|
|
||||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
|
||||||
def main():
|
|
||||||
# env = gym.make('Breakout-ram-v0')
|
|
||||||
env = gym.make('Breakout-ram-v0')
|
|
||||||
env = env.unwrapped
|
|
||||||
|
|
||||||
print(env.action_space)
|
|
||||||
print(env.observation_space.shape[0])
|
|
||||||
print(env.observation_space.high)
|
|
||||||
print(env.observation_space.low)
|
|
||||||
|
|
||||||
#print(env.observation_space.shape)
|
|
||||||
|
|
||||||
|
|
||||||
episodes = 5000
|
|
||||||
trial_len = 10000
|
|
||||||
|
|
||||||
tmp_reward=0
|
|
||||||
sum_rewards = 0
|
|
||||||
n_success = 0
|
|
||||||
total_steps = 0
|
|
||||||
|
|
||||||
graph_reward = []
|
|
||||||
graph_episodes = []
|
|
||||||
time_record = []
|
|
||||||
|
|
||||||
dqn_agent = DQN(env=env)
|
|
||||||
for i_episode in range(episodes):
|
|
||||||
start_time = time.time()
|
|
||||||
total_reward = 0
|
|
||||||
cur_state = env.reset().reshape(1,128)
|
|
||||||
cur_state=dqn_agent.transform(cur_state).reshape(1,128*4)/4
|
|
||||||
i_step=0
|
|
||||||
for step in range(trial_len):
|
|
||||||
#env.render()
|
|
||||||
i_step+=1
|
|
||||||
action = dqn_agent.choose_action(cur_state, total_steps)
|
|
||||||
new_state, reward, done, _ = env.step(action)
|
|
||||||
new_state = new_state.reshape(1, 128)
|
|
||||||
new_state = dqn_agent.transform(new_state).reshape(1,128*4)/4
|
|
||||||
total_reward += reward
|
|
||||||
sum_rewards += reward
|
|
||||||
tmp_reward += reward
|
|
||||||
if reward>0: #Testing whether it is good.
|
|
||||||
reward=1
|
|
||||||
|
|
||||||
dqn_agent.remember(cur_state, action, reward, new_state, done)
|
|
||||||
if total_steps > 10000:
|
|
||||||
if total_steps%4 == 0:
|
|
||||||
dqn_agent.replay()
|
|
||||||
if total_steps%5000 == 0:
|
|
||||||
dqn_agent.target_train()
|
|
||||||
|
|
||||||
cur_state = new_state
|
|
||||||
total_steps += 1
|
|
||||||
if done:
|
|
||||||
env.reset()
|
|
||||||
break
|
|
||||||
if (i_episode+1) % 100 == 0:
|
|
||||||
graph_reward.append(sum_rewards/100)
|
|
||||||
graph_episodes.append(i_episode+1)
|
|
||||||
sum_rewards = 0
|
|
||||||
print("Episode ",i_episode+1," Reward: ")
|
|
||||||
print(graph_reward[-1])
|
|
||||||
end_time = time.time()
|
|
||||||
time_record.append(end_time-start_time)
|
|
||||||
print("NOW in episode: " + str(i_episode))
|
|
||||||
print("Time cost: " + str(end_time-start_time))
|
|
||||||
print("Reward: ",tmp_reward)
|
|
||||||
print("Step:", i_step)
|
|
||||||
tmp_reward=0
|
|
||||||
print("Reward: ")
|
|
||||||
print(graph_reward)
|
|
||||||
print("Episode: ")
|
|
||||||
print(graph_episodes)
|
|
||||||
print("Average_time: ")
|
|
||||||
print(sum(time_record)/5000)
|
|
||||||
dqn_agent.visualize(graph_reward, graph_episodes)
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
||||||
|
Before Width: | Height: | Size: 75 KiB |
|
Before Width: | Height: | Size: 233 KiB |
|
Before Width: | Height: | Size: 34 KiB |
|
Before Width: | Height: | Size: 66 KiB |
|
Before Width: | Height: | Size: 49 KiB |
|
Before Width: | Height: | Size: 239 KiB |
|
Before Width: | Height: | Size: 148 KiB |
|
Before Width: | Height: | Size: 6.8 KiB |
|
Before Width: | Height: | Size: 45 KiB |
|
Before Width: | Height: | Size: 74 KiB |
|
Before Width: | Height: | Size: 49 KiB |
@@ -1,175 +0,0 @@
|
|||||||
前面项目讲的环境都是离散动作的,但实际中也有很多连续动作的环境,比如Open AI Gym中的[Pendulum-v0](https://github.com/openai/gym/wiki/Pendulum-v0)环境,它解决的是一个倒立摆问题,我们先对该环境做一个简要说明。
|
|
||||||
|
|
||||||
## Pendulum-v0简介
|
|
||||||
|
|
||||||
如果说 CartPole-v0 是一个离散动作的经典入门环境的话,那么对应 Pendulum-v0 就是连续动作的经典入门环境,如下图,我们通过施加力矩使其向上摆动并保持直立。
|
|
||||||
|
|
||||||
<img src="../../easy_rl_book/res/ch12/assets/pendulum_1.png" alt="image-20210915161550713" style="zoom:50%;" />
|
|
||||||
|
|
||||||
该环境的状态维度有三个,设摆针竖直方向上的顺时针旋转角为$\theta$,$\theta$设在$[-\pi,\pi]$之间,则相应的状态为$[cos\theta,sin\theta,\dot{\theta}]$,即表示角度和角速度,我们的动作则是一个-2到2之间的力矩,它是一个连续量,因而该环境不能用离散动作的算法比如 DQN 来解决。关于奖励是根据相关的物理原理而计算出的等式,如下:
|
|
||||||
$$
|
|
||||||
-\left(\theta^{2}+0.1 * \hat{\theta}^{2}+0.001 * \text { action }^{2}\right)
|
|
||||||
$$
|
|
||||||
对于每一步,其最低奖励为$-\left(\pi^{2}+0.1 * 8^{2}+0.001 * 2^{2}\right)= -16.2736044$,最高奖励为0。同 CartPole-v0 环境一样,达到最优算法的情况下,每回合的步数是无限的,因此这里设定每回合最大步数为200以便于训练。
|
|
||||||
|
|
||||||
## DDPG 基本接口
|
|
||||||
|
|
||||||
我们依然使用接口的概念,通过伪代码分析并实现 DDPG 的训练模式,如下:
|
|
||||||
|
|
||||||
> 初始化评论家网络$Q\left(s, a \mid \theta^{Q}\right)$和演员网络$\mu\left(s \mid \theta^{\mu}\right)$,其权重分别为$\theta^{Q}$和$\theta^{\mu}$
|
|
||||||
>
|
|
||||||
> 初始化目标网络$Q'$和$\mu'$,并复制权重$\theta^{Q^{\prime}} \leftarrow \theta^{Q}, \theta^{\mu^{\prime}} \leftarrow \theta^{\mu}$
|
|
||||||
>
|
|
||||||
> 初始化经验回放缓冲区$R$
|
|
||||||
>
|
|
||||||
> 执行$M$个回合循环,对于每个回合:
|
|
||||||
>
|
|
||||||
> * 初始化动作探索的的随机过程即噪声$\mathcal{N}$
|
|
||||||
>
|
|
||||||
> * 初始化状态$s_1$
|
|
||||||
>
|
|
||||||
> 循环$T$个时间步长,对于每个时步$
|
|
||||||
>
|
|
||||||
> * 根据当前策略和噪声选择动作$a_{t}=\mu\left(s_{t} \mid \theta^{\mu}\right)+\mathcal{N}_{t}$
|
|
||||||
> * 执行动作$a_t$并得到反馈$r_t$和下一个状态$s_{t+1}$
|
|
||||||
> * 存储转移$\left(s_{t}, a_{t}, r_{t}, s_{t+1}\right)$到经验缓冲$R$中
|
|
||||||
> * (更新策略)从$D$随机采样一个小批量的转移
|
|
||||||
> * (更新策略)计算实际的Q值$y_{i}=r_{i}+\gamma Q^{\prime}\left(s_{i+1}, \mu^{\prime}\left(s_{i+1} \mid \theta^{\mu^{\prime}}\right) \mid \theta^{Q^{\prime}}\right)$
|
|
||||||
> * (更新策略)对损失函数$L=\frac{1}{N} \sum_{i}\left(y_{i}-Q\left(s_{i}, a_{i} \mid \theta^{Q}\right)\right)^{2}$关于参数$\theta$做梯度下降用于更新评论家网络
|
|
||||||
> * (更新策略)使用采样梯度更新演员网络的策略:$\left.\left.\nabla_{\theta^{\mu}} J \approx \frac{1}{N} \sum_{i} \nabla_{a} Q\left(s, a \mid \theta^{Q}\right)\right|_{s=s_{i}, a=\mu\left(s_{i}\right)} \nabla_{\theta^{\mu}} \mu\left(s \mid \theta^{\mu}\right)\right|_{s_{i}}$
|
|
||||||
> * (更新策略)更新目标网络:$\theta^{Q^{\prime}} \leftarrow \tau \theta^{Q}+(1-\tau) \theta^{Q^{\prime}}$,$\theta^{\mu^{\prime}} \leftarrow \tau \theta^{\mu}+(1-\tau) \theta^{\mu^{\prime}}$
|
|
||||||
|
|
||||||
代码如下:
|
|
||||||
|
|
||||||
```python
|
|
||||||
ou_noise = OUNoise(env.action_space) # 动作噪声
|
|
||||||
rewards = [] # 记录奖励
|
|
||||||
ma_rewards = [] # 记录滑动平均奖励
|
|
||||||
for i_ep in range(cfg.train_eps):
|
|
||||||
state = env.reset()
|
|
||||||
ou_noise.reset()
|
|
||||||
done = False
|
|
||||||
ep_reward = 0
|
|
||||||
i_step = 0
|
|
||||||
while not done:
|
|
||||||
i_step += 1
|
|
||||||
action = agent.choose_action(state)
|
|
||||||
action = ou_noise.get_action(action, i_step)
|
|
||||||
next_state, reward, done, _ = env.step(action)
|
|
||||||
ep_reward += reward
|
|
||||||
agent.memory.push(state, action, reward, next_state, done)
|
|
||||||
agent.update()
|
|
||||||
state = next_state
|
|
||||||
if (i_ep+1)%10 == 0:
|
|
||||||
print('回合:{}/{},奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward))
|
|
||||||
rewards.append(ep_reward)
|
|
||||||
if ma_rewards:
|
|
||||||
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
|
|
||||||
else:
|
|
||||||
ma_rewards.append(ep_reward)
|
|
||||||
```
|
|
||||||
|
|
||||||
相比于 DQN ,DDPG 主要多了两处修改,一个是给动作施加噪声,另外一个是软更新策略,即最后一步。
|
|
||||||
|
|
||||||
## Ornstein-Uhlenbeck噪声
|
|
||||||
|
|
||||||
OU 噪声适用于惯性系统,尤其是时间离散化粒度较小的情况。 OU 噪声是一种随机过程,下面略去证明,直接给出公式:
|
|
||||||
$$
|
|
||||||
x(t+\Delta t)=x(t)-\theta(x(t)-\mu) \Delta t+\sigma W_t
|
|
||||||
$$
|
|
||||||
其中 $W_t$ 属于正太分布,进而代码实现如下:
|
|
||||||
|
|
||||||
```python
|
|
||||||
class OUNoise(object):
|
|
||||||
'''Ornstein–Uhlenbeck噪声
|
|
||||||
'''
|
|
||||||
def __init__(self, action_space, mu=0.0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000):
|
|
||||||
self.mu = mu # OU噪声的参数
|
|
||||||
self.theta = theta # OU噪声的参数
|
|
||||||
self.sigma = max_sigma # OU噪声的参数
|
|
||||||
self.max_sigma = max_sigma
|
|
||||||
self.min_sigma = min_sigma
|
|
||||||
self.decay_period = decay_period
|
|
||||||
self.n_actions = action_space.shape[0]
|
|
||||||
self.low = action_space.low
|
|
||||||
self.high = action_space.high
|
|
||||||
self.reset()
|
|
||||||
def reset(self):
|
|
||||||
self.obs = np.ones(self.n_actions) * self.mu
|
|
||||||
def evolve_obs(self):
|
|
||||||
x = self.obs
|
|
||||||
dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.n_actions)
|
|
||||||
self.obs = x + dx
|
|
||||||
return self.obs
|
|
||||||
def get_action(self, action, t=0):
|
|
||||||
ou_obs = self.evolve_obs()
|
|
||||||
self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period) # sigma会逐渐衰减
|
|
||||||
return np.clip(action + ou_obs, self.low, self.high) # 动作加上噪声后进行剪切
|
|
||||||
```
|
|
||||||
|
|
||||||
## DDPG算法
|
|
||||||
|
|
||||||
DDPG算法主要也包括两个功能,一个是选择动作,另外一个是更新策略,首先看选择动作:
|
|
||||||
|
|
||||||
```python
|
|
||||||
def choose_action(self, state):
|
|
||||||
state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
|
|
||||||
action = self.actor(state)
|
|
||||||
return action.detach().cpu().numpy()[0, 0]
|
|
||||||
```
|
|
||||||
|
|
||||||
由于DDPG是直接从演员网络取得动作,所以这里不用$\epsilon-greedy$策略。在更新策略函数中,也会跟DQN稍有不同,并且加入软更新:
|
|
||||||
|
|
||||||
```python
|
|
||||||
def update(self):
|
|
||||||
if len(self.memory) < self.batch_size: # 当 memory 中不满足一个批量时,不更新策略
|
|
||||||
return
|
|
||||||
# 从经验回放中(replay memory)中随机采样一个批量的转移(transition)
|
|
||||||
state, action, reward, next_state, done = self.memory.sample(self.batch_size)
|
|
||||||
# 转变为张量
|
|
||||||
state = torch.FloatTensor(state).to(self.device)
|
|
||||||
next_state = torch.FloatTensor(next_state).to(self.device)
|
|
||||||
action = torch.FloatTensor(action).to(self.device)
|
|
||||||
reward = torch.FloatTensor(reward).unsqueeze(1).to(self.device)
|
|
||||||
done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(self.device)
|
|
||||||
|
|
||||||
policy_loss = self.critic(state, self.actor(state))
|
|
||||||
policy_loss = -policy_loss.mean()
|
|
||||||
next_action = self.target_actor(next_state)
|
|
||||||
target_value = self.target_critic(next_state, next_action.detach())
|
|
||||||
expected_value = reward + (1.0 - done) * self.gamma * target_value
|
|
||||||
expected_value = torch.clamp(expected_value, -np.inf, np.inf)
|
|
||||||
|
|
||||||
value = self.critic(state, action)
|
|
||||||
value_loss = nn.MSELoss()(value, expected_value.detach())
|
|
||||||
|
|
||||||
self.actor_optimizer.zero_grad()
|
|
||||||
policy_loss.backward()
|
|
||||||
self.actor_optimizer.step()
|
|
||||||
self.critic_optimizer.zero_grad()
|
|
||||||
value_loss.backward()
|
|
||||||
self.critic_optimizer.step()
|
|
||||||
# 软更新
|
|
||||||
for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()):
|
|
||||||
target_param.data.copy_(
|
|
||||||
target_param.data * (1.0 - self.soft_tau) +
|
|
||||||
param.data * self.soft_tau
|
|
||||||
)
|
|
||||||
for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()):
|
|
||||||
target_param.data.copy_(
|
|
||||||
target_param.data * (1.0 - self.soft_tau) +
|
|
||||||
param.data * self.soft_tau
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
## 结果分析
|
|
||||||
|
|
||||||
实现算法之后,我们先看看训练效果:
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
可以看到算法整体上是达到收敛了的,但是稳定状态下波动还比较大,依然有提升的空间,限于笔者的精力,这里只是帮助赌注实现一个基础的代码演示,想要使得算法调到最优感兴趣的读者可以多思考实现。我们再来看看测试的结果:
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
从图中看出测试的平均奖励在-150左右,但其实训练的时候平均的稳态奖励在-300左右,这是因为测试的时候我们舍去了OU噪声的缘故。
|
|
||||||
@@ -1,208 +0,0 @@
|
|||||||
|
|
||||||
|
|
||||||
在练习本项目之前,可以先回顾一下之前的项目实战,即使用Q学习解决悬崖寻路问题。本项目将具体实现DQN算法来解决推车杆问题,对应的模拟环境为Open AI Gym中的[CartPole-v0](https://datawhalechina.github.io/easy-rl/#/chapter7/project2?id=cartpole-v0),我们同样先对该环境做一个简要说明。
|
|
||||||
|
|
||||||
## CartPole-v0 简介
|
|
||||||
|
|
||||||
CartPole-v0是一个经典的入门环境,如下图,它通过向左(动作=0)或向右(动作=1)推动推车来实现竖直杆的平衡,每次实施一个动作后如果能够继续保持平衡就会得到一个+1的奖励,否则杆将无法保持平衡而导致游戏结束。
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
我们来看看这个环境的一些参数,执行以下代码:
|
|
||||||
|
|
||||||
```python
|
|
||||||
import gym
|
|
||||||
env = gym.make('CartPole-v0') # 建立环境
|
|
||||||
env.seed(1) # 随机种子
|
|
||||||
n_states = env.observation_space.shape[0] # 状态维度
|
|
||||||
n_actions = env.action_space.n # 动作维度
|
|
||||||
state = env.reset() # 初始化环境
|
|
||||||
print(f"状态维度:{n_states},动作维度:{n_actions}")
|
|
||||||
print(f"初始状态:{state}")
|
|
||||||
```
|
|
||||||
|
|
||||||
可以得到结果:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
状态维度:4,动作维度:2
|
|
||||||
初始状态:[ 0.03073904 0.00145001 -0.03088818 -0.03131252]
|
|
||||||
```
|
|
||||||
|
|
||||||
该环境状态维度是四个,分别为车的位置、车的速度、杆的角度以及杆顶部的速度,动作维度为两个,并且是离散的向左或者向右。理论上达到最优化算法的情况下,推车杆是一直能保持平衡的,也就是每回合的步数是无限,但是这不方便训练,所以环境内部设置了每回合的最大步数为200,也就是说理想情况下,只需要我们每回合的奖励达到200就算训练完成。
|
|
||||||
|
|
||||||
## DQN基本接口
|
|
||||||
|
|
||||||
介绍完环境之后,我们沿用接口的概念,通过分析伪代码来实现DQN的基本训练模式,以及一些要素比如建立什么网络需要什么模块等等。我们现在常用的DQN伪代码如下:
|
|
||||||
|
|
||||||
> 初始化经验回放缓冲区(replay memory)$D$,容量(capacity)为$N$
|
|
||||||
>
|
|
||||||
> 初始化状态-动作函数,即带有初始随机权重$\theta$的$Q$网络
|
|
||||||
>
|
|
||||||
> 初始化目标状态-动作函数,即带有初始随机权重$\theta^-$的$\hat{Q}$网络,且$\theta^-=\theta$
|
|
||||||
>
|
|
||||||
> 执行$M$个回合循环,对于每个回合
|
|
||||||
>
|
|
||||||
> * 初始化环境,得到初始状态$s_1$
|
|
||||||
> * 循环$T$个时间步长,对于每个时步$t$
|
|
||||||
> * 使用$\epsilon-greedy$策略选择动作$a_t$
|
|
||||||
> * 环境根据$a_t$反馈当前的奖励$r_t$和下一个状态$s_{t+1}$
|
|
||||||
> * 更新状态$s_{t+1}=s_t$
|
|
||||||
> * 存储转移(transition)即$(s_t,a_t,r-t,s_{t+1})$到经验回放$D$中
|
|
||||||
> * (更新策略)从$D$随机采样一个小批量的转移
|
|
||||||
> * (更新策略)计算实际的Q值$y_{j}=\left\{\begin{array}{cc}r_{j} & \text { 如果回合在时步 j+1终止 }\\ r_{j}+\gamma \max _{a^{\prime}} \hat{Q}\left(\phi_{j+1}, a^{\prime} ; \theta^{-}\right) & \text {否则 }\end{array}\right.$
|
|
||||||
> * (更新策略)对损失函数$\left(y_{j}-Q\left(\phi_{j}, a_{j} ; \theta\right)\right)^{2}$关于参数$\theta$做梯度下降
|
|
||||||
> * (更新策略)每$C$步重置$\hat{Q}=Q$
|
|
||||||
|
|
||||||
用代码来实现的话如下:
|
|
||||||
|
|
||||||
```python
|
|
||||||
rewards = [] # 记录奖励
|
|
||||||
ma_rewards = [] # 记录滑动平均奖励
|
|
||||||
for i_ep in range(cfg.train_eps):
|
|
||||||
state = env.reset()
|
|
||||||
done = False
|
|
||||||
ep_reward = 0
|
|
||||||
while True:
|
|
||||||
action = agent.choose_action(state)
|
|
||||||
next_state, reward, done, _ = env.step(action)
|
|
||||||
ep_reward += reward
|
|
||||||
agent.memory.push(state, action, reward, next_state, done)
|
|
||||||
state = next_state
|
|
||||||
agent.update()
|
|
||||||
if done:
|
|
||||||
break
|
|
||||||
if (i_ep+1) % cfg.target_update == 0:
|
|
||||||
agent.target_net.load_state_dict(agent.policy_net.state_dict())
|
|
||||||
if (i_ep+1)%10 == 0:
|
|
||||||
print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward))
|
|
||||||
rewards.append(ep_reward)
|
|
||||||
# save ma_rewards
|
|
||||||
if ma_rewards:
|
|
||||||
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
|
|
||||||
else:
|
|
||||||
ma_rewards.append(ep_reward)
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
可以看到,DQN的训练模式其实和大多强化学习算法是一样的套路,但与传统的Q学习算法相比,DQN使用神经网络来代替之前的Q表格从而存储更多的信息,且由于使用了神经网络所以我们一般需要利用随机梯度下降来优化Q值的预测。此外多了经验回放缓冲区(replay memory),并且使用两个网络,即目标网络和当前网络。
|
|
||||||
|
|
||||||
## 经验回放缓冲区
|
|
||||||
|
|
||||||
从伪代码中可以看出来,经验回放缓冲区的功能有两个,一个是将每一步采集的转移(transition,包括状态,动作,奖励,下一时刻的状态)存储到缓冲区中,并且缓冲区具备一定的容量(capacity),另一个是在更新策略的时候需要随机采样小批量的转移进行优化。因此我们可以定义一个ReplayBuffer类,包括push和sample两个函数,用于存储和采样。
|
|
||||||
|
|
||||||
```python
|
|
||||||
import random
|
|
||||||
class ReplayBuffer:
|
|
||||||
def __init__(self, capacity):
|
|
||||||
self.capacity = capacity # 经验回放的容量
|
|
||||||
self.buffer = [] # 缓冲区
|
|
||||||
self.position = 0
|
|
||||||
|
|
||||||
def push(self, state, action, reward, next_state, done):
|
|
||||||
''' 缓冲区是一个队列,容量超出时去掉开始存入的转移(transition)
|
|
||||||
'''
|
|
||||||
if len(self.buffer) < self.capacity:
|
|
||||||
self.buffer.append(None)
|
|
||||||
self.buffer[self.position] = (state, action, reward, next_state, done)
|
|
||||||
self.position = (self.position + 1) % self.capacity
|
|
||||||
|
|
||||||
def sample(self, batch_size):
|
|
||||||
batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移
|
|
||||||
state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等
|
|
||||||
return state, action, reward, next_state, done
|
|
||||||
def __len__(self):
|
|
||||||
''' 返回当前存储的量
|
|
||||||
'''
|
|
||||||
return len(self.buffer)
|
|
||||||
```
|
|
||||||
|
|
||||||
## Q网络
|
|
||||||
|
|
||||||
在DQN中我们使用神经网络替代原有的Q表,从而能够存储更多的Q值,实现更为高级的策略以便用于复杂的环境,这里我们用的是一个三层的感知机或者说全连接网络:
|
|
||||||
|
|
||||||
```python
|
|
||||||
class MLP(nn.Module):
|
|
||||||
def __init__(self, input_dim,output_dim,hidden_dim=128):
|
|
||||||
""" 初始化q网络,为全连接网络
|
|
||||||
input_dim: 输入的特征数即环境的状态维度
|
|
||||||
output_dim: 输出的动作维度
|
|
||||||
"""
|
|
||||||
super(MLP, self).__init__()
|
|
||||||
self.fc1 = nn.Linear(input_dim, hidden_dim) # 输入层
|
|
||||||
self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
|
|
||||||
self.fc3 = nn.Linear(hidden_dim, output_dim) # 输出层
|
|
||||||
|
|
||||||
def forward(self, x):
|
|
||||||
# 各层对应的激活函数
|
|
||||||
x = F.relu(self.fc1(x))
|
|
||||||
x = F.relu(self.fc2(x))
|
|
||||||
return self.fc3(x)
|
|
||||||
```
|
|
||||||
|
|
||||||
学过深度学习的同学应该都对这个网络十分熟悉,在强化学习中,网络的输入一般是状态,输出则是一个动作,假如总共有两个动作,那么这里的动作维度就是2,可能的输出就是0或1,一般我们用ReLU作为激活函数。根据实际需要也可以改变神经网络的模型结构等等,比如若我们使用图像作为输入的话,这里可以使用卷积神经网络(CNN)。
|
|
||||||
|
|
||||||
## DQN算法
|
|
||||||
|
|
||||||
跟前面的项目实战一样,DQN算法一般也包括选择动作和更新策略两个函数,首先我们看选择动作:
|
|
||||||
|
|
||||||
```python
|
|
||||||
def choose_action(self, state):
|
|
||||||
'''选择动作
|
|
||||||
'''
|
|
||||||
self.frame_idx += 1
|
|
||||||
if random.random() > self.epsilon(self.frame_idx):
|
|
||||||
with torch.no_grad():
|
|
||||||
state = torch.tensor([state], device=self.device, dtype=torch.float32)
|
|
||||||
q_values = self.policy_net(state)
|
|
||||||
action = q_values.max(1)[1].item() # 选择Q值最大的动作
|
|
||||||
else:
|
|
||||||
action = random.randrange(self.n_actions)
|
|
||||||
```
|
|
||||||
|
|
||||||
可以看到跟Q学习算法其实是一样的,都是用的$\epsilon-greedy$策略,只是使用神经网络的话我们需要通过Torch或者Tensorflow工具来处理相应的数据。
|
|
||||||
|
|
||||||
而DQN更新策略的步骤稍微复杂一点,主要包括三个部分:随机采样,计算期望Q值和梯度下降,如下:
|
|
||||||
|
|
||||||
```python
|
|
||||||
def update(self):
|
|
||||||
if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时,不更新策略
|
|
||||||
return
|
|
||||||
# 从经验回放中(replay memory)中随机采样一个批量的转移(transition)
|
|
||||||
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(
|
|
||||||
self.batch_size)
|
|
||||||
# 转为张量
|
|
||||||
state_batch = torch.tensor(
|
|
||||||
state_batch, device=self.device, dtype=torch.float)
|
|
||||||
action_batch = torch.tensor(action_batch, device=self.device).unsqueeze(
|
|
||||||
1)
|
|
||||||
reward_batch = torch.tensor(
|
|
||||||
reward_batch, device=self.device, dtype=torch.float)
|
|
||||||
next_state_batch = torch.tensor(
|
|
||||||
next_state_batch, device=self.device, dtype=torch.float)
|
|
||||||
done_batch = torch.tensor(np.float32(
|
|
||||||
done_batch), device=self.device)
|
|
||||||
q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch) # 计算当前状态(s_t,a)对应的Q(s_t, a)
|
|
||||||
next_q_values = self.target_net(next_state_batch).max(1)[0].detach() # 计算下一时刻的状态(s_t_,a)对应的Q值
|
|
||||||
# 计算期望的Q值,对于终止状态,此时done_batch[0]=1, 对应的expected_q_value等于reward
|
|
||||||
expected_q_values = reward_batch + self.gamma * next_q_values * (1-done_batch)
|
|
||||||
loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) # 计算均方根损失
|
|
||||||
# 优化更新模型
|
|
||||||
self.optimizer.zero_grad()
|
|
||||||
loss.backward()
|
|
||||||
for param in self.policy_net.parameters(): # clip防止梯度爆炸
|
|
||||||
param.grad.data.clamp_(-1, 1)
|
|
||||||
self.optimizer.step()
|
|
||||||
```
|
|
||||||
|
|
||||||
## 结果分析
|
|
||||||
|
|
||||||
完成代码之后,我们先来看看DQN算法的训练效果,曲线如下:
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
从图中看出,算法其实已经在60回合左右达到收敛,最后一直维持在最佳奖励200左右,可能会有轻微的波动,这是因为我们在收敛的情况下依然保持了一定的探索率,即epsilon_end=0.01。现在我们可以载入模型看看测试的效果:
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
我们测试了30个回合,每回合都保持在200左右,说明我们的模型学习得不错了!
|
|
||||||
@@ -1,165 +0,0 @@
|
|||||||
# 使用Q学习解决悬崖寻路问题
|
|
||||||
|
|
||||||
强化学习在运动规划方面也有很大的应用前景,已有很多适用于强化学习的相关仿真环境,小到迷宫,大到贴近真实的自动驾驶环境[CARLA](http://carla.org/)。本次使用[OpenAI Gym](https://gym.openai.com/)开发的CliffWalking-v0环境,带大家入门Q学习算法的代码实战。
|
|
||||||
|
|
||||||
## CliffWalking-v0环境简介
|
|
||||||
|
|
||||||
我们首先简单介绍一下这个环境,该环境中文名叫悬崖寻路(CliffWalking),是一个迷宫类问题。如下图,在一个4 x 12的网格中,智能体以网格的左下角位置为起点,以网格的下角位置为终点,目标是移动智能体到达终点位置,智能体每次可以在上、下、左、右这4个方向中移动一步,每移动一步会得到-1单位的奖励。
|
|
||||||
|
|
||||||
<div align=center>
|
|
||||||
<img src="assets/cliffwalking_1.png" alt="cliffwalking_1" style="zoom:50%;" />
|
|
||||||
</div>
|
|
||||||
起终点之间是一段悬崖,即编号为37~46的网格,智能体移动过程中会有如下的限制:
|
|
||||||
|
|
||||||
* 智能体不能移出网格边界,如果智能体想执行某个动作移出网格,那么这一步智能体不会移动,但是这个操作依然会得到-1单位的奖励
|
|
||||||
* 如果智能体“掉入悬崖” ,会立即回到起点位置,并得到-100单位的奖励
|
|
||||||
* 当智能体移动到终点时,该回合结束,该回合总奖励为各步奖励之和
|
|
||||||
|
|
||||||
我们的目标是以最少的步数到达终点,容易看出最少需要13步智能体才能从起点到终点,因此最佳算法收敛的情况下,每回合的总奖励应该是-13,这样人工分析出期望的奖励也便于我们判断算法的收敛情况作出相应调整。
|
|
||||||
|
|
||||||
现在我们可以在代码中定义环境,如下:
|
|
||||||
|
|
||||||
```python
|
|
||||||
import gym # 导入gym模块
|
|
||||||
from envs.gridworld_env import CliffWalkingWapper # 导入自定义装饰器
|
|
||||||
|
|
||||||
env = gym.make('CliffWalking-v0') # 定义环境
|
|
||||||
env = CliffWalkingWapper(env) # 装饰环境
|
|
||||||
```
|
|
||||||
|
|
||||||
这里我们在程序中使用了一个装饰器重新定义环境,但不影响对环境的理解,感兴趣的同学具体看相关代码。可以由于gym环境封装得比较好,所以我们想要使用这个环境只需要使用gym.make命令输入函数名即可,然后我们可以查看环境的状态和动作维度目:
|
|
||||||
|
|
||||||
```python
|
|
||||||
n_states = env.observation_space.n # 状态维度
|
|
||||||
n_actions = env.action_space.n # 动作维度
|
|
||||||
print(f"状态维度:{n_states},动作维度:{n_actions}")
|
|
||||||
```
|
|
||||||
|
|
||||||
打印出来的结果如下:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
状态维度:48,动作维度:4
|
|
||||||
```
|
|
||||||
|
|
||||||
我们的状态维度是48个,这里我们设置的是智能体当前所在网格的编号,而动作维度是4,这表示有0,1,2,3对应着上下左右四个动作。另外我们也可以初始化环境并打印当前所在的状态:
|
|
||||||
|
|
||||||
```python
|
|
||||||
state = env.reset()
|
|
||||||
print(state)
|
|
||||||
```
|
|
||||||
|
|
||||||
结果显示为:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
36
|
|
||||||
```
|
|
||||||
|
|
||||||
也就是说当前智能体的状态即当前所在的网格编号是36,正好对应我们前面讲到的起点。
|
|
||||||
|
|
||||||
## 强化学习基本接口
|
|
||||||
|
|
||||||
这里所说的接口就是一般强化学习的训练模式,也是大多数算法伪代码遵循的套路,步骤如下:
|
|
||||||
|
|
||||||
* 初始化环境和智能体
|
|
||||||
* 对于每个回合,智能体选择动作
|
|
||||||
* 环境接收动作反馈下一个状态和奖励
|
|
||||||
* 智能体进行策略更新(学习)
|
|
||||||
* 多个回合算法收敛之后保存模型以及做后续的分析画图等
|
|
||||||
|
|
||||||
代码如下:
|
|
||||||
|
|
||||||
```python
|
|
||||||
env = gym.make('CliffWalking-v0') # 定义环境
|
|
||||||
env = CliffWalkingWapper(env) # 装饰环境
|
|
||||||
env.seed(1) # 设置随机种子
|
|
||||||
n_states = env.observation_space.n # 状态维度
|
|
||||||
n_actions = env.action_space.n # 动作维度
|
|
||||||
agent = QLearning(n_states,n_actions,cfg) # cfg存储算法相关参数
|
|
||||||
for i_ep in range(cfg.train_eps): # cfg.train_eps表示最大训练的回合数
|
|
||||||
ep_reward = 0 # 记录每个回合的奖励
|
|
||||||
state = env.reset() # 重置环境
|
|
||||||
while True:
|
|
||||||
action = agent.choose_action(state) # 算法选择一个动作
|
|
||||||
next_state, reward, done, _ = env.step(action) # 环境根据动作反馈奖励和下一个状态
|
|
||||||
agent.update(state, action, reward, next_state, done) # 算法更新
|
|
||||||
state = next_state # 更新状态
|
|
||||||
ep_reward += reward
|
|
||||||
if done: # 终止状态提前停止
|
|
||||||
break
|
|
||||||
```
|
|
||||||
|
|
||||||
通常我们会记录并分析奖励的变化,所以在接口基础上加一些变量记录每回合的奖励,此外由于强化学习学习过程得到的奖励可能会产生振荡,因此我们也适用一个滑动平均的量来反映奖励变化的趋势,如下:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
rewards = []
|
|
||||||
ma_rewards = [] # 滑动平均奖励
|
|
||||||
for i_ep in range(cfg.train_eps):
|
|
||||||
ep_reward = 0 # 记录每个回合的奖励
|
|
||||||
state = env.reset() # 重置环境, 重新开一局(即开始新的一个回合)
|
|
||||||
while True:
|
|
||||||
action = agent.choose_action(state) # 根据算法选择一个动作
|
|
||||||
next_state, reward, done, _ = env.step(action) # 与环境进行一次动作交互
|
|
||||||
agent.update(state, action, reward, next_state, done) # Q-learning算法更新
|
|
||||||
state = next_state # 存储上一个观察值
|
|
||||||
ep_reward += reward
|
|
||||||
if done:
|
|
||||||
break
|
|
||||||
rewards.append(ep_reward)
|
|
||||||
if ma_rewards:
|
|
||||||
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
|
|
||||||
else:
|
|
||||||
ma_rewards.append(ep_reward)
|
|
||||||
```
|
|
||||||
|
|
||||||
## Q学习算法
|
|
||||||
|
|
||||||
了解了基本接口之后,现在我们看看Q学习算法具体是怎么实现的,前面讲到智能体其实在整个训练中就做两件事,一个是选择动作,一个是更新策略,所以我们可以定义一个Qlearning类,里面主要包含两个函数choose_action和update。
|
|
||||||
|
|
||||||
我们先看看choose_action函数是怎么定义的,如下:
|
|
||||||
|
|
||||||
```python
|
|
||||||
def choose_action(self, state):
|
|
||||||
self.sample_count += 1
|
|
||||||
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end)
|
|
||||||
math.exp(-1. * self.sample_count / self.epsilon_decay) # epsilon是会递减的,这里选择指数递减
|
|
||||||
# e-greedy 策略
|
|
||||||
if np.random.uniform(0, 1) > self.epsilon:
|
|
||||||
action = np.argmax(self.Q_table[str(state)]) # 选择Q(s,a)最大对应的动作
|
|
||||||
else:
|
|
||||||
action = np.random.choice(self.n_actions) # 随机选择动作
|
|
||||||
return action
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
一般我们使用$\varepsilon-greedy$策略选择动作,我们的输入就是当前的状态,随机选取一个值,当这个值大于我们设置的$\varepsilon$时,我们选取Q值最大对应的动作,否则随机选择动作,这样就能在训练中让智能体保持一定的探索率,这也是平衡探索与利用的技巧之一。
|
|
||||||
|
|
||||||
下面是我们要实现的策略更新函数:
|
|
||||||
|
|
||||||
```python
|
|
||||||
def update(self, state, action, reward, next_state, done):
|
|
||||||
Q_predict = self.Q_table[str(state)][action]
|
|
||||||
if done: # 终止状态
|
|
||||||
Q_target = reward
|
|
||||||
else:
|
|
||||||
Q_target = reward + self.gamma * np.max(self.Q_table[str(next_state)])
|
|
||||||
self.Q_table[str(state)][action] += self.lr * (Q_target - Q_predict)
|
|
||||||
```
|
|
||||||
|
|
||||||
这里面实现的逻辑就是伪代码中的更新公式:
|
|
||||||
|
|
||||||
<img src="assets/image-20210911213241605.png" alt="image-20210911213241605" style="zoom:50%;" />
|
|
||||||
|
|
||||||
注意终止状态下,我们是获取不到下一个动作的,我们直接将Q值(Q_target)更新为对应的奖励即可。
|
|
||||||
|
|
||||||
## 结果分析
|
|
||||||
|
|
||||||
到现在我们就基本完成了Q学习的代码实现,具体可以查看github上的源码,运行代码结果如下:
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
由于这个环境比较简单,可以看到算法很快达到收敛,然后我们再测试我们训练好的模型,一般测试模型只需要20到50左右的回合数即可:
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
这里我们测试的回合数为30,可以看到每个回合智能体都达到了最优的奖励,说明我们的算法训练的效果很不错!
|
|
||||||
@@ -1,21 +0,0 @@
|
|||||||
MIT License
|
|
||||||
|
|
||||||
Copyright (c) 2020 John Jim
|
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
of this software and associated documentation files (the "Software"), to deal
|
|
||||||
in the Software without restriction, including without limitation the rights
|
|
||||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
copies of the Software, and to permit persons to whom the Software is
|
|
||||||
furnished to do so, subject to the following conditions:
|
|
||||||
|
|
||||||
The above copyright notice and this permission notice shall be included in all
|
|
||||||
copies or substantial portions of the Software.
|
|
||||||
|
|
||||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
SOFTWARE.
|
|
||||||
@@ -1,52 +0,0 @@
|
|||||||
中文|[English](./README_en.md)
|
|
||||||
## 写在前面
|
|
||||||
|
|
||||||
本项目用于学习RL基础算法,尽量做到: **注释详细**,**结构清晰**。
|
|
||||||
|
|
||||||
代码结构主要分为以下几个脚本:
|
|
||||||
|
|
||||||
* ```model.py``` 强化学习算法的基本模型,比如神经网络,actor,critic等
|
|
||||||
* ```memory.py``` 保存Replay Buffer,用于off-policy
|
|
||||||
* ```plot.py``` 利用matplotlib或seaborn绘制rewards图,包括滑动平均的reward,结果保存在result文件夹中
|
|
||||||
* ```env.py``` 用于构建强化学习环境,也可以重新自定义环境,比如给action加noise
|
|
||||||
* ```agent.py``` RL核心算法,比如dqn等,主要包含update和choose_action两个方法,
|
|
||||||
* ```train.py``` 保存用于训练和测试的函数
|
|
||||||
|
|
||||||
其中```model.py```,```memory.py```,```plot.py``` 由于不同算法都会用到,所以放入```common```文件夹中。
|
|
||||||
|
|
||||||
**注意:新版本中将```model```,```memory```相关内容全部放到了```agent.py```里面,```plot```放到了```common.utils```中。**
|
|
||||||
## 运行环境
|
|
||||||
|
|
||||||
python 3.7、pytorch 1.6.0-1.8.1、gym 0.21.0
|
|
||||||
|
|
||||||
## 使用说明
|
|
||||||
|
|
||||||
直接运行带有```train```的py文件或ipynb文件会进行训练默认的任务;
|
|
||||||
也可以运行带有```task```的py文件训练不同的任务
|
|
||||||
|
|
||||||
## 内容导航
|
|
||||||
|
|
||||||
| 算法名称 | 相关论文材料 | 环境 | 备注 |
|
|
||||||
| :--------------------------------------: | :----------------------------------------------------------: | ----------------------------------------- | :--------------------------------: |
|
|
||||||
| [On-Policy First-Visit MC](./MonteCarlo) | [medium blog](https://medium.com/analytics-vidhya/monte-carlo-methods-in-reinforcement-learning-part-1-on-policy-methods-1f004d59686a) | [Racetrack](./envs/racetrack_env.md) | |
|
|
||||||
| [Q-Learning](./QLearning) | [towardsdatascience blog](https://towardsdatascience.com/simple-reinforcement-learning-q-learning-fcddc4b6fe56),[q learning paper](https://ieeexplore.ieee.org/document/8836506) | [CliffWalking-v0](./envs/gym_info.md) | |
|
|
||||||
| [Sarsa](./Sarsa) | [geeksforgeeks blog](https://www.geeksforgeeks.org/sarsa-reinforcement-learning/) | [Racetrack](./envs/racetrack_env.md) | |
|
|
||||||
| [DQN](./DQN) | [DQN Paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf),[Nature DQN Paper](https://www.nature.com/articles/nature14236) | [CartPole-v0](./envs/gym_info.md) | |
|
|
||||||
| [DQN-cnn](./DQN_cnn) | [DQN Paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) | [CartPole-v0](./envs/gym_info.md) | 与DQN相比使用了CNN而不是全链接网络 |
|
|
||||||
| [DoubleDQN](./DoubleDQN) | [DoubleDQN Paper](https://arxiv.org/abs/1509.06461) | [CartPole-v0](./envs/gym_info.md) | |
|
|
||||||
| [Hierarchical DQN](HierarchicalDQN) | [H-DQN Paper](https://arxiv.org/abs/1604.06057) | [CartPole-v0](./envs/gym_info.md) | |
|
|
||||||
| [PolicyGradient](./PolicyGradient) | [Lil'log](https://lilianweng.github.io/lil-log/2018/04/08/policy-gradient-algorithms.html) | [CartPole-v0](./envs/gym_info.md) | |
|
|
||||||
| [A2C](./A2C) | [A3C Paper](https://arxiv.org/abs/1602.01783) | [CartPole-v0](./envs/gym_info.md) | |
|
|
||||||
| [SAC](./SoftActorCritic) | [SAC Paper](https://arxiv.org/abs/1801.01290) | [Pendulum-v0](./envs/gym_info.md) | |
|
|
||||||
| [PPO](./PPO) | [PPO paper](https://arxiv.org/abs/1707.06347) | [CartPole-v0](./envs/gym_info.md) | |
|
|
||||||
| [DDPG](./DDPG) | [DDPG Paper](https://arxiv.org/abs/1509.02971) | [Pendulum-v0](./envs/gym_info.md) | |
|
|
||||||
| [TD3](./TD3) | [TD3 Paper](https://arxiv.org/abs/1802.09477) | [HalfCheetah-v2]((./envs/mujoco_info.md)) | |
|
|
||||||
|
|
||||||
|
|
||||||
## Refs
|
|
||||||
|
|
||||||
[RL-Adventure-2](https://github.com/higgsfield/RL-Adventure-2)
|
|
||||||
|
|
||||||
[RL-Adventure](https://github.com/higgsfield/RL-Adventure)
|
|
||||||
|
|
||||||
[Google 开源项目风格指南——中文版](https://zh-google-styleguide.readthedocs.io/en/latest/google-python-styleguide/python_style_rules/#comments)
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
English|[中文](./README.md)
|
|
||||||
36
notebooks/DQN.ipynb
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 1、分析伪代码\n",
|
||||||
|
"\n",
|
||||||
|
"目前DQN算法基本遵循[Nature DQN](https://www.nature.com/articles/nature14236)的伪代码步骤,如下:\n",
|
||||||
|
"\n",
|
||||||
|
"<div align=\"center\">\n",
|
||||||
|
"<img src=\"./figs/dqn_pseu.png\" alt=\"\" style=\"zoom:40%;\" /> \n",
|
||||||
|
"</div>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"vscode": {
|
||||||
|
"languageId": "plaintext"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"language_info": {
|
||||||
|
"name": "python"
|
||||||
|
},
|
||||||
|
"orig_nbformat": 4
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
BIN
notebooks/figs/dqn_pseu.png
Normal file
|
After Width: | Height: | Size: 317 KiB |