Merge branch 'master' of https://github.com/datawhalechina/easy-rl
This commit is contained in:
@@ -2,13 +2,13 @@
|
||||
|
||||
## 原理简介
|
||||
|
||||
DQN是Q-learning算法的优化和延伸,Q-learning中使用有限的Q表存储值的信息,而DQN中则用神经网络替代Q表存储信息,这样更适用于高维的情况,相关知识基础可参考[EasyRL-DQN](https://datawhalechina.github.io/easy-rl/#/chapter6/chapter6)。
|
||||
DQN是Q-leanning算法的优化和延伸,Q-leaning中使用有限的Q表存储值的信息,而DQN中则用神经网络替代Q表存储信息,这样更适用于高维的情况,相关知识基础可参考[datawhale李宏毅笔记-Q学习](https://datawhalechina.github.io/easy-rl/#/chapter6/chapter6)。
|
||||
|
||||
论文方面主要可以参考两篇,一篇就是2013年谷歌DeepMind团队的[Playing Atari with Deep Reinforcement Learning](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf),一篇是也是他们团队后来在Nature杂志上发表的[Human-level control through deep reinforcement learning](https://web.stanford.edu/class/psych209/Readings/MnihEtAlHassibis15NatureControlDeepRL.pdf)。后者在算法层面增加target q-net,也可以叫做Nature DQN。
|
||||
|
||||
Nature DQN使用了两个Q网络,一个当前Q网络𝑄用来选择动作,更新模型参数,另一个目标Q网络𝑄′用于计算目标Q值。目标Q网络的网络参数不需要迭代更新,而是每隔一段时间从当前Q网络𝑄复制过来,即延时更新,这样可以减少目标Q值和当前的Q值相关性。
|
||||
|
||||
要注意的是,两个Q网络的结构是一模一样的,这样才可以复制网络参数。Nature DQN和[Playing Atari with Deep Reinforcement Learning](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf)相比,除了用一个新的相同结构的目标Q网络来计算目标Q值以外,其余部分基本是完全相同的。细节也可参考[强化学习(九)Deep Q-Learning进阶之Nature DQN](https://www.cnblogs.com/pinard/p/9756075.html)。
|
||||
要注意的是,两个Q网络的结构是一模一样的。这样才可以复制网络参数。Nature DQN和[Playing Atari with Deep Reinforcement Learning](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf)相比,除了用一个新的相同结构的目标Q网络来计算目标Q值以外,其余部分基本是完全相同的。细节也可参考[强化学习(九)Deep Q-Learning进阶之Nature DQN](https://www.cnblogs.com/pinard/p/9756075.html)。
|
||||
|
||||
https://blog.csdn.net/JohnJim0/article/details/109557173)
|
||||
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
After Width: | Height: | Size: 55 KiB |
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
After Width: | Height: | Size: 58 KiB |
@@ -5,7 +5,7 @@ Author: JiangJi
|
||||
Email: johnjim0816@gmail.com
|
||||
Date: 2021-12-22 11:14:17
|
||||
LastEditor: JiangJi
|
||||
LastEditTime: 2022-02-10 06:17:41
|
||||
LastEditTime: 2022-06-18 20:12:20
|
||||
Discription: 使用 Nature DQN 训练 CartPole-v1
|
||||
'''
|
||||
import sys
|
||||
@@ -17,6 +17,9 @@ sys.path.append(parent_path) # 添加路径到系统路径
|
||||
import gym
|
||||
import torch
|
||||
import datetime
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from common.utils import save_results, make_dir
|
||||
from common.utils import plot_rewards, plot_rewards_cn
|
||||
from dqn import DQN
|
||||
@@ -33,18 +36,18 @@ class DQNConfig:
|
||||
self.env_name = env_name # 环境名称
|
||||
self.device = torch.device(
|
||||
"cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
|
||||
self.train_eps = 200 # 训练的回合数
|
||||
self.test_eps = 30 # 测试的回合数
|
||||
self.train_eps = 300 # 训练的回合数
|
||||
self.test_eps = 20 # 测试的回合数
|
||||
# 超参数
|
||||
self.gamma = 0.95 # 强化学习中的折扣因子
|
||||
self.epsilon_start = 0.90 # e-greedy策略中初始epsilon
|
||||
self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon
|
||||
self.gamma = 0.99 # 强化学习中的折扣因子
|
||||
self.epsilon_start = 0.99 # e-greedy策略中初始epsilon
|
||||
self.epsilon_end = 0.005 # e-greedy策略中的终止epsilon
|
||||
self.epsilon_decay = 500 # e-greedy策略中epsilon的衰减率
|
||||
self.lr = 0.0001 # 学习率
|
||||
self.memory_capacity = 100000 # 经验回放的容量
|
||||
self.batch_size = 64 # mini-batch SGD中的批量大小
|
||||
self.batch_size = 128 # mini-batch SGD中的批量大小
|
||||
self.target_update = 4 # 目标网络的更新频率
|
||||
self.hidden_dim = 256 # 网络隐藏层
|
||||
self.hidden_dim = 512 # 网络隐藏层
|
||||
class PlotConfig:
|
||||
''' 绘图相关参数设置
|
||||
'''
|
||||
@@ -60,7 +63,23 @@ class PlotConfig:
|
||||
'/' + curr_time + '/models/' # 保存模型的路径
|
||||
self.save = True # 是否保存图片
|
||||
|
||||
|
||||
class MLP(nn.Module):
|
||||
def __init__(self, n_states,n_actions,hidden_dim=128):
|
||||
""" 初始化q网络,为全连接网络
|
||||
n_states: 输入的特征数即环境的状态维度
|
||||
n_actions: 输出的动作维度
|
||||
"""
|
||||
super(MLP, self).__init__()
|
||||
self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层
|
||||
self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层
|
||||
self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层
|
||||
|
||||
def forward(self, x):
|
||||
# 各层对应的激活函数
|
||||
x = F.relu(self.fc1(x))
|
||||
x = F.relu(self.fc2(x))
|
||||
return self.fc3(x)
|
||||
|
||||
def env_agent_config(cfg, seed=1):
|
||||
''' 创建环境和智能体
|
||||
'''
|
||||
@@ -68,7 +87,8 @@ def env_agent_config(cfg, seed=1):
|
||||
env.seed(seed) # 设置随机种子
|
||||
n_states = env.observation_space.shape[0] # 状态维度
|
||||
n_actions = env.action_space.n # 动作维度
|
||||
agent = DQN(n_states, n_actions, cfg) # 创建智能体
|
||||
model = MLP(n_states,n_actions)
|
||||
agent = DQN(n_actions,model,cfg) # 创建智能体
|
||||
return env, agent
|
||||
|
||||
def train(cfg, env, agent):
|
||||
|
||||
@@ -1,184 +0,0 @@
|
||||
import random
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import tensorflow as tf
|
||||
import os
|
||||
import gym
|
||||
import time
|
||||
from collections import deque
|
||||
from tensorflow.keras import optimizers
|
||||
from keras.models import Sequential
|
||||
from keras.layers import Dense, Dropout
|
||||
from keras.layers import Activation, Flatten, Conv1D, MaxPooling1D,Reshape
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
class DQN:
|
||||
def __init__(self, env):
|
||||
self.env = env
|
||||
self.memory = deque(maxlen=400000)
|
||||
self.gamma = 0.99
|
||||
self.epsilon = 1.0
|
||||
self.epsilon_min = 0.01
|
||||
self.epsilon_decay = self.epsilon_min / 500000
|
||||
|
||||
self.batch_size = 32
|
||||
self.train_start = 1000
|
||||
self.state_size = self.env.observation_space.shape[0]*4
|
||||
self.action_size = self.env.action_space.n
|
||||
self.learning_rate = 0.00025
|
||||
|
||||
self.evaluation_model = self.create_model()
|
||||
self.target_model = self.create_model()
|
||||
|
||||
def create_model(self):
|
||||
model = Sequential()
|
||||
model.add(Dense(128*2, input_dim=self.state_size,activation='relu'))
|
||||
model.add(Dense(128*2, activation='relu'))
|
||||
model.add(Dense(128*2, activation='relu'))
|
||||
model.add(Dense(self.env.action_space.n, activation='linear'))
|
||||
model.compile(loss='mean_squared_error', optimizer=optimizers.RMSprop(lr=self.learning_rate,decay=0.99,epsilon=1e-6))
|
||||
return model
|
||||
|
||||
def choose_action(self, state, steps):
|
||||
if steps > 50000:
|
||||
if self.epsilon > self.epsilon_min:
|
||||
self.epsilon -= self.epsilon_decay
|
||||
if np.random.random() < self.epsilon:
|
||||
return self.env.action_space.sample()
|
||||
return np.argmax(self.evaluation_model.predict(state)[0])
|
||||
|
||||
def remember(self, cur_state, action, reward, new_state, done):
|
||||
if not hasattr(self, 'memory_counter'):
|
||||
self.memory_counter = 0
|
||||
|
||||
transition = (cur_state, action, reward, new_state, done)
|
||||
self.memory.extend([transition])
|
||||
|
||||
self.memory_counter += 1
|
||||
|
||||
def replay(self):
|
||||
if len(self.memory) < self.train_start:
|
||||
return
|
||||
|
||||
mini_batch = random.sample(self.memory, self.batch_size)
|
||||
|
||||
update_input = np.zeros((self.batch_size, self.state_size))
|
||||
update_target = np.zeros((self.batch_size, self.action_size))
|
||||
|
||||
for i in range(self.batch_size):
|
||||
state, action, reward, new_state, done = mini_batch[i]
|
||||
target = self.evaluation_model.predict(state)[0]
|
||||
|
||||
if done:
|
||||
target[action] = reward
|
||||
else:
|
||||
target[action] = reward + self.gamma * np.amax(self.target_model.predict(new_state)[0])
|
||||
|
||||
update_input[i] = state
|
||||
update_target[i] = target
|
||||
|
||||
self.evaluation_model.fit(update_input, update_target, batch_size=self.batch_size, epochs=1, verbose=0)
|
||||
|
||||
def target_train(self):
|
||||
self.target_model.set_weights(self.evaluation_model.get_weights())
|
||||
return
|
||||
|
||||
def visualize(self, reward, episode):
|
||||
plt.plot(episode, reward, 'ob-')
|
||||
plt.title('Average reward each 100 episode')
|
||||
plt.ylabel('Reward')
|
||||
plt.xlabel('Episodes')
|
||||
plt.grid()
|
||||
plt.show()
|
||||
|
||||
def transform(self,state):
|
||||
if state.shape[1]==512:
|
||||
return state
|
||||
a=[np.binary_repr(x,width=8) for x in state[0]]
|
||||
res=[]
|
||||
for x in a:
|
||||
res.extend([x[:2],x[2:4],x[4:6],x[6:]])
|
||||
res=[int(x,2) for x in res]
|
||||
return np.array(res)
|
||||
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
||||
def main():
|
||||
# env = gym.make('Breakout-ram-v0')
|
||||
env = gym.make('Breakout-ram-v0')
|
||||
env = env.unwrapped
|
||||
|
||||
print(env.action_space)
|
||||
print(env.observation_space.shape[0])
|
||||
print(env.observation_space.high)
|
||||
print(env.observation_space.low)
|
||||
|
||||
#print(env.observation_space.shape)
|
||||
|
||||
|
||||
episodes = 5000
|
||||
trial_len = 10000
|
||||
|
||||
tmp_reward=0
|
||||
sum_rewards = 0
|
||||
n_success = 0
|
||||
total_steps = 0
|
||||
|
||||
graph_reward = []
|
||||
graph_episodes = []
|
||||
time_record = []
|
||||
|
||||
dqn_agent = DQN(env=env)
|
||||
for i_episode in range(episodes):
|
||||
start_time = time.time()
|
||||
total_reward = 0
|
||||
cur_state = env.reset().reshape(1,128)
|
||||
cur_state=dqn_agent.transform(cur_state).reshape(1,128*4)/4
|
||||
i_step=0
|
||||
for step in range(trial_len):
|
||||
#env.render()
|
||||
i_step+=1
|
||||
action = dqn_agent.choose_action(cur_state, total_steps)
|
||||
new_state, reward, done, _ = env.step(action)
|
||||
new_state = new_state.reshape(1, 128)
|
||||
new_state = dqn_agent.transform(new_state).reshape(1,128*4)/4
|
||||
total_reward += reward
|
||||
sum_rewards += reward
|
||||
tmp_reward += reward
|
||||
if reward>0: #Testing whether it is good.
|
||||
reward=1
|
||||
|
||||
dqn_agent.remember(cur_state, action, reward, new_state, done)
|
||||
if total_steps > 10000:
|
||||
if total_steps%4 == 0:
|
||||
dqn_agent.replay()
|
||||
if total_steps%5000 == 0:
|
||||
dqn_agent.target_train()
|
||||
|
||||
cur_state = new_state
|
||||
total_steps += 1
|
||||
if done:
|
||||
env.reset()
|
||||
break
|
||||
if (i_episode+1) % 100 == 0:
|
||||
graph_reward.append(sum_rewards/100)
|
||||
graph_episodes.append(i_episode+1)
|
||||
sum_rewards = 0
|
||||
print("Episode ",i_episode+1," Reward: ")
|
||||
print(graph_reward[-1])
|
||||
end_time = time.time()
|
||||
time_record.append(end_time-start_time)
|
||||
print("NOW in episode: " + str(i_episode))
|
||||
print("Time cost: " + str(end_time-start_time))
|
||||
print("Reward: ",tmp_reward)
|
||||
print("Step:", i_step)
|
||||
tmp_reward=0
|
||||
print("Reward: ")
|
||||
print(graph_reward)
|
||||
print("Episode: ")
|
||||
print(graph_episodes)
|
||||
print("Average_time: ")
|
||||
print(sum(time_record)/5000)
|
||||
dqn_agent.visualize(graph_reward, graph_episodes)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user