update DQN

This commit is contained in:
JohnJim0816
2020-10-15 22:07:12 +08:00
parent 838088be41
commit cf9887f6d0
38 changed files with 212 additions and 70 deletions

3
codes/dqn/.vscode/settings.json vendored Normal file
View File

@@ -0,0 +1,3 @@
{
"python.pythonPath": "/Users/jj/anaconda3/envs/py37/bin/python"
}

24
codes/dqn/README.md Normal file
View File

@@ -0,0 +1,24 @@
python 3.7.9
pytorch 1.6.0
tensorboard 2.3.0
torchvision 0.7.0
train:
```python
python main.py
```
eval:
```python
python main.py --train 0
```
```python
tensorboard --logdir logs
```

View File

@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com @Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:50:49 @Date: 2020-06-12 00:50:49
@LastEditor: John @LastEditor: John
LastEditTime: 2020-10-07 17:32:18 LastEditTime: 2020-10-15 21:56:21
@Discription: @Discription:
@Environment: python 3.7.7 @Environment: python 3.7.7
''' '''
@@ -13,8 +13,6 @@ LastEditTime: 2020-10-07 17:32:18
''' '''
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.optim as optim import torch.optim as optim
@@ -30,7 +28,7 @@ class DQN:
self.n_actions = n_actions # 总的动作个数 self.n_actions = n_actions # 总的动作个数
self.device = device # 设备cpu或gpu等 self.device = device # 设备cpu或gpu等
self.gamma = gamma self.gamma = gamma
# e-greedy 策略相关参数 # e-greedy策略相关参数
self.epsilon = 0 self.epsilon = 0
self.epsilon_start = epsilon_start self.epsilon_start = epsilon_start
self.epsilon_end = epsilon_end self.epsilon_end = epsilon_end
@@ -46,32 +44,41 @@ class DQN:
self.loss = 0 self.loss = 0
self.memory = ReplayBuffer(memory_capacity) self.memory = ReplayBuffer(memory_capacity)
def select_action(self, state): def choose_action(self, state, train=True):
'''选择动作 '''选择动作
Args:
state [array]: [description]
Returns:
action [array]: [description]
''' '''
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ if train:
math.exp(-1. * self.actions_count / self.epsilon_decay) self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
self.actions_count += 1 math.exp(-1. * self.actions_count / self.epsilon_decay)
if random.random() > self.epsilon: self.actions_count += 1
if random.random() > self.epsilon:
with torch.no_grad():
# 先转为张量便于丢给神经网络,state元素数据原本为float64
# 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
state = torch.tensor(
[state], device=self.device, dtype=torch.float32)
# 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
q_value = self.policy_net(state)
# tensor.max(1)返回每行的最大值以及对应的下标,
# 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0]))
# 所以tensor.max(1)[1]返回最大值对应的下标即action
action = q_value.max(1)[1].item()
else:
action = random.randrange(self.n_actions)
return action
else:
with torch.no_grad(): with torch.no_grad():
# 先转为张量便于丢给神经网络,state元素数据原本为float64 # 先转为张量便于丢给神经网络,state元素数据原本为float64
# 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价 # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
state = torch.tensor( state = torch.tensor(
[state], device=self.device, dtype=torch.float32) [state], device='cpu', dtype=torch.float32)
# 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>) # 如tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
q_value = self.policy_net(state) q_value = self.target_net(state)
# tensor.max(1)返回每行的最大值以及对应的下标, # tensor.max(1)返回每行的最大值以及对应的下标,
# 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0])) # 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0]))
# 所以tensor.max(1)[1]返回最大值对应的下标即action # 所以tensor.max(1)[1]返回最大值对应的下标即action
action = q_value.max(1)[1].item() action = q_value.max(1)[1].item()
else: return action
action = random.randrange(self.n_actions)
return action
def update(self): def update(self):
if len(self.memory) < self.batch_size: if len(self.memory) < self.batch_size:
@@ -113,8 +120,9 @@ class DQN:
for param in self.policy_net.parameters(): # clip防止梯度爆炸 for param in self.policy_net.parameters(): # clip防止梯度爆炸
param.grad.data.clamp_(-1, 1) param.grad.data.clamp_(-1, 1)
self.optimizer.step() # 更新模型 self.optimizer.step() # 更新模型
def save_model(): def save_model(self,path):
pass torch.save(self.target_net.state_dict(), path)
def load_model():
pass def load_model(self,path):
self.target_net.load_state_dict(torch.load(path))

View File

@@ -5,20 +5,28 @@
@Email: johnjim0816@gmail.com @Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:48:57 @Date: 2020-06-12 00:48:57
@LastEditor: John @LastEditor: John
LastEditTime: 2020-08-22 18:02:56 LastEditTime: 2020-10-15 22:00:28
@Discription: @Discription:
@Environment: python 3.7.7 @Environment: python 3.7.7
''' '''
import gym import gym
import torch import torch
from dqn import DQN from agent import DQN
from plot import plot
import argparse import argparse
from torch.utils.tensorboard import SummaryWriter
import datetime
import os
from utils import save_results
SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/'
RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/result/"+SEQUENCE+'/'
def get_args(): def get_args():
'''模型参数 '''模型参数
''' '''
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--train", default=1, type=int) # 1 表示训练0表示只进行eval
parser.add_argument("--gamma", default=0.99, parser.add_argument("--gamma", default=0.99,
type=float) # q-learning中的gamma type=float) # q-learning中的gamma
parser.add_argument("--epsilon_start", default=0.95, parser.add_argument("--epsilon_start", default=0.95,
@@ -31,20 +39,19 @@ def get_args():
parser.add_argument("--batch_size", default=32, type=int, parser.add_argument("--batch_size", default=32, type=int,
help="batch size of memory sampling") help="batch size of memory sampling")
parser.add_argument("--max_episodes", default=200, type=int) # 训练的最大episode数目 parser.add_argument("--train_eps", default=200, type=int) # 训练的最大episode数目
parser.add_argument("--max_steps", default=200, type=int) parser.add_argument("--train_steps", default=200, type=int)
# 将目标网络的更新频率改为1就是普通的dqn大于1就是double dqn parser.add_argument("--target_update", default=2, type=int,
parser.add_argument("--target_update", default=1, type=int, help="when(every default 2 eisodes) to update target net ") # 更新频率
help="when(every default 10 eisodes) to update target net ")
parser.add_argument("--eval_eps", default=100, type=int) # 训练的最大episode数目
parser.add_argument("--eval_steps", default=200,
type=int) # 训练每个episode的长度
config = parser.parse_args() config = parser.parse_args()
return config return config
def train(cfg):
print('Start to train ! \n')
if __name__ == "__main__":
cfg = get_args()
# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu
env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym此处一般不需要 env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym此处一般不需要
env.seed(1) # 设置env随机种子 env.seed(1) # 设置env随机种子
@@ -55,11 +62,13 @@ if __name__ == "__main__":
rewards = [] rewards = []
moving_average_rewards = [] moving_average_rewards = []
ep_steps = [] ep_steps = []
for i_episode in range(1, cfg.max_episodes+1): log_dir=os.path.split(os.path.abspath(__file__))[0]+"/logs/train/" + SEQUENCE
writer = SummaryWriter(log_dir)
for i_episode in range(1, cfg.train_eps+1):
state = env.reset() # reset环境状态 state = env.reset() # reset环境状态
ep_reward = 0 ep_reward = 0
for i_step in range(1, cfg.max_steps+1): for i_step in range(1, cfg.train_steps+1):
action = agent.select_action(state) # 根据当前环境state选择action action = agent.choose_action(state) # 根据当前环境state选择action
next_state, reward, done, _ = env.step(action) # 更新环境参数 next_state, reward, done, _ = env.step(action) # 更新环境参数
ep_reward += reward ep_reward += reward
agent.memory.push(state, action, reward, next_state, done) # 将state等这些transition存入memory agent.memory.push(state, action, reward, next_state, done) # 将state等这些transition存入memory
@@ -80,17 +89,68 @@ if __name__ == "__main__":
else: else:
moving_average_rewards.append( moving_average_rewards.append(
0.9*moving_average_rewards[-1]+0.1*ep_reward) 0.9*moving_average_rewards[-1]+0.1*ep_reward)
# 存储reward等相关结果 writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode)
import os writer.add_scalar('steps_of_each_episode',
import numpy as np ep_steps[-1], i_episode)
output_path = os.path.dirname(__file__)+"/result/" writer.close()
# 检测是否存在文件夹 print('Complete training')
if not os.path.exists(output_path): ''' 保存模型 '''
os.mkdir(output_path) if not os.path.exists(SAVED_MODEL_PATH): # 检测是否存在文件夹
np.save(output_path+"rewards.npy", rewards) os.mkdir(SAVED_MODEL_PATH)
np.save(output_path+"moving_average_rewards.npy", moving_average_rewards) agent.save_model(SAVED_MODEL_PATH+'checkpoint.pth')
np.save(output_path+"steps.npy", ep_steps) print('model saved')
print('Complete') '''存储reward等相关结果'''
plot(rewards) save_results(rewards,moving_average_rewards,ep_steps,tag='train',result_path=RESULT_PATH)
plot(moving_average_rewards, ylabel="moving_average_rewards")
plot(ep_steps, ylabel="steps_of_each_episode")
def eval(cfg, saved_model_path = SAVED_MODEL_PATH):
print('start to eval ! \n')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu
env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym此处一般不需要
env.seed(1) # 设置env随机种子
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n
agent = DQN(n_states=n_states, n_actions=n_actions, device=device, gamma=cfg.gamma, epsilon_start=cfg.epsilon_start,
epsilon_end=cfg.epsilon_end, epsilon_decay=cfg.epsilon_decay, policy_lr=cfg.policy_lr, memory_capacity=cfg.memory_capacity, batch_size=cfg.batch_size)
agent.load_model(saved_model_path+'checkpoint.pth')
rewards = []
moving_average_rewards = []
ep_steps = []
log_dir=os.path.split(os.path.abspath(__file__))[0]+"/logs/eval/" + SEQUENCE
writer = SummaryWriter(log_dir)
for i_episode in range(1, cfg.eval_eps+1):
state = env.reset() # reset环境状态
ep_reward = 0
for i_step in range(1, cfg.eval_steps+1):
action = agent.choose_action(state,train=False) # 根据当前环境state选择action
next_state, reward, done, _ = env.step(action) # 更新环境参数
ep_reward += reward
state = next_state # 跳转到下一个状态
if done:
break
print('Episode:', i_episode, ' Reward: %i' %
int(ep_reward), 'n_steps:', i_step, 'done: ', done)
ep_steps.append(i_step)
rewards.append(ep_reward)
# 计算滑动窗口的reward
if i_episode == 1:
moving_average_rewards.append(ep_reward)
else:
moving_average_rewards.append(
0.9*moving_average_rewards[-1]+0.1*ep_reward)
writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode)
writer.add_scalar('steps_of_each_episode',
ep_steps[-1], i_episode)
writer.close()
'''存储reward等相关结果'''
save_results(rewards,moving_average_rewards,ep_steps,tag='eval',result_path=RESULT_PATH)
print('Complete evaling')
if __name__ == "__main__":
cfg = get_args()
if cfg.train:
train(cfg)
eval(cfg)
else:
model_path = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"
eval(cfg,saved_model_path=model_path)

View File

@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com @Email: johnjim0816@gmail.com
@Date: 2020-06-11 16:30:09 @Date: 2020-06-11 16:30:09
@LastEditor: John @LastEditor: John
LastEditTime: 2020-10-07 20:57:22 LastEditTime: 2020-10-15 22:01:50
@Discription: @Discription:
@Environment: python 3.7.7 @Environment: python 3.7.7
''' '''
@@ -14,19 +14,45 @@ import seaborn as sns
import numpy as np import numpy as np
import os import os
def plot(item,ylabel='rewards'): def plot(item,ylabel='rewards_train', save_fig = True):
'''plot using searborn to plot
'''
sns.set() sns.set()
plt.figure() plt.figure()
plt.plot(np.arange(len(item)), item) plt.plot(np.arange(len(item)), item)
plt.title(ylabel+' of DQN') plt.title(ylabel+' of DQN')
plt.ylabel(ylabel) plt.ylabel(ylabel)
plt.xlabel('episodes') plt.xlabel('episodes')
plt.savefig(os.path.dirname(__file__)+"/result/"+ylabel+".png") if save_fig:
plt.savefig(os.path.dirname(__file__)+"/result/"+ylabel+".png")
plt.show() plt.show()
# def plot(item,ylabel='rewards'):
#
# df = pd.DataFrame(dict(time=np.arange(len(item)),value=item))
# g = sns.relplot(x="time", y="value", kind="line", data=df)
# # g.fig.autofmt_xdate()
# # sns.lineplot(time=time, data=item, color="r", condition="behavior_cloning")
# # # sns.tsplot(time=time, data=x2, color="b", condition="dagger")
# # plt.ylabel("Reward")
# # plt.xlabel("Iteration Number")
# # plt.title("Imitation Learning")
# plt.show()
if __name__ == "__main__": if __name__ == "__main__":
output_path = os.path.dirname(__file__)+"/result/" output_path = os.path.split(os.path.abspath(__file__))[0]+"/result/"
rewards=np.load(output_path+"rewards.npy", ) tag = 'train'
moving_average_rewards=np.load(output_path+"moving_average_rewards.npy",) rewards=np.load(output_path+"rewards_"+tag+".npy", )
moving_average_rewards=np.load(output_path+"moving_average_rewards_"+tag+".npy",)
steps=np.load(output_path+"steps_"+tag+".npy")
plot(rewards) plot(rewards)
plot(moving_average_rewards,ylabel='moving_average_rewards') plot(moving_average_rewards,ylabel='moving_average_rewards_'+tag)
plot(steps,ylabel='steps_'+tag)
tag = 'eval'
rewards=np.load(output_path+"rewards_"+tag+".npy", )
moving_average_rewards=np.load(output_path+"moving_average_rewards_"+tag+".npy",)
steps=np.load(output_path+"steps_"+tag+".npy")
plot(rewards,ylabel='rewards_'+tag)
plot(moving_average_rewards,ylabel='moving_average_rewards_'+tag)
plot(steps,ylabel='steps_'+tag)

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 35 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 46 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 23 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 48 KiB

Binary file not shown.

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 51 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 48 KiB

Binary file not shown.

Binary file not shown.

21
codes/dqn/utils.py Normal file
View File

@@ -0,0 +1,21 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2020-10-15 21:28:00
LastEditor: John
LastEditTime: 2020-10-15 21:50:30
Discription:
Environment:
'''
import os
import numpy as np
def save_results(rewards,moving_average_rewards,ep_steps,tag='train',result_path='./result'):
if not os.path.exists(result_path): # 检测是否存在文件夹
os.mkdir(result_path)
np.save(result_path+'rewards_'+tag+'.npy', rewards)
np.save(result_path+'moving_average_rewards_'+tag+'.npy', moving_average_rewards)
np.save(result_path+'steps_'+tag+'.npy',ep_steps )