update DDPG

This commit is contained in:
JohnJim0816
2020-10-15 22:07:42 +08:00
parent cf9887f6d0
commit 0ff03c498e
37 changed files with 161 additions and 99 deletions

3
codes/ddpg/.vscode/settings.json vendored Normal file
View File

@@ -0,0 +1,3 @@
{
"python.pythonPath": "/Users/jj/anaconda3/envs/py37/bin/python"
}

26
codes/ddpg/README.md Normal file
View File

@@ -0,0 +1,26 @@
python 3.7.9
pytorch 1.6.0
tensorboard 2.3.0
torchvision 0.7.0
train:
```python
python main.py
```
eval:
```python
python main.py --train 0
```
open tensorboard:
```python
tensorboard --logdir logs
```

View File

@@ -5,63 +5,76 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-11 20:58:21
@LastEditor: John
LastEditTime: 2020-09-02 01:24:50
LastEditTime: 2020-10-15 21:23:39
@Discription:
@Environment: python 3.7.7
'''
from token import NUMBER
from typing import Sequence
import torch
import gym
from ddpg import DDPG
from agent import DDPG
from env import NormalizedActions
from noise import OUNoise
from plot import plot
import os
import numpy as np
import argparse
from torch.utils.tensorboard import SummaryWriter
import datetime
SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/'
RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/result/"+SEQUENCE+'/'
def get_args():
'''模型建立好之后只需要在这里调参
'''
parser = argparse.ArgumentParser()
parser.add_argument("--gamma", default=0.99, type=float) # q-learning中的gamma
parser.add_argument("--critic_lr", default=1e-3, type=float) # critic学习率
parser.add_argument("--train", default=1, type=int) # 1 表示训练0表示只进行eval
parser.add_argument("--gamma", default=0.99,
type=float) # q-learning中的gamma
parser.add_argument("--critic_lr", default=1e-3, type=float) # critic学习率
parser.add_argument("--actor_lr", default=1e-4, type=float)
parser.add_argument("--memory_capacity", default=10000, type=int,help="capacity of Replay Memory")
parser.add_argument("--batch_size", default=128, type=int,help="batch size of memory sampling")
parser.add_argument("--memory_capacity", default=10000,
type=int, help="capacity of Replay Memory")
parser.add_argument("--batch_size", default=128, type=int,
help="batch size of memory sampling")
parser.add_argument("--train_eps", default=200, type=int)
parser.add_argument("--train_steps", default=200, type=int)
parser.add_argument("--eval_eps", default=200, type=int) # 训练的最大episode数目
parser.add_argument("--eval_steps", default=200, type=int) # 训练每个episode的长度
parser.add_argument("--target_update", default=4, type=int,help="when(every default 10 eisodes) to update target net ")
parser.add_argument("--eval_eps", default=200, type=int) # 训练的最大episode数目
parser.add_argument("--eval_steps", default=200,
type=int) # 训练每个episode的长度
parser.add_argument("--target_update", default=4, type=int,
help="when(every default 10 eisodes) to update target net ")
config = parser.parse_args()
return config
def train():
cfg = get_args()
def train(cfg):
print('Start to train ! \n')
env = NormalizedActions(gym.make("Pendulum-v0"))
# 增加action噪声
ou_noise = OUNoise(env.action_space)
n_states = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
agent=DDPG(n_states,n_actions,device="cpu", critic_lr=1e-3,
actor_lr=1e-4, gamma=0.99, soft_tau=1e-2, memory_capacity=100000, batch_size=128)
rewards = []
n_states = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
agent = DDPG(n_states, n_actions, device="cpu", critic_lr=1e-3,
actor_lr=1e-4, gamma=0.99, soft_tau=1e-2, memory_capacity=100000, batch_size=128)
rewards = []
moving_average_rewards = []
ep_steps = []
for i_episode in range(1,cfg.train_eps+1):
state=env.reset()
log_dir=os.path.split(os.path.abspath(__file__))[0]+"/logs/train/" + SEQUENCE
writer = SummaryWriter(log_dir)
for i_episode in range(1, cfg.train_eps+1):
state = env.reset()
ou_noise.reset()
ep_reward = 0
for i_step in range(1,cfg.train_steps+1):
action = agent.select_action(state)
action = ou_noise.get_action(action, i_step) # 即paper中的random process
for i_step in range(1, cfg.train_steps+1):
action = agent.select_action(state)
action = ou_noise.get_action(
action, i_step) # 即paper中的random process
next_state, reward, done, _ = env.step(action)
ep_reward += reward
agent.memory.push(state, action, reward, next_state, done)
@@ -69,7 +82,8 @@ def train():
state = next_state
if done:
break
print('Episode:', i_episode, ' Reward: %i' % int(ep_reward),'n_steps:', i_step)
print('Episode:', i_episode, ' Reward: %i' %
int(ep_reward), 'n_steps:', i_step)
ep_steps.append(i_step)
rewards.append(ep_reward)
if i_episode == 1:
@@ -77,54 +91,43 @@ def train():
else:
moving_average_rewards.append(
0.9*moving_average_rewards[-1]+0.1*ep_reward)
print('Complete')
# 保存模型
import os
import numpy as np
save_path = os.path.dirname(__file__)+"/saved_model/"
if not os.path.exists(save_path):
os.mkdir(save_path)
agent.save_model(save_path+'checkpoint.pth')
# 存储reward等相关结果
output_path = os.path.dirname(__file__)+"/result/"
# 检测是否存在文件夹
if not os.path.exists(output_path):
os.mkdir(output_path)
np.save(output_path+"rewards.npy", rewards)
np.save(output_path+"moving_average_rewards.npy", moving_average_rewards)
np.save(output_path+"steps.npy", ep_steps)
plot(rewards)
plot(moving_average_rewards,ylabel="moving_average_rewards")
plot(ep_steps, ylabel="steps_of_each_episode")
def eval():
cfg = get_args()
env = NormalizedActions(gym.make("Pendulum-v0"))
# 增加action噪声
ou_noise = OUNoise(env.action_space)
n_states = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]
agent=DDPG(n_states,n_actions, critic_lr=1e-3,
actor_lr=1e-4, gamma=0.99, soft_tau=1e-2, memory_capacity=100000, batch_size=128)
writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode)
writer.add_scalar('steps_of_each_episode',
ep_steps[-1], i_episode)
writer.close()
print('Complete training')
''' 保存模型 '''
if not os.path.exists(SAVED_MODEL_PATH): # 检测是否存在文件夹
os.mkdir(SAVED_MODEL_PATH)
agent.save_model(SAVED_MODEL_PATH+'checkpoint.pth')
'''存储reward等相关结果'''
if not os.path.exists(RESULT_PATH): # 检测是否存在文件夹
os.mkdir(RESULT_PATH)
np.save(RESULT_PATH+'rewards_train.npy', rewards)
np.save(RESULT_PATH+'moving_average_rewards_train.npy', moving_average_rewards)
np.save(RESULT_PATH+'steps_train.npy', ep_steps)
import os
save_path = os.path.dirname(__file__)+"/saved_model/"
if not os.path.exists(save_path):
os.mkdir(save_path)
agent.load_model(save_path+'checkpoint.pth')
def eval(cfg, saved_model_path = SAVED_MODEL_PATH):
print('start to eval ! \n')
env = NormalizedActions(gym.make("Pendulum-v0"))
n_states = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]
agent = DDPG(n_states, n_actions, critic_lr=1e-3,
actor_lr=1e-4, gamma=0.99, soft_tau=1e-2, memory_capacity=100000, batch_size=128)
agent.load_model(saved_model_path+'checkpoint.pth')
rewards = []
moving_average_rewards = []
ep_steps = []
log_dir=os.path.split(os.path.abspath(__file__))[0]+"/logs/eval/" + SEQUENCE
writer = SummaryWriter(log_dir)
for i_episode in range(1, cfg.eval_eps+1):
state = env.reset() # reset环境状态
state = env.reset() # reset环境状态
ep_reward = 0
for i_step in range(1, cfg.eval_steps+1):
action = agent.select_action(state) # 根据当前环境state选择action
next_state, reward, done, _ = env.step(action) # 更新环境参数
action = agent.select_action(state) # 根据当前环境state选择action
next_state, reward, done, _ = env.step(action) # 更新环境参数
ep_reward += reward
state = next_state # 跳转到下一个状态
state = next_state # 跳转到下一个状态
if done:
break
print('Episode:', i_episode, ' Reward: %i' %
@@ -137,11 +140,22 @@ def eval():
else:
moving_average_rewards.append(
0.9*moving_average_rewards[-1]+0.1*ep_reward)
plot(rewards,save_fig=False)
plot(moving_average_rewards, ylabel="moving_average_rewards",save_fig=False)
plot(ep_steps, ylabel="steps_of_each_episode",save_fig=False)
writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode)
writer.add_scalar('steps_of_each_episode',
ep_steps[-1], i_episode)
writer.close()
'''存储reward等相关结果'''
if not os.path.exists(RESULT_PATH): # 检测是否存在文件夹
os.mkdir(RESULT_PATH)
np.save(RESULT_PATH+'rewards_eval.npy', rewards)
np.save(RESULT_PATH+'moving_average_rewards_eval.npy', moving_average_rewards)
np.save(RESULT_PATH+'steps_eval.npy', ep_steps)
if __name__ == "__main__":
# train()
eval()
cfg = get_args()
if cfg.train:
train(cfg)
eval(cfg)
else:
model_path = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"
eval(cfg,saved_model_path=model_path)

View File

@@ -5,17 +5,16 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-11 16:30:09
@LastEditor: John
LastEditTime: 2020-09-02 01:20:03
LastEditTime: 2020-10-15 21:32:05
@Discription:
@Environment: python 3.7.7
'''
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns;
import seaborn as sns
import numpy as np
import os
def plot(item,ylabel='rewards',save_fig = True):
def plot_results(item,ylabel='rewards_train', save_fig = True):
'''plot using searborn to plot
'''
sns.set()
@@ -24,25 +23,24 @@ def plot(item,ylabel='rewards',save_fig = True):
plt.title(ylabel+' of DDPG')
plt.ylabel(ylabel)
plt.xlabel('episodes')
plt.savefig(os.path.dirname(__file__)+"/result/"+ylabel+".png")
if save_fig:
plt.savefig(os.path.dirname(__file__)+"/result/"+ylabel+".png")
plt.show()
# def plot(item,ylabel='rewards'):
#
# df = pd.DataFrame(dict(time=np.arange(len(item)),value=item))
# g = sns.relplot(x="time", y="value", kind="line", data=df)
# # g.fig.autofmt_xdate()
# # sns.lineplot(time=time, data=item, color="r", condition="behavior_cloning")
# # # sns.tsplot(time=time, data=x2, color="b", condition="dagger")
# # plt.ylabel("Reward")
# # plt.xlabel("Iteration Number")
# # plt.title("Imitation Learning")
# plt.show()
if __name__ == "__main__":
output_path = os.path.dirname(__file__)+"/result/"
rewards=np.load(output_path+"rewards.npy", )
moving_average_rewards=np.load(output_path+"moving_average_rewards.npy",)
plot(rewards)
plot(moving_average_rewards,ylabel='moving_average_rewards')
output_path = os.path.split(os.path.abspath(__file__))[0]+"/result/"
tag = 'train'
rewards=np.load(output_path+"rewards_"+tag+".npy", )
moving_average_rewards=np.load(output_path+"moving_average_rewards_"+tag+".npy",)
steps=np.load(output_path+"steps_"+tag+".npy")
plot_results(rewards)
plot_results(moving_average_rewards,ylabel='moving_average_rewards_'+tag)
plot_results(steps,ylabel='steps_'+tag)
tag = 'eval'
rewards=np.load(output_path+"rewards_"+tag+".npy", )
moving_average_rewards=np.load(output_path+"moving_average_rewards_"+tag+".npy",)
steps=np.load(output_path+"steps_"+tag+".npy")
plot_results(rewards,ylabel='rewards_'+tag)
plot_results(moving_average_rewards,ylabel='moving_average_rewards_'+tag)
plot_results(steps,ylabel='steps_'+tag)

Binary file not shown.

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 56 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 50 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 40 KiB

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 67 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 74 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 56 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 23 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 27 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 23 KiB

Binary file not shown.

21
codes/ddpg/utils.py Normal file
View File

@@ -0,0 +1,21 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2020-10-15 21:31:19
LastEditor: John
LastEditTime: 2020-10-15 21:31:25
Discription:
Environment:
'''
import os
import numpy as np
RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/result/"+SEQUENCE+'/'
def save_results(rewards,moving_average_rewards,ep_steps,path=RESULT_PATH):
if not os.path.exists(path): # 检测是否存在文件夹
os.mkdir(path)
np.save(RESULT_PATH+'rewards_train.npy', rewards)
np.save(RESULT_PATH+'moving_average_rewards_train.npy', moving_average_rewards)
np.save(RESULT_PATH+'steps_train.npy',ep_steps )