This commit is contained in:
JohnJim0816
2021-03-23 16:10:11 +08:00
parent d4690c2058
commit bf0f2990cf
198 changed files with 1668 additions and 1545 deletions

View File

@@ -1,3 +0,0 @@
{
"python.pythonPath": "/Users/jj/anaconda3/envs/py37/bin/python"
}

View File

@@ -1,26 +1,5 @@
# DDPG
python 3.7.9
## 伪代码
pytorch 1.6.0
tensorboard 2.3.0
torchvision 0.7.0
train:
```python
python main.py
```
eval:
```python
python main.py --train 0
```
open tensorboard:
```python
tensorboard --logdir logs
```
![image-20210320151900695](assets/image-20210320151900695.png)

View File

@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-09 20:25:52
@LastEditor: John
LastEditTime: 2020-09-02 01:19:13
LastEditTime: 2021-03-17 20:43:25
@Discription:
@Environment: python 3.7.7
'''
@@ -14,18 +14,17 @@ import torch
import torch.nn as nn
import torch.optim as optim
from model import Actor, Critic
from memory import ReplayBuffer
from common.model import Actor, Critic
from common.memory import ReplayBuffer
class DDPG:
def __init__(self, n_states, n_actions, hidden_dim=30, device="cpu", critic_lr=1e-3,
actor_lr=1e-4, gamma=0.99, soft_tau=1e-2, memory_capacity=100000, batch_size=128):
self.device = device
self.critic = Critic(n_states, n_actions, hidden_dim).to(device)
self.actor = Actor(n_states, n_actions, hidden_dim).to(device)
self.target_critic = Critic(n_states, n_actions, hidden_dim).to(device)
self.target_actor = Actor(n_states, n_actions, hidden_dim).to(device)
def __init__(self, n_states, n_actions, cfg):
self.device = cfg.device
self.critic = Critic(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
self.actor = Actor(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
self.target_critic = Critic(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
self.target_actor = Actor(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()):
target_param.data.copy_(param.data)
@@ -33,14 +32,14 @@ class DDPG:
target_param.data.copy_(param.data)
self.critic_optimizer = optim.Adam(
self.critic.parameters(), lr=critic_lr)
self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)
self.memory = ReplayBuffer(memory_capacity)
self.batch_size = batch_size
self.soft_tau = soft_tau
self.gamma = gamma
self.critic.parameters(), lr=cfg.critic_lr)
self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=cfg.actor_lr)
self.memory = ReplayBuffer(cfg.memory_capacity)
self.batch_size = cfg.batch_size
self.soft_tau = cfg.soft_tau
self.gamma = cfg.gamma
def select_action(self, state):
def choose_action(self, state):
state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
action = self.actor(state)
# torch.detach()用于切断反向传播
@@ -87,8 +86,8 @@ class DDPG:
target_param.data * (1.0 - self.soft_tau) +
param.data * self.soft_tau
)
def save_model(self,path):
torch.save(self.target_actor.state_dict(), path)
def save(self,path):
torch.save(self.target_net.state_dict(), path+'DDPG_checkpoint.pth')
def load_model(self,path):
self.actor.load_state_dict(torch.load(path))
def load(self,path):
self.actor.load_state_dict(torch.load(path+'DDPG_checkpoint.pth'))

Binary file not shown.

After

Width:  |  Height:  |  Size: 259 KiB

View File

@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-10 15:28:30
@LastEditor: John
LastEditTime: 2020-09-01 10:57:36
LastEditTime: 2021-03-19 19:56:46
@Discription:
@Environment: python 3.7.7
'''
@@ -29,4 +29,33 @@ class NormalizedActions(gym.ActionWrapper):
upper_bound = self.action_space.high
action = 2 * (action - low_bound) / (upper_bound - low_bound) - 1
action = np.clip(action, low_bound, upper_bound)
return action
return action
class OUNoise(object):
'''OrnsteinUhlenbeck
'''
def __init__(self, action_space, mu=0.0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000):
self.mu = mu
self.theta = theta
self.sigma = max_sigma
self.max_sigma = max_sigma
self.min_sigma = min_sigma
self.decay_period = decay_period
self.n_actions = action_space.shape[0]
self.low = action_space.low
self.high = action_space.high
self.reset()
def reset(self):
self.obs = np.ones(self.n_actions) * self.mu
def evolve_obs(self):
x = self.obs
dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.n_actions)
self.obs = x + dx
return self.obs
def get_action(self, action, t=0):
ou_obs = self.evolve_obs()
self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period)
return np.clip(action + ou_obs, self.low, self.high)

View File

@@ -5,74 +5,60 @@
@Email: johnjim0816@gmail.com
@Date: 2020-06-11 20:58:21
@LastEditor: John
LastEditTime: 2020-10-15 21:23:39
LastEditTime: 2021-03-19 19:57:00
@Discription:
@Environment: python 3.7.7
'''
from token import NUMBER
from typing import Sequence
import sys,os
sys.path.append(os.getcwd()) # 添加当前终端路径
import torch
import gym
from agent import DDPG
from env import NormalizedActions
from noise import OUNoise
import os
import numpy as np
import argparse
from torch.utils.tensorboard import SummaryWriter
import datetime
from DDPG.agent import DDPG
from DDPG.env import NormalizedActions,OUNoise
from common.plot import plot_rewards
from common.utils import save_results
SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/'
RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/result/"+SEQUENCE+'/'
SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' # 生成保存的模型路径
if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"): # 检测是否存在文件夹
os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/")
if not os.path.exists(SAVED_MODEL_PATH): # 检测是否存在文件夹
os.mkdir(SAVED_MODEL_PATH)
RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/' # 存储reward的路径
if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/results/"): # 检测是否存在文件夹
os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/results/")
if not os.path.exists(RESULT_PATH): # 检测是否存在文件夹
os.mkdir(RESULT_PATH)
def get_args():
'''模型建立好之后只需要在这里调参
'''
parser = argparse.ArgumentParser()
parser.add_argument("--train", default=1, type=int) # 1 表示训练0表示只进行eval
parser.add_argument("--gamma", default=0.99,
type=float) # q-learning中的gamma
parser.add_argument("--critic_lr", default=1e-3, type=float) # critic学习率
parser.add_argument("--actor_lr", default=1e-4, type=float)
parser.add_argument("--memory_capacity", default=10000,
type=int, help="capacity of Replay Memory")
parser.add_argument("--batch_size", default=128, type=int,
help="batch size of memory sampling")
parser.add_argument("--train_eps", default=200, type=int)
parser.add_argument("--train_steps", default=200, type=int)
parser.add_argument("--eval_eps", default=200, type=int) # 训练的最大episode数目
parser.add_argument("--eval_steps", default=200,
type=int) # 训练每个episode的长度
parser.add_argument("--target_update", default=4, type=int,
help="when(every default 10 eisodes) to update target net ")
config = parser.parse_args()
return config
def train(cfg):
print('Start to train ! \n')
env = NormalizedActions(gym.make("Pendulum-v0"))
# 增加action噪声
ou_noise = OUNoise(env.action_space)
n_states = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
agent = DDPG(n_states, n_actions, device="cpu", critic_lr=1e-3,
actor_lr=1e-4, gamma=0.99, soft_tau=1e-2, memory_capacity=100000, batch_size=128)
class DDPGConfig:
def __init__(self):
self.gamma = 0.99
self.critic_lr = 1e-3
self.actor_lr = 1e-4
self.memory_capacity = 10000
self.batch_size = 128
self.train_eps =300
self.train_steps = 200
self.eval_eps = 200
self.eval_steps = 200
self.target_update = 4
self.hidden_dim = 30
self.soft_tau=1e-2
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def train(cfg,env,agent):
print('Start to train ! ')
ou_noise = OUNoise(env.action_space) # action noise
rewards = []
moving_average_rewards = []
ma_rewards = [] # moving average rewards
ep_steps = []
log_dir=os.path.split(os.path.abspath(__file__))[0]+"/logs/train/" + SEQUENCE
writer = SummaryWriter(log_dir)
for i_episode in range(1, cfg.train_eps+1):
for i_episode in range(cfg.train_eps):
state = env.reset()
ou_noise.reset()
ep_reward = 0
for i_step in range(1, cfg.train_steps+1):
action = agent.select_action(state)
for i_step in range(cfg.train_steps):
action = agent.choose_action(state)
action = ou_noise.get_action(
action, i_step) # 即paper中的random process
next_state, reward, done, _ = env.step(action)
@@ -82,80 +68,25 @@ def train(cfg):
state = next_state
if done:
break
print('Episode:', i_episode, ' Reward: %i' %
int(ep_reward), 'n_steps:', i_step)
print('Episode:{}/{}, Reward:{}, Steps:{}, Done:{}'.format(i_episode+1,cfg.train_eps,ep_reward,i_step+1,done))
ep_steps.append(i_step)
rewards.append(ep_reward)
if i_episode == 1:
moving_average_rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
else:
moving_average_rewards.append(
0.9*moving_average_rewards[-1]+0.1*ep_reward)
writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode)
writer.add_scalar('steps_of_each_episode',
ep_steps[-1], i_episode)
writer.close()
ma_rewards.append(ep_reward)
print('Complete training')
''' 保存模型 '''
if not os.path.exists(SAVED_MODEL_PATH): # 检测是否存在文件夹
os.mkdir(SAVED_MODEL_PATH)
agent.save_model(SAVED_MODEL_PATH+'checkpoint.pth')
'''存储reward等相关结果'''
if not os.path.exists(RESULT_PATH): # 检测是否存在文件夹
os.mkdir(RESULT_PATH)
np.save(RESULT_PATH+'rewards_train.npy', rewards)
np.save(RESULT_PATH+'moving_average_rewards_train.npy', moving_average_rewards)
np.save(RESULT_PATH+'steps_train.npy', ep_steps)
def eval(cfg, saved_model_path = SAVED_MODEL_PATH):
print('start to eval ! \n')
env = NormalizedActions(gym.make("Pendulum-v0"))
n_states = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]
agent = DDPG(n_states, n_actions, critic_lr=1e-3,
actor_lr=1e-4, gamma=0.99, soft_tau=1e-2, memory_capacity=100000, batch_size=128)
agent.load_model(saved_model_path+'checkpoint.pth')
rewards = []
moving_average_rewards = []
ep_steps = []
log_dir=os.path.split(os.path.abspath(__file__))[0]+"/logs/eval/" + SEQUENCE
writer = SummaryWriter(log_dir)
for i_episode in range(1, cfg.eval_eps+1):
state = env.reset() # reset环境状态
ep_reward = 0
for i_step in range(1, cfg.eval_steps+1):
action = agent.select_action(state) # 根据当前环境state选择action
next_state, reward, done, _ = env.step(action) # 更新环境参数
ep_reward += reward
state = next_state # 跳转到下一个状态
if done:
break
print('Episode:', i_episode, ' Reward: %i' %
int(ep_reward), 'n_steps:', i_step, 'done: ', done)
ep_steps.append(i_step)
rewards.append(ep_reward)
# 计算滑动窗口的reward
if i_episode == 1:
moving_average_rewards.append(ep_reward)
else:
moving_average_rewards.append(
0.9*moving_average_rewards[-1]+0.1*ep_reward)
writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode)
writer.add_scalar('steps_of_each_episode',
ep_steps[-1], i_episode)
writer.close()
'''存储reward等相关结果'''
if not os.path.exists(RESULT_PATH): # 检测是否存在文件夹
os.mkdir(RESULT_PATH)
np.save(RESULT_PATH+'rewards_eval.npy', rewards)
np.save(RESULT_PATH+'moving_average_rewards_eval.npy', moving_average_rewards)
np.save(RESULT_PATH+'steps_eval.npy', ep_steps)
return rewards,ma_rewards
if __name__ == "__main__":
cfg = get_args()
if cfg.train:
train(cfg)
eval(cfg)
else:
model_path = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"
eval(cfg,saved_model_path=model_path)
cfg = DDPGConfig()
env = NormalizedActions(gym.make("Pendulum-v0"))
env.seed(1) # 设置env随机种子
n_states = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]
agent = DDPG(n_states,n_actions,cfg)
rewards,ma_rewards = train(cfg,env,agent)
agent.save(path=SAVED_MODEL_PATH)
save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
plot_rewards(rewards,ma_rewards,tag="train",algo = cfg.algo,path=RESULT_PATH)

View File

@@ -1,34 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-10 15:27:16
@LastEditor: John
@LastEditTime: 2020-06-13 00:29:45
@Discription:
@Environment: python 3.7.7
'''
import random
import numpy as np
class ReplayBuffer:
def __init__(self, capacity):
self.capacity = capacity
self.buffer = []
self.position = 0
def push(self, state, action, reward, next_state, done):
if len(self.buffer) < self.capacity:
self.buffer.append(None)
self.buffer[self.position] = (state, action, reward, next_state, done)
self.position = (self.position + 1) % self.capacity
def sample(self, batch_size):
batch = random.sample(self.buffer, batch_size)
state_batch, action_batch, reward_batch, next_state_batch, done_batch = map(np.stack, zip(*batch))
return state_batch, action_batch, reward_batch, next_state_batch, done_batch
def __len__(self):
return len(self.buffer)

View File

@@ -1,50 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-10 15:03:59
@LastEditor: John
LastEditTime: 2020-08-22 19:09:54
@Discription:
@Environment: python 3.7.7
'''
import torch
import torch.nn as nn
import torch.nn.functional as F
class Critic(nn.Module):
def __init__(self, n_obs, n_actions, hidden_size, init_w=3e-3):
super(Critic, self).__init__()
self.linear1 = nn.Linear(n_obs + n_actions, hidden_size)
self.linear2 = nn.Linear(hidden_size, hidden_size)
self.linear3 = nn.Linear(hidden_size, 1)
# 随机初始化为较小的值
self.linear3.weight.data.uniform_(-init_w, init_w)
self.linear3.bias.data.uniform_(-init_w, init_w)
def forward(self, state, action):
# 按维数1拼接
x = torch.cat([state, action], 1)
x = F.relu(self.linear1(x))
x = F.relu(self.linear2(x))
x = self.linear3(x)
return x
class Actor(nn.Module):
def __init__(self, n_obs, n_actions, hidden_size, init_w=3e-3):
super(Actor, self).__init__()
self.linear1 = nn.Linear(n_obs, hidden_size)
self.linear2 = nn.Linear(hidden_size, hidden_size)
self.linear3 = nn.Linear(hidden_size, n_actions)
self.linear3.weight.data.uniform_(-init_w, init_w)
self.linear3.bias.data.uniform_(-init_w, init_w)
def forward(self, x):
x = F.relu(self.linear1(x))
x = F.relu(self.linear2(x))
x = F.tanh(self.linear3(x))
return x

View File

@@ -1,39 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-11 20:58:59
@LastEditor: John
@LastEditTime: 2020-06-11 20:59:20
@Discription:
@Environment: python 3.7.7
'''
import numpy as np
class OUNoise(object):
def __init__(self, action_space, mu=0.0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000):
self.mu = mu
self.theta = theta
self.sigma = max_sigma
self.max_sigma = max_sigma
self.min_sigma = min_sigma
self.decay_period = decay_period
self.n_actions = action_space.shape[0]
self.low = action_space.low
self.high = action_space.high
self.reset()
def reset(self):
self.obs = np.ones(self.n_actions) * self.mu
def evolve_obs(self):
x = self.obs
dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.n_actions)
self.obs = x + dx
return self.obs
def get_action(self, action, t=0):
ou_obs = self.evolve_obs()
self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period)
return np.clip(action + ou_obs, self.low, self.high)

View File

@@ -1,46 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-11 16:30:09
@LastEditor: John
LastEditTime: 2020-10-15 21:32:05
@Discription:
@Environment: python 3.7.7
'''
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
def plot_results(item,ylabel='rewards_train', save_fig = True):
'''plot using searborn to plot
'''
sns.set()
plt.figure()
plt.plot(np.arange(len(item)), item)
plt.title(ylabel+' of DDPG')
plt.ylabel(ylabel)
plt.xlabel('episodes')
if save_fig:
plt.savefig(os.path.dirname(__file__)+"/result/"+ylabel+".png")
plt.show()
if __name__ == "__main__":
output_path = os.path.split(os.path.abspath(__file__))[0]+"/result/"
tag = 'train'
rewards=np.load(output_path+"rewards_"+tag+".npy", )
moving_average_rewards=np.load(output_path+"moving_average_rewards_"+tag+".npy",)
steps=np.load(output_path+"steps_"+tag+".npy")
plot_results(rewards)
plot_results(moving_average_rewards,ylabel='moving_average_rewards_'+tag)
plot_results(steps,ylabel='steps_'+tag)
tag = 'eval'
rewards=np.load(output_path+"rewards_"+tag+".npy", )
moving_average_rewards=np.load(output_path+"moving_average_rewards_"+tag+".npy",)
steps=np.load(output_path+"steps_"+tag+".npy")
plot_results(rewards,ylabel='rewards_'+tag)
plot_results(moving_average_rewards,ylabel='moving_average_rewards_'+tag)
plot_results(steps,ylabel='steps_'+tag)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 50 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 40 KiB

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 74 KiB

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 56 KiB

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 23 KiB

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 23 KiB

View File

@@ -1,21 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2020-10-15 21:31:19
LastEditor: John
LastEditTime: 2020-10-15 21:31:25
Discription:
Environment:
'''
import os
import numpy as np
RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/result/"+SEQUENCE+'/'
def save_results(rewards,moving_average_rewards,ep_steps,path=RESULT_PATH):
if not os.path.exists(path): # 检测是否存在文件夹
os.mkdir(path)
np.save(RESULT_PATH+'rewards_train.npy', rewards)
np.save(RESULT_PATH+'moving_average_rewards_train.npy', moving_average_rewards)
np.save(RESULT_PATH+'steps_train.npy',ep_steps )