update codes

This commit is contained in:
johnjim0816
2021-12-21 20:14:13 +08:00
parent 64c319cab4
commit 3b712e8815
71 changed files with 1097 additions and 1340 deletions

View File

@@ -9,22 +9,75 @@ LastEditTime: 2021-09-16 00:55:30
@Discription:
@Environment: python 3.7.7
'''
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from common.model import Actor, Critic
from common.memory import ReplayBuffer
import torch.nn.functional as F
class ReplayBuffer:
def __init__(self, capacity):
self.capacity = capacity # 经验回放的容量
self.buffer = [] # 缓冲区
self.position = 0
def push(self, state, action, reward, next_state, done):
''' 缓冲区是一个队列,容量超出时去掉开始存入的转移(transition)
'''
if len(self.buffer) < self.capacity:
self.buffer.append(None)
self.buffer[self.position] = (state, action, reward, next_state, done)
self.position = (self.position + 1) % self.capacity
def sample(self, batch_size):
batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移
state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等
return state, action, reward, next_state, done
def __len__(self):
''' 返回当前存储的量
'''
return len(self.buffer)
class Actor(nn.Module):
def __init__(self, n_states, n_actions, hidden_dim, init_w=3e-3):
super(Actor, self).__init__()
self.linear1 = nn.Linear(n_states, hidden_dim)
self.linear2 = nn.Linear(hidden_dim, hidden_dim)
self.linear3 = nn.Linear(hidden_dim, n_actions)
self.linear3.weight.data.uniform_(-init_w, init_w)
self.linear3.bias.data.uniform_(-init_w, init_w)
def forward(self, x):
x = F.relu(self.linear1(x))
x = F.relu(self.linear2(x))
x = torch.tanh(self.linear3(x))
return x
class Critic(nn.Module):
def __init__(self, n_states, n_actions, hidden_dim, init_w=3e-3):
super(Critic, self).__init__()
self.linear1 = nn.Linear(n_states + n_actions, hidden_dim)
self.linear2 = nn.Linear(hidden_dim, hidden_dim)
self.linear3 = nn.Linear(hidden_dim, 1)
# 随机初始化为较小的值
self.linear3.weight.data.uniform_(-init_w, init_w)
self.linear3.bias.data.uniform_(-init_w, init_w)
def forward(self, state, action):
# 按维数1拼接
x = torch.cat([state, action], 1)
x = F.relu(self.linear1(x))
x = F.relu(self.linear2(x))
x = self.linear3(x)
return x
class DDPG:
def __init__(self, state_dim, action_dim, cfg):
def __init__(self, n_states, n_actions, cfg):
self.device = cfg.device
self.critic = Critic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
self.actor = Actor(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
self.target_critic = Critic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
self.target_actor = Actor(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
self.critic = Critic(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
self.actor = Actor(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
self.target_critic = Critic(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
self.target_actor = Actor(n_states, n_actions, cfg.hidden_dim).to(cfg.device)
# 复制参数到目标网络
for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()):

View File

@@ -16,12 +16,10 @@ class NormalizedActions(gym.ActionWrapper):
''' 将action范围重定在[0.1]之间
'''
def action(self, action):
low_bound = self.action_space.low
upper_bound = self.action_space.high
action = low_bound + (action + 1.0) * 0.5 * (upper_bound - low_bound)
action = np.clip(action, low_bound, upper_bound)
return action
def reverse_action(self, action):

81
codes/DDPG/task0.py Normal file
View File

@@ -0,0 +1,81 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-11 20:58:21
@LastEditor: John
LastEditTime: 2021-09-16 01:31:33
@Discription:
@Environment: python 3.7.7
'''
import sys,os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径sys.path
import datetime
import gym
import torch
from DDPG.env import NormalizedActions
from DDPG.agent import DDPG
from DDPG.train import train,test
from common.utils import save_results,make_dir
from common.utils import plot_rewards
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
algo_name = 'DDPG' # 算法名称
env_name = 'Pendulum-v1' # 环境名称gym新版本约0.21.0之后中Pendulum-v0改为Pendulum-v1
class DDPGConfig:
def __init__(self):
self.algo_name = algo_name # 算法名称
self.env_name = env_name # 环境名称
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
self.train_eps = 300 # 训练的回合数
self.eval_eps = 50 # 测试的回合数
self.gamma = 0.99 # 折扣因子
self.critic_lr = 1e-3 # 评论家网络的学习率
self.actor_lr = 1e-4 # 演员网络的学习率
self.memory_capacity = 8000 # 经验回放的容量
self.batch_size = 128 # mini-batch SGD中的批量大小
self.target_update = 2 # 目标网络的更新频率
self.hidden_dim = 256 # 网络隐藏层维度
self.soft_tau = 1e-2 # 软更新参数
class PlotConfig:
def __init__(self) -> None:
self.algo_name = algo_name # 算法名称
self.env_name = env_name # 环境名称
self.result_path = curr_path+"/outputs/" + self.env_name + \
'/'+curr_time+'/results/' # 保存结果的路径
self.model_path = curr_path+"/outputs/" + self.env_name + \
'/'+curr_time+'/models/' # 保存模型的路径
self.save = True # 是否保存图片
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
def env_agent_config(cfg,seed=1):
env = NormalizedActions(gym.make(cfg.env_name)) # 装饰action噪声
env.seed(seed) # 随机种子
n_states = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]
agent = DDPG(n_states,n_actions,cfg)
return env,agent
cfg = DDPGConfig()
plot_cfg = PlotConfig()
# 训练
env,agent = env_agent_config(cfg,seed=1)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(plot_cfg.result_path, plot_cfg.model_path)
agent.save(path=plot_cfg.model_path)
save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path)
plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果
# 测试
env,agent = env_agent_config(cfg,seed=10)
agent.load(path=plot_cfg.model_path)
rewards,ma_rewards = test(plot_cfg,env,agent)
save_results(rewards,ma_rewards,tag = 'test',path = cfg.result_path)
plot_rewards(rewards, ma_rewards, plot_cfg, tag="test") # 画出结果

View File

@@ -1,136 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-11 20:58:21
@LastEditor: John
LastEditTime: 2021-09-16 01:31:33
@Discription:
@Environment: python 3.7.7
'''
import sys,os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径sys.path
import datetime
import gym
import torch
from DDPG.env import NormalizedActions, OUNoise
from DDPG.agent import DDPG
from common.utils import save_results,make_dir
from common.plot import plot_rewards
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
class DDPGConfig:
def __init__(self):
self.algo = 'DDPG' # 算法名称
self.env_name = 'Pendulum-v0' # 环境名称
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
self.train_eps = 300 # 训练的回合数
self.eval_eps = 50 # 测试的回合数
self.gamma = 0.99 # 折扣因子
self.critic_lr = 1e-3 # 评论家网络的学习率
self.actor_lr = 1e-4 # 演员网络的学习率
self.memory_capacity = 8000 # 经验回放的容量
self.batch_size = 128 # mini-batch SGD中的批量大小
self.target_update = 2 # 目标网络的更新频率
self.hidden_dim = 256 # 网络隐藏层维度
self.soft_tau = 1e-2 # 软更新参数
class PlotConfig:
def __init__(self) -> None:
self.algo = "DQN" # 算法名称
self.env_name = 'CartPole-v0' # 环境名称
self.result_path = curr_path+"/outputs/" + self.env_name + \
'/'+curr_time+'/results/' # 保存结果的路径
self.model_path = curr_path+"/outputs/" + self.env_name + \
'/'+curr_time+'/models/' # 保存模型的路径
self.save = True # 是否保存图片
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
def env_agent_config(cfg,seed=1):
env = NormalizedActions(gym.make(cfg.env_name)) # 装饰action噪声
env.seed(seed) # 随机种子
n_states = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]
agent = DDPG(n_states,n_actions,cfg)
return env,agent
def train(cfg, env, agent):
print('开始训练!')
print(f'环境:{cfg.env_name},算法:{cfg.algo},设备:{cfg.device}')
ou_noise = OUNoise(env.action_space) # 动作噪声
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.train_eps):
state = env.reset()
ou_noise.reset()
done = False
ep_reward = 0
i_step = 0
while not done:
i_step += 1
action = agent.choose_action(state)
action = ou_noise.get_action(action, i_step)
next_state, reward, done, _ = env.step(action)
ep_reward += reward
agent.memory.push(state, action, reward, next_state, done)
agent.update()
state = next_state
if (i_ep+1)%10 == 0:
print('回合:{}/{},奖励:{:.2f}'.format(i_ep+1, cfg.train_eps, ep_reward))
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print('完成训练!')
return rewards, ma_rewards
def eval(cfg, env, agent):
print('开始测试!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.eval_eps):
state = env.reset()
done = False
ep_reward = 0
i_step = 0
while not done:
i_step += 1
action = agent.choose_action(state)
next_state, reward, done, _ = env.step(action)
ep_reward += reward
state = next_state
print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward))
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print('完成测试!')
return rewards, ma_rewards
if __name__ == "__main__":
cfg = DDPGConfig()
plot_cfg = PlotConfig()
# 训练
env,agent = env_agent_config(cfg,seed=1)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(plot_cfg.result_path, plot_cfg.model_path)
agent.save(path=plot_cfg.model_path)
save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path)
plot_rewards(rewards, ma_rewards, plot_cfg, tag="train")
# 测试
env,agent = env_agent_config(cfg,seed=10)
agent.load(path=plot_cfg.model_path)
rewards,ma_rewards = eval(plot_cfg,env,agent)
save_results(rewards,ma_rewards,tag = 'eval',path = cfg.result_path)
plot_rewards(rewards,ma_rewards,plot_cfg,tag = "eval")

64
codes/DDPG/train.py Normal file
View File

@@ -0,0 +1,64 @@
import sys
import os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径
from DDPG.env import OUNoise
def train(cfg, env, agent):
print('开始训练!')
print(f'环境:{cfg.env_name},算法:{cfg.algo},设备:{cfg.device}')
ou_noise = OUNoise(env.action_space) # 动作噪声
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.train_eps):
state = env.reset()
ou_noise.reset()
done = False
ep_reward = 0
i_step = 0
while not done:
i_step += 1
action = agent.choose_action(state)
action = ou_noise.get_action(action, i_step)
next_state, reward, done, _ = env.step(action)
ep_reward += reward
agent.memory.push(state, action, reward, next_state, done)
agent.update()
state = next_state
if (i_ep+1)%10 == 0:
print('回合:{}/{},奖励:{:.2f}'.format(i_ep+1, cfg.train_eps, ep_reward))
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print('完成训练!')
return rewards, ma_rewards
def test(cfg, env, agent):
print('开始测试!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.eval_eps):
state = env.reset()
done = False
ep_reward = 0
i_step = 0
while not done:
i_step += 1
action = agent.choose_action(state)
next_state, reward, done, _ = env.step(action)
ep_reward += reward
state = next_state
print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward))
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print(f"回合:{i_ep+1}/{cfg.eval_eps},奖励:{ep_reward:.1f}")
print('完成测试!')
return rewards, ma_rewards