update
@@ -34,7 +34,7 @@ class DDPGConfig:
|
|||||||
self.env_name = env_name # 环境名称
|
self.env_name = env_name # 环境名称
|
||||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
|
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
|
||||||
self.train_eps = 300 # 训练的回合数
|
self.train_eps = 300 # 训练的回合数
|
||||||
self.eval_eps = 50 # 测试的回合数
|
self.test_eps = 50 # 测试的回合数
|
||||||
self.gamma = 0.99 # 折扣因子
|
self.gamma = 0.99 # 折扣因子
|
||||||
self.critic_lr = 1e-3 # 评论家网络的学习率
|
self.critic_lr = 1e-3 # 评论家网络的学习率
|
||||||
self.actor_lr = 1e-4 # 演员网络的学习率
|
self.actor_lr = 1e-4 # 演员网络的学习率
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ def test(cfg, env, agent):
|
|||||||
print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}')
|
print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}')
|
||||||
rewards = [] # 记录所有回合的奖励
|
rewards = [] # 记录所有回合的奖励
|
||||||
ma_rewards = [] # 记录所有回合的滑动平均奖励
|
ma_rewards = [] # 记录所有回合的滑动平均奖励
|
||||||
for i_ep in range(cfg.eval_eps):
|
for i_ep in range(cfg.test_eps):
|
||||||
state = env.reset()
|
state = env.reset()
|
||||||
done = False
|
done = False
|
||||||
ep_reward = 0
|
ep_reward = 0
|
||||||
@@ -59,6 +59,6 @@ def test(cfg, env, agent):
|
|||||||
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
|
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
|
||||||
else:
|
else:
|
||||||
ma_rewards.append(ep_reward)
|
ma_rewards.append(ep_reward)
|
||||||
print(f"回合:{i_ep+1}/{cfg.eval_eps},奖励:{ep_reward:.1f}")
|
print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}")
|
||||||
print('完成测试!')
|
print('完成测试!')
|
||||||
return rewards, ma_rewards
|
return rewards, ma_rewards
|
||||||
@@ -23,7 +23,7 @@ class DQNConfig:
|
|||||||
self.device = torch.device(
|
self.device = torch.device(
|
||||||
"cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
|
"cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
|
||||||
self.train_eps = 200 # 训练的回合数
|
self.train_eps = 200 # 训练的回合数
|
||||||
self.eval_eps = 30 # 测试的回合数
|
self.test_eps = 30 # 测试的回合数
|
||||||
# 超参数
|
# 超参数
|
||||||
self.gamma = 0.95 # 强化学习中的折扣因子
|
self.gamma = 0.95 # 强化学习中的折扣因子
|
||||||
self.epsilon_start = 0.90 # e-greedy策略中初始epsilon
|
self.epsilon_start = 0.90 # e-greedy策略中初始epsilon
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ class DQNConfig:
|
|||||||
self.device = torch.device(
|
self.device = torch.device(
|
||||||
"cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
|
"cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
|
||||||
self.train_eps = 200 # 训练的回合数
|
self.train_eps = 200 # 训练的回合数
|
||||||
self.eval_eps = 30 # 测试的回合数
|
self.test_eps = 30 # 测试的回合数
|
||||||
# 超参数
|
# 超参数
|
||||||
self.gamma = 0.95 # 强化学习中的折扣因子
|
self.gamma = 0.95 # 强化学习中的折扣因子
|
||||||
self.epsilon_start = 0.90 # e-greedy策略中初始epsilon
|
self.epsilon_start = 0.90 # e-greedy策略中初始epsilon
|
||||||
|
|||||||
@@ -180,7 +180,7 @@
|
|||||||
" self.algo = \"DQN\" # 算法名称\n",
|
" self.algo = \"DQN\" # 算法名称\n",
|
||||||
" self.env = 'CartPole-v0' # 环境名称\n",
|
" self.env = 'CartPole-v0' # 环境名称\n",
|
||||||
" self.train_eps = 200 # 训练的回合数\n",
|
" self.train_eps = 200 # 训练的回合数\n",
|
||||||
" self.eval_eps = 20 # 测试的回合数\n",
|
" self.test_eps = 20 # 测试的回合数\n",
|
||||||
" self.gamma = 0.95 # 强化学习中的折扣因子\n",
|
" self.gamma = 0.95 # 强化学习中的折扣因子\n",
|
||||||
" self.epsilon_start = 0.90 # e-greedy策略中初始epsilon\n",
|
" self.epsilon_start = 0.90 # e-greedy策略中初始epsilon\n",
|
||||||
" self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon\n",
|
" self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon\n",
|
||||||
@@ -365,7 +365,7 @@
|
|||||||
" cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon\n",
|
" cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon\n",
|
||||||
" rewards = [] # 记录所有回合的奖励\n",
|
" rewards = [] # 记录所有回合的奖励\n",
|
||||||
" ma_rewards = [] # 记录所有回合的滑动平均奖励\n",
|
" ma_rewards = [] # 记录所有回合的滑动平均奖励\n",
|
||||||
" for i_ep in range(cfg.eval_eps):\n",
|
" for i_ep in range(cfg.test_eps):\n",
|
||||||
" ep_reward = 0 # 记录一回合内的奖励\n",
|
" ep_reward = 0 # 记录一回合内的奖励\n",
|
||||||
" state = env.reset() # 重置环境,返回初始状态\n",
|
" state = env.reset() # 重置环境,返回初始状态\n",
|
||||||
" while True:\n",
|
" while True:\n",
|
||||||
@@ -381,7 +381,7 @@
|
|||||||
" else:\n",
|
" else:\n",
|
||||||
" ma_rewards.append(ep_reward)\n",
|
" ma_rewards.append(ep_reward)\n",
|
||||||
" if (i_ep+1)%3 == 0: \n",
|
" if (i_ep+1)%3 == 0: \n",
|
||||||
" print(f\"回合:{i_ep+1}/{cfg.eval_eps}, 奖励:{ep_reward:.1f}\")\n",
|
" print(f\"回合:{i_ep+1}/{cfg.test_eps}, 奖励:{ep_reward:.1f}\")\n",
|
||||||
" print('完成测试!')\n",
|
" print('完成测试!')\n",
|
||||||
" return rewards,ma_rewards\n",
|
" return rewards,ma_rewards\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
|||||||
@@ -5,7 +5,7 @@
|
|||||||
@Email: johnjim0816@gmail.com
|
@Email: johnjim0816@gmail.com
|
||||||
@Date: 2020-06-12 00:48:57
|
@Date: 2020-06-12 00:48:57
|
||||||
@LastEditor: John
|
@LastEditor: John
|
||||||
LastEditTime: 2021-09-15 15:34:13
|
LastEditTime: 2021-12-22 11:08:04
|
||||||
@Discription:
|
@Discription:
|
||||||
@Environment: python 3.7.7
|
@Environment: python 3.7.7
|
||||||
'''
|
'''
|
||||||
@@ -30,13 +30,13 @@ def train(cfg, env, agent):
|
|||||||
break
|
break
|
||||||
if (i_ep+1) % cfg.target_update == 0: # 智能体目标网络更新
|
if (i_ep+1) % cfg.target_update == 0: # 智能体目标网络更新
|
||||||
agent.target_net.load_state_dict(agent.policy_net.state_dict())
|
agent.target_net.load_state_dict(agent.policy_net.state_dict())
|
||||||
if (i_ep+1)%10 == 0:
|
|
||||||
print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward))
|
|
||||||
rewards.append(ep_reward)
|
rewards.append(ep_reward)
|
||||||
if ma_rewards:
|
if ma_rewards:
|
||||||
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
|
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
|
||||||
else:
|
else:
|
||||||
ma_rewards.append(ep_reward)
|
ma_rewards.append(ep_reward)
|
||||||
|
if (i_ep+1)%10 == 0:
|
||||||
|
print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward))
|
||||||
print('完成训练!')
|
print('完成训练!')
|
||||||
return rewards, ma_rewards
|
return rewards, ma_rewards
|
||||||
|
|
||||||
@@ -48,7 +48,7 @@ def test(cfg,env,agent):
|
|||||||
cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
|
cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
|
||||||
rewards = [] # 记录所有回合的奖励
|
rewards = [] # 记录所有回合的奖励
|
||||||
ma_rewards = [] # 记录所有回合的滑动平均奖励
|
ma_rewards = [] # 记录所有回合的滑动平均奖励
|
||||||
for i_ep in range(cfg.eval_eps):
|
for i_ep in range(cfg.test_eps):
|
||||||
ep_reward = 0 # 记录一回合内的奖励
|
ep_reward = 0 # 记录一回合内的奖励
|
||||||
state = env.reset() # 重置环境,返回初始状态
|
state = env.reset() # 重置环境,返回初始状态
|
||||||
while True:
|
while True:
|
||||||
@@ -63,7 +63,7 @@ def test(cfg,env,agent):
|
|||||||
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
|
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
|
||||||
else:
|
else:
|
||||||
ma_rewards.append(ep_reward)
|
ma_rewards.append(ep_reward)
|
||||||
print(f"回合:{i_ep+1}/{cfg.eval_eps},奖励:{ep_reward:.1f}")
|
print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}")
|
||||||
print('完成测试!')
|
print('完成测试!')
|
||||||
return rewards,ma_rewards
|
return rewards,ma_rewards
|
||||||
|
|
||||||
@@ -89,7 +89,7 @@ if __name__ == "__main__":
|
|||||||
self.env_name = 'CartPole-v0' # 环境名称
|
self.env_name = 'CartPole-v0' # 环境名称
|
||||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
|
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
|
||||||
self.train_eps = 200 # 训练的回合数
|
self.train_eps = 200 # 训练的回合数
|
||||||
self.eval_eps = 30 # 测试的回合数
|
self.test_eps = 30 # 测试的回合数
|
||||||
# 超参数
|
# 超参数
|
||||||
self.gamma = 0.95 # 强化学习中的折扣因子
|
self.gamma = 0.95 # 强化学习中的折扣因子
|
||||||
self.epsilon_start = 0.90 # e-greedy策略中初始epsilon
|
self.epsilon_start = 0.90 # e-greedy策略中初始epsilon
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ class PPOConfig:
|
|||||||
self.continuous = False # 环境是否为连续动作
|
self.continuous = False # 环境是否为连续动作
|
||||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
|
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
|
||||||
self.train_eps = 200 # 训练的回合数
|
self.train_eps = 200 # 训练的回合数
|
||||||
self.eval_eps = 20 # 测试的回合数
|
self.test_eps = 20 # 测试的回合数
|
||||||
self.batch_size = 5
|
self.batch_size = 5
|
||||||
self.gamma=0.99
|
self.gamma=0.99
|
||||||
self.n_epochs = 4
|
self.n_epochs = 4
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ class PPOConfig:
|
|||||||
self.continuous = True # 环境是否为连续动作
|
self.continuous = True # 环境是否为连续动作
|
||||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
|
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
|
||||||
self.train_eps = 200 # 训练的回合数
|
self.train_eps = 200 # 训练的回合数
|
||||||
self.eval_eps = 20 # 测试的回合数
|
self.test_eps = 20 # 测试的回合数
|
||||||
self.batch_size = 5
|
self.batch_size = 5
|
||||||
self.gamma=0.99
|
self.gamma=0.99
|
||||||
self.n_epochs = 4
|
self.n_epochs = 4
|
||||||
|
|||||||
@@ -68,7 +68,7 @@
|
|||||||
" self.result_path = curr_path+\"/results/\" +self.env+'/'+curr_time+'/results/' # path to save results\n",
|
" self.result_path = curr_path+\"/results/\" +self.env+'/'+curr_time+'/results/' # path to save results\n",
|
||||||
" self.model_path = curr_path+\"/results/\" +self.env+'/'+curr_time+'/models/' # path to save models\n",
|
" self.model_path = curr_path+\"/results/\" +self.env+'/'+curr_time+'/models/' # path to save models\n",
|
||||||
" self.train_eps = 200 # max training episodes\n",
|
" self.train_eps = 200 # max training episodes\n",
|
||||||
" self.eval_eps = 50\n",
|
" self.test_eps = 50\n",
|
||||||
" self.batch_size = 5\n",
|
" self.batch_size = 5\n",
|
||||||
" self.gamma=0.99\n",
|
" self.gamma=0.99\n",
|
||||||
" self.n_epochs = 4\n",
|
" self.n_epochs = 4\n",
|
||||||
@@ -144,7 +144,7 @@
|
|||||||
" print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}')\n",
|
" print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}')\n",
|
||||||
" rewards= []\n",
|
" rewards= []\n",
|
||||||
" ma_rewards = [] # moving average rewards\n",
|
" ma_rewards = [] # moving average rewards\n",
|
||||||
" for i_ep in range(cfg.eval_eps):\n",
|
" for i_ep in range(cfg.test_eps):\n",
|
||||||
" state = env.reset()\n",
|
" state = env.reset()\n",
|
||||||
" done = False\n",
|
" done = False\n",
|
||||||
" ep_reward = 0\n",
|
" ep_reward = 0\n",
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ def eval(cfg,env,agent):
|
|||||||
print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}')
|
print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}')
|
||||||
rewards = [] # 记录所有回合的奖励
|
rewards = [] # 记录所有回合的奖励
|
||||||
ma_rewards = [] # 记录所有回合的滑动平均奖励
|
ma_rewards = [] # 记录所有回合的滑动平均奖励
|
||||||
for i_ep in range(cfg.eval_eps):
|
for i_ep in range(cfg.test_eps):
|
||||||
state = env.reset()
|
state = env.reset()
|
||||||
done = False
|
done = False
|
||||||
ep_reward = 0
|
ep_reward = 0
|
||||||
@@ -47,7 +47,7 @@ def eval(cfg,env,agent):
|
|||||||
0.9*ma_rewards[-1]+0.1*ep_reward)
|
0.9*ma_rewards[-1]+0.1*ep_reward)
|
||||||
else:
|
else:
|
||||||
ma_rewards.append(ep_reward)
|
ma_rewards.append(ep_reward)
|
||||||
print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.eval_eps, ep_reward))
|
print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.test_eps, ep_reward))
|
||||||
print('完成训练!')
|
print('完成训练!')
|
||||||
return rewards,ma_rewards
|
return rewards,ma_rewards
|
||||||
|
|
||||||
@@ -74,7 +74,7 @@ if __name__ == '__main__':
|
|||||||
self.continuous = False # 环境是否为连续动作
|
self.continuous = False # 环境是否为连续动作
|
||||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
|
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
|
||||||
self.train_eps = 200 # 训练的回合数
|
self.train_eps = 200 # 训练的回合数
|
||||||
self.eval_eps = 20 # 测试的回合数
|
self.test_eps = 20 # 测试的回合数
|
||||||
self.batch_size = 5
|
self.batch_size = 5
|
||||||
self.gamma=0.99
|
self.gamma=0.99
|
||||||
self.n_epochs = 4
|
self.n_epochs = 4
|
||||||
|
|||||||
@@ -5,21 +5,22 @@ Author: John
|
|||||||
Email: johnjim0816@gmail.com
|
Email: johnjim0816@gmail.com
|
||||||
Date: 2021-03-23 16:35:58
|
Date: 2021-03-23 16:35:58
|
||||||
LastEditor: John
|
LastEditor: John
|
||||||
LastEditTime: 2021-03-23 16:36:20
|
LastEditTime: 2021-12-21 23:21:26
|
||||||
Discription:
|
Discription:
|
||||||
Environment:
|
Environment:
|
||||||
'''
|
'''
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
class MLP(nn.Module):
|
class MLP(nn.Module):
|
||||||
|
|
||||||
''' 多层感知机
|
''' 多层感知机
|
||||||
输入:state维度
|
输入:state维度
|
||||||
输出:概率
|
输出:概率
|
||||||
'''
|
'''
|
||||||
def __init__(self,state_dim,hidden_dim = 36):
|
def __init__(self,input_dim,hidden_dim = 36):
|
||||||
super(MLP, self).__init__()
|
super(MLP, self).__init__()
|
||||||
# 24和36为hidden layer的层数,可根据state_dim, action_dim的情况来改变
|
# 24和36为hidden layer的层数,可根据input_dim, action_dim的情况来改变
|
||||||
self.fc1 = nn.Linear(state_dim, hidden_dim)
|
self.fc1 = nn.Linear(input_dim, hidden_dim)
|
||||||
self.fc2 = nn.Linear(hidden_dim,hidden_dim)
|
self.fc2 = nn.Linear(hidden_dim,hidden_dim)
|
||||||
self.fc3 = nn.Linear(hidden_dim, 1) # Prob of Left
|
self.fc3 = nn.Linear(hidden_dim, 1) # Prob of Left
|
||||||
|
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ class PGConfig:
|
|||||||
self.model_path = curr_path+"/outputs/" + self.env + \
|
self.model_path = curr_path+"/outputs/" + self.env + \
|
||||||
'/'+curr_time+'/models/' # 保存模型的路径
|
'/'+curr_time+'/models/' # 保存模型的路径
|
||||||
self.train_eps = 300 # 训练的回合数
|
self.train_eps = 300 # 训练的回合数
|
||||||
self.eval_eps = 30 # 测试的回合数
|
self.test_eps = 30 # 测试的回合数
|
||||||
self.batch_size = 8
|
self.batch_size = 8
|
||||||
self.lr = 0.01 # 学习率
|
self.lr = 0.01 # 学习率
|
||||||
self.gamma = 0.99
|
self.gamma = 0.99
|
||||||
@@ -94,7 +94,7 @@ def eval(cfg,env,agent):
|
|||||||
print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}')
|
print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}')
|
||||||
rewards = []
|
rewards = []
|
||||||
ma_rewards = []
|
ma_rewards = []
|
||||||
for i_ep in range(cfg.eval_eps):
|
for i_ep in range(cfg.test_eps):
|
||||||
state = env.reset()
|
state = env.reset()
|
||||||
ep_reward = 0
|
ep_reward = 0
|
||||||
for _ in count():
|
for _ in count():
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ Author: John
|
|||||||
Email: johnjim0816@gmail.com
|
Email: johnjim0816@gmail.com
|
||||||
Date: 2020-09-11 23:03:00
|
Date: 2020-09-11 23:03:00
|
||||||
LastEditor: John
|
LastEditor: John
|
||||||
LastEditTime: 2021-09-19 23:05:45
|
LastEditTime: 2021-12-22 10:54:57
|
||||||
Discription: use defaultdict to define Q table
|
Discription: use defaultdict to define Q table
|
||||||
Environment:
|
Environment:
|
||||||
'''
|
'''
|
||||||
@@ -15,17 +15,17 @@ import torch
|
|||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
class QLearning(object):
|
class QLearning(object):
|
||||||
def __init__(self,state_dim,
|
def __init__(self,n_states,
|
||||||
action_dim,cfg):
|
n_actions,cfg):
|
||||||
self.action_dim = action_dim # dimension of acgtion
|
self.n_actions = n_actions
|
||||||
self.lr = cfg.lr # learning rate
|
self.lr = cfg.lr # 学习率
|
||||||
self.gamma = cfg.gamma
|
self.gamma = cfg.gamma
|
||||||
self.epsilon = 0
|
self.epsilon = 0
|
||||||
self.sample_count = 0
|
self.sample_count = 0
|
||||||
self.epsilon_start = cfg.epsilon_start
|
self.epsilon_start = cfg.epsilon_start
|
||||||
self.epsilon_end = cfg.epsilon_end
|
self.epsilon_end = cfg.epsilon_end
|
||||||
self.epsilon_decay = cfg.epsilon_decay
|
self.epsilon_decay = cfg.epsilon_decay
|
||||||
self.Q_table = defaultdict(lambda: np.zeros(action_dim)) # A nested dictionary that maps state -> (action -> action-value)
|
self.Q_table = defaultdict(lambda: np.zeros(n_actions)) # 用嵌套字典存放状态->动作->状态-动作值(Q值)的映射,即Q表
|
||||||
def choose_action(self, state):
|
def choose_action(self, state):
|
||||||
self.sample_count += 1
|
self.sample_count += 1
|
||||||
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
|
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
|
||||||
@@ -34,7 +34,7 @@ class QLearning(object):
|
|||||||
if np.random.uniform(0, 1) > self.epsilon:
|
if np.random.uniform(0, 1) > self.epsilon:
|
||||||
action = np.argmax(self.Q_table[str(state)]) # 选择Q(s,a)最大对应的动作
|
action = np.argmax(self.Q_table[str(state)]) # 选择Q(s,a)最大对应的动作
|
||||||
else:
|
else:
|
||||||
action = np.random.choice(self.action_dim) # 随机选择动作
|
action = np.random.choice(self.n_actions) # 随机选择动作
|
||||||
return action
|
return action
|
||||||
def predict(self,state):
|
def predict(self,state):
|
||||||
action = np.argmax(self.Q_table[str(state)])
|
action = np.argmax(self.Q_table[str(state)])
|
||||||
|
|||||||
|
Before Width: | Height: | Size: 49 KiB |
|
Before Width: | Height: | Size: 48 KiB |
|
Before Width: | Height: | Size: 33 KiB |
|
Before Width: | Height: | Size: 31 KiB |
|
Before Width: | Height: | Size: 42 KiB |
|
After Width: | Height: | Size: 24 KiB |
|
After Width: | Height: | Size: 38 KiB |
|
After Width: | Height: | Size: 13 KiB |
|
After Width: | Height: | Size: 18 KiB |
386
codes/QLearning/task0.ipynb
Normal file
93
codes/QLearning/task0.py
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# coding=utf-8
|
||||||
|
'''
|
||||||
|
Author: John
|
||||||
|
Email: johnjim0816@gmail.com
|
||||||
|
Date: 2020-09-11 23:03:00
|
||||||
|
LastEditor: John
|
||||||
|
LastEditTime: 2021-12-22 11:13:23
|
||||||
|
Discription:
|
||||||
|
Environment:
|
||||||
|
'''
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
|
||||||
|
parent_path = os.path.dirname(curr_path) # 父路径
|
||||||
|
sys.path.append(parent_path) # 添加路径到系统路径
|
||||||
|
|
||||||
|
import gym
|
||||||
|
import torch
|
||||||
|
import datetime
|
||||||
|
|
||||||
|
from envs.gridworld_env import CliffWalkingWapper
|
||||||
|
from QLearning.agent import QLearning
|
||||||
|
from QLearning.train import train,test
|
||||||
|
from common.utils import plot_rewards,plot_rewards_cn
|
||||||
|
from common.utils import save_results,make_dir
|
||||||
|
|
||||||
|
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
|
||||||
|
algo_name = 'Q-learning' # 算法名称
|
||||||
|
env_name = 'CliffWalking-v0' # 环境名称
|
||||||
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
|
||||||
|
class QlearningConfig:
|
||||||
|
'''训练相关参数'''
|
||||||
|
def __init__(self):
|
||||||
|
self.algo_name = algo_name # 算法名称
|
||||||
|
self.env_name = env_name # 环境名称
|
||||||
|
self.device = device # 检测GPU
|
||||||
|
self.train_eps = 400 # 训练的回合数
|
||||||
|
self.test_eps = 30 # 测试的回合数
|
||||||
|
self.gamma = 0.9 # reward的衰减率
|
||||||
|
self.epsilon_start = 0.95 # e-greedy策略中初始epsilon
|
||||||
|
self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon
|
||||||
|
self.epsilon_decay = 300 # e-greedy策略中epsilon的衰减率
|
||||||
|
self.lr = 0.1 # 学习率
|
||||||
|
class PlotConfig:
|
||||||
|
''' 绘图相关参数设置
|
||||||
|
'''
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self.algo_name = algo_name # 算法名称
|
||||||
|
self.env_name = env_name # 环境名称
|
||||||
|
self.device = device # 检测GPU
|
||||||
|
self.result_path = curr_path + "/outputs/" + self.env_name + \
|
||||||
|
'/' + curr_time + '/results/' # 保存结果的路径
|
||||||
|
self.model_path = curr_path + "/outputs/" + self.env_name + \
|
||||||
|
'/' + curr_time + '/models/' # 保存模型的路径
|
||||||
|
self.save = True # 是否保存图片
|
||||||
|
|
||||||
|
def env_agent_config(cfg,seed=1):
|
||||||
|
'''创建环境和智能体
|
||||||
|
Args:
|
||||||
|
cfg ([type]): [description]
|
||||||
|
seed (int, optional): 随机种子. Defaults to 1.
|
||||||
|
Returns:
|
||||||
|
env [type]: 环境
|
||||||
|
agent : 智能体
|
||||||
|
'''
|
||||||
|
env = gym.make(cfg.env_name)
|
||||||
|
env = CliffWalkingWapper(env)
|
||||||
|
env.seed(seed) # 设置随机种子
|
||||||
|
n_states = env.observation_space.n # 状态维度
|
||||||
|
n_actions = env.action_space.n # 动作维度
|
||||||
|
agent = QLearning(n_states,n_actions,cfg)
|
||||||
|
return env,agent
|
||||||
|
|
||||||
|
cfg = QlearningConfig()
|
||||||
|
plot_cfg = PlotConfig()
|
||||||
|
# 训练
|
||||||
|
env, agent = env_agent_config(cfg, seed=1)
|
||||||
|
rewards, ma_rewards = train(cfg, env, agent)
|
||||||
|
make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹
|
||||||
|
agent.save(path=plot_cfg.model_path) # 保存模型
|
||||||
|
save_results(rewards, ma_rewards, tag='train',
|
||||||
|
path=plot_cfg.result_path) # 保存结果
|
||||||
|
plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果
|
||||||
|
# 测试
|
||||||
|
env, agent = env_agent_config(cfg, seed=10)
|
||||||
|
agent.load(path=plot_cfg.model_path) # 导入模型
|
||||||
|
rewards, ma_rewards = test(cfg, env, agent)
|
||||||
|
save_results(rewards, ma_rewards, tag='test', path=plot_cfg.result_path) # 保存结果
|
||||||
|
plot_rewards(rewards, ma_rewards, plot_cfg, tag="test") # 画出结果
|
||||||
|
|
||||||
|
|
||||||
@@ -1,126 +0,0 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
# coding=utf-8
|
|
||||||
'''
|
|
||||||
Author: John
|
|
||||||
Email: johnjim0816@gmail.com
|
|
||||||
Date: 2020-09-11 23:03:00
|
|
||||||
LastEditor: John
|
|
||||||
LastEditTime: 2021-09-23 12:22:58
|
|
||||||
Discription:
|
|
||||||
Environment:
|
|
||||||
'''
|
|
||||||
import sys,os
|
|
||||||
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前路径
|
|
||||||
parent_path=os.path.dirname(curr_path) # 父路径,这里就是我们的项目路径
|
|
||||||
sys.path.append(parent_path) # 由于需要引用项目路径下的其他模块比如envs,所以需要添加路径到sys.path
|
|
||||||
|
|
||||||
import gym
|
|
||||||
import torch
|
|
||||||
import datetime
|
|
||||||
|
|
||||||
from envs.gridworld_env import CliffWalkingWapper
|
|
||||||
from QLearning.agent import QLearning
|
|
||||||
from common.plot import plot_rewards,plot_rewards_cn
|
|
||||||
from common.utils import save_results,make_dir
|
|
||||||
|
|
||||||
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
|
|
||||||
class QlearningConfig:
|
|
||||||
'''训练相关参数'''
|
|
||||||
def __init__(self):
|
|
||||||
self.algo = 'Q-learning' # 算法名称
|
|
||||||
self.env = 'CliffWalking-v0' # 环境名称
|
|
||||||
self.result_path = curr_path+"/outputs/" +self.env+'/'+curr_time+'/results/' # 保存结果的路径
|
|
||||||
self.model_path = curr_path+"/outputs/" +self.env+'/'+curr_time+'/models/' # 保存模型的路径
|
|
||||||
self.train_eps = 400 # 训练的回合数
|
|
||||||
self.eval_eps = 30 # 测试的回合数
|
|
||||||
self.gamma = 0.9 # reward的衰减率
|
|
||||||
self.epsilon_start = 0.95 # e-greedy策略中初始epsilon
|
|
||||||
self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon
|
|
||||||
self.epsilon_decay = 300 # e-greedy策略中epsilon的衰减率
|
|
||||||
self.lr = 0.1 # 学习率
|
|
||||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
|
|
||||||
|
|
||||||
|
|
||||||
def env_agent_config(cfg,seed=1):
|
|
||||||
env = gym.make(cfg.env)
|
|
||||||
env = CliffWalkingWapper(env)
|
|
||||||
env.seed(seed) # 设置随机种子
|
|
||||||
state_dim = env.observation_space.n # 状态维度
|
|
||||||
action_dim = env.action_space.n # 动作维度
|
|
||||||
agent = QLearning(state_dim,action_dim,cfg)
|
|
||||||
return env,agent
|
|
||||||
|
|
||||||
def train(cfg,env,agent):
|
|
||||||
print('开始训练!')
|
|
||||||
print(f'环境:{cfg.env}, 算法:{cfg.algo}, 设备:{cfg.device}')
|
|
||||||
rewards = [] # 记录奖励
|
|
||||||
ma_rewards = [] # 记录滑动平均奖励
|
|
||||||
for i_ep in range(cfg.train_eps):
|
|
||||||
ep_reward = 0 # 记录每个回合的奖励
|
|
||||||
state = env.reset() # 重置环境,即开始新的回合
|
|
||||||
while True:
|
|
||||||
action = agent.choose_action(state) # 根据算法选择一个动作
|
|
||||||
next_state, reward, done, _ = env.step(action) # 与环境进行一次动作交互
|
|
||||||
print(reward)
|
|
||||||
agent.update(state, action, reward, next_state, done) # Q学习算法更新
|
|
||||||
state = next_state # 更新状态
|
|
||||||
ep_reward += reward
|
|
||||||
if done:
|
|
||||||
break
|
|
||||||
rewards.append(ep_reward)
|
|
||||||
if ma_rewards:
|
|
||||||
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
|
|
||||||
else:
|
|
||||||
ma_rewards.append(ep_reward)
|
|
||||||
print("回合数:{}/{},奖励{:.1f}".format(i_ep+1, cfg.train_eps,ep_reward))
|
|
||||||
print('完成训练!')
|
|
||||||
return rewards,ma_rewards
|
|
||||||
|
|
||||||
def eval(cfg,env,agent):
|
|
||||||
print('开始测试!')
|
|
||||||
print(f'环境:{cfg.env}, 算法:{cfg.algo}, 设备:{cfg.device}')
|
|
||||||
for item in agent.Q_table.items():
|
|
||||||
print(item)
|
|
||||||
rewards = [] # 记录所有回合的奖励
|
|
||||||
ma_rewards = [] # 滑动平均的奖励
|
|
||||||
for i_ep in range(cfg.eval_eps):
|
|
||||||
ep_reward = 0 # 记录每个episode的reward
|
|
||||||
state = env.reset() # 重置环境, 重新开一局(即开始新的一个回合)
|
|
||||||
while True:
|
|
||||||
action = agent.predict(state) # 根据算法选择一个动作
|
|
||||||
next_state, reward, done, _ = env.step(action) # 与环境进行一个交互
|
|
||||||
state = next_state # 更新状态
|
|
||||||
ep_reward += reward
|
|
||||||
if done:
|
|
||||||
break
|
|
||||||
rewards.append(ep_reward)
|
|
||||||
if ma_rewards:
|
|
||||||
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
|
|
||||||
else:
|
|
||||||
ma_rewards.append(ep_reward)
|
|
||||||
print(f"回合数:{i_ep+1}/{cfg.eval_eps}, 奖励:{ep_reward:.1f}")
|
|
||||||
print('完成测试!')
|
|
||||||
return rewards,ma_rewards
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
cfg = QlearningConfig()
|
|
||||||
|
|
||||||
# 训练
|
|
||||||
env,agent = env_agent_config(cfg,seed=0)
|
|
||||||
rewards,ma_rewards = train(cfg,env,agent)
|
|
||||||
make_dir(cfg.result_path,cfg.model_path) # 创建文件夹
|
|
||||||
agent.save(path=cfg.model_path) # 保存模型
|
|
||||||
for item in agent.Q_table.items():
|
|
||||||
print(item)
|
|
||||||
save_results(rewards,ma_rewards,tag='train',path=cfg.result_path) # 保存结果
|
|
||||||
plot_rewards_cn(rewards,ma_rewards,tag="train",env=cfg.env,algo = cfg.algo,path=cfg.result_path)
|
|
||||||
|
|
||||||
# # 测试
|
|
||||||
env,agent = env_agent_config(cfg,seed=10)
|
|
||||||
agent.load(path=cfg.model_path) # 加载模型
|
|
||||||
rewards,ma_rewards = eval(cfg,env,agent)
|
|
||||||
|
|
||||||
save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path)
|
|
||||||
plot_rewards_cn(rewards,ma_rewards,tag="eval",env=cfg.env,algo = cfg.algo,path=cfg.result_path)
|
|
||||||
|
|
||||||
|
|
||||||
51
codes/QLearning/train.py
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
def train(cfg,env,agent):
|
||||||
|
print('开始训练!')
|
||||||
|
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
|
||||||
|
rewards = [] # 记录奖励
|
||||||
|
ma_rewards = [] # 记录滑动平均奖励
|
||||||
|
for i_ep in range(cfg.train_eps):
|
||||||
|
ep_reward = 0 # 记录每个回合的奖励
|
||||||
|
state = env.reset() # 重置环境,即开始新的回合
|
||||||
|
while True:
|
||||||
|
action = agent.choose_action(state) # 根据算法选择一个动作
|
||||||
|
next_state, reward, done, _ = env.step(action) # 与环境进行一次动作交互
|
||||||
|
agent.update(state, action, reward, next_state, done) # Q学习算法更新
|
||||||
|
state = next_state # 更新状态
|
||||||
|
ep_reward += reward
|
||||||
|
if done:
|
||||||
|
break
|
||||||
|
rewards.append(ep_reward)
|
||||||
|
if ma_rewards:
|
||||||
|
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
|
||||||
|
else:
|
||||||
|
ma_rewards.append(ep_reward)
|
||||||
|
if ()
|
||||||
|
print("回合数:{}/{},奖励{:.1f}".format(i_ep+1, cfg.train_eps,ep_reward))
|
||||||
|
print('完成训练!')
|
||||||
|
return rewards,ma_rewards
|
||||||
|
|
||||||
|
def test(cfg,env,agent):
|
||||||
|
print('开始测试!')
|
||||||
|
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
|
||||||
|
for item in agent.Q_table.items():
|
||||||
|
print(item)
|
||||||
|
rewards = [] # 记录所有回合的奖励
|
||||||
|
ma_rewards = [] # 滑动平均的奖励
|
||||||
|
for i_ep in range(cfg.test_eps):
|
||||||
|
ep_reward = 0 # 记录每个episode的reward
|
||||||
|
state = env.reset() # 重置环境, 重新开一局(即开始新的一个回合)
|
||||||
|
while True:
|
||||||
|
action = agent.predict(state) # 根据算法选择一个动作
|
||||||
|
next_state, reward, done, _ = env.step(action) # 与环境进行一个交互
|
||||||
|
state = next_state # 更新状态
|
||||||
|
ep_reward += reward
|
||||||
|
if done:
|
||||||
|
break
|
||||||
|
rewards.append(ep_reward)
|
||||||
|
if ma_rewards:
|
||||||
|
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
|
||||||
|
else:
|
||||||
|
ma_rewards.append(ep_reward)
|
||||||
|
print(f"回合数:{i_ep+1}/{cfg.test_eps}, 奖励:{ep_reward:.1f}")
|
||||||
|
print('完成测试!')
|
||||||
|
return rewards,ma_rewards
|
||||||
@@ -13,6 +13,7 @@
|
|||||||
|
|
||||||
其中```model.py```,```memory.py```,```plot.py``` 由于不同算法都会用到,所以放入```common```文件夹中。
|
其中```model.py```,```memory.py```,```plot.py``` 由于不同算法都会用到,所以放入```common```文件夹中。
|
||||||
|
|
||||||
|
**注意:新版本中将```model```,```memory```相关内容全部放到了```agent.py```里面,```plot```放到了```common.utils```中。**
|
||||||
## 运行环境
|
## 运行环境
|
||||||
|
|
||||||
python 3.7、pytorch 1.6.0-1.8.1、gym 0.17.0-0.19.0
|
python 3.7、pytorch 1.6.0-1.8.1、gym 0.17.0-0.19.0
|
||||||
|
|||||||
@@ -45,7 +45,7 @@
|
|||||||
" self.model_path = curr_path+\"/outputs/\" +self.env+'/'+curr_time+'/models/' # path to save models\n",
|
" self.model_path = curr_path+\"/outputs/\" +self.env+'/'+curr_time+'/models/' # path to save models\n",
|
||||||
" self.train_eps = 300\n",
|
" self.train_eps = 300\n",
|
||||||
" self.train_steps = 500\n",
|
" self.train_steps = 500\n",
|
||||||
" self.eval_eps = 50\n",
|
" self.test_eps = 50\n",
|
||||||
" self.eval_steps = 500\n",
|
" self.eval_steps = 500\n",
|
||||||
" self.gamma = 0.99\n",
|
" self.gamma = 0.99\n",
|
||||||
" self.mean_lambda=1e-3\n",
|
" self.mean_lambda=1e-3\n",
|
||||||
@@ -121,7 +121,7 @@
|
|||||||
" print(f'Env: {cfg.env}, Algorithm: {cfg.algo}, Device: {cfg.device}')\n",
|
" print(f'Env: {cfg.env}, Algorithm: {cfg.algo}, Device: {cfg.device}')\n",
|
||||||
" rewards = []\n",
|
" rewards = []\n",
|
||||||
" ma_rewards = [] # moveing average reward\n",
|
" ma_rewards = [] # moveing average reward\n",
|
||||||
" for i_ep in range(cfg.eval_eps):\n",
|
" for i_ep in range(cfg.test_eps):\n",
|
||||||
" state = env.reset()\n",
|
" state = env.reset()\n",
|
||||||
" ep_reward = 0\n",
|
" ep_reward = 0\n",
|
||||||
" for i_step in range(cfg.eval_steps):\n",
|
" for i_step in range(cfg.eval_steps):\n",
|
||||||
|
|||||||
@@ -33,7 +33,7 @@ class SACConfig:
|
|||||||
self.model_path = curr_path+"/outputs/" +self.env_name+'/'+curr_time+'/models/' # path to save models
|
self.model_path = curr_path+"/outputs/" +self.env_name+'/'+curr_time+'/models/' # path to save models
|
||||||
self.train_eps = 300
|
self.train_eps = 300
|
||||||
self.train_steps = 500
|
self.train_steps = 500
|
||||||
self.eval_eps = 50
|
self.test_eps = 50
|
||||||
self.eval_steps = 500
|
self.eval_steps = 500
|
||||||
self.gamma = 0.99
|
self.gamma = 0.99
|
||||||
self.mean_lambda=1e-3
|
self.mean_lambda=1e-3
|
||||||
@@ -96,7 +96,7 @@ def eval(cfg,env,agent):
|
|||||||
print(f'Env: {cfg.env_name}, Algorithm: {cfg.algo}, Device: {cfg.device}')
|
print(f'Env: {cfg.env_name}, Algorithm: {cfg.algo}, Device: {cfg.device}')
|
||||||
rewards = []
|
rewards = []
|
||||||
ma_rewards = [] # moveing average reward
|
ma_rewards = [] # moveing average reward
|
||||||
for i_ep in range(cfg.eval_eps):
|
for i_ep in range(cfg.test_eps):
|
||||||
state = env.reset()
|
state = env.reset()
|
||||||
ep_reward = 0
|
ep_reward = 0
|
||||||
for i_step in range(cfg.eval_steps):
|
for i_step in range(cfg.eval_steps):
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ class SarsaConfig:
|
|||||||
self.result_path = curr_path+"/outputs/" +self.env+'/'+curr_time+'/results/' # path to save results
|
self.result_path = curr_path+"/outputs/" +self.env+'/'+curr_time+'/results/' # path to save results
|
||||||
self.model_path = curr_path+"/outputs/" +self.env+'/'+curr_time+'/models/' # path to save models
|
self.model_path = curr_path+"/outputs/" +self.env+'/'+curr_time+'/models/' # path to save models
|
||||||
self.train_eps = 200
|
self.train_eps = 200
|
||||||
self.eval_eps = 50
|
self.test_eps = 50
|
||||||
self.epsilon = 0.15 # epsilon: The probability to select a random action .
|
self.epsilon = 0.15 # epsilon: The probability to select a random action .
|
||||||
self.gamma = 0.9 # gamma: Gamma discount factor.
|
self.gamma = 0.9 # gamma: Gamma discount factor.
|
||||||
self.lr = 0.2 # learning rate: step size parameter
|
self.lr = 0.2 # learning rate: step size parameter
|
||||||
@@ -74,7 +74,7 @@ def train(cfg,env,agent):
|
|||||||
def eval(cfg,env,agent):
|
def eval(cfg,env,agent):
|
||||||
rewards = []
|
rewards = []
|
||||||
ma_rewards = []
|
ma_rewards = []
|
||||||
for i_episode in range(cfg.eval_eps):
|
for i_episode in range(cfg.test_eps):
|
||||||
# Print out which episode we're on, useful for debugging.
|
# Print out which episode we're on, useful for debugging.
|
||||||
# Generate an episode.
|
# Generate an episode.
|
||||||
# An episode is an array of (state, action, reward) tuples
|
# An episode is an array of (state, action, reward) tuples
|
||||||
@@ -94,7 +94,7 @@ def eval(cfg,env,agent):
|
|||||||
ma_rewards.append(ep_reward)
|
ma_rewards.append(ep_reward)
|
||||||
rewards.append(ep_reward)
|
rewards.append(ep_reward)
|
||||||
if (i_episode+1)%10==0:
|
if (i_episode+1)%10==0:
|
||||||
print("Episode:{}/{}: Reward:{}".format(i_episode+1, cfg.eval_eps,ep_reward))
|
print("Episode:{}/{}: Reward:{}".format(i_episode+1, cfg.test_eps,ep_reward))
|
||||||
print('Complete evaling!')
|
print('Complete evaling!')
|
||||||
return rewards,ma_rewards
|
return rewards,ma_rewards
|
||||||
|
|
||||||
|
|||||||
1
codes/TD3/README.md
Normal file
@@ -0,0 +1 @@
|
|||||||
|
这是对[Implementation of Twin Delayed Deep Deterministic Policy Gradients (TD3)](https://arxiv.org/abs/1802.09477)的复现
|
||||||
@@ -1,3 +1,13 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# coding=utf-8
|
||||||
|
'''
|
||||||
|
Author: JiangJi
|
||||||
|
Email: johnjim0816@gmail.com
|
||||||
|
Date: 2021-12-22 10:40:05
|
||||||
|
LastEditor: JiangJi
|
||||||
|
LastEditTime: 2021-12-22 10:43:55
|
||||||
|
Discription:
|
||||||
|
'''
|
||||||
import copy
|
import copy
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
@@ -5,40 +15,41 @@ import torch.nn as nn
|
|||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from TD3.memory import ReplayBuffer
|
from TD3.memory import ReplayBuffer
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Implementation of Twin Delayed Deep Deterministic Policy Gradients (TD3)
|
|
||||||
# Paper: https://arxiv.org/abs/1802.09477
|
|
||||||
|
|
||||||
|
|
||||||
class Actor(nn.Module):
|
class Actor(nn.Module):
|
||||||
def __init__(self, state_dim, action_dim, max_action):
|
|
||||||
|
def __init__(self, input_dim, output_dim, max_action):
|
||||||
|
'''[summary]
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_dim (int): 输入维度,这里等于n_states
|
||||||
|
output_dim (int): 输出维度,这里等于n_actions
|
||||||
|
max_action (int): action的最大值
|
||||||
|
'''
|
||||||
super(Actor, self).__init__()
|
super(Actor, self).__init__()
|
||||||
|
|
||||||
self.l1 = nn.Linear(state_dim, 256)
|
self.l1 = nn.Linear(input_dim, 256)
|
||||||
self.l2 = nn.Linear(256, 256)
|
self.l2 = nn.Linear(256, 256)
|
||||||
self.l3 = nn.Linear(256, action_dim)
|
self.l3 = nn.Linear(256, output_dim)
|
||||||
|
|
||||||
self.max_action = max_action
|
self.max_action = max_action
|
||||||
|
|
||||||
|
|
||||||
def forward(self, state):
|
def forward(self, state):
|
||||||
|
|
||||||
a = F.relu(self.l1(state))
|
a = F.relu(self.l1(state))
|
||||||
a = F.relu(self.l2(a))
|
a = F.relu(self.l2(a))
|
||||||
return self.max_action * torch.tanh(self.l3(a))
|
return self.max_action * torch.tanh(self.l3(a))
|
||||||
|
|
||||||
|
|
||||||
class Critic(nn.Module):
|
class Critic(nn.Module):
|
||||||
def __init__(self, state_dim, action_dim):
|
def __init__(self, input_dim, output_dim):
|
||||||
super(Critic, self).__init__()
|
super(Critic, self).__init__()
|
||||||
|
|
||||||
# Q1 architecture
|
# Q1 architecture
|
||||||
self.l1 = nn.Linear(state_dim + action_dim, 256)
|
self.l1 = nn.Linear(input_dim + output_dim, 256)
|
||||||
self.l2 = nn.Linear(256, 256)
|
self.l2 = nn.Linear(256, 256)
|
||||||
self.l3 = nn.Linear(256, 1)
|
self.l3 = nn.Linear(256, 1)
|
||||||
|
|
||||||
# Q2 architecture
|
# Q2 architecture
|
||||||
self.l4 = nn.Linear(state_dim + action_dim, 256)
|
self.l4 = nn.Linear(input_dim + output_dim, 256)
|
||||||
self.l5 = nn.Linear(256, 256)
|
self.l5 = nn.Linear(256, 256)
|
||||||
self.l6 = nn.Linear(256, 1)
|
self.l6 = nn.Linear(256, 1)
|
||||||
|
|
||||||
@@ -68,8 +79,8 @@ class Critic(nn.Module):
|
|||||||
class TD3(object):
|
class TD3(object):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
state_dim,
|
input_dim,
|
||||||
action_dim,
|
output_dim,
|
||||||
max_action,
|
max_action,
|
||||||
cfg,
|
cfg,
|
||||||
):
|
):
|
||||||
@@ -83,14 +94,14 @@ class TD3(object):
|
|||||||
self.device = cfg.device
|
self.device = cfg.device
|
||||||
self.total_it = 0
|
self.total_it = 0
|
||||||
|
|
||||||
self.actor = Actor(state_dim, action_dim, max_action).to(self.device)
|
self.actor = Actor(input_dim, output_dim, max_action).to(self.device)
|
||||||
self.actor_target = copy.deepcopy(self.actor)
|
self.actor_target = copy.deepcopy(self.actor)
|
||||||
self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=3e-4)
|
self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=3e-4)
|
||||||
|
|
||||||
self.critic = Critic(state_dim, action_dim).to(self.device)
|
self.critic = Critic(input_dim, output_dim).to(self.device)
|
||||||
self.critic_target = copy.deepcopy(self.critic)
|
self.critic_target = copy.deepcopy(self.critic)
|
||||||
self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4)
|
self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4)
|
||||||
self.memory = ReplayBuffer(state_dim, action_dim)
|
self.memory = ReplayBuffer(input_dim, output_dim)
|
||||||
|
|
||||||
def choose_action(self, state):
|
def choose_action(self, state):
|
||||||
state = torch.FloatTensor(state.reshape(1, -1)).to(self.device)
|
state = torch.FloatTensor(state.reshape(1, -1)).to(self.device)
|
||||||
|
|||||||