Merge branch 'master' of github.com:datawhalechina/easy-rl

This commit is contained in:
qiwang067
2022-07-22 17:16:56 +08:00
34 changed files with 753 additions and 496 deletions

View File

@@ -5,10 +5,11 @@ Author: JiangJi
Email: johnjim0816@gmail.com Email: johnjim0816@gmail.com
Date: 2021-05-03 22:16:08 Date: 2021-05-03 22:16:08
LastEditor: JiangJi LastEditor: JiangJi
LastEditTime: 2021-05-03 22:23:48 LastEditTime: 2022-07-20 23:54:40
Discription: Discription:
Environment: Environment:
''' '''
import torch
import torch.optim as optim import torch.optim as optim
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
@@ -42,7 +43,7 @@ class A2C:
''' '''
def __init__(self,n_states,n_actions,cfg) -> None: def __init__(self,n_states,n_actions,cfg) -> None:
self.gamma = cfg.gamma self.gamma = cfg.gamma
self.device = cfg.device self.device = torch.device(cfg.device)
self.model = ActorCritic(n_states, n_actions, cfg.hidden_size).to(self.device) self.model = ActorCritic(n_states, n_actions, cfg.hidden_size).to(self.device)
self.optimizer = optim.Adam(self.model.parameters()) self.optimizer = optim.Adam(self.model.parameters())

View File

@@ -0,0 +1,14 @@
{
"algo_name": "A2C",
"env_name": "CartPole-v0",
"n_envs": 8,
"max_steps": 20000,
"n_steps": 5,
"gamma": 0.99,
"lr": 0.001,
"hidden_dim": 256,
"deivce": "cpu",
"result_path": "C:\\Users\\24438\\Desktop\\rl-tutorials/outputs/CartPole-v0/20220713-221850/results/",
"model_path": "C:\\Users\\24438\\Desktop\\rl-tutorials/outputs/CartPole-v0/20220713-221850/models/",
"save_fig": true
}

View File

@@ -1,14 +0,0 @@
------------------ start ------------------
algo_name : A2C
env_name : CartPole-v0
n_envs : 8
max_steps : 30000
n_steps : 5
gamma : 0.99
lr : 0.001
hidden_dim : 256
result_path : c:\Users\24438\Desktop\rl-tutorials\codes\A2C/outputs/CartPole-v0/20220713-221850/results/
model_path : c:\Users\24438\Desktop\rl-tutorials\codes\A2C/outputs/CartPole-v0/20220713-221850/models/
save_fig : True
device : cuda
------------------- end -------------------

File diff suppressed because one or more lines are too long

View File

@@ -29,14 +29,13 @@ def get_args():
parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor") parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor")
parser.add_argument('--lr',default=1e-3,type=float,help="learning rate") parser.add_argument('--lr',default=1e-3,type=float,help="learning rate")
parser.add_argument('--hidden_dim',default=256,type=int) parser.add_argument('--hidden_dim',default=256,type=int)
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \ parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/results/' ) '/' + curr_time + '/results/' )
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \ parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/models/' ) # path to save models '/' + curr_time + '/models/' ) # path to save models
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not") parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
args = parser.parse_args() args = parser.parse_args()
args.device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu") # check GPU
return args return args
def make_envs(env_name): def make_envs(env_name):
@@ -124,14 +123,15 @@ def train(cfg,envs):
loss.backward() loss.backward()
optimizer.step() optimizer.step()
print('Finish training') print('Finish training')
return test_rewards, test_ma_rewards return {'rewards':test_rewards,'ma_rewards':test_ma_rewards}
if __name__ == "__main__": if __name__ == "__main__":
cfg = get_args() cfg = get_args()
envs = [make_envs(cfg.env_name) for i in range(cfg.n_envs)] envs = [make_envs(cfg.env_name) for i in range(cfg.n_envs)]
envs = SubprocVecEnv(envs) envs = SubprocVecEnv(envs)
# training # training
rewards,ma_rewards = train(cfg,envs) res_dic = train(cfg,envs)
make_dir(cfg.result_path,cfg.model_path) make_dir(cfg.result_path,cfg.model_path)
save_args(cfg) save_args(cfg)
save_results(rewards, ma_rewards, tag='train', path=cfg.result_path) # 保存结果 save_results(res_dic, tag='train',
plot_rewards(rewards, ma_rewards, cfg, tag="train") # 画出结果 path=cfg.result_path)
plot_rewards(res_dic['rewards'], res_dic['ma_rewards'], cfg, tag="train") # 画出结果

View File

@@ -73,11 +73,11 @@ class Critic(nn.Module):
return x return x
class DDPG: class DDPG:
def __init__(self, n_states, n_actions, cfg): def __init__(self, n_states, n_actions, cfg):
self.device = cfg.device self.device = torch.device(cfg.device)
self.critic = Critic(n_states, n_actions, cfg.hidden_dim).to(cfg.device) self.critic = Critic(n_states, n_actions, cfg.hidden_dim).to(self.device)
self.actor = Actor(n_states, n_actions, cfg.hidden_dim).to(cfg.device) self.actor = Actor(n_states, n_actions, cfg.hidden_dim).to(self.device)
self.target_critic = Critic(n_states, n_actions, cfg.hidden_dim).to(cfg.device) self.target_critic = Critic(n_states, n_actions, cfg.hidden_dim).to(self.device)
self.target_actor = Actor(n_states, n_actions, cfg.hidden_dim).to(cfg.device) self.target_actor = Actor(n_states, n_actions, cfg.hidden_dim).to(self.device)
# 复制参数到目标网络 # 复制参数到目标网络
for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()): for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()):

View File

@@ -0,0 +1,18 @@
{
"algo_name": "DDPG",
"env_name": "Pendulum-v1",
"train_eps": 300,
"test_eps": 20,
"gamma": 0.99,
"critic_lr": 0.001,
"actor_lr": 0.0001,
"memory_capacity": 8000,
"batch_size": 128,
"target_update": 2,
"soft_tau": 0.01,
"hidden_dim": 256,
"deivce": "cpu",
"result_path": "C:\\Users\\24438\\Desktop\\rl-tutorials/outputs/DDPG/outputs/Pendulum-v1/20220713-225402/results//",
"model_path": "C:\\Users\\24438\\Desktop\\rl-tutorials/outputs/DDPG/outputs/Pendulum-v1/20220713-225402/models/",
"save_fig": true
}

View File

@@ -1,18 +0,0 @@
------------------ start ------------------
algo_name : DDPG
env_name : Pendulum-v1
train_eps : 300
test_eps : 20
gamma : 0.99
critic_lr : 0.001
actor_lr : 0.0001
memory_capacity : 8000
batch_size : 128
target_update : 2
soft_tau : 0.01
hidden_dim : 256
result_path : c:\Users\24438\Desktop\rl-tutorials\codes\DDPG/outputs/Pendulum-v1/20220713-225402/results/
model_path : c:\Users\24438\Desktop\rl-tutorials\codes\DDPG/outputs/Pendulum-v1/20220713-225402/models/
save_fig : True
device : cuda
------------------- end -------------------

View File

@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com @Email: johnjim0816@gmail.com
@Date: 2020-06-11 20:58:21 @Date: 2020-06-11 20:58:21
@LastEditor: John @LastEditor: John
LastEditTime: 2022-07-13 22:53:11 LastEditTime: 2022-07-21 21:51:34
@Discription: @Discription:
@Environment: python 3.7.7 @Environment: python 3.7.7
''' '''
@@ -41,14 +41,13 @@ def get_args():
parser.add_argument('--target_update',default=2,type=int) parser.add_argument('--target_update',default=2,type=int)
parser.add_argument('--soft_tau',default=1e-2,type=float) parser.add_argument('--soft_tau',default=1e-2,type=float)
parser.add_argument('--hidden_dim',default=256,type=int) parser.add_argument('--hidden_dim',default=256,type=int)
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \ parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/results/' ) '/' + curr_time + '/results/' )
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \ parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/models/' ) # path to save models '/' + curr_time + '/models/' ) # path to save models
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not") parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
args = parser.parse_args() args = parser.parse_args()
args.device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu") # check GPU
return args return args
def env_agent_config(cfg,seed=1): def env_agent_config(cfg,seed=1):
@@ -87,7 +86,7 @@ def train(cfg, env, agent):
else: else:
ma_rewards.append(ep_reward) ma_rewards.append(ep_reward)
print('Finish training!') print('Finish training!')
return rewards, ma_rewards return {'rewards':rewards,'ma_rewards':ma_rewards}
def test(cfg, env, agent): def test(cfg, env, agent):
print('Start testing') print('Start testing')
@@ -112,21 +111,23 @@ def test(cfg, env, agent):
ma_rewards.append(ep_reward) ma_rewards.append(ep_reward)
print(f"Epside:{i_ep+1}/{cfg.test_eps}, Reward:{ep_reward:.1f}") print(f"Epside:{i_ep+1}/{cfg.test_eps}, Reward:{ep_reward:.1f}")
print('Finish testing!') print('Finish testing!')
return rewards, ma_rewards return {'rewards':rewards,'ma_rewards':ma_rewards}
if __name__ == "__main__": if __name__ == "__main__":
cfg = get_args() cfg = get_args()
# training # training
env,agent = env_agent_config(cfg,seed=1) env,agent = env_agent_config(cfg,seed=1)
rewards, ma_rewards = train(cfg, env, agent) res_dic = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path) make_dir(cfg.result_path, cfg.model_path)
save_args(cfg) save_args(cfg)
agent.save(path=cfg.model_path) agent.save(path=cfg.model_path)
save_results(rewards, ma_rewards, tag='train', path=cfg.result_path) save_results(res_dic, tag='train',
plot_rewards(rewards, ma_rewards, cfg, tag="train") # 画出结果 path=cfg.result_path)
plot_rewards(res_dic['rewards'], res_dic['ma_rewards'], cfg, tag="train")
# testing # testing
env,agent = env_agent_config(cfg,seed=10) env,agent = env_agent_config(cfg,seed=10)
agent.load(path=cfg.model_path) agent.load(path=cfg.model_path)
rewards,ma_rewards = test(cfg,env,agent) res_dic = test(cfg,env,agent)
save_results(rewards,ma_rewards,tag = 'test',path = cfg.result_path) save_results(res_dic, tag='test',
plot_rewards(rewards, ma_rewards, cfg, tag="test") # 画出结果 path=cfg.result_path)
plot_rewards(res_dic['rewards'], res_dic['ma_rewards'], cfg, tag="test")

View File

@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com @Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:50:49 @Date: 2020-06-12 00:50:49
@LastEditor: John @LastEditor: John
LastEditTime: 2022-07-13 00:08:18 LastEditTime: 2022-07-20 23:57:16
@Discription: @Discription:
@Environment: python 3.7.7 @Environment: python 3.7.7
''' '''
@@ -64,8 +64,8 @@ class ReplayBuffer:
class DQN: class DQN:
def __init__(self, n_states,n_actions,cfg): def __init__(self, n_states,n_actions,cfg):
self.n_actions = n_actions # 总的动作个数 self.n_actions = n_actions
self.device = cfg.device # 设备cpu或gpu等 self.device = torch.device(cfg.device) # cpu or cuda
self.gamma = cfg.gamma # 奖励的折扣因子 self.gamma = cfg.gamma # 奖励的折扣因子
# e-greedy策略相关参数 # e-greedy策略相关参数
self.frame_idx = 0 # 用于epsilon的衰减计数 self.frame_idx = 0 # 用于epsilon的衰减计数

View File

@@ -0,0 +1,19 @@
{
"algo_name": "DQN",
"env_name": "CartPole-v0",
"train_eps": 200,
"test_eps": 20,
"gamma": 0.95,
"epsilon_start": 0.95,
"epsilon_end": 0.01,
"epsilon_decay": 500,
"lr": 0.0001,
"memory_capacity": 100000,
"batch_size": 64,
"target_update": 4,
"hidden_dim": 256,
"deivce": "cpu",
"result_path": "C:\\Users\\24438\\Desktop\\rl-tutorials/outputs/CartPole-v0/20220713-211653/results/",
"model_path": "C:\\Users\\24438\\Desktop\\rl-tutorials/outputs/CartPole-v0/20220713-211653/models/",
"save_fig": true
}

View File

@@ -1,19 +0,0 @@
------------------ start ------------------
algo_name : DQN
env_name : CartPole-v0
train_eps : 200
test_eps : 20
gamma : 0.95
epsilon_start : 0.95
epsilon_end : 0.01
epsilon_decay : 500
lr : 0.0001
memory_capacity : 100000
batch_size : 64
target_update : 4
hidden_dim : 256
result_path : C:\Users\24438\Desktop\rl-tutorials\codes\DQN/outputs/CartPole-v0/20220713-211653/results/
model_path : C:\Users\24438\Desktop\rl-tutorials\codes\DQN/outputs/CartPole-v0/20220713-211653/models/
save_fig : True
device : cuda
------------------- end -------------------

View File

@@ -1,19 +1,16 @@
from lib2to3.pytree import type_repr import sys,os
import sys curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
import os parent_path = os.path.dirname(curr_path) # parent path
from parso import parse sys.path.append(parent_path) # add to system path
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径
import gym import gym
import torch import torch
import datetime import datetime
import numpy as np import numpy as np
import argparse import argparse
from common.utils import save_results_1, make_dir from common.utils import save_results, make_dir
from common.utils import plot_rewards,save_args from common.utils import plot_rewards,save_args
from dqn import DQN from dqn import DQN
@@ -35,14 +32,13 @@ def get_args():
parser.add_argument('--batch_size',default=64,type=int) parser.add_argument('--batch_size',default=64,type=int)
parser.add_argument('--target_update',default=4,type=int) parser.add_argument('--target_update',default=4,type=int)
parser.add_argument('--hidden_dim',default=256,type=int) parser.add_argument('--hidden_dim',default=256,type=int)
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \ parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/results/' ) '/' + curr_time + '/results/' )
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \ parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/models/' ) # path to save models '/' + curr_time + '/models/' ) # path to save models
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not") parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
args = parser.parse_args() args = parser.parse_args()
args.device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu") # check GPU
return args return args
def env_agent_config(cfg,seed=1): def env_agent_config(cfg,seed=1):
@@ -99,8 +95,8 @@ def train(cfg, env, agent):
def test(cfg, env, agent): def test(cfg, env, agent):
print('开始测试!') print('Start testing!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') print(f'Env:{cfg.env_name}, A{cfg.algo_name}, 设备:{cfg.device}')
############# 由于测试不需要使用epsilon-greedy策略所以相应的值设置为0 ############### ############# 由于测试不需要使用epsilon-greedy策略所以相应的值设置为0 ###############
cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon
cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
@@ -127,7 +123,7 @@ def test(cfg, env, agent):
else: else:
ma_rewards.append(ep_reward) ma_rewards.append(ep_reward)
print(f'Episode{i_ep+1}/{cfg.test_eps}, Reward:{ep_reward:.2f}, Step:{ep_step:.2f}') print(f'Episode{i_ep+1}/{cfg.test_eps}, Reward:{ep_reward:.2f}, Step:{ep_step:.2f}')
print('完成测试!') print('Finish testing')
env.close() env.close()
return {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps} return {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps}
@@ -137,16 +133,16 @@ if __name__ == "__main__":
# 训练 # 训练
env, agent = env_agent_config(cfg) env, agent = env_agent_config(cfg)
res_dic = train(cfg, env, agent) res_dic = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹 make_dir(cfg.result_path, cfg.model_path)
save_args(cfg) save_args(cfg) # save parameters
agent.save(path=cfg.model_path) # 保存模型 agent.save(path=cfg.model_path) # save model
save_results_1(res_dic, tag='train', save_results(res_dic, tag='train',
path=cfg.result_path) # 保存结果 path=cfg.result_path)
plot_rewards(res_dic['rewards'], res_dic['ma_rewards'], cfg, tag="train") # 画出结果 plot_rewards(res_dic['rewards'], res_dic['ma_rewards'], cfg, tag="train")
# 测试 # 测试
env, agent = env_agent_config(cfg) env, agent = env_agent_config(cfg)
agent.load(path=cfg.model_path) # 导入模型 agent.load(path=cfg.model_path) # 导入模型
res_dic = test(cfg, env, agent) res_dic = test(cfg, env, agent)
save_results_1(res_dic, tag='test', save_results(res_dic, tag='test',
path=cfg.result_path) # 保存结果 path=cfg.result_path) # 保存结果
plot_rewards(res_dic['rewards'], res_dic['ma_rewards'],cfg, tag="test") # 画出结果 plot_rewards(res_dic['rewards'], res_dic['ma_rewards'],cfg, tag="test") # 画出结果

View File

@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com @Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:50:49 @Date: 2020-06-12 00:50:49
@LastEditor: John @LastEditor: John
LastEditTime: 2021-11-19 18:07:09 LastEditTime: 2022-07-21 00:08:26
@Discription: @Discription:
@Environment: python 3.7.7 @Environment: python 3.7.7
''' '''
@@ -65,7 +65,7 @@ class MLP(nn.Module):
class DoubleDQN: class DoubleDQN:
def __init__(self, n_states, n_actions, cfg): def __init__(self, n_states, n_actions, cfg):
self.n_actions = n_actions # 总的动作个数 self.n_actions = n_actions # 总的动作个数
self.device = cfg.device # 设备cpu或gpu等 self.device = torch.device(cfg.device) # 设备cpu或gpu等
self.gamma = cfg.gamma self.gamma = cfg.gamma
# e-greedy策略相关参数 # e-greedy策略相关参数
self.actions_count = 0 self.actions_count = 0
@@ -88,8 +88,7 @@ class DoubleDQN:
'''选择动作 '''选择动作
''' '''
self.actions_count += 1 self.actions_count += 1
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * math.exp(-1. * self.actions_count / self.epsilon_decay)
math.exp(-1. * self.actions_count / self.epsilon_decay)
if random.random() > self.epsilon: if random.random() > self.epsilon:
with torch.no_grad(): with torch.no_grad():
# 先转为张量便于丢给神经网络,state元素数据原本为float64 # 先转为张量便于丢给神经网络,state元素数据原本为float64

Binary file not shown.

Before

Width:  |  Height:  |  Size: 37 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 39 KiB

View File

@@ -0,0 +1,19 @@
{
"algo_name": "DoubleDQN",
"env_name": "CartPole-v0",
"train_eps": 200,
"test_eps": 20,
"gamma": 0.99,
"epsilon_start": 0.95,
"epsilon_end": 0.01,
"epsilon_decay": 500,
"lr": 0.0001,
"memory_capacity": 100000,
"batch_size": 64,
"target_update": 2,
"hidden_dim": 256,
"device": "cuda",
"result_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DoubleDQN/outputs/CartPole-v0/20220721-215416/results/",
"model_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DoubleDQN/outputs/CartPole-v0/20220721-215416/models/",
"save_fig": true
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 44 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 44 KiB

View File

@@ -5,55 +5,49 @@ Author: JiangJi
Email: johnjim0816@gmail.com Email: johnjim0816@gmail.com
Date: 2021-11-07 18:10:37 Date: 2021-11-07 18:10:37
LastEditor: JiangJi LastEditor: JiangJi
LastEditTime: 2021-12-29 15:02:30 LastEditTime: 2022-07-21 21:52:31
Discription: Discription:
''' '''
import sys,os import sys,os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
parent_path = os.path.dirname(curr_path) # 父路径 parent_path = os.path.dirname(curr_path) # parent path
sys.path.append(parent_path) # 添加路径到系统路径 sys.path.append(parent_path) # add to system path
import gym import gym
import torch import torch
import datetime import datetime
import argparse
from common.utils import save_results, make_dir from common.utils import save_results,make_dir
from common.utils import plot_rewards from common.utils import plot_rewards,save_args
from DoubleDQN.double_dqn import DoubleDQN from DoubleDQN.double_dqn import DoubleDQN
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 def get_args():
""" Hyperparameters
class Config: """
def __init__(self): curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # Obtain current time
################################## 环境超参数 ################################### parser = argparse.ArgumentParser(description="hyperparameters")
self.algo_name = 'DoubleDQN' # 算法名称 parser.add_argument('--algo_name',default='DoubleDQN',type=str,help="name of algorithm")
self.env_name = 'CartPole-v0' # 环境名称 parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
self.device = torch.device( parser.add_argument('--train_eps',default=200,type=int,help="episodes of training")
"cuda" if torch.cuda.is_available() else "cpu") # 检测GPU parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
self.train_eps = 200 # 训练的回合数 parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor")
self.test_eps = 30 # 测试的回合数 parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon")
################################################################################ parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon")
parser.add_argument('--epsilon_decay',default=500,type=int,help="decay rate of epsilon")
################################## 算法超参数 ################################### parser.add_argument('--lr',default=0.0001,type=float,help="learning rate")
self.gamma = 0.95 # 强化学习中的折扣因子 parser.add_argument('--memory_capacity',default=100000,type=int,help="memory capacity")
self.epsilon_start = 0.95 # e-greedy策略中初始epsilon parser.add_argument('--batch_size',default=64,type=int)
self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon parser.add_argument('--target_update',default=2,type=int)
self.epsilon_decay = 500 # e-greedy策略中epsilon的衰减率 parser.add_argument('--hidden_dim',default=256,type=int)
self.lr = 0.0001 # 学习率 parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
self.memory_capacity = 100000 # 经验回放的容量 parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
self.batch_size = 64 # mini-batch SGD中的批量大小 '/' + curr_time + '/results/' )
self.target_update = 2 # 目标网络的更新频率 parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
self.hidden_dim = 256 # 网络隐藏层 '/' + curr_time + '/models/' ) # path to save models
################################################################################ parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
args = parser.parse_args()
################################# 保存结果相关参数 ############################## return args
self.result_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/results/' # 保存结果的路径
self.model_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/models/' # 保存模型的路径
self.save = True # 是否保存图片
################################################################################
def env_agent_config(cfg,seed=1): def env_agent_config(cfg,seed=1):
@@ -65,8 +59,8 @@ def env_agent_config(cfg,seed=1):
return env,agent return env,agent
def train(cfg,env,agent): def train(cfg,env,agent):
print('开始训练!') print('Start training!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') print(f'Env:{cfg.env_name}, Algorithm:{cfg.algo_name}, Device:{cfg.device}')
rewards = [] # 记录所有回合的奖励 rewards = [] # 记录所有回合的奖励
ma_rewards = [] # 记录所有回合的滑动平均奖励 ma_rewards = [] # 记录所有回合的滑动平均奖励
for i_ep in range(cfg.train_eps): for i_ep in range(cfg.train_eps):
@@ -84,20 +78,19 @@ def train(cfg,env,agent):
if i_ep % cfg.target_update == 0: if i_ep % cfg.target_update == 0:
agent.target_net.load_state_dict(agent.policy_net.state_dict()) agent.target_net.load_state_dict(agent.policy_net.state_dict())
if (i_ep+1)%10 == 0: if (i_ep+1)%10 == 0:
print(f'回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward}') print(f'Env:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}')
rewards.append(ep_reward) rewards.append(ep_reward)
if ma_rewards: if ma_rewards:
ma_rewards.append( ma_rewards.append(
0.9*ma_rewards[-1]+0.1*ep_reward) 0.9*ma_rewards[-1]+0.1*ep_reward)
else: else:
ma_rewards.append(ep_reward) ma_rewards.append(ep_reward)
print('完成训练!') print('Finish training!')
env.close() return {'rewards':rewards,'ma_rewards':ma_rewards}
return rewards,ma_rewards
def test(cfg,env,agent): def test(cfg,env,agent):
print('开始测试!') print('Start testing')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') print(f'Env:{cfg.env_name}, Algorithm:{cfg.algo_name}, Device:{cfg.device}')
############# 由于测试不需要使用epsilon-greedy策略所以相应的值设置为0 ############### ############# 由于测试不需要使用epsilon-greedy策略所以相应的值设置为0 ###############
cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon
cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon
@@ -120,25 +113,26 @@ def test(cfg,env,agent):
ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1) ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
else: else:
ma_rewards.append(ep_reward) ma_rewards.append(ep_reward)
print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}") print(f"Epside:{i_ep+1}/{cfg.test_eps}, Reward:{ep_reward:.1f}")
print('完成测试!') print('Finish testing!')
env.close() return {'rewards':rewards,'ma_rewards':ma_rewards}
return rewards,ma_rewards
if __name__ == "__main__": if __name__ == "__main__":
cfg = Config() cfg = get_args()
# 训练 print(cfg.device)
env, agent = env_agent_config(cfg) # training
rewards, ma_rewards = train(cfg, env, agent) env,agent = env_agent_config(cfg,seed=1)
make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹 res_dic = train(cfg, env, agent)
agent.save(path=cfg.model_path) # 保存模型 make_dir(cfg.result_path, cfg.model_path)
save_results(rewards, ma_rewards, tag='train', save_args(cfg)
path=cfg.result_path) # 保存结果 agent.save(path=cfg.model_path)
plot_rewards(rewards, ma_rewards, cfg, tag="train") # 画出结果 save_results(res_dic, tag='train',
# 测试 path=cfg.result_path)
env, agent = env_agent_config(cfg) plot_rewards(res_dic['rewards'], res_dic['ma_rewards'], cfg, tag="train")
agent.load(path=cfg.model_path) # 导入模型 # testing
rewards, ma_rewards = test(cfg, env, agent) env,agent = env_agent_config(cfg,seed=10)
save_results(rewards, ma_rewards, tag='test', agent.load(path=cfg.model_path)
path=cfg.result_path) # 保存结果 res_dic = test(cfg,env,agent)
plot_rewards(rewards, ma_rewards, cfg, tag="test") # 画出结果 save_results(res_dic, tag='test',
path=cfg.result_path)
plot_rewards(res_dic['rewards'], res_dic['ma_rewards'], cfg, tag="test")

View File

@@ -16,7 +16,7 @@ curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时
class Config: class Config:
def __init__(self) -> None: def __init__(self) -> None:
################################## 环境超参数 ################################### ################################## 环境超参数 ###################################
self.algo_name = "DQN" # 算法名称 self.algo_name = "PPO" # 算法名称
self.env_name = 'CartPole-v0' # 环境名称 self.env_name = 'CartPole-v0' # 环境名称
self.continuous = False # 环境是否为连续动作 self.continuous = False # 环境是否为连续动作
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU

View File

@@ -5,56 +5,47 @@ Author: John
Email: johnjim0816@gmail.com Email: johnjim0816@gmail.com
Date: 2020-11-22 23:21:53 Date: 2020-11-22 23:21:53
LastEditor: John LastEditor: John
LastEditTime: 2022-02-10 06:13:21 LastEditTime: 2022-07-21 21:44:00
Discription: Discription:
Environment: Environment:
''' '''
import sys import sys,os
import os curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 parent_path = os.path.dirname(curr_path) # parent path
parent_path = os.path.dirname(curr_path) # 父路径 sys.path.append(parent_path) # add to system path
sys.path.append(parent_path) # 添加路径到系统路径
import gym import gym
import torch import torch
import datetime import datetime
import argparse
from itertools import count from itertools import count
from pg import PolicyGradient from pg import PolicyGradient
from common.utils import save_results, make_dir from common.utils import save_results, make_dir
from common.utils import plot_rewards from common.utils import plot_rewards
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
class Config: def get_args():
'''超参数 """ Hyperparameters
''' """
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # Obtain current time
def __init__(self): parser = argparse.ArgumentParser(description="hyperparameters")
################################## 环境超参数 ################################### parser.add_argument('--algo_name',default='PolicyGradient',type=str,help="name of algorithm")
self.algo_name = "PolicyGradient" # 算法名称 parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
self.env_name = 'CartPole-v0' # 环境名称 parser.add_argument('--train_eps',default=300,type=int,help="episodes of training")
self.device = torch.device( parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
"cuda" if torch.cuda.is_available() else "cpu") # 检测GPUgjgjlkhfsf风刀霜的撒发十 parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor")
self.seed = 10 # 随机种子置0则不设置随机种子 parser.add_argument('--lr',default=0.01,type=float,help="learning rate")
self.train_eps = 300 # 训练的回合数 parser.add_argument('--batch_size',default=8,type=int)
self.test_eps = 30 # 测试的回合数 parser.add_argument('--hidden_dim',default=36,type=int)
################################################################################ parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
################################## 算法超参数 ################################### '/' + curr_time + '/results/' )
self.batch_size = 8 # mini-batch SGD中的批量大小 parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
self.lr = 0.01 # 学习率 '/' + curr_time + '/models/' ) # path to save models
self.gamma = 0.99 # 强化学习中的折扣因子 parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
self.hidden_dim = 36 # 网络隐藏层 args = parser.parse_args()
################################################################################ return args
################################# 保存结果相关参数 ################################
self.result_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/results/' # 保存结果的路径
self.model_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/models/' # 保存模型的路径
self.save = True # 是否保存图片
################################################################################
def env_agent_config(cfg,seed=1): def env_agent_config(cfg,seed=1):
@@ -65,9 +56,9 @@ def env_agent_config(cfg,seed=1):
return env,agent return env,agent
def train(cfg,env,agent): def train(cfg,env,agent):
print('开始训练!') print('Start training!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') print(f'Env:{cfg.env_name}, Algorithm:{cfg.algo_name}, Device:{cfg.device}')
state_pool = [] # 存放每batch_size个episode的state序列 state_pool = [] # temp states pool per several episodes
action_pool = [] action_pool = []
reward_pool = [] reward_pool = []
rewards = [] rewards = []
@@ -86,11 +77,11 @@ def train(cfg,env,agent):
reward_pool.append(reward) reward_pool.append(reward)
state = next_state state = next_state
if done: if done:
print('回合:{}/{}, 奖励:{}'.format(i_ep + 1, cfg.train_eps, ep_reward)) print(f'Episode{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}')
break break
if i_ep > 0 and i_ep % cfg.batch_size == 0: if i_ep > 0 and i_ep % cfg.batch_size == 0:
agent.update(reward_pool,state_pool,action_pool) agent.update(reward_pool,state_pool,action_pool)
state_pool = [] # 每个episode的state state_pool = []
action_pool = [] action_pool = []
reward_pool = [] reward_pool = []
rewards.append(ep_reward) rewards.append(ep_reward)
@@ -99,8 +90,8 @@ def train(cfg,env,agent):
0.9*ma_rewards[-1]+0.1*ep_reward) 0.9*ma_rewards[-1]+0.1*ep_reward)
else: else:
ma_rewards.append(ep_reward) ma_rewards.append(ep_reward)
print('完成训练!') print('Finish training!')
env.close() env.close() # close environment
return rewards, ma_rewards return rewards, ma_rewards

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com Email: johnjim0816@gmail.com
Date: 2021-03-12 16:02:24 Date: 2021-03-12 16:02:24
LastEditor: John LastEditor: John
LastEditTime: 2022-07-13 22:15:46 LastEditTime: 2022-07-21 21:45:33
Discription: Discription:
Environment: Environment:
''' '''
@@ -14,6 +14,7 @@ import numpy as np
from pathlib import Path from pathlib import Path
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import seaborn as sns import seaborn as sns
import json
from matplotlib.font_manager import FontProperties # 导入字体模块 from matplotlib.font_manager import FontProperties # 导入字体模块
@@ -68,19 +69,19 @@ def plot_losses(losses, algo="DQN", save=True, path='./'):
plt.savefig(path+"losses_curve") plt.savefig(path+"losses_curve")
plt.show() plt.show()
def save_results_1(dic, tag='train', path='./results'): def save_results(dic, tag='train', path='./results'):
''' 保存奖励 ''' 保存奖励
''' '''
for key,value in dic.items(): for key,value in dic.items():
np.save(path+'{}_{}.npy'.format(tag,key),value) np.save(path+'{}_{}.npy'.format(tag,key),value)
print('Results saved') print('Results saved')
def save_results(rewards, ma_rewards, tag='train', path='./results'): # def save_results(rewards, ma_rewards, tag='train', path='./results'):
''' 保存奖励 # ''' 保存奖励
''' # '''
np.save(path+'{}_rewards.npy'.format(tag), rewards) # np.save(path+'{}_rewards.npy'.format(tag), rewards)
np.save(path+'{}_ma_rewards.npy'.format(tag), ma_rewards) # np.save(path+'{}_ma_rewards.npy'.format(tag), ma_rewards)
print('Result saved!') # print('Result saved!')
def make_dir(*paths): def make_dir(*paths):
@@ -101,11 +102,8 @@ def del_empty_dir(*paths):
def save_args(args): def save_args(args):
# save parameters # save parameters
argsDict = args.__dict__ args_dict = vars(args)
with open(args.result_path+'params.txt', 'w') as f: with open(args.result_path+'params.json', 'w') as fp:
f.writelines('------------------ start ------------------' + '\n') json.dump(args_dict, fp)
for eachArg, value in argsDict.items():
f.writelines(eachArg + ' : ' + str(value) + '\n')
f.writelines('------------------- end -------------------')
print("Parameters saved!") print("Parameters saved!")

370
notebooks/A2C.ipynb Normal file

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,153 @@
# 该代码来自 openai baseline用于多线程环境
# https://github.com/openai/baselines/tree/master/baselines/common/vec_env
import numpy as np
from multiprocessing import Process, Pipe
def worker(remote, parent_remote, env_fn_wrapper):
parent_remote.close()
env = env_fn_wrapper.x()
while True:
cmd, data = remote.recv()
if cmd == 'step':
ob, reward, done, info = env.step(data)
if done:
ob = env.reset()
remote.send((ob, reward, done, info))
elif cmd == 'reset':
ob = env.reset()
remote.send(ob)
elif cmd == 'reset_task':
ob = env.reset_task()
remote.send(ob)
elif cmd == 'close':
remote.close()
break
elif cmd == 'get_spaces':
remote.send((env.observation_space, env.action_space))
else:
raise NotImplementedError
class VecEnv(object):
"""
An abstract asynchronous, vectorized environment.
"""
def __init__(self, num_envs, observation_space, action_space):
self.num_envs = num_envs
self.observation_space = observation_space
self.action_space = action_space
def reset(self):
"""
Reset all the environments and return an array of
observations, or a tuple of observation arrays.
If step_async is still doing work, that work will
be cancelled and step_wait() should not be called
until step_async() is invoked again.
"""
pass
def step_async(self, actions):
"""
Tell all the environments to start taking a step
with the given actions.
Call step_wait() to get the results of the step.
You should not call this if a step_async run is
already pending.
"""
pass
def step_wait(self):
"""
Wait for the step taken with step_async().
Returns (obs, rews, dones, infos):
- obs: an array of observations, or a tuple of
arrays of observations.
- rews: an array of rewards
- dones: an array of "episode done" booleans
- infos: a sequence of info objects
"""
pass
def close(self):
"""
Clean up the environments' resources.
"""
pass
def step(self, actions):
self.step_async(actions)
return self.step_wait()
class CloudpickleWrapper(object):
"""
Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
"""
def __init__(self, x):
self.x = x
def __getstate__(self):
import cloudpickle
return cloudpickle.dumps(self.x)
def __setstate__(self, ob):
import pickle
self.x = pickle.loads(ob)
class SubprocVecEnv(VecEnv):
def __init__(self, env_fns, spaces=None):
"""
envs: list of gym environments to run in subprocesses
"""
self.waiting = False
self.closed = False
nenvs = len(env_fns)
self.nenvs = nenvs
self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
for p in self.ps:
p.daemon = True # if the main process crashes, we should not cause things to hang
p.start()
for remote in self.work_remotes:
remote.close()
self.remotes[0].send(('get_spaces', None))
observation_space, action_space = self.remotes[0].recv()
VecEnv.__init__(self, len(env_fns), observation_space, action_space)
def step_async(self, actions):
for remote, action in zip(self.remotes, actions):
remote.send(('step', action))
self.waiting = True
def step_wait(self):
results = [remote.recv() for remote in self.remotes]
self.waiting = False
obs, rews, dones, infos = zip(*results)
return np.stack(obs), np.stack(rews), np.stack(dones), infos
def reset(self):
for remote in self.remotes:
remote.send(('reset', None))
return np.stack([remote.recv() for remote in self.remotes])
def reset_task(self):
for remote in self.remotes:
remote.send(('reset_task', None))
return np.stack([remote.recv() for remote in self.remotes])
def close(self):
if self.closed:
return
if self.waiting:
for remote in self.remotes:
remote.recv()
for remote in self.remotes:
remote.send(('close', None))
for p in self.ps:
p.join()
self.closed = True
def __len__(self):
return self.nenvs