更新算法模版

This commit is contained in:
johnjim0816
2022-11-06 12:15:36 +08:00
parent 466a17707f
commit dc78698262
256 changed files with 17282 additions and 10229 deletions

View File

@@ -0,0 +1,7 @@
## 脚本描述
* `task0.py`:离散动作任务
* `task1.py`:离散动作任务,与`task0.py`唯一的区别就是Actor的激活函数是tanh而不是relu`CartPole-v1`上效果更好
* `task2.py`:连续动作任务,#TODO待调试

View File

@@ -0,0 +1,24 @@
general_cfg:
algo_name: A2C
device: cuda
env_name: CartPole-v1
eval_eps: 10
load_checkpoint: true
load_path: Train_CartPole-v1_A2C_20221030-211435
max_steps: 200
mode: test
save_fig: true
seed: 1
show_fig: false
test_eps: 20
train_eps: 1000
algo_cfg:
actor_hidden_dim: 256
actor_lr: 0.0003
batch_size: 64
buffer_size: 100000
critic_hidden_dim: 256
critic_lr: 0.001
gamma: 0.99
hidden_dim: 256
target_update: 4

View File

@@ -0,0 +1,23 @@
2022-10-30 21:25:53 - r - INFO: - n_states: 4, n_actions: 2
2022-10-30 21:25:55 - r - INFO: - Start testing!
2022-10-30 21:25:55 - r - INFO: - Env: CartPole-v1, Algorithm: A2C, Device: cuda
2022-10-30 21:25:56 - r - INFO: - Episode: 1/20, Reward: 200.0, Step: 200
2022-10-30 21:25:56 - r - INFO: - Episode: 2/20, Reward: 200.0, Step: 200
2022-10-30 21:25:56 - r - INFO: - Episode: 3/20, Reward: 200.0, Step: 200
2022-10-30 21:25:56 - r - INFO: - Episode: 4/20, Reward: 200.0, Step: 200
2022-10-30 21:25:56 - r - INFO: - Episode: 5/20, Reward: 200.0, Step: 200
2022-10-30 21:25:56 - r - INFO: - Episode: 6/20, Reward: 200.0, Step: 200
2022-10-30 21:25:56 - r - INFO: - Episode: 7/20, Reward: 200.0, Step: 200
2022-10-30 21:25:56 - r - INFO: - Episode: 8/20, Reward: 200.0, Step: 200
2022-10-30 21:25:56 - r - INFO: - Episode: 9/20, Reward: 200.0, Step: 200
2022-10-30 21:25:56 - r - INFO: - Episode: 10/20, Reward: 200.0, Step: 200
2022-10-30 21:25:57 - r - INFO: - Episode: 11/20, Reward: 200.0, Step: 200
2022-10-30 21:25:57 - r - INFO: - Episode: 12/20, Reward: 190.0, Step: 190
2022-10-30 21:25:57 - r - INFO: - Episode: 13/20, Reward: 200.0, Step: 200
2022-10-30 21:25:57 - r - INFO: - Episode: 14/20, Reward: 200.0, Step: 200
2022-10-30 21:25:57 - r - INFO: - Episode: 15/20, Reward: 96.0, Step: 96
2022-10-30 21:25:57 - r - INFO: - Episode: 16/20, Reward: 200.0, Step: 200
2022-10-30 21:25:57 - r - INFO: - Episode: 17/20, Reward: 200.0, Step: 200
2022-10-30 21:25:57 - r - INFO: - Episode: 18/20, Reward: 200.0, Step: 200
2022-10-30 21:25:57 - r - INFO: - Episode: 19/20, Reward: 112.0, Step: 112
2022-10-30 21:25:57 - r - INFO: - Episode: 20/20, Reward: 200.0, Step: 200

Binary file not shown.

After

Width:  |  Height:  |  Size: 34 KiB

View File

@@ -0,0 +1,21 @@
episodes,rewards,steps
0,200.0,200
1,200.0,200
2,200.0,200
3,200.0,200
4,200.0,200
5,200.0,200
6,200.0,200
7,200.0,200
8,200.0,200
9,200.0,200
10,200.0,200
11,190.0,190
12,200.0,200
13,200.0,200
14,96.0,96
15,200.0,200
16,200.0,200
17,200.0,200
18,112.0,112
19,200.0,200
1 episodes rewards steps
2 0 200.0 200
3 1 200.0 200
4 2 200.0 200
5 3 200.0 200
6 4 200.0 200
7 5 200.0 200
8 6 200.0 200
9 7 200.0 200
10 8 200.0 200
11 9 200.0 200
12 10 200.0 200
13 11 190.0 190
14 12 200.0 200
15 13 200.0 200
16 14 96.0 96
17 15 200.0 200
18 16 200.0 200
19 17 200.0 200
20 18 112.0 112
21 19 200.0 200

View File

@@ -0,0 +1,25 @@
general_cfg:
algo_name: A2C
device: cuda
env_name: CartPole-v1
eval_eps: 10
eval_per_episode: 5
load_checkpoint: true
load_path: Train_CartPole-v1_A2C_20221031-232138
max_steps: 200
mode: test
save_fig: true
seed: 1
show_fig: false
test_eps: 20
train_eps: 1000
algo_cfg:
actor_hidden_dim: 256
actor_lr: 0.0003
batch_size: 64
buffer_size: 100000
critic_hidden_dim: 256
critic_lr: 0.001
gamma: 0.99
hidden_dim: 256
target_update: 4

View File

@@ -0,0 +1,28 @@
2022-10-31 23:33:16 - r - INFO: - n_states: 4, n_actions: 2
2022-10-31 23:33:16 - r - INFO: - Actor model name: ActorSoftmaxTanh
2022-10-31 23:33:16 - r - INFO: - Critic model name: Critic
2022-10-31 23:33:16 - r - INFO: - ACMemory memory name: PGReplay
2022-10-31 23:33:16 - r - INFO: - agent name: A2C
2022-10-31 23:33:17 - r - INFO: - Start testing!
2022-10-31 23:33:17 - r - INFO: - Env: CartPole-v1, Algorithm: A2C, Device: cuda
2022-10-31 23:33:18 - r - INFO: - Episode: 1/20, Reward: 200.0, Step: 200
2022-10-31 23:33:18 - r - INFO: - Episode: 2/20, Reward: 200.0, Step: 200
2022-10-31 23:33:18 - r - INFO: - Episode: 3/20, Reward: 186.0, Step: 186
2022-10-31 23:33:18 - r - INFO: - Episode: 4/20, Reward: 200.0, Step: 200
2022-10-31 23:33:18 - r - INFO: - Episode: 5/20, Reward: 200.0, Step: 200
2022-10-31 23:33:19 - r - INFO: - Episode: 6/20, Reward: 200.0, Step: 200
2022-10-31 23:33:19 - r - INFO: - Episode: 7/20, Reward: 200.0, Step: 200
2022-10-31 23:33:19 - r - INFO: - Episode: 8/20, Reward: 200.0, Step: 200
2022-10-31 23:33:19 - r - INFO: - Episode: 9/20, Reward: 200.0, Step: 200
2022-10-31 23:33:19 - r - INFO: - Episode: 10/20, Reward: 200.0, Step: 200
2022-10-31 23:33:19 - r - INFO: - Episode: 11/20, Reward: 200.0, Step: 200
2022-10-31 23:33:19 - r - INFO: - Episode: 12/20, Reward: 200.0, Step: 200
2022-10-31 23:33:19 - r - INFO: - Episode: 13/20, Reward: 200.0, Step: 200
2022-10-31 23:33:19 - r - INFO: - Episode: 14/20, Reward: 200.0, Step: 200
2022-10-31 23:33:19 - r - INFO: - Episode: 15/20, Reward: 200.0, Step: 200
2022-10-31 23:33:19 - r - INFO: - Episode: 16/20, Reward: 200.0, Step: 200
2022-10-31 23:33:19 - r - INFO: - Episode: 17/20, Reward: 200.0, Step: 200
2022-10-31 23:33:19 - r - INFO: - Episode: 18/20, Reward: 200.0, Step: 200
2022-10-31 23:33:19 - r - INFO: - Episode: 19/20, Reward: 200.0, Step: 200
2022-10-31 23:33:20 - r - INFO: - Episode: 20/20, Reward: 200.0, Step: 200
2022-10-31 23:33:20 - r - INFO: - Finish testing!

Binary file not shown.

After

Width:  |  Height:  |  Size: 31 KiB

View File

@@ -1,21 +1,21 @@
episodes,rewards,steps
0,200.0,200
1,200.0,200
2,93.0,93
3,155.0,155
4,116.0,116
2,186.0,186
3,200.0,200
4,200.0,200
5,200.0,200
6,190.0,190
7,176.0,176
6,200.0,200
7,200.0,200
8,200.0,200
9,200.0,200
10,200.0,200
11,179.0,179
11,200.0,200
12,200.0,200
13,185.0,185
14,191.0,191
13,200.0,200
14,200.0,200
15,200.0,200
16,200.0,200
17,124.0,124
17,200.0,200
18,200.0,200
19,172.0,172
19,200.0,200
1 episodes rewards steps
2 0 200.0 200
3 1 200.0 200
4 2 93.0 186.0 93 186
5 3 155.0 200.0 155 200
6 4 116.0 200.0 116 200
7 5 200.0 200
8 6 190.0 200.0 190 200
9 7 176.0 200.0 176 200
10 8 200.0 200
11 9 200.0 200
12 10 200.0 200
13 11 179.0 200.0 179 200
14 12 200.0 200
15 13 185.0 200.0 185 200
16 14 191.0 200.0 191 200
17 15 200.0 200
18 16 200.0 200
19 17 124.0 200.0 124 200
20 18 200.0 200
21 19 172.0 200.0 172 200

View File

@@ -0,0 +1,23 @@
general_cfg:
algo_name: A2C
device: cuda
env_name: CartPole-v1
eval_eps: 10
load_checkpoint: false
load_path: tasks
max_steps: 200
mode: train
save_fig: true
seed: 1
show_fig: false
test_eps: 20
train_eps: 1000
algo_cfg:
actor_hidden_dim: 256
actor_lr: 0.0003
batch_size: 64
buffer_size: 100000
critic_hidden_dim: 256
critic_lr: 0.001
gamma: 0.99
hidden_dim: 256

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 68 KiB

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,24 @@
general_cfg:
algo_name: A2C
device: cuda
env_name: CartPole-v1
eval_eps: 10
eval_per_episode: 5
load_checkpoint: false
load_path: tasks
max_steps: 200
mode: train
save_fig: true
seed: 1
show_fig: false
test_eps: 20
train_eps: 1000
algo_cfg:
actor_hidden_dim: 256
actor_lr: 0.0003
batch_size: 64
buffer_size: 100000
critic_hidden_dim: 256
critic_lr: 0.001
gamma: 0.99
hidden_dim: 256

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 58 KiB

File diff suppressed because it is too large Load Diff

View File

@@ -1,34 +1,79 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: JiangJi
Email: johnjim0816@gmail.com
Date: 2022-08-16 23:05:25
LastEditor: JiangJi
LastEditTime: 2022-11-01 00:33:49
Discription:
'''
import torch
import numpy as np
from torch.distributions import Categorical,Normal
class A2C:
def __init__(self,models,memories,cfg):
self.n_actions = cfg['n_actions']
self.gamma = cfg['gamma']
self.device = torch.device(cfg['device'])
self.n_actions = cfg.n_actions
self.gamma = cfg.gamma
self.device = torch.device(cfg.device)
self.continuous = cfg.continuous
if hasattr(cfg,'action_bound'):
self.action_bound = cfg.action_bound
self.memory = memories['ACMemory']
self.actor = models['Actor'].to(self.device)
self.critic = models['Critic'].to(self.device)
self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=cfg['actor_lr'])
self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=cfg['critic_lr'])
self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=cfg.actor_lr)
self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=cfg.critic_lr)
def sample_action(self,state):
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
dist = self.actor(state)
value = self.critic(state) # note that 'dist' need require_grad=True
value = value.detach().numpy().squeeze(0)[0]
action = np.random.choice(self.n_actions, p=dist.detach().numpy().squeeze(0)) # shape(p=(n_actions,1)
return action,value,dist
# state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
# dist = self.actor(state)
# self.entropy = - np.sum(np.mean(dist.detach().cpu().numpy()) * np.log(dist.detach().cpu().numpy()))
# value = self.critic(state) # note that 'dist' need require_grad=True
# self.value = value.detach().cpu().numpy().squeeze(0)[0]
# action = np.random.choice(self.n_actions, p=dist.detach().cpu().numpy().squeeze(0)) # shape(p=(n_actions,1)
# self.log_prob = torch.log(dist.squeeze(0)[action])
if self.continuous:
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
mu, sigma = self.actor(state)
dist = Normal(self.action_bound * mu.view(1,), sigma.view(1,))
action = dist.sample()
value = self.critic(state)
# self.entropy = - np.sum(np.mean(dist.detach().cpu().numpy()) * np.log(dist.detach().cpu().numpy()))
self.value = value.detach().cpu().numpy().squeeze(0)[0] # detach() to avoid gradient
self.log_prob = dist.log_prob(action).squeeze(dim=0) # Tensor([0.])
self.entropy = dist.entropy().cpu().detach().numpy().squeeze(0) # detach() to avoid gradient
return action.cpu().detach().numpy()
else:
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
probs = self.actor(state)
dist = Categorical(probs)
action = dist.sample() # Tensor([0])
value = self.critic(state)
self.value = value.detach().cpu().numpy().squeeze(0)[0] # detach() to avoid gradient
self.log_prob = dist.log_prob(action).squeeze(dim=0) # Tensor([0.])
self.entropy = dist.entropy().cpu().detach().numpy().squeeze(0) # detach() to avoid gradient
return action.cpu().numpy().item()
@torch.no_grad()
def predict_action(self,state):
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
dist = self.actor(state)
value = self.critic(state) # note that 'dist' need require_grad=True
value = value.detach().numpy().squeeze(0)[0]
action = np.random.choice(self.n_actions, p=dist.detach().numpy().squeeze(0)) # shape(p=(n_actions,1)
return action,value,dist
if self.continuous:
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
mu, sigma = self.actor(state)
dist = Normal(self.action_bound * mu.view(1,), sigma.view(1,))
action = dist.sample()
return action.cpu().detach().numpy()
else:
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
dist = self.actor(state)
# value = self.critic(state) # note that 'dist' need require_grad=True
# value = value.detach().cpu().numpy().squeeze(0)[0]
action = np.random.choice(self.n_actions, p=dist.detach().cpu().numpy().squeeze(0)) # shape(p=(n_actions,1)
return action
def update(self,next_state,entropy):
value_pool,log_prob_pool,reward_pool = self.memory.sample()
value_pool = torch.tensor(value_pool, device=self.device)
log_prob_pool = torch.stack(log_prob_pool)
next_state = torch.tensor(next_state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
next_value = self.critic(next_state)
returns = np.zeros_like(reward_pool)
@@ -36,9 +81,7 @@ class A2C:
next_value = reward_pool[t] + self.gamma * next_value # G(s_{t},a{t}) = r_{t+1} + gamma * V(s_{t+1})
returns[t] = next_value
returns = torch.tensor(returns, device=self.device)
value_pool = torch.tensor(value_pool, device=self.device)
advantages = returns - value_pool
log_prob_pool = torch.stack(log_prob_pool)
actor_loss = (-log_prob_pool * advantages).mean()
critic_loss = 0.5 * advantages.pow(2).mean()
tot_loss = actor_loss + critic_loss + 0.001 * entropy

View File

@@ -1,14 +1,24 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: JiangJi
Email: johnjim0816@gmail.com
Date: 2022-09-19 14:48:16
LastEditor: JiangJi
LastEditTime: 2022-10-30 01:21:50
Discription: #TODO待更新模版
'''
import torch
import numpy as np
class A2C_2:
def __init__(self,models,memories,cfg):
self.n_actions = cfg['n_actions']
self.gamma = cfg['gamma']
self.device = torch.device(cfg['device'])
self.n_actions = cfg.n_actions
self.gamma = cfg.gamma
self.device = torch.device(cfg.device)
self.memory = memories['ACMemory']
self.ac_net = models['ActorCritic'].to(self.device)
self.ac_optimizer = torch.optim.Adam(self.ac_net.parameters(), lr=cfg['lr'])
self.ac_optimizer = torch.optim.Adam(self.ac_net.parameters(), lr = cfg.lr)
def sample_action(self,state):
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
value, dist = self.ac_net(state) # note that 'dist' need require_grad=True

View File

@@ -0,0 +1,21 @@
general_cfg:
algo_name: A2C
device: cuda
env_name: CartPole-v1
mode: test
load_checkpoint: true
load_path: Train_CartPole-v1_A2C_20221031-232138
max_steps: 200
save_fig: true
seed: 1
show_fig: false
test_eps: 20
train_eps: 1000
algo_cfg:
continuous: false
batch_size: 64
buffer_size: 100000
gamma: 0.99
actor_lr: 0.0003
critic_lr: 0.001
target_update: 4

View File

@@ -0,0 +1,19 @@
general_cfg:
algo_name: A2C
device: cuda
env_name: CartPole-v1
mode: train
load_checkpoint: false
load_path: Train_CartPole-v1_DQN_20221026-054757
max_steps: 200
save_fig: true
seed: 1
show_fig: false
test_eps: 20
train_eps: 600
algo_cfg:
continuous: false
batch_size: 64
buffer_size: 100000
gamma: 0.0003
lr: 0.001

View File

@@ -0,0 +1,21 @@
general_cfg:
algo_name: A2C
device: cuda
env_name: Pendulum-v1
mode: train
eval_per_episode: 200
load_checkpoint: false
load_path: Train_CartPole-v1_DQN_20221026-054757
max_steps: 200
save_fig: true
seed: 1
show_fig: false
test_eps: 20
train_eps: 1000
algo_cfg:
continuous: true
batch_size: 64
buffer_size: 100000
gamma: 0.0003
actor_lr: 0.0003
critic_lr: 0.001

View File

@@ -0,0 +1,38 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: JiangJi
Email: johnjim0816@gmail.com
Date: 2022-10-30 00:53:03
LastEditor: JiangJi
LastEditTime: 2022-11-01 00:17:55
Discription: default parameters of A2C
'''
from common.config import GeneralConfig,AlgoConfig
class GeneralConfigA2C(GeneralConfig):
def __init__(self) -> None:
self.env_name = "CartPole-v1" # name of environment
self.algo_name = "A2C" # name of algorithm
self.mode = "train" # train or test
self.seed = 1 # random seed
self.device = "cuda" # device to use
self.train_eps = 1000 # number of episodes for training
self.test_eps = 20 # number of episodes for testing
self.max_steps = 200 # max steps for each episode
self.load_checkpoint = False
self.load_path = "tasks" # path to load model
self.show_fig = False # show figure or not
self.save_fig = True # save figure or not
class AlgoConfigA2C(AlgoConfig):
def __init__(self) -> None:
self.continuous = False # continuous or discrete action space
self.hidden_dim = 256 # hidden_dim for MLP
self.gamma = 0.99 # discount factor
self.actor_lr = 3e-4 # learning rate of actor
self.critic_lr = 1e-3 # learning rate of critic
self.actor_hidden_dim = 256 # hidden_dim for actor MLP
self.critic_hidden_dim = 256 # hidden_dim for critic MLP
self.buffer_size = 100000 # size of replay buffer
self.batch_size = 64 # batch size

View File

@@ -1,121 +0,0 @@
import sys,os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" # avoid "OMP: Error #15: Initializing libiomp5md.dll, but found libiomp5md.dll already initialized."
curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
parent_path = os.path.dirname(curr_path) # parent path
sys.path.append(parent_path) # add path to system path
import datetime
import argparse
import gym
import torch
import numpy as np
from common.utils import all_seed
from common.launcher import Launcher
from common.memories import PGReplay
from common.models import ActorSoftmax,Critic
from envs.register import register_env
from a2c import A2C
class Main(Launcher):
def get_args(self):
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
parser = argparse.ArgumentParser(description="hyperparameters")
parser.add_argument('--algo_name',default='A2C',type=str,help="name of algorithm")
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
parser.add_argument('--train_eps',default=1600,type=int,help="episodes of training")
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
parser.add_argument('--ep_max_steps',default = 100000,type=int,help="steps per episode, much larger value can simulate infinite steps")
parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor")
parser.add_argument('--actor_lr',default=3e-4,type=float,help="learning rate of actor")
parser.add_argument('--critic_lr',default=1e-3,type=float,help="learning rate of critic")
parser.add_argument('--actor_hidden_dim',default=256,type=int,help="hidden of actor net")
parser.add_argument('--critic_hidden_dim',default=256,type=int,help="hidden of critic net")
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
parser.add_argument('--seed',default=10,type=int,help="seed")
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
args = parser.parse_args()
default_args = {'result_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/results/",
'model_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/models/",
}
args = {**vars(args),**default_args} # type(dict)
return args
def env_agent_config(self,cfg):
''' create env and agent
'''
register_env(cfg['env_name'])
env = gym.make(cfg['env_name'])
if cfg['seed'] !=0: # set random seed
all_seed(env,seed=cfg["seed"])
try: # state dimension
n_states = env.observation_space.n # print(hasattr(env.observation_space, 'n'))
except AttributeError:
n_states = env.observation_space.shape[0] # print(hasattr(env.observation_space, 'shape'))
n_actions = env.action_space.n # action dimension
print(f"n_states: {n_states}, n_actions: {n_actions}")
cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters
models = {'Actor':ActorSoftmax(cfg['n_states'],cfg['n_actions'], hidden_dim = cfg['actor_hidden_dim']),'Critic':Critic(cfg['n_states'],1,hidden_dim=cfg['critic_hidden_dim'])}
memories = {'ACMemory':PGReplay()}
agent = A2C(models,memories,cfg)
return env,agent
def train(self,cfg,env,agent):
print("Start training!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = [] # record rewards for all episodes
steps = [] # record steps for all episodes
for i_ep in range(cfg['train_eps']):
ep_reward = 0 # reward per episode
ep_step = 0 # step per episode
ep_entropy = 0
state = env.reset() # reset and obtain initial state
for _ in range(cfg['ep_max_steps']):
action, value, dist = agent.sample_action(state) # sample action
next_state, reward, done, _ = env.step(action) # update env and return transitions
log_prob = torch.log(dist.squeeze(0)[action])
entropy = -np.sum(np.mean(dist.detach().numpy()) * np.log(dist.detach().numpy()))
agent.memory.push((value,log_prob,reward)) # save transitions
state = next_state # update state
ep_reward += reward
ep_entropy += entropy
ep_step += 1
if done:
break
agent.update(next_state,ep_entropy) # update agent
rewards.append(ep_reward)
steps.append(ep_step)
if (i_ep+1)%10==0:
print(f'Episode: {i_ep+1}/{cfg["train_eps"]}, Reward: {ep_reward:.2f}, Steps:{ep_step}')
print("Finish training!")
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
def test(self,cfg,env,agent):
print("Start testing!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = [] # record rewards for all episodes
steps = [] # record steps for all episodes
for i_ep in range(cfg['test_eps']):
ep_reward = 0 # reward per episode
ep_step = 0
state = env.reset() # reset and obtain initial state
for _ in range(cfg['ep_max_steps']):
action,_,_ = agent.predict_action(state) # predict action
next_state, reward, done, _ = env.step(action)
state = next_state
ep_reward += reward
ep_step += 1
if done:
break
rewards.append(ep_reward)
steps.append(ep_step)
print(f"Episode: {i_ep+1}/{cfg['test_eps']}, Steps:{ep_step}, Reward: {ep_reward:.2f}")
print("Finish testing!")
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
if __name__ == "__main__":
main = Main()
main.run()

View File

@@ -1,3 +1,13 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: JiangJi
Email: johnjim0816@gmail.com
Date: 2022-09-19 14:48:16
LastEditor: JiangJi
LastEditTime: 2022-10-30 01:21:15
Discription: #TODO待更新模版
'''
import sys,os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" # avoid "OMP: Error #15: Initializing libiomp5md.dll, but found libiomp5md.dll already initialized."
curr_path = os.path.dirname(os.path.abspath(__file__)) # current path

View File

@@ -1,19 +0,0 @@
{
"algo_name": "A2C",
"env_name": "CartPole-v0",
"train_eps": 2000,
"test_eps": 20,
"ep_max_steps": 100000,
"gamma": 0.99,
"lr": 0.0003,
"actor_hidden_dim": 256,
"critic_hidden_dim": 256,
"device": "cpu",
"seed": 10,
"show_fig": false,
"save_fig": true,
"result_path": "/Users/jj/Desktop/rl-tutorials/codes/A2C/outputs/CartPole-v0/20220829-135818/results/",
"model_path": "/Users/jj/Desktop/rl-tutorials/codes/A2C/outputs/CartPole-v0/20220829-135818/models/",
"n_states": 4,
"n_actions": 2
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 44 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 63 KiB

View File

@@ -1 +0,0 @@
{"algo_name": "A2C", "env_name": "CartPole-v0", "train_eps": 1600, "test_eps": 20, "ep_max_steps": 100000, "gamma": 0.99, "actor_lr": 0.0003, "critic_lr": 0.001, "actor_hidden_dim": 256, "critic_hidden_dim": 256, "device": "cpu", "seed": 10, "show_fig": false, "save_fig": true, "result_path": "/Users/jj/Desktop/rl-tutorials/codes/A2C/outputs/CartPole-v0/20220829-143327/results/", "model_path": "/Users/jj/Desktop/rl-tutorials/codes/A2C/outputs/CartPole-v0/20220829-143327/models/", "n_states": 4, "n_actions": 2}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 41 KiB

View File

@@ -1,21 +0,0 @@
episodes,rewards,steps
0,177.0,177
1,180.0,180
2,200.0,200
3,200.0,200
4,167.0,167
5,124.0,124
6,128.0,128
7,200.0,200
8,200.0,200
9,200.0,200
10,186.0,186
11,187.0,187
12,200.0,200
13,176.0,176
14,200.0,200
15,200.0,200
16,200.0,200
17,200.0,200
18,185.0,185
19,180.0,180
1 episodes rewards steps
2 0 177.0 177
3 1 180.0 180
4 2 200.0 200
5 3 200.0 200
6 4 167.0 167
7 5 124.0 124
8 6 128.0 128
9 7 200.0 200
10 8 200.0 200
11 9 200.0 200
12 10 186.0 186
13 11 187.0 187
14 12 200.0 200
15 13 176.0 176
16 14 200.0 200
17 15 200.0 200
18 16 200.0 200
19 17 200.0 200
20 18 185.0 185
21 19 180.0 180

Binary file not shown.

Before

Width:  |  Height:  |  Size: 66 KiB

142
projects/codes/A2C/task0.py Normal file
View File

@@ -0,0 +1,142 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: JiangJi
Email: johnjim0816@gmail.com
Date: 2022-10-30 01:19:43
LastEditor: JiangJi
LastEditTime: 2022-11-01 01:21:06
Discription:
'''
import sys,os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" # avoid "OMP: Error #15: Initializing libiomp5md.dll, but found libiomp5md.dll already initialized."
curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
parent_path = os.path.dirname(curr_path) # parent path
sys.path.append(parent_path) # add path to system path
import gym
from common.utils import all_seed,merge_class_attrs
from common.launcher import Launcher
from common.memories import PGReplay
from common.models import ActorSoftmax,Critic
from envs.register import register_env
from a2c import A2C
from config.config import GeneralConfigA2C,AlgoConfigA2C
class Main(Launcher):
def __init__(self) -> None:
super().__init__()
self.cfgs['general_cfg'] = merge_class_attrs(self.cfgs['general_cfg'],GeneralConfigA2C())
self.cfgs['algo_cfg'] = merge_class_attrs(self.cfgs['algo_cfg'],AlgoConfigA2C())
def env_agent_config(self,cfg,logger):
''' create env and agent
'''
register_env(cfg.env_name)
env = gym.make(cfg.env_name,new_step_api=True) # create env
if cfg.seed !=0: # set random seed
all_seed(env,seed = cfg.seed)
try: # state dimension
n_states = env.observation_space.n # print(hasattr(env.observation_space, 'n'))
except AttributeError:
n_states = env.observation_space.shape[0] # print(hasattr(env.observation_space, 'shape'))
n_actions = env.action_space.n # action dimension
logger.info(f"n_states: {n_states}, n_actions: {n_actions}") # print info
# update to cfg paramters
setattr(cfg, 'n_states', n_states)
setattr(cfg, 'n_actions', n_actions)
models = {'Actor':ActorSoftmax(n_states,n_actions, hidden_dim = cfg.actor_hidden_dim),'Critic':Critic(n_states,1,hidden_dim=cfg.critic_hidden_dim)}
memories = {'ACMemory':PGReplay()}
agent = A2C(models,memories,cfg)
for k,v in models.items():
logger.info(f"{k} model name: {type(v).__name__}")
for k,v in memories.items():
logger.info(f"{k} memory name: {type(v).__name__}")
logger.info(f"agent name: {type(agent).__name__}")
return env,agent
def train_one_episode(self, env, agent, cfg):
ep_reward = 0 # reward per episode
ep_step = 0 # step per episode
ep_entropy = 0 # entropy per episode
state = env.reset() # reset and obtain initial state
for _ in range(cfg.max_steps):
action = agent.sample_action(state) # sample action
next_state, reward, terminated, truncated , info = env.step(action) # update env and return transitions
agent.memory.push((agent.value,agent.log_prob,reward)) # save transitions
state = next_state # update state
ep_reward += reward
ep_entropy += agent.entropy
ep_step += 1
if terminated:
break
agent.update(next_state,ep_entropy) # update agent
return agent,ep_reward,ep_step
def test_one_episode(self, env, agent, cfg):
ep_reward = 0 # reward per episode
ep_step = 0 # step per episode
state = env.reset() # reset and obtain initial state
for _ in range(cfg.max_steps):
action = agent.predict_action(state) # predict action
next_state, reward, terminated, truncated , info = env.step(action)
state = next_state
ep_reward += reward
ep_step += 1
if terminated:
break
return agent,ep_reward,ep_step
# def train(self,cfg,env,agent,logger):
# logger.info("Start training!")
# logger.info(f"Env: {cfg.env_name}, Algorithm: {cfg.algo_name}, Device: {cfg.device}")
# rewards = [] # record rewards for all episodes
# steps = [] # record steps for all episodes
# for i_ep in range(cfg.train_eps):
# ep_reward = 0 # reward per episode
# ep_step = 0 # step per episode
# ep_entropy = 0
# state = env.reset() # reset and obtain initial state
# for _ in range(cfg.max_steps):
# action = agent.sample_action(state) # sample action
# next_state, reward, terminated, truncated , info = env.step(action) # update env and return transitions
# agent.memory.push((agent.value,agent.log_prob,reward)) # save transitions
# state = next_state # update state
# ep_reward += reward
# ep_entropy += agent.entropy
# ep_step += 1
# if terminated:
# break
# agent.update(next_state,ep_entropy) # update agent
# rewards.append(ep_reward)
# steps.append(ep_step)
# logger.info(f"Episode: {i_ep+1}/{cfg.train_eps}, Reward: {ep_reward:.2f}, Steps:{ep_step}")
# logger.info("Finish training!")
# return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
# def test(self,cfg,env,agent,logger):
# logger.info("Start testing!")
# logger.info(f"Env: {cfg.env_name}, Algorithm: {cfg.algo_name}, Device: {cfg.device}")
# rewards = [] # record rewards for all episodes
# steps = [] # record steps for all episodes
# for i_ep in range(cfg.test_eps):
# ep_reward = 0 # reward per episode
# ep_step = 0
# state = env.reset() # reset and obtain initial state
# for _ in range(cfg.max_steps):
# action = agent.predict_action(state) # predict action
# next_state, reward, terminated, truncated , info = env.step(action)
# state = next_state
# ep_reward += reward
# ep_step += 1
# if terminated:
# break
# rewards.append(ep_reward)
# steps.append(ep_step)
# logger.info(f"Episode: {i_ep+1}/{cfg.test_eps}, Reward: {ep_reward:.2f}, Steps:{ep_step}")
# logger.info("Finish testing!")
# env.close()
# return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
if __name__ == "__main__":
main = Main()
main.run()

142
projects/codes/A2C/task1.py Normal file
View File

@@ -0,0 +1,142 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: JiangJi
Email: johnjim0816@gmail.com
Date: 2022-10-30 01:19:43
LastEditor: JiangJi
LastEditTime: 2022-11-01 01:21:12
Discription: continuous action space
'''
import sys,os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" # avoid "OMP: Error #15: Initializing libiomp5md.dll, but found libiomp5md.dll already initialized."
curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
parent_path = os.path.dirname(curr_path) # parent path
sys.path.append(parent_path) # add path to system path
import gym
from common.utils import all_seed,merge_class_attrs
from common.launcher import Launcher
from common.memories import PGReplay
from common.models import ActorSoftmaxTanh,Critic
from envs.register import register_env
from a2c import A2C
from config.config import GeneralConfigA2C,AlgoConfigA2C
class Main(Launcher):
def __init__(self) -> None:
super().__init__()
self.cfgs['general_cfg'] = merge_class_attrs(self.cfgs['general_cfg'],GeneralConfigA2C())
self.cfgs['algo_cfg'] = merge_class_attrs(self.cfgs['algo_cfg'],AlgoConfigA2C())
def env_agent_config(self,cfg,logger):
''' create env and agent
'''
register_env(cfg.env_name)
env = gym.make(cfg.env_name,new_step_api=True) # create env
if cfg.seed !=0: # set random seed
all_seed(env,seed = cfg.seed)
try: # state dimension
n_states = env.observation_space.n # print(hasattr(env.observation_space, 'n'))
except AttributeError:
n_states = env.observation_space.shape[0] # print(hasattr(env.observation_space, 'shape'))
n_actions = env.action_space.n # action dimension
logger.info(f"n_states: {n_states}, n_actions: {n_actions}") # print info
# update to cfg paramters
setattr(cfg, 'n_states', n_states)
setattr(cfg, 'n_actions', n_actions)
models = {'Actor':ActorSoftmaxTanh(n_states,n_actions, hidden_dim = cfg.actor_hidden_dim),'Critic':Critic(n_states,1,hidden_dim=cfg.critic_hidden_dim)}
memories = {'ACMemory':PGReplay()}
agent = A2C(models,memories,cfg)
for k,v in models.items():
logger.info(f"{k} model name: {type(v).__name__}")
for k,v in memories.items():
logger.info(f"{k} memory name: {type(v).__name__}")
logger.info(f"agent name: {type(agent).__name__}")
return env,agent
def train_one_episode(self, env, agent, cfg):
ep_reward = 0 # reward per episode
ep_step = 0 # step per episode
ep_entropy = 0 # entropy per episode
state = env.reset() # reset and obtain initial state
for _ in range(cfg.max_steps):
action = agent.sample_action(state) # sample action
next_state, reward, terminated, truncated , info = env.step(action) # update env and return transitions
agent.memory.push((agent.value,agent.log_prob,reward)) # save transitions
state = next_state # update state
ep_reward += reward
ep_entropy += agent.entropy
ep_step += 1
if terminated:
break
agent.update(next_state,ep_entropy) # update agent
return agent,ep_reward,ep_step
def test_one_episode(self, env, agent, cfg):
ep_reward = 0 # reward per episode
ep_step = 0 # step per episode
state = env.reset() # reset and obtain initial state
for _ in range(cfg.max_steps):
action = agent.predict_action(state) # predict action
next_state, reward, terminated, truncated , info = env.step(action)
state = next_state
ep_reward += reward
ep_step += 1
if terminated:
break
return agent,ep_reward,ep_step
# def train(self,cfg,env,agent,logger):
# logger.info("Start training!")
# logger.info(f"Env: {cfg.env_name}, Algorithm: {cfg.algo_name}, Device: {cfg.device}")
# rewards = [] # record rewards for all episodes
# steps = [] # record steps for all episodes
# for i_ep in range(cfg.train_eps):
# ep_reward = 0 # reward per episode
# ep_step = 0 # step per episode
# ep_entropy = 0
# state = env.reset() # reset and obtain initial state
# for _ in range(cfg.max_steps):
# action = agent.sample_action(state) # sample action
# next_state, reward, terminated, truncated , info = env.step(action) # update env and return transitions
# agent.memory.push((agent.value,agent.log_prob,reward)) # save transitions
# state = next_state # update state
# ep_reward += reward
# ep_entropy += agent.entropy
# ep_step += 1
# if terminated:
# break
# agent.update(next_state,ep_entropy) # update agent
# rewards.append(ep_reward)
# steps.append(ep_step)
# logger.info(f"Episode: {i_ep+1}/{cfg.train_eps}, Reward: {ep_reward:.2f}, Steps:{ep_step}")
# logger.info("Finish training!")
# return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
# def test(self,cfg,env,agent,logger):
# logger.info("Start testing!")
# logger.info(f"Env: {cfg.env_name}, Algorithm: {cfg.algo_name}, Device: {cfg.device}")
# rewards = [] # record rewards for all episodes
# steps = [] # record steps for all episodes
# for i_ep in range(cfg.test_eps):
# ep_reward = 0 # reward per episode
# ep_step = 0
# state = env.reset() # reset and obtain initial state
# for _ in range(cfg.max_steps):
# action = agent.predict_action(state) # predict action
# next_state, reward, terminated, truncated , info = env.step(action)
# state = next_state
# ep_reward += reward
# ep_step += 1
# if terminated:
# break
# rewards.append(ep_reward)
# steps.append(ep_step)
# logger.info(f"Episode: {i_ep+1}/{cfg.test_eps}, Reward: {ep_reward:.2f}, Steps:{ep_step}")
# logger.info("Finish testing!")
# env.close()
# return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
if __name__ == "__main__":
main = Main()
main.run()

149
projects/codes/A2C/task2.py Normal file
View File

@@ -0,0 +1,149 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: JiangJi
Email: johnjim0816@gmail.com
Date: 2022-10-30 01:19:43
LastEditor: JiangJi
LastEditTime: 2022-11-01 00:08:22
Discription: the only difference from task0.py is that the actor here we use ActorSoftmaxTanh instead of ActorSoftmax with ReLU
'''
import sys,os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" # avoid "OMP: Error #15: Initializing libiomp5md.dll, but found libiomp5md.dll already initialized."
curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
parent_path = os.path.dirname(curr_path) # parent path
sys.path.append(parent_path) # add path to system path
import gym
import torch
import numpy as np
from common.utils import all_seed,merge_class_attrs
from common.launcher import Launcher
from common.memories import PGReplay
from common.models import ActorNormal,Critic
from envs.register import register_env
from a2c import A2C
from config.config import GeneralConfigA2C,AlgoConfigA2C
class Main(Launcher):
def __init__(self) -> None:
super().__init__()
self.cfgs['general_cfg'] = merge_class_attrs(self.cfgs['general_cfg'],GeneralConfigA2C())
self.cfgs['algo_cfg'] = merge_class_attrs(self.cfgs['algo_cfg'],AlgoConfigA2C())
def env_agent_config(self,cfg,logger):
''' create env and agent
'''
register_env(cfg.env_name)
env = gym.make(cfg.env_name,new_step_api=True) # create env
if cfg.seed !=0: # set random seed
all_seed(env,seed = cfg.seed)
try: # state dimension
n_states = env.observation_space.n # print(hasattr(env.observation_space, 'n'))
except AttributeError:
n_states = env.observation_space.shape[0] # print(hasattr(env.observation_space, 'shape'))
try:
n_actions = env.action_space.n # action dimension
except AttributeError:
n_actions = env.action_space.shape[0]
logger.info(f"action bound: {abs(env.action_space.low.item())}")
setattr(cfg, 'action_bound', abs(env.action_space.low.item()))
logger.info(f"n_states: {n_states}, n_actions: {n_actions}") # print info
# update to cfg paramters
setattr(cfg, 'n_states', n_states)
setattr(cfg, 'n_actions', n_actions)
models = {'Actor':ActorNormal(n_states,n_actions, hidden_dim = cfg.actor_hidden_dim),'Critic':Critic(n_states,1,hidden_dim=cfg.critic_hidden_dim)}
memories = {'ACMemory':PGReplay()}
agent = A2C(models,memories,cfg)
for k,v in models.items():
logger.info(f"{k} model name: {type(v).__name__}")
for k,v in memories.items():
logger.info(f"{k} memory name: {type(v).__name__}")
logger.info(f"agent name: {type(agent).__name__}")
return env,agent
def train_one_episode(self, env, agent, cfg):
ep_reward = 0 # reward per episode
ep_step = 0 # step per episode
ep_entropy = 0 # entropy per episode
state = env.reset() # reset and obtain initial state
for _ in range(cfg.max_steps):
action = agent.sample_action(state) # sample action
next_state, reward, terminated, truncated , info = env.step(action) # update env and return transitions
agent.memory.push((agent.value,agent.log_prob,reward)) # save transitions
state = next_state # update state
ep_reward += reward
ep_entropy += agent.entropy
ep_step += 1
if terminated:
break
agent.update(next_state,ep_entropy) # update agent
return agent,ep_reward,ep_step
def test_one_episode(self, env, agent, cfg):
ep_reward = 0 # reward per episode
ep_step = 0 # step per episode
state = env.reset() # reset and obtain initial state
for _ in range(cfg.max_steps):
action = agent.predict_action(state) # predict action
next_state, reward, terminated, truncated , info = env.step(action)
state = next_state
ep_reward += reward
ep_step += 1
if terminated:
break
return agent,ep_reward,ep_step
# def train(self,cfg,env,agent,logger):
# logger.info("Start training!")
# logger.info(f"Env: {cfg.env_name}, Algorithm: {cfg.algo_name}, Device: {cfg.device}")
# rewards = [] # record rewards for all episodes
# steps = [] # record steps for all episodes
# for i_ep in range(cfg.train_eps):
# ep_reward = 0 # reward per episode
# ep_step = 0 # step per episode
# ep_entropy = 0
# state = env.reset() # reset and obtain initial state
# for _ in range(cfg.max_steps):
# action = agent.sample_action(state) # sample action
# next_state, reward, terminated, truncated , info = env.step(action) # update env and return transitions
# agent.memory.push((agent.value,agent.log_prob,reward)) # save transitions
# state = next_state # update state
# ep_reward += reward
# ep_entropy += agent.entropy
# ep_step += 1
# if terminated:
# break
# agent.update(next_state,ep_entropy) # update agent
# rewards.append(ep_reward)
# steps.append(ep_step)
# logger.info(f"Episode: {i_ep+1}/{cfg.train_eps}, Reward: {ep_reward:.2f}, Steps:{ep_step}")
# logger.info("Finish training!")
# return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
# def test(self,cfg,env,agent,logger):
# logger.info("Start testing!")
# logger.info(f"Env: {cfg.env_name}, Algorithm: {cfg.algo_name}, Device: {cfg.device}")
# rewards = [] # record rewards for all episodes
# steps = [] # record steps for all episodes
# for i_ep in range(cfg.test_eps):
# ep_reward = 0 # reward per episode
# ep_step = 0
# state = env.reset() # reset and obtain initial state
# for _ in range(cfg.max_steps):
# action = agent.predict_action(state) # predict action
# next_state, reward, terminated, truncated , info = env.step(action)
# state = next_state
# ep_reward += reward
# ep_step += 1
# if terminated:
# break
# rewards.append(ep_reward)
# steps.append(ep_step)
# logger.info(f"Episode: {i_ep+1}/{cfg.test_eps}, Reward: {ep_reward:.2f}, Steps:{ep_step}")
# logger.info("Finish testing!")
# env.close()
# return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
if __name__ == "__main__":
main = Main()
main.run()