update rainbowdqn

2022-05-31 01:20:58 +08:00
parent cfc0f6492e
commit c7c94468c9
149 changed files with 1866 additions and 1549 deletions
--- a/codes/TD3/agent.py
+++ b/codes/TD3/agent.py
@@ -21,8 +21,8 @@ class Actor(nn.Module):
 		'''[summary]

 		Args:
-			input_dim (int): 输入维度，这里等于state_dim
-			output_dim (int): 输出维度，这里等于action_dim
+			input_dim (int): 输入维度，这里等于n_states
+			output_dim (int): 输出维度，这里等于n_actions
 			max_action (int): action的最大值
 		'''		
 		super(Actor, self).__init__()
--- a/codes/TD3/memory.py
+++ b/codes/TD3/memory.py
@@ -14,13 +14,13 @@ import torch


 class ReplayBuffer(object):
-	def __init__(self, state_dim, action_dim, max_size=int(1e6)):
+	def __init__(self, n_states, n_actions, max_size=int(1e6)):
 		self.max_size = max_size
 		self.ptr = 0
 		self.size = 0
-		self.state = np.zeros((max_size, state_dim))
-		self.action = np.zeros((max_size, action_dim))
-		self.next_state = np.zeros((max_size, state_dim))
+		self.state = np.zeros((max_size, n_states))
+		self.action = np.zeros((max_size, n_actions))
+		self.next_state = np.zeros((max_size, n_states))
 		self.reward = np.zeros((max_size, 1))
 		self.not_done = np.zeros((max_size, 1))
 		self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
--- a/codes/TD3/task0_eval.py
+++ b/codes/TD3/task0_eval.py
@@ -74,10 +74,10 @@ if __name__ == "__main__":
 	env.seed(cfg.seed) # Set seeds
 	torch.manual_seed(cfg.seed)
 	np.random.seed(cfg.seed)
-	state_dim = env.observation_space.shape[0]
-	action_dim = env.action_space.shape[0] 
+	n_states = env.observation_space.shape[0]
+	n_actions = env.action_space.shape[0] 
 	max_action = float(env.action_space.high[0])
-	td3= TD3(state_dim,action_dim,max_action,cfg)
+	td3= TD3(n_states,n_actions,max_action,cfg)
 	cfg.model_path = './TD3/results/HalfCheetah-v2/20210416-130341/models/'
 	td3.load(cfg.model_path)
 	td3_rewards,td3_ma_rewards = eval(cfg.env,td3,cfg.seed)
--- a/codes/TD3/task0_train.py
+++ b/codes/TD3/task0_train.py
@@ -72,7 +72,7 @@ def train(cfg,env,agent):
 		else:
 			action = (
 				agent.choose_action(np.array(state))
-				+ np.random.normal(0, max_action * cfg.expl_noise, size=action_dim)
+				+ np.random.normal(0, max_action * cfg.expl_noise, size=n_actions)
 			).clip(-max_action, max_action)
 		# Perform action
 		next_state, reward, done, _ = env.step(action) 
@@ -121,11 +121,11 @@ def train(cfg,env,agent):
 # 			else:
 # 				action = (
 # 					agent.choose_action(np.array(state))
-# 					+ np.random.normal(0, max_action * cfg.expl_noise, size=action_dim)
+# 					+ np.random.normal(0, max_action * cfg.expl_noise, size=n_actions)
 # 				).clip(-max_action, max_action)
 # 			# action = (
 # 			# 		agent.choose_action(np.array(state))
-# 			# 		+ np.random.normal(0, max_action * cfg.expl_noise, size=action_dim)
+# 			# 		+ np.random.normal(0, max_action * cfg.expl_noise, size=n_actions)
 # 			# 	).clip(-max_action, max_action)
 # 			# Perform action
 # 			next_state, reward, done, _ = env.step(action) 
@@ -157,10 +157,10 @@ if __name__ == "__main__":
 	env.seed(cfg.seed) # Set seeds
 	torch.manual_seed(cfg.seed)
 	np.random.seed(cfg.seed)
-	state_dim = env.observation_space.shape[0]
-	action_dim = env.action_space.shape[0] 
+	n_states = env.observation_space.shape[0]
+	n_actions = env.action_space.shape[0] 
 	max_action = float(env.action_space.high[0])
-	agent = TD3(state_dim,action_dim,max_action,cfg)
+	agent = TD3(n_states,n_actions,max_action,cfg)
 	rewards,ma_rewards = train(cfg,env,agent)
 	make_dir(cfg.result_path,cfg.model_path)
 	agent.save(path=cfg.model_path)
--- a/codes/TD3/task1_eval.py
+++ b/codes/TD3/task1_eval.py
@@ -70,10 +70,10 @@ if __name__ == "__main__":
 	env.seed(cfg.seed) # Set seeds
 	torch.manual_seed(cfg.seed)
 	np.random.seed(cfg.seed)
-	state_dim = env.observation_space.shape[0]
-	action_dim = env.action_space.shape[0] 
+	n_states = env.observation_space.shape[0]
+	n_actions = env.action_space.shape[0] 
 	max_action = float(env.action_space.high[0])
-	td3= TD3(state_dim,action_dim,max_action,cfg)
+	td3= TD3(n_states,n_actions,max_action,cfg)
 	cfg.model_path = './TD3/results/Pendulum-v0/20210428-092059/models/'
 	cfg.result_path = './TD3/results/Pendulum-v0/20210428-092059/results/'
 	td3.load(cfg.model_path)
--- a/codes/TD3/task1_train.py
+++ b/codes/TD3/task1_train.py
@@ -79,7 +79,7 @@ def train(cfg,env,agent):
 			else:
 				action = (
 					agent.choose_action(np.array(state))
-					+ np.random.normal(0, max_action * cfg.expl_noise, size=action_dim)
+					+ np.random.normal(0, max_action * cfg.expl_noise, size=n_actions)
 				).clip(-max_action, max_action)
 			# Perform action
 			next_state, reward, done, _ = env.step(action) 
@@ -109,10 +109,10 @@ if __name__ == "__main__":
 	env.seed(1) # 随机种子
 	torch.manual_seed(1)
 	np.random.seed(1)
-	state_dim = env.observation_space.shape[0]
-	action_dim = env.action_space.shape[0] 
+	n_states = env.observation_space.shape[0]
+	n_actions = env.action_space.shape[0] 
 	max_action = float(env.action_space.high[0])
-	agent = TD3(state_dim,action_dim,max_action,cfg)
+	agent = TD3(n_states,n_actions,max_action,cfg)
 	rewards,ma_rewards = train(cfg,env,agent)
 	make_dir(plot_cfg.result_path,plot_cfg.model_path)
 	agent.save(path=plot_cfg.model_path)