update codes

2021-11-18 15:41:27 +08:00
parent 442e307b01
commit 129c0c65fa
103 changed files with 1025 additions and 558 deletions
@@ -12,10 +12,10 @@ Environment:
 import torch.optim as optim
 from A2C.model import ActorCritic
 class A2C:
-    def __init__(self,state_dim,action_dim,cfg) -> None:
+    def __init__(self,n_states,n_actions,cfg) -> None:
        self.gamma = cfg.gamma
        self.device = cfg.device
-        self.model = ActorCritic(state_dim, action_dim, cfg.hidden_size).to(self.device)
+        self.model = ActorCritic(n_states, n_actions, cfg.hidden_size).to(self.device)
        self.optimizer = optim.Adam(self.model.parameters())

    def compute_returns(self,next_value, rewards, masks):
@@ -13,19 +13,19 @@ import torch.nn as nn
 import torch.nn.functional as F
 from torch.distributions import Categorical
 class ActorCritic(nn.Module):
-    def __init__(self, num_inputs, num_outputs, hidden_size, std=0.0):
+    def __init__(self, n_states, n_actions, hidden_dim):
        super(ActorCritic, self).__init__()
        
        self.critic = nn.Sequential(
-            nn.Linear(num_inputs, hidden_size),
+            nn.Linear(n_states, hidden_dim),
            nn.ReLU(),
-            nn.Linear(hidden_size, 1)
+            nn.Linear(hidden_dim, 1)
        )
        
        self.actor = nn.Sequential(
-            nn.Linear(num_inputs, hidden_size),
+            nn.Linear(n_states, hidden_dim),
            nn.ReLU(),
-            nn.Linear(hidden_size, num_outputs),
+            nn.Linear(hidden_dim, n_actions),
            nn.Softmax(dim=1),
        )
        
@@ -1,8 +1,7 @@
 import sys,os
-curr_path = os.path.dirname(__file__)
-parent_path = os.path.dirname(curr_path)
-sys.path.append(parent_path)  # add current terminal path to sys.path
-
+curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
+parent_path = os.path.dirname(curr_path) # 父路径
+sys.path.append(parent_path) # 添加路径到系统路径sys.path

 import gym
 import numpy as np
@@ -17,17 +16,28 @@ from common.plot import plot_rewards
 curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
 class A2CConfig:
    def __init__(self) -> None:
-        self.algo='A2C'
-        self.env= 'CartPole-v0'
-        self.result_path = curr_path+"/outputs/" +self.env+'/'+curr_time+'/results/'  # path to save results
-        self.model_path = curr_path+"/outputs/" +self.env+'/'+curr_time+'/models/'  # path to save models
-        self.n_envs = 8
-        self.gamma = 0.99
-        self.hidden_size = 256
+        self.algo='A2C' # 算法名称
+        self.env_name= 'CartPole-v0' # 环境名称
+        self.n_envs = 8 # 异步的环境数目
+        self.gamma = 0.99 # 强化学习中的折扣因子
+        self.hidden_dim = 256
        self.lr = 1e-3 # learning rate
        self.max_frames = 30000
        self.n_steps = 5
        self.device  = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+class PlotConfig:
+    def __init__(self) -> None:
+        self.algo = "DQN"  # 算法名称
+        self.env_name = 'CartPole-v0' # 环境名称
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 检测GPU
+
+        self.result_path = curr_path+"/outputs/" + self.env_name + \
+            '/'+curr_time+'/results/'  # 保存结果的路径
+        self.model_path = curr_path+"/outputs/" + self.env_name + \
+            '/'+curr_time+'/models/'  # 保存模型的路径
+        self.save = True # 是否保存图片
+        
+
 def make_envs(env_name):
    def _thunk():
        env = gym.make(env_name)
@@ -57,11 +67,11 @@ def compute_returns(next_value, rewards, masks, gamma=0.99):


 def train(cfg,envs):
-    env = gym.make(cfg.env) # a single env
+    env = gym.make(cfg.env_name) # a single env
    env.seed(10)
    state_dim  = envs.observation_space.shape[0]
    action_dim = envs.action_space.n
-    model = ActorCritic(state_dim, action_dim, cfg.hidden_size).to(cfg.device)
+    model = ActorCritic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device)
    optimizer = optim.Adam(model.parameters())
    frame_idx    = 0
    test_rewards = []
@@ -112,9 +122,11 @@ def train(cfg,envs):
    return test_rewards, test_ma_rewards
 if __name__ == "__main__":
    cfg = A2CConfig()
-    envs = [make_envs(cfg.env) for i in range(cfg.n_envs)]
-    envs = SubprocVecEnv(envs) # 8 env
+    plot_cfg = PlotConfig()
+    envs = [make_envs(cfg.env_name) for i in range(cfg.n_envs)]
+    envs = SubprocVecEnv(envs) 
+    # 训练
    rewards,ma_rewards = train(cfg,envs)
-    make_dir(cfg.result_path,cfg.model_path)
-    save_results(rewards,ma_rewards,tag='train',path=cfg.result_path)
-    plot_rewards(rewards,ma_rewards,tag="train",env=cfg.env,algo = cfg.algo,path=cfg.result_path)
+    make_dir(plot_cfg.result_path,plot_cfg.model_path)
+    save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path) # 保存结果
+    plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果