更新算法模版

2022-11-06 12:15:36 +08:00
parent 466a17707f
commit dc78698262
256 changed files with 17282 additions and 10229 deletions
--- a/projects/codes/PPO/ppo2.py
+++ b/projects/codes/PPO/ppo2.py
@@ -1,99 +1,53 @@
 #!/usr/bin/env python
 # coding=utf-8
 '''
-Author: John
+Author: JiangJi
 Email: johnjim0816@gmail.com
-Date: 2021-03-23 15:17:42
-LastEditor: John
-LastEditTime: 2021-12-31 19:38:33
-Discription: 
-Environment: 
+Date: 2022-09-26 16:11:36
+LastEditor: JiangJi
+LastEditTime: 2022-10-31 00:36:37
+Discription: PPO-clip
 '''
+
 import os
 import numpy as np
 import torch 
 import torch.optim as optim
-import torch.nn as nn
 from torch.distributions.categorical import Categorical
-class PPOMemory:
-    def __init__(self, batch_size):
-        self.states = []
-        self.probs = []
-        self.vals = []
-        self.actions = []
-        self.rewards = []
-        self.dones = []
-        self.batch_size = batch_size
-    def sample(self):
-        batch_step = np.arange(0, len(self.states), self.batch_size)
-        indices = np.arange(len(self.states), dtype=np.int64)
-        np.random.shuffle(indices)
-        batches = [indices[i:i+self.batch_size] for i in batch_step]
-        return np.array(self.states),np.array(self.actions),np.array(self.probs),\
-                np.array(self.vals),np.array(self.rewards),np.array(self.dones),batches
-                
-    def push(self, state, action, probs, vals, reward, done):
-        self.states.append(state)
-        self.actions.append(action)
-        self.probs.append(probs)
-        self.vals.append(vals)
-        self.rewards.append(reward)
-        self.dones.append(done)

-    def clear(self):
-        self.states = []
-        self.probs = []
-        self.actions = []
-        self.rewards = []
-        self.dones = []
-        self.vals = []
-class Actor(nn.Module):
-    def __init__(self,n_states, n_actions,
-            hidden_dim):
-        super(Actor, self).__init__()

-        self.actor = nn.Sequential(
-                nn.Linear(n_states, hidden_dim),
-                nn.ReLU(),
-                nn.Linear(hidden_dim, hidden_dim),
-                nn.ReLU(),
-                nn.Linear(hidden_dim, n_actions),
-                nn.Softmax(dim=-1)
-        )
-    def forward(self, state):
-        dist = self.actor(state)
-        dist = Categorical(dist)
-        return dist
-
-class Critic(nn.Module):
-    def __init__(self, n_states,hidden_dim):
-        super(Critic, self).__init__()
-        self.critic = nn.Sequential(
-                nn.Linear(n_states, hidden_dim),
-                nn.ReLU(),
-                nn.Linear(hidden_dim, hidden_dim),
-                nn.ReLU(),
-                nn.Linear(hidden_dim, 1)
-        )
-    def forward(self, state):
-        value = self.critic(state)
-        return value
 class PPO:
-    def __init__(self, n_states, n_actions,cfg):
+    def __init__(self, models,memory,cfg):
        self.gamma = cfg.gamma
-        self.continuous = cfg.continuous 
+        self.continuous = cfg.continuous
        self.policy_clip = cfg.policy_clip
        self.n_epochs = cfg.n_epochs
+        self.batch_size = cfg.batch_size
        self.gae_lambda = cfg.gae_lambda
-        self.device = cfg.device
-        self.actor = Actor(n_states, n_actions,cfg.hidden_dim).to(self.device)
-        self.critic = Critic(n_states,cfg.hidden_dim).to(self.device)
+        self.device = torch.device(cfg.device) 
+        self.actor = models['Actor'].to(self.device)
+        self.critic = models['Critic'].to(self.device)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=cfg.actor_lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=cfg.critic_lr)
-        self.memory = PPOMemory(cfg.batch_size)
+        self.memory = memory
        self.loss = 0

-    def choose_action(self, state):
+    def sample_action(self, state):
+        state = np.array([state]) # 先转成数组再转tensor更高效
+        state = torch.tensor(state, dtype=torch.float).to(self.device)
+        probs = self.actor(state)
+        dist = Categorical(probs)
+        value = self.critic(state)
+        action = dist.sample()
+        probs = torch.squeeze(dist.log_prob(action)).item()
+        if self.continuous:
+            action = torch.tanh(action)
+        else:
+            action = torch.squeeze(action).item()
+        value = torch.squeeze(value).item()
+        return action, probs, value
+    @torch.no_grad()
+    def predict_action(self, state):
        state = np.array([state]) # 先转成数组再转tensor更高效
        state = torch.tensor(state, dtype=torch.float).to(self.device)
        dist = self.actor(state)
@@ -148,12 +102,15 @@ class PPO:
                self.actor_optimizer.step()
                self.critic_optimizer.step()
        self.memory.clear()  
-    def save(self,path):
+    def save_model(self,path):
+        from pathlib import Path
+        # create path
+        Path(path).mkdir(parents=True, exist_ok=True)
        actor_checkpoint = os.path.join(path, 'ppo_actor.pt')
        critic_checkpoint= os.path.join(path, 'ppo_critic.pt')
        torch.save(self.actor.state_dict(), actor_checkpoint)
        torch.save(self.critic.state_dict(), critic_checkpoint)
-    def load(self,path):
+    def load_model(self,path):
        actor_checkpoint = os.path.join(path, 'ppo_actor.pt')
        critic_checkpoint= os.path.join(path, 'ppo_critic.pt')
        self.actor.load_state_dict(torch.load(actor_checkpoint))