add DQN_cnn

2021-03-23 21:23:43 +08:00
parent cf4ff96726
commit 2df8d965d2
6 changed files with 361 additions and 1 deletions
--- a/codes/DQN_cnn/main.py
+++ b/codes/DQN_cnn/main.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+@Author: John
+@Email: johnjim0816@gmail.com
+@Date: 2020-06-11 10:01:09
+@LastEditor: John
+LastEditTime: 2021-03-23 20:43:28
+@Discription: 
+@Environment: python 3.7.7
+'''
+import sys,os
+sys.path.append(os.getcwd()) # add current terminal path to sys.path
+import gym
+import torch
+import datetime
+from DQN_cnn.env import get_screen
+from DQN_cnn.agent import DQNcnn
+from common.plot import plot_rewards
+from common.utils import save_results
+
+sys.path.append(os.getcwd())  # add current terminal path to sys.path
+
+SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
+SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' # path to save model
+if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"): 
+    os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/")
+if not os.path.exists(SAVED_MODEL_PATH): 
+    os.mkdir(SAVED_MODEL_PATH)
+RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/' # path to save rewards
+if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/results/"): 
+    os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/results/")
+if not os.path.exists(RESULT_PATH): 
+    os.mkdir(RESULT_PATH)
+
+class DQNcnnConfig:
+    def __init__(self) -> None:
+        self.algo = "DQN_cnn"  # name of algo
+        self.gamma = 0.99
+        self.epsilon_start = 0.95  # e-greedy策略的初始epsilon
+        self.epsilon_end = 0.05
+        self.epsilon_decay = 200
+        self.lr = 0.01  # leanring rate
+        self.memory_capacity = 10000  # Replay Memory容量
+        self.batch_size = 64
+        self.train_eps = 250  # 训练的episode数目
+        self.train_steps = 200  # 训练每个episode的最大长度
+        self.target_update = 4  # target net的更新频率
+        self.eval_eps = 20  # 测试的episode数目
+        self.eval_steps = 200  # 测试每个episode的最大长度
+        self.hidden_dim = 128  # 神经网络隐藏层维度
+        self.device = torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu")  # if gpu is to be used
+
+def train(cfg, env, agent):
+    rewards = []
+    ma_rewards = []
+    for i_episode in range(cfg.train_eps):
+        # Initialize the environment and state
+        env.reset()
+        last_screen = get_screen(env, cfg.device)
+        current_screen = get_screen(env, cfg.device)
+        state = current_screen - last_screen
+        ep_reward = 0
+        for i_step in range(cfg.train_steps+1):
+            # Select and perform an action
+            action = agent.choose_action(state)
+            _, reward, done, _ = env.step(action.item())
+            ep_reward += reward
+            reward = torch.tensor([reward], device=cfg.device)
+            # Observe new state
+            last_screen = current_screen
+            current_screen = get_screen(env, cfg.device)
+            if done:
+                break
+            state_ = current_screen - last_screen
+            # Store the transition in memory
+            agent.memory.push(state, action, state_, reward)
+            # Move to the next state
+            state = state_
+            # Perform one step of the optimization (on the target network)
+            agent.update()
+        # Update the target network, copying all weights and biases in DQN
+        if i_episode % cfg.target_update == 0:
+            agent.target_net.load_state_dict(agent.policy_net.state_dict())
+        print('Episode:{}/{}, Reward:{}, Steps:{}, Explore:{:.2f}, Done:{}'.format(i_episode+1,cfg.train_eps,ep_reward,i_step+1,agent.epsilon,done))
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
+        else:
+            ma_rewards.append(ep_reward)
+    return rewards,ma_rewards
+
+
+if __name__ == "__main__":
+    cfg = DQNcnnConfig()
+    # Get screen size so that we can initialize layers correctly based on shape
+    # returned from AI gym. Typical dimensions at this point are close to 3x40x90
+    # which is the result of a clamped and down-scaled render buffer in get_screen(env,device)
+    # 因为这里环境的state需要从默认的向量改为图像，所以要unwrapped更改state
+    env = gym.make('CartPole-v0').unwrapped
+    env.reset()
+    init_screen = get_screen(env, cfg.device)
+    _, _, screen_height, screen_width = init_screen.shape
+    # Get number of actions from gym action space
+    action_dim = env.action_space.n
+    agent = DQNcnn(screen_height, screen_width,
+                   action_dim, cfg)
+    rewards,ma_rewards = train(cfg,env,agent)
+    save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
+    plot_rewards(rewards,ma_rewards,tag="train",algo = cfg.algo,path=RESULT_PATH)