add DQN_cnn
This commit is contained in:
111
codes/DQN_cnn/main.py
Normal file
111
codes/DQN_cnn/main.py
Normal file
@@ -0,0 +1,111 @@
|
||||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
'''
|
||||
@Author: John
|
||||
@Email: johnjim0816@gmail.com
|
||||
@Date: 2020-06-11 10:01:09
|
||||
@LastEditor: John
|
||||
LastEditTime: 2021-03-23 20:43:28
|
||||
@Discription:
|
||||
@Environment: python 3.7.7
|
||||
'''
|
||||
import sys,os
|
||||
sys.path.append(os.getcwd()) # add current terminal path to sys.path
|
||||
import gym
|
||||
import torch
|
||||
import datetime
|
||||
from DQN_cnn.env import get_screen
|
||||
from DQN_cnn.agent import DQNcnn
|
||||
from common.plot import plot_rewards
|
||||
from common.utils import save_results
|
||||
|
||||
sys.path.append(os.getcwd()) # add current terminal path to sys.path
|
||||
|
||||
SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
|
||||
SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' # path to save model
|
||||
if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"):
|
||||
os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/")
|
||||
if not os.path.exists(SAVED_MODEL_PATH):
|
||||
os.mkdir(SAVED_MODEL_PATH)
|
||||
RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/' # path to save rewards
|
||||
if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/results/"):
|
||||
os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/results/")
|
||||
if not os.path.exists(RESULT_PATH):
|
||||
os.mkdir(RESULT_PATH)
|
||||
|
||||
class DQNcnnConfig:
|
||||
def __init__(self) -> None:
|
||||
self.algo = "DQN_cnn" # name of algo
|
||||
self.gamma = 0.99
|
||||
self.epsilon_start = 0.95 # e-greedy策略的初始epsilon
|
||||
self.epsilon_end = 0.05
|
||||
self.epsilon_decay = 200
|
||||
self.lr = 0.01 # leanring rate
|
||||
self.memory_capacity = 10000 # Replay Memory容量
|
||||
self.batch_size = 64
|
||||
self.train_eps = 250 # 训练的episode数目
|
||||
self.train_steps = 200 # 训练每个episode的最大长度
|
||||
self.target_update = 4 # target net的更新频率
|
||||
self.eval_eps = 20 # 测试的episode数目
|
||||
self.eval_steps = 200 # 测试每个episode的最大长度
|
||||
self.hidden_dim = 128 # 神经网络隐藏层维度
|
||||
self.device = torch.device(
|
||||
"cuda" if torch.cuda.is_available() else "cpu") # if gpu is to be used
|
||||
|
||||
def train(cfg, env, agent):
|
||||
rewards = []
|
||||
ma_rewards = []
|
||||
for i_episode in range(cfg.train_eps):
|
||||
# Initialize the environment and state
|
||||
env.reset()
|
||||
last_screen = get_screen(env, cfg.device)
|
||||
current_screen = get_screen(env, cfg.device)
|
||||
state = current_screen - last_screen
|
||||
ep_reward = 0
|
||||
for i_step in range(cfg.train_steps+1):
|
||||
# Select and perform an action
|
||||
action = agent.choose_action(state)
|
||||
_, reward, done, _ = env.step(action.item())
|
||||
ep_reward += reward
|
||||
reward = torch.tensor([reward], device=cfg.device)
|
||||
# Observe new state
|
||||
last_screen = current_screen
|
||||
current_screen = get_screen(env, cfg.device)
|
||||
if done:
|
||||
break
|
||||
state_ = current_screen - last_screen
|
||||
# Store the transition in memory
|
||||
agent.memory.push(state, action, state_, reward)
|
||||
# Move to the next state
|
||||
state = state_
|
||||
# Perform one step of the optimization (on the target network)
|
||||
agent.update()
|
||||
# Update the target network, copying all weights and biases in DQN
|
||||
if i_episode % cfg.target_update == 0:
|
||||
agent.target_net.load_state_dict(agent.policy_net.state_dict())
|
||||
print('Episode:{}/{}, Reward:{}, Steps:{}, Explore:{:.2f}, Done:{}'.format(i_episode+1,cfg.train_eps,ep_reward,i_step+1,agent.epsilon,done))
|
||||
rewards.append(ep_reward)
|
||||
if ma_rewards:
|
||||
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
|
||||
else:
|
||||
ma_rewards.append(ep_reward)
|
||||
return rewards,ma_rewards
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cfg = DQNcnnConfig()
|
||||
# Get screen size so that we can initialize layers correctly based on shape
|
||||
# returned from AI gym. Typical dimensions at this point are close to 3x40x90
|
||||
# which is the result of a clamped and down-scaled render buffer in get_screen(env,device)
|
||||
# 因为这里环境的state需要从默认的向量改为图像,所以要unwrapped更改state
|
||||
env = gym.make('CartPole-v0').unwrapped
|
||||
env.reset()
|
||||
init_screen = get_screen(env, cfg.device)
|
||||
_, _, screen_height, screen_width = init_screen.shape
|
||||
# Get number of actions from gym action space
|
||||
action_dim = env.action_space.n
|
||||
agent = DQNcnn(screen_height, screen_width,
|
||||
action_dim, cfg)
|
||||
rewards,ma_rewards = train(cfg,env,agent)
|
||||
save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
|
||||
plot_rewards(rewards,ma_rewards,tag="train",algo = cfg.algo,path=RESULT_PATH)
|
||||
Reference in New Issue
Block a user