Files
easy-rl/codes/DQN_cnn/main.py
JohnJim0816 e4690ac89f update
2021-04-16 14:59:23 +08:00

113 lines
4.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-11 10:01:09
@LastEditor: John
LastEditTime: 2021-04-05 11:06:23
@Discription:
@Environment: python 3.7.7
'''
import sys,os
curr_path = os.path.dirname(__file__)
parent_path=os.path.dirname(curr_path)
sys.path.append(parent_path) # add current terminal path to sys.path
import gym
import torch
import datetime
from DQN_cnn.env import get_screen
from DQN_cnn.agent import DQNcnn
from common.plot import plot_rewards
from common.utils import save_results
SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
SAVED_MODEL_PATH = curr_path+"/saved_model/"+SEQUENCE+'/' # path to save model
if not os.path.exists(curr_path+"/saved_model/"):
os.mkdir(curr_path+"/saved_model/")
if not os.path.exists(SAVED_MODEL_PATH):
os.mkdir(SAVED_MODEL_PATH)
RESULT_PATH = curr_path+"/results/"+SEQUENCE+'/' # path to save rewards
if not os.path.exists(curr_path+"/results/"):
os.mkdir(curr_path+"/results/")
if not os.path.exists(RESULT_PATH):
os.mkdir(RESULT_PATH)
class DQNcnnConfig:
def __init__(self) -> None:
self.algo = "DQN_cnn" # name of algo
self.gamma = 0.99
self.epsilon_start = 0.95 # e-greedy策略的初始epsilon
self.epsilon_end = 0.05
self.epsilon_decay = 200
self.lr = 0.01 # leanring rate
self.memory_capacity = 10000 # Replay Memory容量
self.batch_size = 64
self.train_eps = 250 # 训练的episode数目
self.train_steps = 200 # 训练每个episode的最大长度
self.target_update = 4 # target net的更新频率
self.eval_eps = 20 # 测试的episode数目
self.eval_steps = 200 # 测试每个episode的最大长度
self.hidden_dim = 128 # 神经网络隐藏层维度
self.device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu") # if gpu is to be used
def train(cfg, env, agent):
rewards = []
ma_rewards = []
for i_episode in range(cfg.train_eps):
# Initialize the environment and state
env.reset()
last_screen = get_screen(env, cfg.device)
current_screen = get_screen(env, cfg.device)
state = current_screen - last_screen
ep_reward = 0
for i_step in range(cfg.train_steps+1):
# Select and perform an action
action = agent.choose_action(state)
_, reward, done, _ = env.step(action.item())
ep_reward += reward
reward = torch.tensor([reward], device=cfg.device)
# Observe new state
last_screen = current_screen
current_screen = get_screen(env, cfg.device)
if done:
break
state_ = current_screen - last_screen
# Store the transition in memory
agent.memory.push(state, action, state_, reward)
# Move to the next state
state = state_
# Perform one step of the optimization (on the target network)
agent.update()
# Update the target network, copying all weights and biases in DQN
if i_episode % cfg.target_update == 0:
agent.target_net.load_state_dict(agent.policy_net.state_dict())
print('Episode:{}/{}, Reward:{}, Steps:{}, Explore:{:.2f}, Done:{}'.format(i_episode+1,cfg.train_eps,ep_reward,i_step+1,agent.epsilon,done))
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
return rewards,ma_rewards
if __name__ == "__main__":
cfg = DQNcnnConfig()
# Get screen size so that we can initialize layers correctly based on shape
# returned from AI gym. Typical dimensions at this point are close to 3x40x90
# which is the result of a clamped and down-scaled render buffer in get_screen(env,device)
# 因为这里环境的state需要从默认的向量改为图像所以要unwrapped更改state
env = gym.make('CartPole-v0').unwrapped
env.reset()
init_screen = get_screen(env, cfg.device)
_, _, screen_height, screen_width = init_screen.shape
# Get number of actions from gym action space
action_dim = env.action_space.n
agent = DQNcnn(screen_height, screen_width,
action_dim, cfg)
rewards,ma_rewards = train(cfg,env,agent)
save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH)
plot_rewards(rewards,ma_rewards,tag="train",algo = cfg.algo,path=RESULT_PATH)