Files
easy-rl/codes/DQN/test copy.py
2022-05-31 01:20:58 +08:00

184 lines
6.0 KiB
Python

import random
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import gym
import time
from collections import deque
from tensorflow.keras import optimizers
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Activation, Flatten, Conv1D, MaxPooling1D,Reshape
import matplotlib.pyplot as plt
class DQN:
def __init__(self, env):
self.env = env
self.memory = deque(maxlen=400000)
self.gamma = 0.99
self.epsilon = 1.0
self.epsilon_min = 0.01
self.epsilon_decay = self.epsilon_min / 500000
self.batch_size = 32
self.train_start = 1000
self.state_size = self.env.observation_space.shape[0]*4
self.action_size = self.env.action_space.n
self.learning_rate = 0.00025
self.evaluation_model = self.create_model()
self.target_model = self.create_model()
def create_model(self):
model = Sequential()
model.add(Dense(128*2, input_dim=self.state_size,activation='relu'))
model.add(Dense(128*2, activation='relu'))
model.add(Dense(128*2, activation='relu'))
model.add(Dense(self.env.action_space.n, activation='linear'))
model.compile(loss='mean_squared_error', optimizer=optimizers.RMSprop(lr=self.learning_rate,decay=0.99,epsilon=1e-6))
return model
def choose_action(self, state, steps):
if steps > 50000:
if self.epsilon > self.epsilon_min:
self.epsilon -= self.epsilon_decay
if np.random.random() < self.epsilon:
return self.env.action_space.sample()
return np.argmax(self.evaluation_model.predict(state)[0])
def remember(self, cur_state, action, reward, new_state, done):
if not hasattr(self, 'memory_counter'):
self.memory_counter = 0
transition = (cur_state, action, reward, new_state, done)
self.memory.extend([transition])
self.memory_counter += 1
def replay(self):
if len(self.memory) < self.train_start:
return
mini_batch = random.sample(self.memory, self.batch_size)
update_input = np.zeros((self.batch_size, self.state_size))
update_target = np.zeros((self.batch_size, self.action_size))
for i in range(self.batch_size):
state, action, reward, new_state, done = mini_batch[i]
target = self.evaluation_model.predict(state)[0]
if done:
target[action] = reward
else:
target[action] = reward + self.gamma * np.amax(self.target_model.predict(new_state)[0])
update_input[i] = state
update_target[i] = target
self.evaluation_model.fit(update_input, update_target, batch_size=self.batch_size, epochs=1, verbose=0)
def target_train(self):
self.target_model.set_weights(self.evaluation_model.get_weights())
return
def visualize(self, reward, episode):
plt.plot(episode, reward, 'ob-')
plt.title('Average reward each 100 episode')
plt.ylabel('Reward')
plt.xlabel('Episodes')
plt.grid()
plt.show()
def transform(self,state):
if state.shape[1]==512:
return state
a=[np.binary_repr(x,width=8) for x in state[0]]
res=[]
for x in a:
res.extend([x[:2],x[2:4],x[4:6],x[6:]])
res=[int(x,2) for x in res]
return np.array(res)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
def main():
# env = gym.make('Breakout-ram-v0')
env = gym.make('Breakout-ram-v0')
env = env.unwrapped
print(env.action_space)
print(env.observation_space.shape[0])
print(env.observation_space.high)
print(env.observation_space.low)
#print(env.observation_space.shape)
episodes = 5000
trial_len = 10000
tmp_reward=0
sum_rewards = 0
n_success = 0
total_steps = 0
graph_reward = []
graph_episodes = []
time_record = []
dqn_agent = DQN(env=env)
for i_episode in range(episodes):
start_time = time.time()
total_reward = 0
cur_state = env.reset().reshape(1,128)
cur_state=dqn_agent.transform(cur_state).reshape(1,128*4)/4
i_step=0
for step in range(trial_len):
#env.render()
i_step+=1
action = dqn_agent.choose_action(cur_state, total_steps)
new_state, reward, done, _ = env.step(action)
new_state = new_state.reshape(1, 128)
new_state = dqn_agent.transform(new_state).reshape(1,128*4)/4
total_reward += reward
sum_rewards += reward
tmp_reward += reward
if reward>0: #Testing whether it is good.
reward=1
dqn_agent.remember(cur_state, action, reward, new_state, done)
if total_steps > 10000:
if total_steps%4 == 0:
dqn_agent.replay()
if total_steps%5000 == 0:
dqn_agent.target_train()
cur_state = new_state
total_steps += 1
if done:
env.reset()
break
if (i_episode+1) % 100 == 0:
graph_reward.append(sum_rewards/100)
graph_episodes.append(i_episode+1)
sum_rewards = 0
print("Episode ",i_episode+1," Reward: ")
print(graph_reward[-1])
end_time = time.time()
time_record.append(end_time-start_time)
print("NOW in episode: " + str(i_episode))
print("Time cost: " + str(end_time-start_time))
print("Reward: ",tmp_reward)
print("Step:", i_step)
tmp_reward=0
print("Reward: ")
print(graph_reward)
print("Episode: ")
print(graph_episodes)
print("Average_time: ")
print(sum(time_record)/5000)
dqn_agent.visualize(graph_reward, graph_episodes)
if __name__ == '__main__':
main()