update rainbowdqn
This commit is contained in:
184
codes/DQN/test copy.py
Normal file
184
codes/DQN/test copy.py
Normal file
@@ -0,0 +1,184 @@
|
||||
import random
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import tensorflow as tf
|
||||
import os
|
||||
import gym
|
||||
import time
|
||||
from collections import deque
|
||||
from tensorflow.keras import optimizers
|
||||
from keras.models import Sequential
|
||||
from keras.layers import Dense, Dropout
|
||||
from keras.layers import Activation, Flatten, Conv1D, MaxPooling1D,Reshape
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
class DQN:
|
||||
def __init__(self, env):
|
||||
self.env = env
|
||||
self.memory = deque(maxlen=400000)
|
||||
self.gamma = 0.99
|
||||
self.epsilon = 1.0
|
||||
self.epsilon_min = 0.01
|
||||
self.epsilon_decay = self.epsilon_min / 500000
|
||||
|
||||
self.batch_size = 32
|
||||
self.train_start = 1000
|
||||
self.state_size = self.env.observation_space.shape[0]*4
|
||||
self.action_size = self.env.action_space.n
|
||||
self.learning_rate = 0.00025
|
||||
|
||||
self.evaluation_model = self.create_model()
|
||||
self.target_model = self.create_model()
|
||||
|
||||
def create_model(self):
|
||||
model = Sequential()
|
||||
model.add(Dense(128*2, input_dim=self.state_size,activation='relu'))
|
||||
model.add(Dense(128*2, activation='relu'))
|
||||
model.add(Dense(128*2, activation='relu'))
|
||||
model.add(Dense(self.env.action_space.n, activation='linear'))
|
||||
model.compile(loss='mean_squared_error', optimizer=optimizers.RMSprop(lr=self.learning_rate,decay=0.99,epsilon=1e-6))
|
||||
return model
|
||||
|
||||
def choose_action(self, state, steps):
|
||||
if steps > 50000:
|
||||
if self.epsilon > self.epsilon_min:
|
||||
self.epsilon -= self.epsilon_decay
|
||||
if np.random.random() < self.epsilon:
|
||||
return self.env.action_space.sample()
|
||||
return np.argmax(self.evaluation_model.predict(state)[0])
|
||||
|
||||
def remember(self, cur_state, action, reward, new_state, done):
|
||||
if not hasattr(self, 'memory_counter'):
|
||||
self.memory_counter = 0
|
||||
|
||||
transition = (cur_state, action, reward, new_state, done)
|
||||
self.memory.extend([transition])
|
||||
|
||||
self.memory_counter += 1
|
||||
|
||||
def replay(self):
|
||||
if len(self.memory) < self.train_start:
|
||||
return
|
||||
|
||||
mini_batch = random.sample(self.memory, self.batch_size)
|
||||
|
||||
update_input = np.zeros((self.batch_size, self.state_size))
|
||||
update_target = np.zeros((self.batch_size, self.action_size))
|
||||
|
||||
for i in range(self.batch_size):
|
||||
state, action, reward, new_state, done = mini_batch[i]
|
||||
target = self.evaluation_model.predict(state)[0]
|
||||
|
||||
if done:
|
||||
target[action] = reward
|
||||
else:
|
||||
target[action] = reward + self.gamma * np.amax(self.target_model.predict(new_state)[0])
|
||||
|
||||
update_input[i] = state
|
||||
update_target[i] = target
|
||||
|
||||
self.evaluation_model.fit(update_input, update_target, batch_size=self.batch_size, epochs=1, verbose=0)
|
||||
|
||||
def target_train(self):
|
||||
self.target_model.set_weights(self.evaluation_model.get_weights())
|
||||
return
|
||||
|
||||
def visualize(self, reward, episode):
|
||||
plt.plot(episode, reward, 'ob-')
|
||||
plt.title('Average reward each 100 episode')
|
||||
plt.ylabel('Reward')
|
||||
plt.xlabel('Episodes')
|
||||
plt.grid()
|
||||
plt.show()
|
||||
|
||||
def transform(self,state):
|
||||
if state.shape[1]==512:
|
||||
return state
|
||||
a=[np.binary_repr(x,width=8) for x in state[0]]
|
||||
res=[]
|
||||
for x in a:
|
||||
res.extend([x[:2],x[2:4],x[4:6],x[6:]])
|
||||
res=[int(x,2) for x in res]
|
||||
return np.array(res)
|
||||
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
||||
def main():
|
||||
# env = gym.make('Breakout-ram-v0')
|
||||
env = gym.make('Breakout-ram-v0')
|
||||
env = env.unwrapped
|
||||
|
||||
print(env.action_space)
|
||||
print(env.observation_space.shape[0])
|
||||
print(env.observation_space.high)
|
||||
print(env.observation_space.low)
|
||||
|
||||
#print(env.observation_space.shape)
|
||||
|
||||
|
||||
episodes = 5000
|
||||
trial_len = 10000
|
||||
|
||||
tmp_reward=0
|
||||
sum_rewards = 0
|
||||
n_success = 0
|
||||
total_steps = 0
|
||||
|
||||
graph_reward = []
|
||||
graph_episodes = []
|
||||
time_record = []
|
||||
|
||||
dqn_agent = DQN(env=env)
|
||||
for i_episode in range(episodes):
|
||||
start_time = time.time()
|
||||
total_reward = 0
|
||||
cur_state = env.reset().reshape(1,128)
|
||||
cur_state=dqn_agent.transform(cur_state).reshape(1,128*4)/4
|
||||
i_step=0
|
||||
for step in range(trial_len):
|
||||
#env.render()
|
||||
i_step+=1
|
||||
action = dqn_agent.choose_action(cur_state, total_steps)
|
||||
new_state, reward, done, _ = env.step(action)
|
||||
new_state = new_state.reshape(1, 128)
|
||||
new_state = dqn_agent.transform(new_state).reshape(1,128*4)/4
|
||||
total_reward += reward
|
||||
sum_rewards += reward
|
||||
tmp_reward += reward
|
||||
if reward>0: #Testing whether it is good.
|
||||
reward=1
|
||||
|
||||
dqn_agent.remember(cur_state, action, reward, new_state, done)
|
||||
if total_steps > 10000:
|
||||
if total_steps%4 == 0:
|
||||
dqn_agent.replay()
|
||||
if total_steps%5000 == 0:
|
||||
dqn_agent.target_train()
|
||||
|
||||
cur_state = new_state
|
||||
total_steps += 1
|
||||
if done:
|
||||
env.reset()
|
||||
break
|
||||
if (i_episode+1) % 100 == 0:
|
||||
graph_reward.append(sum_rewards/100)
|
||||
graph_episodes.append(i_episode+1)
|
||||
sum_rewards = 0
|
||||
print("Episode ",i_episode+1," Reward: ")
|
||||
print(graph_reward[-1])
|
||||
end_time = time.time()
|
||||
time_record.append(end_time-start_time)
|
||||
print("NOW in episode: " + str(i_episode))
|
||||
print("Time cost: " + str(end_time-start_time))
|
||||
print("Reward: ",tmp_reward)
|
||||
print("Step:", i_step)
|
||||
tmp_reward=0
|
||||
print("Reward: ")
|
||||
print(graph_reward)
|
||||
print("Episode: ")
|
||||
print(graph_episodes)
|
||||
print("Average_time: ")
|
||||
print(sum(time_record)/5000)
|
||||
dqn_agent.visualize(graph_reward, graph_episodes)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user