diff --git a/codes/A2C/.vscode/settings.json b/codes/A2C/.vscode/settings.json deleted file mode 100644 index be0f1ab..0000000 --- a/codes/A2C/.vscode/settings.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "python.pythonPath": "/Users/jj/anaconda3/envs/py37/bin/python" -} \ No newline at end of file diff --git a/codes/A2C/agent.py b/codes/A2C/agent.py index a5a2fee..af1201b 100644 --- a/codes/A2C/agent.py +++ b/codes/A2C/agent.py @@ -5,19 +5,18 @@ Author: John Email: johnjim0816@gmail.com Date: 2020-11-03 20:47:09 LastEditor: John -LastEditTime: 2020-11-08 22:16:29 +LastEditTime: 2021-03-20 17:41:21 Discription: Environment: ''' -from model import ActorCritic +from A2C.model import ActorCritic import torch.optim as optim class A2C: - def __init__(self,n_states, n_actions, hidden_dim=256,device="cpu",lr = 3e-4): - self.device = device + def __init__(self,n_states, n_actions, cfg): self.gamma = 0.99 - self.model = ActorCritic(n_states, n_actions, hidden_dim=hidden_dim).to(device) - self.optimizer = optim.Adam(self.model.parameters(),lr=lr) + self.model = ActorCritic(n_states, n_actions, hidden_dim=cfg.hidden_dim).to(cfg.device) + self.optimizer = optim.Adam(self.model.parameters(),lr=cfg.lr) def choose_action(self, state): dist, value = self.model(state) action = dist.sample() diff --git a/codes/A2C/common/__init__.py b/codes/A2C/common/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/codes/A2C/env.py b/codes/A2C/env.py index fda92d8..34d6d70 100644 --- a/codes/A2C/env.py +++ b/codes/A2C/env.py @@ -5,13 +5,13 @@ Author: John Email: johnjim0816@gmail.com Date: 2020-10-30 15:39:37 LastEditor: John -LastEditTime: 2020-11-03 20:52:07 +LastEditTime: 2021-03-17 20:19:14 Discription: Environment: ''' import gym -from common.multiprocessing_env import SubprocVecEnv +from A2C.multiprocessing_env import SubprocVecEnv # num_envs = 16 # env_name = "Pendulum-v0" diff --git a/codes/A2C/main.py b/codes/A2C/main.py index 65d8a32..08a1e1d 100644 --- a/codes/A2C/main.py +++ b/codes/A2C/main.py @@ -5,94 +5,73 @@ @Email: johnjim0816@gmail.com @Date: 2020-06-11 20:58:21 @LastEditor: John -LastEditTime: 2020-11-08 22:19:56 +LastEditTime: 2021-03-20 16:58:04 @Discription: @Environment: python 3.7.9 ''' +import sys,os +sys.path.append(os.getcwd()) # add current terminal path import torch import gym -import os -import numpy as np -import argparse -from torch.utils.tensorboard import SummaryWriter - -from agent import A2C -from env import make_envs -from utils import SEQUENCE, SAVED_MODEL_PATH, RESULT_PATH -from utils import save_model,save_results - -def get_args(): - '''模型建立好之后只需要在这里调参 - ''' - parser = argparse.ArgumentParser() - parser.add_argument("--train", default=1, type=int) # 1 表示训练,0表示只进行eval - parser.add_argument("--gamma", default=0.99, - type=float) # reward 折扣因子 - parser.add_argument("--lr", default=3e-4, type=float) # critic学习率 - parser.add_argument("--actor_lr", default=1e-4, type=float) - parser.add_argument("--memory_capacity", default=10000, - type=int, help="capacity of Replay Memory") - parser.add_argument("--batch_size", default=128, type=int, - help="batch size of memory sampling") - parser.add_argument("--train_eps", default=4000, type=int) - parser.add_argument("--train_steps", default=5, type=int) - parser.add_argument("--eval_eps", default=200, type=int) # 训练的最大episode数目 - parser.add_argument("--eval_steps", default=200, - type=int) # 训练每个episode的长度 - parser.add_argument("--target_update", default=4, type=int, - help="when(every default 10 eisodes) to update target net ") - config = parser.parse_args() - return config - -def test_env(agent,device='cpu'): - env = gym.make("CartPole-v0") - state = env.reset() - ep_reward=0 - for _ in range(200): - state = torch.FloatTensor(state).unsqueeze(0).to(device) - dist, value = agent.model(state) - action = dist.sample() - next_state, reward, done, _ = env.step(action.cpu().numpy()[0]) - state = next_state - ep_reward += reward - if done: - break - return ep_reward +import datetime +from A2C.agent import A2C -def train(cfg): - print('Start to train ! \n') - envs = make_envs(num_envs=16,env_name="CartPole-v0") - n_states = envs.observation_space.shape[0] - n_actions = envs.action_space.n - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - agent = A2C(n_states, n_actions, hidden_dim=256) - # moving_average_rewards = [] - # ep_steps = [] - log_dir=os.path.split(os.path.abspath(__file__))[0]+"/logs/train/" + SEQUENCE - writer = SummaryWriter(log_dir) - state = envs.reset() - for i_episode in range(1, cfg.train_eps+1): + +SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 +SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' # 生成保存的模型路径 +if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"): + os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/") +if not os.path.exists(SAVED_MODEL_PATH): + os.mkdir(SAVED_MODEL_PATH) +RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/' # 存储reward的路径 +if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/results/"): + os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/results/") +if not os.path.exists(RESULT_PATH): + os.mkdir(RESULT_PATH) + +class A2CConfig: + def __init__(self): + self.gamma = 0.99 + self.lr = 3e-4 # learnning rate + self.actor_lr = 1e-4 # learnning rate of actor network + self.memory_capacity = 10000 # capacity of replay memory + self.batch_size = 128 + self.train_eps = 200 + self.train_steps = 200 + self.eval_eps = 200 + self.eval_steps = 200 + self.target_update = 4 + self.hidden_dim=256 + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +def train(cfg,env,agent): + print('Start to train ! ') + for i_episode in range(cfg.train_eps): + state = env.reset() log_probs = [] values = [] rewards = [] masks = [] entropy = 0 - for i_step in range(1, cfg.train_steps+1): - state = torch.FloatTensor(state).to(device) + ep_reward = 0 + for i_step in range(cfg.train_steps): + state = torch.FloatTensor(state).to(cfg.device) dist, value = agent.model(state) action = dist.sample() - next_state, reward, done, _ = envs.step(action.cpu().numpy()) + next_state, reward, done, _ = env.step(action.cpu().numpy()) + ep_reward+=reward state = next_state log_prob = dist.log_prob(action) entropy += dist.entropy().mean() log_probs.append(log_prob) values.append(value) - rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device)) - masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device)) - if i_episode%20 == 0: - print("reward",test_env(agent,device='cpu')) - next_state = torch.FloatTensor(next_state).to(device) + rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(cfg.device)) + masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(cfg.device)) + if done: + break + print('Episode:{}/{}, Reward:{}, Steps:{}, Done:{}'.format(i_episode+1,cfg.train_eps,ep_reward,i_step+1,done)) + next_state = torch.FloatTensor(next_state).to(cfg.device) _, next_value =agent.model(next_state) returns = agent.compute_returns(next_value, rewards, masks) @@ -107,80 +86,17 @@ def train(cfg): agent.optimizer.zero_grad() loss.backward() agent.optimizer.step() - for _ in range(100): - print("test_reward",test_env(agent,device='cpu')) - - # print('Episode:', i_episode, ' Reward: %i' % - # int(ep_reward[0]), 'n_steps:', i_step) - # ep_steps.append(i_step) - # rewards.append(ep_reward) - # if i_episode == 1: - # moving_average_rewards.append(ep_reward[0]) - # else: - # moving_average_rewards.append( - # 0.9*moving_average_rewards[-1]+0.1*ep_reward[0]) - # writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode) - # writer.add_scalar('steps_of_each_episode', - # ep_steps[-1], i_episode) - writer.close() + print('Complete training!') - ''' 保存模型 ''' - # save_model(agent,model_path=SAVED_MODEL_PATH) - # '''存储reward等相关结果''' - # save_results(rewards,moving_average_rewards,ep_steps,tag='train',result_path=RESULT_PATH) -# def eval(cfg, saved_model_path = SAVED_MODEL_PATH): -# print('start to eval ! \n') -# env = NormalizedActions(gym.make("Pendulum-v0")) -# n_states = env.observation_space.shape[0] -# n_actions = env.action_space.shape[0] -# agent = DDPG(n_states, n_actions, critic_lr=1e-3, -# actor_lr=1e-4, gamma=0.99, soft_tau=1e-2, memory_capacity=100000, batch_size=128) -# agent.load_model(saved_model_path+'checkpoint.pth') -# rewards = [] -# moving_average_rewards = [] -# ep_steps = [] -# log_dir=os.path.split(os.path.abspath(__file__))[0]+"/logs/eval/" + SEQUENCE -# writer = SummaryWriter(log_dir) -# for i_episode in range(1, cfg.eval_eps+1): -# state = env.reset() # reset环境状态 -# ep_reward = 0 -# for i_step in range(1, cfg.eval_steps+1): -# action = agent.choose_action(state) # 根据当前环境state选择action -# next_state, reward, done, _ = env.step(action) # 更新环境参数 -# ep_reward += reward -# state = next_state # 跳转到下一个状态 -# if done: -# break -# print('Episode:', i_episode, ' Reward: %i' % -# int(ep_reward), 'n_steps:', i_step, 'done: ', done) -# ep_steps.append(i_step) -# rewards.append(ep_reward) -# # 计算滑动窗口的reward -# if i_episode == 1: -# moving_average_rewards.append(ep_reward) -# else: -# moving_average_rewards.append( -# 0.9*moving_average_rewards[-1]+0.1*ep_reward) -# writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode) -# writer.add_scalar('steps_of_each_episode', -# ep_steps[-1], i_episode) -# writer.close() -# '''存储reward等相关结果''' -# if not os.path.exists(RESULT_PATH): # 检测是否存在文件夹 -# os.mkdir(RESULT_PATH) -# np.save(RESULT_PATH+'rewards_eval.npy', rewards) -# np.save(RESULT_PATH+'moving_average_rewards_eval.npy', moving_average_rewards) -# np.save(RESULT_PATH+'steps_eval.npy', ep_steps) + if __name__ == "__main__": - cfg = get_args() - train(cfg) - - # cfg = get_args() - # if cfg.train: - # train(cfg) - # eval(cfg) - # else: - # model_path = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/" - # eval(cfg,saved_model_path=model_path) + cfg = A2CConfig() + env = gym.make('CartPole-v0') + env.seed(1) # set random seed for env + n_states = env.observation_space.shape[0] + n_actions = env.action_space.n + agent = A2C(n_states, n_actions, cfg) + train(cfg,env,agent) + diff --git a/codes/A2C/model.py b/codes/A2C/model.py index 0d68901..0ceba5e 100644 --- a/codes/A2C/model.py +++ b/codes/A2C/model.py @@ -5,7 +5,7 @@ Author: John Email: johnjim0816@gmail.com Date: 2020-11-03 20:45:25 LastEditor: John -LastEditTime: 2020-11-07 18:49:09 +LastEditTime: 2021-03-20 17:41:33 Discription: Environment: ''' @@ -13,7 +13,7 @@ import torch.nn as nn from torch.distributions import Categorical class ActorCritic(nn.Module): - def __init__(self, n_states, n_actions, hidden_dim=256, std=0.0): + def __init__(self, n_states, n_actions, hidden_dim=256): super(ActorCritic, self).__init__() self.critic = nn.Sequential( nn.Linear(n_states, hidden_dim), @@ -30,6 +30,7 @@ class ActorCritic(nn.Module): def forward(self, x): value = self.critic(x) + print(x) probs = self.actor(x) dist = Categorical(probs) return dist, value \ No newline at end of file diff --git a/codes/A2C/common/multiprocessing_env.py b/codes/A2C/multiprocessing_env.py similarity index 100% rename from codes/A2C/common/multiprocessing_env.py rename to codes/A2C/multiprocessing_env.py diff --git a/codes/A2C/test.py b/codes/A2C/test.py new file mode 100644 index 0000000..fd124ff --- /dev/null +++ b/codes/A2C/test.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python +# coding=utf-8 +''' +Author: John +Email: johnjim0816@gmail.com +Date: 2021-03-20 17:43:17 +LastEditor: John +LastEditTime: 2021-03-20 19:36:24 +Discription: +Environment: +''' +import sys +import torch +import gym +import numpy as np +import torch.nn as nn +import torch.optim as optim +import torch.nn.functional as F +from torch.autograd import Variable +import matplotlib.pyplot as plt +import pandas as pd + + +learning_rate = 3e-4 + +# Constants +GAMMA = 0.99 + +class A2CConfig: + ''' hyperparameters + ''' + def __init__(self): + self.gamma = 0.99 + self.lr = 3e-4 # learnning rate + self.actor_lr = 1e-4 # learnning rate of actor network + self.memory_capacity = 10000 # capacity of replay memory + self.batch_size = 128 + self.train_eps = 3000 + self.train_steps = 200 + self.eval_eps = 200 + self.eval_steps = 200 + self.target_update = 4 + self.hidden_dim=256 + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +class ActorCritic(nn.Module): + def __init__(self, n_states, n_actions, hidden_dim, learning_rate=3e-4): + super(ActorCritic, self).__init__() + + self.n_actions = n_actions + self.critic_linear1 = nn.Linear(n_states, hidden_dim) + self.critic_linear2 = nn.Linear(hidden_dim, 1) + + self.actor_linear1 = nn.Linear(n_states, hidden_dim) + self.actor_linear2 = nn.Linear(hidden_dim, n_actions) + + def forward(self, state): + state = Variable(torch.from_numpy(state).float().unsqueeze(0)) + value = F.relu(self.critic_linear1(state)) + value = self.critic_linear2(value) + policy_dist = F.relu(self.actor_linear1(state)) + policy_dist = F.softmax(self.actor_linear2(policy_dist), dim=1) + + return value, policy_dist + +class A2C: + def __init__(self,n_states,n_actions,cfg): + self.model = ActorCritic(n_states, n_actions, cfg.hidden_dim) + self.optimizer = optim.Adam(self.model.parameters(), lr=cfg.lr) + def choose_action(self,state): + pass + def update(self): + pass + +def train(cfg,env,agent): + n_states = env.observation_space.shape[0] + n_actions = env.action_space.n + actor_critic = ActorCritic(n_states, n_actions, hidden_dim) + ac_optimizer = optim.Adam(actor_critic.parameters(), lr=learning_rate) + + all_lengths = [] + average_lengths = [] + all_rewards = [] + entropy_term = 0 + + for episode in range(cfg.train_eps): + log_probs = [] + values = [] + rewards = [] + state = env.reset() + for steps in range(cfg.train_steps): + value, policy_dist = actor_critic.forward(state) + value = value.detach().numpy()[0,0] + dist = policy_dist.detach().numpy() + + action = np.random.choice(n_actions, p=np.squeeze(dist)) + log_prob = torch.log(policy_dist.squeeze(0)[action]) + entropy = -np.sum(np.mean(dist) * np.log(dist)) + new_state, reward, done, _ = env.step(action) + + rewards.append(reward) + values.append(value) + log_probs.append(log_prob) + entropy_term += entropy + state = new_state + + if done or steps == cfg.train_steps-1: + Qval, _ = actor_critic.forward(new_state) + Qval = Qval.detach().numpy()[0,0] + all_rewards.append(np.sum(rewards)) + all_lengths.append(steps) + average_lengths.append(np.mean(all_lengths[-10:])) + if episode % 10 == 0: + sys.stdout.write("episode: {}, reward: {}, total length: {}, average length: {} \n".format(episode, np.sum(rewards), steps, average_lengths[-1])) + break + + # compute Q values + Qvals = np.zeros_like(values) + for t in reversed(range(len(rewards))): + Qval = rewards[t] + GAMMA * Qval + Qvals[t] = Qval + + #update actor critic + values = torch.FloatTensor(values) + Qvals = torch.FloatTensor(Qvals) + log_probs = torch.stack(log_probs) + + advantage = Qvals - values + actor_loss = (-log_probs * advantage).mean() + critic_loss = 0.5 * advantage.pow(2).mean() + ac_loss = actor_loss + critic_loss + 0.001 * entropy_term + + ac_optimizer.zero_grad() + ac_loss.backward() + ac_optimizer.step() + + + + # Plot results + smoothed_rewards = pd.Series.rolling(pd.Series(all_rewards), 10).mean() + smoothed_rewards = [elem for elem in smoothed_rewards] + plt.plot(all_rewards) + plt.plot(smoothed_rewards) + plt.plot() + plt.xlabel('Episode') + plt.ylabel('Reward') + plt.show() + + plt.plot(all_lengths) + plt.plot(average_lengths) + plt.xlabel('Episode') + plt.ylabel('Episode length') + plt.show() + +if __name__ == "__main__": + cfg = A2CConfig + env = gym.make("CartPole-v0") + n_states = env.observation_space.shape[0] + n_actions = env.action_space.n + agent = A2C(n_states,n_actions,cfg) + train(cfg,env,agent) \ No newline at end of file diff --git a/codes/A2C/utils.py b/codes/A2C/utils.py index ce89c7c..b6d66c6 100644 --- a/codes/A2C/utils.py +++ b/codes/A2C/utils.py @@ -15,7 +15,7 @@ import datetime SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' -RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/result/"+SEQUENCE+'/' +RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/' def save_results(rewards,moving_average_rewards,ep_steps,path=RESULT_PATH): diff --git a/codes/DQN/main.py b/codes/DQN/main.py index 437ddcc..dae9c86 100644 --- a/codes/DQN/main.py +++ b/codes/DQN/main.py @@ -5,7 +5,7 @@ @Email: johnjim0816@gmail.com @Date: 2020-06-12 00:48:57 @LastEditor: John -LastEditTime: 2021-03-13 14:56:50 +LastEditTime: 2021-03-17 20:35:37 @Discription: @Environment: python 3.7.7 ''' @@ -68,7 +68,7 @@ def train(cfg,env,agent): # 更新target network,复制DQN中的所有weights and biases if i_episode % cfg.target_update == 0: agent.target_net.load_state_dict(agent.policy_net.state_dict()) - print('Episode:{}/{}, Reward:{}, Steps:{}, Done:{}'.format(i_episode+1,cfg.train_eps,ep_reward,i_step,done)) + print('Episode:{}/{}, Reward:{}, Steps:{}, Done:{}'.format(i_episode+1,cfg.train_eps,ep_reward,i_step+1,done)) ep_steps.append(i_step) rewards.append(ep_reward) # 计算滑动窗口的reward diff --git a/codes/DoubleDQN/README.md b/codes/DoubleDQN/README.md deleted file mode 100644 index dcfea92..0000000 --- a/codes/DoubleDQN/README.md +++ /dev/null @@ -1,33 +0,0 @@ -## 思路 - -见[博客](https://blog.csdn.net/JohnJim0/article/details/111552545) - -## 环境 - -python 3.7.9 - -pytorch 1.6.0 - -tensorboard 2.3.0 - -torchvision 0.7.0 - -## 使用 - - -train: - -```python -python main.py -``` - -eval: - -```python -python main.py --train 0 -``` -可视化 - -```python -tensorboard --logdir logs -``` \ No newline at end of file diff --git a/codes/DoubleDQN/agent.py b/codes/DoubleDQN/agent.py index b77e9c1..1f9c7c1 100644 --- a/codes/DoubleDQN/agent.py +++ b/codes/DoubleDQN/agent.py @@ -5,7 +5,7 @@ @Email: johnjim0816@gmail.com @Date: 2020-06-12 00:50:49 @LastEditor: John -LastEditTime: 2020-12-22 16:20:35 +LastEditTime: 2021-03-13 15:01:27 @Discription: @Environment: python 3.7.7 ''' @@ -20,65 +20,51 @@ import torch.nn.functional as F import random import math import numpy as np -from memory import ReplayBuffer -from model import FCN -class DQN: - def __init__(self, n_states, n_actions, gamma=0.99, epsilon_start=0.9, epsilon_end=0.05, epsilon_decay=200, memory_capacity=10000, policy_lr=0.01, batch_size=128, device="cpu"): - self.actions_count = 0 +from common.memory import ReplayBuffer +from common.model import MLP2 +class DoubleDQN: + def __init__(self, n_states, n_actions, cfg): + self.n_actions = n_actions # 总的动作个数 - self.device = device # 设备,cpu或gpu等 - self.gamma = gamma + self.device = cfg.device # 设备,cpu或gpu等 + self.gamma = cfg.gamma # e-greedy策略相关参数 - self.epsilon = 0 - self.epsilon_start = epsilon_start - self.epsilon_end = epsilon_end - self.epsilon_decay = epsilon_decay - self.batch_size = batch_size - self.policy_net = FCN(n_states, n_actions).to(self.device) - self.target_net = FCN(n_states, n_actions).to(self.device) + self.actions_count = 0 + self.epsilon_start = cfg.epsilon_start + self.epsilon_end = cfg.epsilon_end + self.epsilon_decay = cfg.epsilon_decay + self.batch_size = cfg.batch_size + self.policy_net = MLP2(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device) + self.target_net = MLP2(n_states, n_actions,hidden_dim=cfg.hidden_dim).to(self.device) # target_net的初始模型参数完全复制policy_net self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() # 不启用 BatchNormalization 和 Dropout # 可查parameters()与state_dict()的区别,前者require_grad=True - self.optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) + self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) self.loss = 0 - self.memory = ReplayBuffer(memory_capacity) + self.memory = ReplayBuffer(cfg.memory_capacity) - def choose_action(self, state, train=True): + def choose_action(self, state): '''选择动作 ''' - if train: - self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ - math.exp(-1. * self.actions_count / self.epsilon_decay) - self.actions_count += 1 - if random.random() > self.epsilon: - with torch.no_grad(): - # 先转为张量便于丢给神经网络,state元素数据原本为float64 - # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价 - state = torch.tensor( - [state], device=self.device, dtype=torch.float32) - # 如tensor([[-0.0798, -0.0079]], grad_fn=) - q_value = self.policy_net(state) - # tensor.max(1)返回每行的最大值以及对应的下标, - # 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0])) - # 所以tensor.max(1)[1]返回最大值对应的下标,即action - action = q_value.max(1)[1].item() - else: - action = random.randrange(self.n_actions) - return action - else: + self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ + math.exp(-1. * self.actions_count / self.epsilon_decay) + self.actions_count += 1 + if random.random() > self.epsilon: with torch.no_grad(): - # 先转为张量便于丢给神经网络,state元素数据原本为float64 - # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价 - state = torch.tensor( - [state], device='cpu', dtype=torch.float32) - # 如tensor([[-0.0798, -0.0079]], grad_fn=) - q_value = self.target_net(state) - # tensor.max(1)返回每行的最大值以及对应的下标, - # 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0])) - # 所以tensor.max(1)[1]返回最大值对应的下标,即action - action = q_value.max(1)[1].item() - return action + # 先转为张量便于丢给神经网络,state元素数据原本为float64 + # 注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价 + state = torch.tensor( + [state], device=self.device, dtype=torch.float32) + # 如tensor([[-0.0798, -0.0079]], grad_fn=) + q_value = self.policy_net(state) + # tensor.max(1)返回每行的最大值以及对应的下标, + # 如torch.return_types.max(values=tensor([10.3587]),indices=tensor([0])) + # 所以tensor.max(1)[1]返回最大值对应的下标,即action + action = q_value.max(1)[1].item() + else: + action = random.randrange(self.n_actions) + return action def update(self): if len(self.memory) < self.batch_size: @@ -86,8 +72,7 @@ class DQN: # 从memory中随机采样transition state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample( self.batch_size) - # 转为张量 - # 例如tensor([[-4.5543e-02, -2.3910e-01, 1.8344e-02, 2.3158e-01],...,[-1.8615e-02, -2.3921e-01, -1.1791e-02, 2.3400e-01]]) + ### 转为张量 ### state_batch = torch.tensor( state_batch, device=self.device, dtype=torch.float) action_batch = torch.tensor(action_batch, device=self.device).unsqueeze( @@ -96,6 +81,7 @@ class DQN: reward_batch, device=self.device, dtype=torch.float) # tensor([1., 1.,...,1]) next_state_batch = torch.tensor( next_state_batch, device=self.device, dtype=torch.float) + done_batch = torch.tensor(np.float32( done_batch), device=self.device).unsqueeze(1) # 将bool转为float然后转为张量 @@ -112,7 +98,7 @@ class DQN: # 对于终止状态,此时done_batch[0]=1, 对应的expected_q_value等于reward q_target = reward_batch + self.gamma * next_q_state_value * (1-done_batch[0]) ''' - '''以下是Double DQNq_target计算方式,与NatureDQN稍有不同''' + '''以下是Double DQN q_target计算方式,与NatureDQN稍有不同''' next_target_values = self.target_net( next_state_batch) # 选出Q(s_t‘, a)对应的action,代入到next_target_values获得target net对应的next_q_value,即Q’(s_t|a=argmax Q(s_t‘, a)) @@ -127,8 +113,8 @@ class DQN: param.grad.data.clamp_(-1, 1) self.optimizer.step() # 更新模型 - def save_model(self,path): - torch.save(self.target_net.state_dict(), path) + def save(self,path): + torch.save(self.target_net.state_dict(), path+'DoubleDQN_checkpoint.pth') - def load_model(self,path): - self.target_net.load_state_dict(torch.load(path)) + def load(self,path): + self.target_net.load_state_dict(torch.load(path+'DoubleDQN_checkpoint.pth')) diff --git a/codes/DoubleDQN/logs/eval/20201222-144524/events.out.tfevents.1608619584.MacBook-Pro.local.35381.3 b/codes/DoubleDQN/logs/eval/20201222-144524/events.out.tfevents.1608619584.MacBook-Pro.local.35381.3 deleted file mode 100644 index 1d2ea32..0000000 Binary files a/codes/DoubleDQN/logs/eval/20201222-144524/events.out.tfevents.1608619584.MacBook-Pro.local.35381.3 and /dev/null differ diff --git a/codes/DoubleDQN/logs/eval/20201222-144524/rewards_moving_average/events.out.tfevents.1608619584.MacBook-Pro.local.35381.5 b/codes/DoubleDQN/logs/eval/20201222-144524/rewards_moving_average/events.out.tfevents.1608619584.MacBook-Pro.local.35381.5 deleted file mode 100644 index 3e37edb..0000000 Binary files a/codes/DoubleDQN/logs/eval/20201222-144524/rewards_moving_average/events.out.tfevents.1608619584.MacBook-Pro.local.35381.5 and /dev/null differ diff --git a/codes/DoubleDQN/logs/eval/20201222-144524/rewards_raw/events.out.tfevents.1608619584.MacBook-Pro.local.35381.4 b/codes/DoubleDQN/logs/eval/20201222-144524/rewards_raw/events.out.tfevents.1608619584.MacBook-Pro.local.35381.4 deleted file mode 100644 index a2c43cb..0000000 Binary files a/codes/DoubleDQN/logs/eval/20201222-144524/rewards_raw/events.out.tfevents.1608619584.MacBook-Pro.local.35381.4 and /dev/null differ diff --git a/codes/DoubleDQN/logs/eval/DQN20201015-215937/events.out.tfevents.1602770409.MacBook-Pro.local.21607.3 b/codes/DoubleDQN/logs/eval/DQN20201015-215937/events.out.tfevents.1602770409.MacBook-Pro.local.21607.3 deleted file mode 100644 index 8ceddf5..0000000 Binary files a/codes/DoubleDQN/logs/eval/DQN20201015-215937/events.out.tfevents.1602770409.MacBook-Pro.local.21607.3 and /dev/null differ diff --git a/codes/DoubleDQN/logs/eval/DQN20201015-215937/rewards_moving_average/events.out.tfevents.1602770409.MacBook-Pro.local.21607.5 b/codes/DoubleDQN/logs/eval/DQN20201015-215937/rewards_moving_average/events.out.tfevents.1602770409.MacBook-Pro.local.21607.5 deleted file mode 100644 index aada812..0000000 Binary files a/codes/DoubleDQN/logs/eval/DQN20201015-215937/rewards_moving_average/events.out.tfevents.1602770409.MacBook-Pro.local.21607.5 and /dev/null differ diff --git a/codes/DoubleDQN/logs/eval/DQN20201015-215937/rewards_raw/events.out.tfevents.1602770409.MacBook-Pro.local.21607.4 b/codes/DoubleDQN/logs/eval/DQN20201015-215937/rewards_raw/events.out.tfevents.1602770409.MacBook-Pro.local.21607.4 deleted file mode 100644 index ae17517..0000000 Binary files a/codes/DoubleDQN/logs/eval/DQN20201015-215937/rewards_raw/events.out.tfevents.1602770409.MacBook-Pro.local.21607.4 and /dev/null differ diff --git a/codes/DoubleDQN/logs/train/20201222-144524/events.out.tfevents.1608619536.MacBook-Pro.local.35381.0 b/codes/DoubleDQN/logs/train/20201222-144524/events.out.tfevents.1608619536.MacBook-Pro.local.35381.0 deleted file mode 100644 index 01f2af1..0000000 Binary files a/codes/DoubleDQN/logs/train/20201222-144524/events.out.tfevents.1608619536.MacBook-Pro.local.35381.0 and /dev/null differ diff --git a/codes/DoubleDQN/logs/train/20201222-144524/rewards_moving_average/events.out.tfevents.1608619536.MacBook-Pro.local.35381.2 b/codes/DoubleDQN/logs/train/20201222-144524/rewards_moving_average/events.out.tfevents.1608619536.MacBook-Pro.local.35381.2 deleted file mode 100644 index ddb603f..0000000 Binary files a/codes/DoubleDQN/logs/train/20201222-144524/rewards_moving_average/events.out.tfevents.1608619536.MacBook-Pro.local.35381.2 and /dev/null differ diff --git a/codes/DoubleDQN/logs/train/20201222-144524/rewards_raw/events.out.tfevents.1608619536.MacBook-Pro.local.35381.1 b/codes/DoubleDQN/logs/train/20201222-144524/rewards_raw/events.out.tfevents.1608619536.MacBook-Pro.local.35381.1 deleted file mode 100644 index c40643f..0000000 Binary files a/codes/DoubleDQN/logs/train/20201222-144524/rewards_raw/events.out.tfevents.1608619536.MacBook-Pro.local.35381.1 and /dev/null differ diff --git a/codes/DoubleDQN/logs/train/DQN20201015-215937/events.out.tfevents.1602770377.MacBook-Pro.local.21607.0 b/codes/DoubleDQN/logs/train/DQN20201015-215937/events.out.tfevents.1602770377.MacBook-Pro.local.21607.0 deleted file mode 100644 index 9a4f3f5..0000000 Binary files a/codes/DoubleDQN/logs/train/DQN20201015-215937/events.out.tfevents.1602770377.MacBook-Pro.local.21607.0 and /dev/null differ diff --git a/codes/DoubleDQN/logs/train/DQN20201015-215937/rewards_moving_average/events.out.tfevents.1602770377.MacBook-Pro.local.21607.2 b/codes/DoubleDQN/logs/train/DQN20201015-215937/rewards_moving_average/events.out.tfevents.1602770377.MacBook-Pro.local.21607.2 deleted file mode 100644 index 8eed693..0000000 Binary files a/codes/DoubleDQN/logs/train/DQN20201015-215937/rewards_moving_average/events.out.tfevents.1602770377.MacBook-Pro.local.21607.2 and /dev/null differ diff --git a/codes/DoubleDQN/logs/train/DQN20201015-215937/rewards_raw/events.out.tfevents.1602770377.MacBook-Pro.local.21607.1 b/codes/DoubleDQN/logs/train/DQN20201015-215937/rewards_raw/events.out.tfevents.1602770377.MacBook-Pro.local.21607.1 deleted file mode 100644 index 4322bc3..0000000 Binary files a/codes/DoubleDQN/logs/train/DQN20201015-215937/rewards_raw/events.out.tfevents.1602770377.MacBook-Pro.local.21607.1 and /dev/null differ diff --git a/codes/DoubleDQN/main.py b/codes/DoubleDQN/main.py index e531b29..88add9a 100644 --- a/codes/DoubleDQN/main.py +++ b/codes/DoubleDQN/main.py @@ -5,37 +5,58 @@ @Email: johnjim0816@gmail.com @Date: 2020-06-12 00:48:57 @LastEditor: John -LastEditTime: 2020-12-22 15:39:46 +LastEditTime: 2021-03-17 20:11:19 @Discription: @Environment: python 3.7.7 ''' +import sys,os +sys.path.append(os.getcwd()) # add current terminal path import gym import torch -from torch.utils.tensorboard import SummaryWriter -import os -from agent import DQN -from params import SEQUENCE,SAVED_MODEL_PATH,RESULT_PATH -from params import get_args -from utils import save_results +import datetime +from DoubleDQN.agent import DoubleDQN +from common.plot import plot_rewards +from common.utils import save_results -def train(cfg): +SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 +SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' # 生成保存的模型路径 +if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"): + os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/") +if not os.path.exists(SAVED_MODEL_PATH): + os.mkdir(SAVED_MODEL_PATH) +RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/' # 存储reward的路径 +if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/results/"): + os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/results/") +if not os.path.exists(RESULT_PATH): + os.mkdir(RESULT_PATH) + +class DoubleDQNConfig: + def __init__(self): + self.algo = "Double DQN" # 算法名称 + self.gamma = 0.99 + self.epsilon_start = 0.9 # e-greedy策略的初始epsilon + self.epsilon_end = 0.01 + self.epsilon_decay = 200 + self.lr = 0.01 # 学习率 + self.memory_capacity = 10000 # Replay Memory容量 + self.batch_size = 128 + self.train_eps = 250 # 训练的episode数目 + self.train_steps = 200 # 训练每个episode的最大长度 + self.target_update = 2 # target net的更新频率 + self.eval_eps = 20 # 测试的episode数目 + self.eval_steps = 200 # 测试每个episode的最大长度 + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu + self.hidden_dim = 128 # 神经网络隐藏层维度 + + +def train(cfg,env,agent): print('Start to train !') - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu - env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym,此处一般不需要 - env.seed(1) # 设置env随机种子 - n_states = env.observation_space.shape[0] - n_actions = env.action_space.n - agent = DQN(n_states=n_states, n_actions=n_actions, device=device, gamma=cfg.gamma, epsilon_start=cfg.epsilon_start, - epsilon_end=cfg.epsilon_end, epsilon_decay=cfg.epsilon_decay, policy_lr=cfg.policy_lr, memory_capacity=cfg.memory_capacity, batch_size=cfg.batch_size) - rewards = [] - moving_average_rewards = [] + rewards,ma_rewards = [],[] ep_steps = [] - log_dir=os.path.split(os.path.abspath(__file__))[0]+"/logs/train/" + SEQUENCE - writer = SummaryWriter(log_dir) - for i_episode in range(1, cfg.train_eps+1): + for i_episode in range(cfg.train_eps): state = env.reset() # reset环境状态 ep_reward = 0 - for i_step in range(1, cfg.train_steps+1): + for i_step in range(cfg.train_steps): action = agent.choose_action(state) # 根据当前环境state选择action next_state, reward, done, _ = env.step(action) # 更新环境参数 ep_reward += reward @@ -47,80 +68,26 @@ def train(cfg): # 更新target network,复制DQN中的所有weights and biases if i_episode % cfg.target_update == 0: agent.target_net.load_state_dict(agent.policy_net.state_dict()) - print('Episode:', i_episode, ' Reward: %i' % - int(ep_reward), 'n_steps:', i_step, 'done: ', done,' Explore: %.2f' % agent.epsilon) + print('Episode:{}/{}, Reward:{}, Steps:{}, Done:{}'.format(i_episode+1,cfg.train_eps,ep_reward,i_step,done)) ep_steps.append(i_step) rewards.append(ep_reward) # 计算滑动窗口的reward - if i_episode == 1: - moving_average_rewards.append(ep_reward) + if ma_rewards: + ma_rewards.append( + 0.9*ma_rewards[-1]+0.1*ep_reward) else: - moving_average_rewards.append( - 0.9*moving_average_rewards[-1]+0.1*ep_reward) - writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode) - writer.add_scalar('steps_of_each_episode', - ep_steps[-1], i_episode) - writer.close() + ma_rewards.append(ep_reward) print('Complete training!') - ''' 保存模型 ''' - if not os.path.exists(SAVED_MODEL_PATH): # 检测是否存在文件夹 - os.mkdir(SAVED_MODEL_PATH) - agent.save_model(SAVED_MODEL_PATH+'checkpoint.pth') - print('model saved!') - '''存储reward等相关结果''' - save_results(rewards,moving_average_rewards,ep_steps,tag='train',result_path=RESULT_PATH) + return rewards,ma_rewards - -def eval(cfg, saved_model_path = SAVED_MODEL_PATH): - print('start to eval !') - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu +if __name__ == "__main__": + cfg = DoubleDQNConfig() env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym,此处一般不需要 env.seed(1) # 设置env随机种子 n_states = env.observation_space.shape[0] n_actions = env.action_space.n - agent = DQN(n_states=n_states, n_actions=n_actions, device=device, gamma=cfg.gamma, epsilon_start=cfg.epsilon_start, - epsilon_end=cfg.epsilon_end, epsilon_decay=cfg.epsilon_decay, policy_lr=cfg.policy_lr, memory_capacity=cfg.memory_capacity, batch_size=cfg.batch_size) - agent.load_model(saved_model_path+'checkpoint.pth') - rewards = [] - moving_average_rewards = [] - ep_steps = [] - log_dir=os.path.split(os.path.abspath(__file__))[0]+"/logs/eval/" + SEQUENCE - writer = SummaryWriter(log_dir) - for i_episode in range(1, cfg.eval_eps+1): - state = env.reset() # reset环境状态 - ep_reward = 0 - for i_step in range(1, cfg.eval_steps+1): - action = agent.choose_action(state,train=False) # 根据当前环境state选择action - next_state, reward, done, _ = env.step(action) # 更新环境参数 - ep_reward += reward - state = next_state # 跳转到下一个状态 - if done: - break - print('Episode:', i_episode, ' Reward: %i' % - int(ep_reward), 'n_steps:', i_step, 'done: ', done) - - ep_steps.append(i_step) - rewards.append(ep_reward) - # 计算滑动窗口的reward - if i_episode == 1: - moving_average_rewards.append(ep_reward) - else: - moving_average_rewards.append( - 0.9*moving_average_rewards[-1]+0.1*ep_reward) - - writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode) - writer.add_scalar('steps_of_each_episode', - ep_steps[-1], i_episode) - writer.close() - '''存储reward等相关结果''' - save_results(rewards,moving_average_rewards,ep_steps,tag='eval',result_path=RESULT_PATH) - print('Complete evaling!') - -if __name__ == "__main__": - cfg = get_args() - if cfg.train: - train(cfg) - eval(cfg) - else: - model_path = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/" - eval(cfg,saved_model_path=model_path) + agent = DoubleDQN(n_states,n_actions,cfg) + rewards,ma_rewards = train(cfg,env,agent) + agent.save(path=SAVED_MODEL_PATH) + save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH) + plot_rewards(rewards,ma_rewards,tag="train",algo = cfg.algo,path=RESULT_PATH) diff --git a/codes/DoubleDQN/memory.py b/codes/DoubleDQN/memory.py index 6339754..52394a5 100644 --- a/codes/DoubleDQN/memory.py +++ b/codes/DoubleDQN/memory.py @@ -5,12 +5,11 @@ @Email: johnjim0816@gmail.com @Date: 2020-06-10 15:27:16 @LastEditor: John -LastEditTime: 2020-12-22 12:56:27 +LastEditTime: 2021-01-20 18:58:37 @Discription: @Environment: python 3.7.7 ''' import random -import numpy as np class ReplayBuffer: diff --git a/codes/DoubleDQN/model.py b/codes/DoubleDQN/model.py index a4642d8..282fa83 100644 --- a/codes/DoubleDQN/model.py +++ b/codes/DoubleDQN/model.py @@ -12,13 +12,13 @@ LastEditTime: 2020-08-19 16:55:54 import torch.nn as nn import torch.nn.functional as F -class FCN(nn.Module): +class MLP(nn.Module): def __init__(self, n_states=4, n_actions=18): """ 初始化q网络,为全连接网络 n_states: 输入的feature即环境的state数目 n_actions: 输出的action总个数 """ - super(FCN, self).__init__() + super(MLP, self).__init__() self.fc1 = nn.Linear(n_states, 128) # 输入层 self.fc2 = nn.Linear(128, 128) # 隐藏层 self.fc3 = nn.Linear(128, n_actions) # 输出层 diff --git a/codes/DoubleDQN/params.py b/codes/DoubleDQN/params.py index 46eb499..75b9f24 100644 --- a/codes/DoubleDQN/params.py +++ b/codes/DoubleDQN/params.py @@ -5,7 +5,7 @@ Author: John Email: johnjim0816@gmail.com Date: 2020-12-22 15:22:17 LastEditor: John -LastEditTime: 2020-12-22 15:26:09 +LastEditTime: 2021-01-21 14:30:38 Discription: Environment: ''' @@ -16,7 +16,10 @@ import argparse ALGO_NAME = 'Double DQN' SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' -RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/result/"+SEQUENCE+'/' +RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/' + +TRAIN_LOG_DIR=os.path.split(os.path.abspath(__file__))[0]+"/logs/train/" + SEQUENCE +EVAL_LOG_DIR=os.path.split(os.path.abspath(__file__))[0]+"/logs/eval/" + SEQUENCE def get_args(): '''模型参数 diff --git a/codes/DoubleDQN/plot.py b/codes/DoubleDQN/plot.py index a1be9eb..1004285 100644 --- a/codes/DoubleDQN/plot.py +++ b/codes/DoubleDQN/plot.py @@ -24,14 +24,14 @@ def plot(item,ylabel='rewards_train', save_fig = True): plt.ylabel(ylabel) plt.xlabel('episodes') if save_fig: - plt.savefig(os.path.dirname(__file__)+"/result/"+ylabel+".png") + plt.savefig(os.path.dirname(__file__)+"/results/"+ylabel+".png") plt.show() # plt.show() if __name__ == "__main__": - output_path = os.path.split(os.path.abspath(__file__))[0]+"/result/" + output_path = os.path.split(os.path.abspath(__file__))[0]+"/results/" tag = 'train' rewards=np.load(output_path+"rewards_"+tag+".npy", ) moving_average_rewards=np.load(output_path+"moving_average_rewards_"+tag+".npy",) diff --git a/codes/DoubleDQN/result/20201222-144524/moving_average_rewards_eval.npy b/codes/DoubleDQN/result/20201222-144524/moving_average_rewards_eval.npy deleted file mode 100644 index d367d86..0000000 Binary files a/codes/DoubleDQN/result/20201222-144524/moving_average_rewards_eval.npy and /dev/null differ diff --git a/codes/DoubleDQN/result/20201222-144524/moving_average_rewards_train.npy b/codes/DoubleDQN/result/20201222-144524/moving_average_rewards_train.npy deleted file mode 100644 index 656ebc5..0000000 Binary files a/codes/DoubleDQN/result/20201222-144524/moving_average_rewards_train.npy and /dev/null differ diff --git a/codes/DoubleDQN/result/20201222-144524/rewards_eval.npy b/codes/DoubleDQN/result/20201222-144524/rewards_eval.npy deleted file mode 100644 index d367d86..0000000 Binary files a/codes/DoubleDQN/result/20201222-144524/rewards_eval.npy and /dev/null differ diff --git a/codes/DoubleDQN/result/20201222-144524/rewards_train.npy b/codes/DoubleDQN/result/20201222-144524/rewards_train.npy deleted file mode 100644 index 8eb2ccb..0000000 Binary files a/codes/DoubleDQN/result/20201222-144524/rewards_train.npy and /dev/null differ diff --git a/codes/DoubleDQN/result/20201222-144524/steps_eval.npy b/codes/DoubleDQN/result/20201222-144524/steps_eval.npy deleted file mode 100644 index 83f995e..0000000 Binary files a/codes/DoubleDQN/result/20201222-144524/steps_eval.npy and /dev/null differ diff --git a/codes/DoubleDQN/result/20201222-144524/steps_train.npy b/codes/DoubleDQN/result/20201222-144524/steps_train.npy deleted file mode 100644 index 78abdad..0000000 Binary files a/codes/DoubleDQN/result/20201222-144524/steps_train.npy and /dev/null differ diff --git a/codes/DoubleDQN/result/DQN20201015-215937/moving_average_rewards_eval.npy b/codes/DoubleDQN/result/DQN20201015-215937/moving_average_rewards_eval.npy deleted file mode 100644 index 4d9dbaa..0000000 Binary files a/codes/DoubleDQN/result/DQN20201015-215937/moving_average_rewards_eval.npy and /dev/null differ diff --git a/codes/DoubleDQN/result/DQN20201015-215937/moving_average_rewards_train.npy b/codes/DoubleDQN/result/DQN20201015-215937/moving_average_rewards_train.npy deleted file mode 100644 index 67c5579..0000000 Binary files a/codes/DoubleDQN/result/DQN20201015-215937/moving_average_rewards_train.npy and /dev/null differ diff --git a/codes/DoubleDQN/result/DQN20201015-215937/rewards_eval.npy b/codes/DoubleDQN/result/DQN20201015-215937/rewards_eval.npy deleted file mode 100644 index b992efa..0000000 Binary files a/codes/DoubleDQN/result/DQN20201015-215937/rewards_eval.npy and /dev/null differ diff --git a/codes/DoubleDQN/result/DQN20201015-215937/rewards_train.npy b/codes/DoubleDQN/result/DQN20201015-215937/rewards_train.npy deleted file mode 100644 index b4758a9..0000000 Binary files a/codes/DoubleDQN/result/DQN20201015-215937/rewards_train.npy and /dev/null differ diff --git a/codes/DoubleDQN/result/DQN20201015-215937/steps_eval.npy b/codes/DoubleDQN/result/DQN20201015-215937/steps_eval.npy deleted file mode 100644 index d10f0eb..0000000 Binary files a/codes/DoubleDQN/result/DQN20201015-215937/steps_eval.npy and /dev/null differ diff --git a/codes/DoubleDQN/result/DQN20201015-215937/steps_train.npy b/codes/DoubleDQN/result/DQN20201015-215937/steps_train.npy deleted file mode 100644 index ccc81c7..0000000 Binary files a/codes/DoubleDQN/result/DQN20201015-215937/steps_train.npy and /dev/null differ diff --git a/codes/DoubleDQN/result/moving_average_rewards_eval.npy b/codes/DoubleDQN/result/moving_average_rewards_eval.npy deleted file mode 100644 index d367d86..0000000 Binary files a/codes/DoubleDQN/result/moving_average_rewards_eval.npy and /dev/null differ diff --git a/codes/DoubleDQN/result/moving_average_rewards_eval.png b/codes/DoubleDQN/result/moving_average_rewards_eval.png deleted file mode 100644 index f5bea21..0000000 Binary files a/codes/DoubleDQN/result/moving_average_rewards_eval.png and /dev/null differ diff --git a/codes/DoubleDQN/result/moving_average_rewards_train.npy b/codes/DoubleDQN/result/moving_average_rewards_train.npy deleted file mode 100644 index 656ebc5..0000000 Binary files a/codes/DoubleDQN/result/moving_average_rewards_train.npy and /dev/null differ diff --git a/codes/DoubleDQN/result/moving_average_rewards_train.png b/codes/DoubleDQN/result/moving_average_rewards_train.png deleted file mode 100644 index 1398641..0000000 Binary files a/codes/DoubleDQN/result/moving_average_rewards_train.png and /dev/null differ diff --git a/codes/DoubleDQN/result/rewards_eval.npy b/codes/DoubleDQN/result/rewards_eval.npy deleted file mode 100644 index d367d86..0000000 Binary files a/codes/DoubleDQN/result/rewards_eval.npy and /dev/null differ diff --git a/codes/DoubleDQN/result/rewards_eval.png b/codes/DoubleDQN/result/rewards_eval.png deleted file mode 100644 index b516038..0000000 Binary files a/codes/DoubleDQN/result/rewards_eval.png and /dev/null differ diff --git a/codes/DoubleDQN/result/rewards_train.npy b/codes/DoubleDQN/result/rewards_train.npy deleted file mode 100644 index 8eb2ccb..0000000 Binary files a/codes/DoubleDQN/result/rewards_train.npy and /dev/null differ diff --git a/codes/DoubleDQN/result/rewards_train.png b/codes/DoubleDQN/result/rewards_train.png deleted file mode 100644 index 4b71dc9..0000000 Binary files a/codes/DoubleDQN/result/rewards_train.png and /dev/null differ diff --git a/codes/DoubleDQN/result/steps_eval.npy b/codes/DoubleDQN/result/steps_eval.npy deleted file mode 100644 index 83f995e..0000000 Binary files a/codes/DoubleDQN/result/steps_eval.npy and /dev/null differ diff --git a/codes/DoubleDQN/result/steps_eval.png b/codes/DoubleDQN/result/steps_eval.png deleted file mode 100644 index 0c6292d..0000000 Binary files a/codes/DoubleDQN/result/steps_eval.png and /dev/null differ diff --git a/codes/DoubleDQN/result/steps_train.npy b/codes/DoubleDQN/result/steps_train.npy deleted file mode 100644 index 78abdad..0000000 Binary files a/codes/DoubleDQN/result/steps_train.npy and /dev/null differ diff --git a/codes/DoubleDQN/result/steps_train.png b/codes/DoubleDQN/result/steps_train.png deleted file mode 100644 index c56a232..0000000 Binary files a/codes/DoubleDQN/result/steps_train.png and /dev/null differ diff --git a/codes/DoubleDQN/results/20210317-010120/ma_rewards_train.npy b/codes/DoubleDQN/results/20210317-010120/ma_rewards_train.npy new file mode 100644 index 0000000..a4e7516 Binary files /dev/null and b/codes/DoubleDQN/results/20210317-010120/ma_rewards_train.npy differ diff --git a/codes/DoubleDQN/results/20210317-010120/rewards_curve_train.png b/codes/DoubleDQN/results/20210317-010120/rewards_curve_train.png new file mode 100644 index 0000000..a776580 Binary files /dev/null and b/codes/DoubleDQN/results/20210317-010120/rewards_curve_train.png differ diff --git a/codes/DoubleDQN/results/20210317-010120/rewards_train.npy b/codes/DoubleDQN/results/20210317-010120/rewards_train.npy new file mode 100644 index 0000000..c788230 Binary files /dev/null and b/codes/DoubleDQN/results/20210317-010120/rewards_train.npy differ diff --git a/codes/DoubleDQN/saved_model/20201222-144524/checkpoint.pth b/codes/DoubleDQN/saved_model/20201222-144524/checkpoint.pth deleted file mode 100644 index bcfa0a0..0000000 Binary files a/codes/DoubleDQN/saved_model/20201222-144524/checkpoint.pth and /dev/null differ diff --git a/codes/DoubleDQN/saved_model/20210317-010120/DoubleDQN_checkpoint.pth b/codes/DoubleDQN/saved_model/20210317-010120/DoubleDQN_checkpoint.pth new file mode 100644 index 0000000..8a43c12 Binary files /dev/null and b/codes/DoubleDQN/saved_model/20210317-010120/DoubleDQN_checkpoint.pth differ diff --git a/codes/DoubleDQN/saved_model/checkpoint.pth b/codes/DoubleDQN/saved_model/checkpoint.pth deleted file mode 100644 index bcfa0a0..0000000 Binary files a/codes/DoubleDQN/saved_model/checkpoint.pth and /dev/null differ diff --git a/codes/DoubleDQN/utils.py b/codes/DoubleDQN/utils.py index 0c75408..c5f5305 100644 --- a/codes/DoubleDQN/utils.py +++ b/codes/DoubleDQN/utils.py @@ -13,7 +13,7 @@ import os import numpy as np -def save_results(rewards,moving_average_rewards,ep_steps,tag='train',result_path='./result'): +def save_results(rewards,moving_average_rewards,ep_steps,tag='train',result_path='./results'): if not os.path.exists(result_path): # 检测是否存在文件夹 os.mkdir(result_path) np.save(result_path+'rewards_'+tag+'.npy', rewards) diff --git a/codes/LICENSE b/codes/LICENSE new file mode 100644 index 0000000..673d927 --- /dev/null +++ b/codes/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 John Jim + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/codes/MonteCarlo/README.md b/codes/MonteCarlo/README.md index 6338644..4f90307 100644 --- a/codes/MonteCarlo/README.md +++ b/codes/MonteCarlo/README.md @@ -2,10 +2,10 @@ ## 环境说明 -见[环境说明](https://github.com/datawhalechina/leedeeprl-notes/blob/master/codes/env_info.md)中的The Racetrack +见[环境说明](https://github.com/JohnJim0816/reinforcement-learning-tutorials/blob/master/env_info.md)中的The Racetrack ## First-Visit MC 介绍 -伪代码: +### 伪代码 ![mc_control_algo](assets/mc_control_algo.png) \ No newline at end of file diff --git a/codes/MonteCarlo/agent.py b/codes/MonteCarlo/agent.py index 8efe36b..1484049 100644 --- a/codes/MonteCarlo/agent.py +++ b/codes/MonteCarlo/agent.py @@ -5,7 +5,7 @@ Author: John Email: johnjim0816@gmail.com Date: 2021-03-12 16:14:34 LastEditor: John -LastEditTime: 2021-03-12 16:15:12 +LastEditTime: 2021-03-17 12:35:06 Discription: Environment: ''' @@ -26,11 +26,13 @@ class FisrtVisitMC: def choose_action(self,state): ''' e-greed policy ''' - best_action = np.argmax(self.Q[state]) - # action = best_action - action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions - action_probs[best_action] += (1.0 - self.epsilon) - action = np.random.choice(np.arange(len(action_probs)), p=action_probs) + if state in self.Q.keys(): + best_action = np.argmax(self.Q[state]) + action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions + action_probs[best_action] += (1.0 - self.epsilon) + action = np.random.choice(np.arange(len(action_probs)), p=action_probs) + else: + action = np.random.randint(0,self.n_actions) return action def update(self,one_ep_transition): # Find all (state, action) pairs we've visited in this one_ep_transition diff --git a/codes/MonteCarlo/assets/action_grid.png b/codes/MonteCarlo/assets/action_grid.png deleted file mode 100644 index 7759f8b..0000000 Binary files a/codes/MonteCarlo/assets/action_grid.png and /dev/null differ diff --git a/codes/MonteCarlo/assets/track_big.png b/codes/MonteCarlo/assets/track_big.png deleted file mode 100644 index f7b3dc1..0000000 Binary files a/codes/MonteCarlo/assets/track_big.png and /dev/null differ diff --git a/codes/MonteCarlo/main.py b/codes/MonteCarlo/main.py index 545c2a4..bdd5ca4 100644 --- a/codes/MonteCarlo/main.py +++ b/codes/MonteCarlo/main.py @@ -5,7 +5,7 @@ Author: John Email: johnjim0816@gmail.com Date: 2021-03-11 14:26:44 LastEditor: John -LastEditTime: 2021-03-12 16:15:46 +LastEditTime: 2021-03-17 12:35:36 Discription: Environment: ''' @@ -35,7 +35,7 @@ class MCConfig: def __init__(self): self.epsilon = 0.15 # epsilon: The probability to select a random action . self.gamma = 0.9 # gamma: Gamma discount factor. - self.n_episodes = 300 + self.n_episodes = 150 self.n_steps = 2000 def get_mc_args(): @@ -58,8 +58,8 @@ def mc_train(cfg,env,agent): one_ep_transition = [] state = env.reset() ep_reward = 0 - # while True: - for t in range(cfg.n_steps): + while True: + # for t in range(cfg.n_steps): action = agent.choose_action(state) next_state, reward, done = env.step(action) ep_reward+=reward diff --git a/codes/MonteCarlo/results/20210312-161601/ma_rewards_train.npy b/codes/MonteCarlo/results/20210312-161601/ma_rewards_train.npy deleted file mode 100644 index 5734986..0000000 Binary files a/codes/MonteCarlo/results/20210312-161601/ma_rewards_train.npy and /dev/null differ diff --git a/codes/MonteCarlo/results/20210312-161601/rewards_curve_train.png b/codes/MonteCarlo/results/20210312-161601/rewards_curve_train.png deleted file mode 100644 index 7288729..0000000 Binary files a/codes/MonteCarlo/results/20210312-161601/rewards_curve_train.png and /dev/null differ diff --git a/codes/MonteCarlo/results/20210312-161601/rewards_train.npy b/codes/MonteCarlo/results/20210312-161601/rewards_train.npy deleted file mode 100644 index 5b43586..0000000 Binary files a/codes/MonteCarlo/results/20210312-161601/rewards_train.npy and /dev/null differ diff --git a/codes/MonteCarlo/results/20210317-123623/ma_rewards_train.npy b/codes/MonteCarlo/results/20210317-123623/ma_rewards_train.npy new file mode 100644 index 0000000..5b5a10a Binary files /dev/null and b/codes/MonteCarlo/results/20210317-123623/ma_rewards_train.npy differ diff --git a/codes/MonteCarlo/results/20210317-123623/rewards_curve_train.png b/codes/MonteCarlo/results/20210317-123623/rewards_curve_train.png new file mode 100644 index 0000000..08498fe Binary files /dev/null and b/codes/MonteCarlo/results/20210317-123623/rewards_curve_train.png differ diff --git a/codes/MonteCarlo/results/20210317-123623/rewards_train.npy b/codes/MonteCarlo/results/20210317-123623/rewards_train.npy new file mode 100644 index 0000000..1c8d034 Binary files /dev/null and b/codes/MonteCarlo/results/20210317-123623/rewards_train.npy differ diff --git a/codes/PolicyGradient/README.md b/codes/PolicyGradient/README.md index 43891bd..cc6edf3 100644 --- a/codes/PolicyGradient/README.md +++ b/codes/PolicyGradient/README.md @@ -1,38 +1,15 @@ # Policy Gradient 实现的是Policy Gradient最基本的REINFORCE方法 +## 使用说明 +直接运行```main.py```即可 ## 原理讲解 参考我的博客[Policy Gradient算法实战](https://blog.csdn.net/JohnJim0/article/details/110236851) ## 环境 - -python 3.7.9 - -pytorch 1.6.0 - -tensorboard 2.3.0 - -torchvision 0.7.0 - +python 3.7.9、pytorch 1.6.0 ## 程序运行方法 -train: - -```python -python main.py -``` - -eval: - -```python -python main.py --train 0 -``` -tensorboard: -```python -tensorboard --logdir logs -``` - - ## 参考 [REINFORCE和Reparameterization Trick](https://blog.csdn.net/JohnJim0/article/details/110230703) diff --git a/codes/PolicyGradient/agent.py b/codes/PolicyGradient/agent.py index 6adf217..e4c6e7c 100644 --- a/codes/PolicyGradient/agent.py +++ b/codes/PolicyGradient/agent.py @@ -5,7 +5,7 @@ Author: John Email: johnjim0816@gmail.com Date: 2020-11-22 23:27:44 LastEditor: John -LastEditTime: 2020-11-23 17:04:37 +LastEditTime: 2021-03-13 11:50:16 Discription: Environment: ''' @@ -14,24 +14,23 @@ from torch.distributions import Bernoulli from torch.autograd import Variable import numpy as np -from model import FCN +from common.model import MLP1 class PolicyGradient: - def __init__(self, state_dim,device='cpu',gamma = 0.99,lr = 0.01,batch_size=5): - self.gamma = gamma - self.policy_net = FCN(state_dim) - self.optimizer = torch.optim.RMSprop(self.policy_net.parameters(), lr=lr) - self.batch_size = batch_size + def __init__(self, n_states,cfg): + self.gamma = cfg.gamma + self.policy_net = MLP1(n_states,hidden_dim=cfg.hidden_dim) + self.optimizer = torch.optim.RMSprop(self.policy_net.parameters(), lr=cfg.lr) + self.batch_size = cfg.batch_size def choose_action(self,state): state = torch.from_numpy(state).float() state = Variable(state) probs = self.policy_net(state) - m = Bernoulli(probs) + m = Bernoulli(probs) # 伯努利分布 action = m.sample() - action = action.data.numpy().astype(int)[0] # 转为标量 return action @@ -67,6 +66,6 @@ class PolicyGradient: loss.backward() self.optimizer.step() def save_model(self,path): - torch.save(self.policy_net.state_dict(), path) + torch.save(self.policy_net.state_dict(), path+'pg_checkpoint.pth') def load_model(self,path): - self.policy_net.load_state_dict(torch.load(path)) \ No newline at end of file + self.policy_net.load_state_dict(torch.load(path+'pg_checkpoint.pth')) \ No newline at end of file diff --git a/codes/PolicyGradient/env.py b/codes/PolicyGradient/env.py deleted file mode 100644 index 0bf59eb..0000000 --- a/codes/PolicyGradient/env.py +++ /dev/null @@ -1,19 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -Author: John -Email: johnjim0816@gmail.com -Date: 2020-11-22 23:23:10 -LastEditor: John -LastEditTime: 2020-11-23 11:55:24 -Discription: -Environment: -''' -import gym - -def env_init(): - env = gym.make('CartPole-v0') # 可google为什么unwrapped gym,此处一般不需要 - env.seed(1) # 设置env随机种子 - state_dim = env.observation_space.shape[0] - n_actions = env.action_space.n - return env,state_dim,n_actions \ No newline at end of file diff --git a/codes/PolicyGradient/logs/eval/20201123-170440/events.out.tfevents.1606122284.MacBook-Pro.local.78801.0 b/codes/PolicyGradient/logs/eval/20201123-170440/events.out.tfevents.1606122284.MacBook-Pro.local.78801.0 deleted file mode 100644 index 2569c0f..0000000 Binary files a/codes/PolicyGradient/logs/eval/20201123-170440/events.out.tfevents.1606122284.MacBook-Pro.local.78801.0 and /dev/null differ diff --git a/codes/PolicyGradient/logs/eval/20201123-170440/rewards_moving_average/events.out.tfevents.1606122284.MacBook-Pro.local.78801.2 b/codes/PolicyGradient/logs/eval/20201123-170440/rewards_moving_average/events.out.tfevents.1606122284.MacBook-Pro.local.78801.2 deleted file mode 100644 index 909ca5f..0000000 Binary files a/codes/PolicyGradient/logs/eval/20201123-170440/rewards_moving_average/events.out.tfevents.1606122284.MacBook-Pro.local.78801.2 and /dev/null differ diff --git a/codes/PolicyGradient/logs/eval/20201123-170440/rewards_raw/events.out.tfevents.1606122284.MacBook-Pro.local.78801.1 b/codes/PolicyGradient/logs/eval/20201123-170440/rewards_raw/events.out.tfevents.1606122284.MacBook-Pro.local.78801.1 deleted file mode 100644 index 70b7113..0000000 Binary files a/codes/PolicyGradient/logs/eval/20201123-170440/rewards_raw/events.out.tfevents.1606122284.MacBook-Pro.local.78801.1 and /dev/null differ diff --git a/codes/PolicyGradient/logs/eval/20201126-191039/events.out.tfevents.1606389059.MacBook-Pro.local.21663.3 b/codes/PolicyGradient/logs/eval/20201126-191039/events.out.tfevents.1606389059.MacBook-Pro.local.21663.3 deleted file mode 100644 index fa9c4e6..0000000 Binary files a/codes/PolicyGradient/logs/eval/20201126-191039/events.out.tfevents.1606389059.MacBook-Pro.local.21663.3 and /dev/null differ diff --git a/codes/PolicyGradient/logs/eval/20201126-191039/rewards_moving_average/events.out.tfevents.1606389059.MacBook-Pro.local.21663.5 b/codes/PolicyGradient/logs/eval/20201126-191039/rewards_moving_average/events.out.tfevents.1606389059.MacBook-Pro.local.21663.5 deleted file mode 100644 index f11d33e..0000000 Binary files a/codes/PolicyGradient/logs/eval/20201126-191039/rewards_moving_average/events.out.tfevents.1606389059.MacBook-Pro.local.21663.5 and /dev/null differ diff --git a/codes/PolicyGradient/logs/eval/20201126-191039/rewards_raw/events.out.tfevents.1606389059.MacBook-Pro.local.21663.4 b/codes/PolicyGradient/logs/eval/20201126-191039/rewards_raw/events.out.tfevents.1606389059.MacBook-Pro.local.21663.4 deleted file mode 100644 index 1d052ea..0000000 Binary files a/codes/PolicyGradient/logs/eval/20201126-191039/rewards_raw/events.out.tfevents.1606389059.MacBook-Pro.local.21663.4 and /dev/null differ diff --git a/codes/PolicyGradient/logs/eval/20201126-191145/events.out.tfevents.1606389139.MacBook-Pro.local.21831.3 b/codes/PolicyGradient/logs/eval/20201126-191145/events.out.tfevents.1606389139.MacBook-Pro.local.21831.3 deleted file mode 100644 index 646540e..0000000 Binary files a/codes/PolicyGradient/logs/eval/20201126-191145/events.out.tfevents.1606389139.MacBook-Pro.local.21831.3 and /dev/null differ diff --git a/codes/PolicyGradient/logs/eval/20201126-191145/rewards_moving_average/events.out.tfevents.1606389139.MacBook-Pro.local.21831.5 b/codes/PolicyGradient/logs/eval/20201126-191145/rewards_moving_average/events.out.tfevents.1606389139.MacBook-Pro.local.21831.5 deleted file mode 100644 index b58a943..0000000 Binary files a/codes/PolicyGradient/logs/eval/20201126-191145/rewards_moving_average/events.out.tfevents.1606389139.MacBook-Pro.local.21831.5 and /dev/null differ diff --git a/codes/PolicyGradient/logs/eval/20201126-191145/rewards_raw/events.out.tfevents.1606389139.MacBook-Pro.local.21831.4 b/codes/PolicyGradient/logs/eval/20201126-191145/rewards_raw/events.out.tfevents.1606389139.MacBook-Pro.local.21831.4 deleted file mode 100644 index d7adc36..0000000 Binary files a/codes/PolicyGradient/logs/eval/20201126-191145/rewards_raw/events.out.tfevents.1606389139.MacBook-Pro.local.21831.4 and /dev/null differ diff --git a/codes/PolicyGradient/logs/train/20201123-135302/events.out.tfevents.1606110786.MacBook-Pro.local.75770.0 b/codes/PolicyGradient/logs/train/20201123-135302/events.out.tfevents.1606110786.MacBook-Pro.local.75770.0 deleted file mode 100644 index a7ee08a..0000000 Binary files a/codes/PolicyGradient/logs/train/20201123-135302/events.out.tfevents.1606110786.MacBook-Pro.local.75770.0 and /dev/null differ diff --git a/codes/PolicyGradient/logs/train/20201123-135302/rewards_moving_average/events.out.tfevents.1606110786.MacBook-Pro.local.75770.2 b/codes/PolicyGradient/logs/train/20201123-135302/rewards_moving_average/events.out.tfevents.1606110786.MacBook-Pro.local.75770.2 deleted file mode 100644 index 0043ce2..0000000 Binary files a/codes/PolicyGradient/logs/train/20201123-135302/rewards_moving_average/events.out.tfevents.1606110786.MacBook-Pro.local.75770.2 and /dev/null differ diff --git a/codes/PolicyGradient/logs/train/20201123-135302/rewards_raw/events.out.tfevents.1606110786.MacBook-Pro.local.75770.1 b/codes/PolicyGradient/logs/train/20201123-135302/rewards_raw/events.out.tfevents.1606110786.MacBook-Pro.local.75770.1 deleted file mode 100644 index 41392e8..0000000 Binary files a/codes/PolicyGradient/logs/train/20201123-135302/rewards_raw/events.out.tfevents.1606110786.MacBook-Pro.local.75770.1 and /dev/null differ diff --git a/codes/PolicyGradient/logs/train/20201126-191039/events.out.tfevents.1606389044.MacBook-Pro.local.21663.0 b/codes/PolicyGradient/logs/train/20201126-191039/events.out.tfevents.1606389044.MacBook-Pro.local.21663.0 deleted file mode 100644 index 3ececd0..0000000 Binary files a/codes/PolicyGradient/logs/train/20201126-191039/events.out.tfevents.1606389044.MacBook-Pro.local.21663.0 and /dev/null differ diff --git a/codes/PolicyGradient/logs/train/20201126-191039/rewards_moving_average/events.out.tfevents.1606389044.MacBook-Pro.local.21663.2 b/codes/PolicyGradient/logs/train/20201126-191039/rewards_moving_average/events.out.tfevents.1606389044.MacBook-Pro.local.21663.2 deleted file mode 100644 index 5b10ae8..0000000 Binary files a/codes/PolicyGradient/logs/train/20201126-191039/rewards_moving_average/events.out.tfevents.1606389044.MacBook-Pro.local.21663.2 and /dev/null differ diff --git a/codes/PolicyGradient/logs/train/20201126-191039/rewards_raw/events.out.tfevents.1606389044.MacBook-Pro.local.21663.1 b/codes/PolicyGradient/logs/train/20201126-191039/rewards_raw/events.out.tfevents.1606389044.MacBook-Pro.local.21663.1 deleted file mode 100644 index a0940a6..0000000 Binary files a/codes/PolicyGradient/logs/train/20201126-191039/rewards_raw/events.out.tfevents.1606389044.MacBook-Pro.local.21663.1 and /dev/null differ diff --git a/codes/PolicyGradient/logs/train/20201126-191145/events.out.tfevents.1606389110.MacBook-Pro.local.21831.0 b/codes/PolicyGradient/logs/train/20201126-191145/events.out.tfevents.1606389110.MacBook-Pro.local.21831.0 deleted file mode 100644 index 5ffc5f9..0000000 Binary files a/codes/PolicyGradient/logs/train/20201126-191145/events.out.tfevents.1606389110.MacBook-Pro.local.21831.0 and /dev/null differ diff --git a/codes/PolicyGradient/logs/train/20201126-191145/rewards_moving_average/events.out.tfevents.1606389110.MacBook-Pro.local.21831.2 b/codes/PolicyGradient/logs/train/20201126-191145/rewards_moving_average/events.out.tfevents.1606389110.MacBook-Pro.local.21831.2 deleted file mode 100644 index 933661b..0000000 Binary files a/codes/PolicyGradient/logs/train/20201126-191145/rewards_moving_average/events.out.tfevents.1606389110.MacBook-Pro.local.21831.2 and /dev/null differ diff --git a/codes/PolicyGradient/logs/train/20201126-191145/rewards_raw/events.out.tfevents.1606389110.MacBook-Pro.local.21831.1 b/codes/PolicyGradient/logs/train/20201126-191145/rewards_raw/events.out.tfevents.1606389110.MacBook-Pro.local.21831.1 deleted file mode 100644 index 0cfb773..0000000 Binary files a/codes/PolicyGradient/logs/train/20201126-191145/rewards_raw/events.out.tfevents.1606389110.MacBook-Pro.local.21831.1 and /dev/null differ diff --git a/codes/PolicyGradient/main.py b/codes/PolicyGradient/main.py index 6d8bc93..19a0484 100644 --- a/codes/PolicyGradient/main.py +++ b/codes/PolicyGradient/main.py @@ -5,34 +5,47 @@ Author: John Email: johnjim0816@gmail.com Date: 2020-11-22 23:21:53 LastEditor: John -LastEditTime: 2020-11-24 19:52:40 +LastEditTime: 2021-03-13 11:50:32 Discription: Environment: ''' +import sys,os +sys.path.append(os.getcwd()) # 添加当前终端路径 from itertools import count -import torch -import os -from torch.utils.tensorboard import SummaryWriter +import datetime +import gym +from PolicyGradient.agent import PolicyGradient +from common.plot import plot_rewards +from common.utils import save_results -from env import env_init -from params import get_args -from agent import PolicyGradient -from params import SEQUENCE, SAVED_MODEL_PATH, RESULT_PATH -from utils import save_results,save_model -from plot import plot -def train(cfg): - env,state_dim,n_actions = env_init() - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu - agent = PolicyGradient(state_dim,device = device,lr = cfg.policy_lr) +SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 +SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' # 生成保存的模型路径 +if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"): # 检测是否存在文件夹 + os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/") +if not os.path.exists(SAVED_MODEL_PATH): # 检测是否存在文件夹 + os.mkdir(SAVED_MODEL_PATH) +RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/' # 存储reward的路径 +if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/results/"): # 检测是否存在文件夹 + os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/results/") +if not os.path.exists(RESULT_PATH): # 检测是否存在文件夹 + os.mkdir(RESULT_PATH) + +class PGConfig: + def __init__(self): + self.train_eps = 300 # 训练的episode数目 + self.batch_size = 8 + self.lr = 0.01 # 学习率 + self.gamma = 0.99 + self.hidden_dim = 36 # 隐藏层维度 + +def train(cfg,env,agent): '''下面带pool都是存放的transition序列用于gradient''' state_pool = [] # 存放每batch_size个episode的state序列 action_pool = [] reward_pool = [] ''' 存储每个episode的reward用于绘图''' rewards = [] - moving_average_rewards = [] - log_dir=os.path.split(os.path.abspath(__file__))[0]+"/logs/train/" + SEQUENCE - writer = SummaryWriter(log_dir) # 使用tensorboard的writer + ma_rewards = [] for i_episode in range(cfg.train_eps): state = env.reset() ep_reward = 0 @@ -55,55 +68,22 @@ def train(cfg): action_pool = [] reward_pool = [] rewards.append(ep_reward) - if i_episode == 0: - moving_average_rewards.append(ep_reward) + if ma_rewards: + ma_rewards.append( + 0.9*ma_rewards[-1]+0.1*ep_reward) else: - moving_average_rewards.append( - 0.9*moving_average_rewards[-1]+0.1*ep_reward) - writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode+1) - writer.close() - print('Complete training!') - save_model(agent,model_path=SAVED_MODEL_PATH) - '''存储reward等相关结果''' - save_results(rewards,moving_average_rewards,tag='train',result_path=RESULT_PATH) - plot(rewards) - plot(moving_average_rewards,ylabel='moving_average_rewards_train') - -def eval(cfg,saved_model_path = SAVED_MODEL_PATH): - env,state_dim,n_actions = env_init() - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu - agent = PolicyGradient(state_dim,device = device,lr = cfg.policy_lr) - agent.load_model(saved_model_path+'checkpoint.pth') - rewards = [] - moving_average_rewards = [] - log_dir=os.path.split(os.path.abspath(__file__))[0]+"/logs/eval/" + SEQUENCE - writer = SummaryWriter(log_dir) # 使用tensorboard的writer - for i_episode in range(cfg.eval_eps): - state = env.reset() - ep_reward = 0 - for _ in count(): - action = agent.choose_action(state) # 根据当前环境state选择action - next_state, reward, done, _ = env.step(action) - ep_reward += reward - state = next_state - if done: - print('Episode:', i_episode, ' Reward:', ep_reward) - break - rewards.append(ep_reward) - if i_episode == 0: - moving_average_rewards.append(ep_reward) - else: - moving_average_rewards.append( - 0.9*moving_average_rewards[-1]+0.1*ep_reward) - writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode+1) - writer.close() - print('Complete evaling!') - + ma_rewards.append(ep_reward) + print('complete training!') + return rewards, ma_rewards + if __name__ == "__main__": - cfg = get_args() - if cfg.train: - train(cfg) - eval(cfg) - else: - model_path = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/" - eval(cfg,saved_model_path=model_path) + cfg = PGConfig() + env = gym.make('CartPole-v0') # 可google为什么unwrapped gym,此处一般不需要 + env.seed(1) # 设置env随机种子 + n_states = env.observation_space.shape[0] + n_actions = env.action_space.n + agent = PolicyGradient(n_states,cfg) + rewards, ma_rewards = train(cfg,env,agent) + agent.save_model(SAVED_MODEL_PATH) + save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH) + plot_rewards(rewards,ma_rewards,tag="train",algo = "Policy Gradient",path=RESULT_PATH) diff --git a/codes/PolicyGradient/model.py b/codes/PolicyGradient/model.py deleted file mode 100644 index ce8b4d2..0000000 --- a/codes/PolicyGradient/model.py +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -Author: John -Email: johnjim0816@gmail.com -Date: 2020-11-22 23:18:46 -LastEditor: John -LastEditTime: 2020-11-27 16:55:25 -Discription: -Environment: -''' -import torch.nn as nn -import torch.nn.functional as F -class FCN(nn.Module): - ''' 全连接网络''' - def __init__(self,state_dim): - super(FCN, self).__init__() - # 24和36为hidden layer的层数,可根据state_dim, n_actions的情况来改变 - self.fc1 = nn.Linear(state_dim, 36) - self.fc2 = nn.Linear(36, 36) - self.fc3 = nn.Linear(36, 1) # Prob of Left - - def forward(self, x): - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = F.sigmoid(self.fc3(x)) - return x \ No newline at end of file diff --git a/codes/PolicyGradient/params.py b/codes/PolicyGradient/params.py deleted file mode 100644 index 3f34b20..0000000 --- a/codes/PolicyGradient/params.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -Author: John -Email: johnjim0816@gmail.com -Date: 2020-11-22 23:25:37 -LastEditor: John -LastEditTime: 2020-11-26 19:11:21 -Discription: 存储参数 -Environment: -''' -import argparse -import datetime -import os - -SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") -SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' -RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/result/"+SEQUENCE+'/' - -def get_args(): - '''训练参数''' - parser = argparse.ArgumentParser() - parser.add_argument("--train", default=1, type=int) # 1 表示训练,0表示只进行eval - parser.add_argument("--train_eps", default=300, type=int) # 训练的最大episode数目 - parser.add_argument("--eval_eps", default=100, type=int) # 训练的最大episode数目 - parser.add_argument("--batch_size", default=4, type=int) # 用于gradient的episode数目 - parser.add_argument("--policy_lr", default=0.01, type=float) # 学习率 - config = parser.parse_args() - return config \ No newline at end of file diff --git a/codes/PolicyGradient/plot.py b/codes/PolicyGradient/plot.py deleted file mode 100644 index 2a9a65e..0000000 --- a/codes/PolicyGradient/plot.py +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -Author: John -Email: johnjim0816@gmail.com -Date: 2020-11-23 13:48:46 -LastEditor: John -LastEditTime: 2020-11-23 13:48:48 -Discription: -Environment: -''' -import matplotlib.pyplot as plt -import seaborn as sns -import numpy as np -import os - -def plot(item,ylabel='rewards_train', save_fig = True): - '''plot using searborn to plot - ''' - sns.set() - plt.figure() - plt.plot(np.arange(len(item)), item) - plt.title(ylabel+' of DQN') - plt.ylabel(ylabel) - plt.xlabel('episodes') - if save_fig: - plt.savefig(os.path.dirname(__file__)+"/result/"+ylabel+".png") - plt.show() - -if __name__ == "__main__": - - output_path = os.path.split(os.path.abspath(__file__))[0]+"/result/" - tag = 'train' - rewards=np.load(output_path+"rewards_"+tag+".npy", ) - moving_average_rewards=np.load(output_path+"moving_average_rewards_"+tag+".npy",) - steps=np.load(output_path+"steps_"+tag+".npy") - plot(rewards) - plot(moving_average_rewards,ylabel='moving_average_rewards_'+tag) - plot(steps,ylabel='steps_'+tag) - tag = 'eval' - rewards=np.load(output_path+"rewards_"+tag+".npy", ) - moving_average_rewards=np.load(output_path+"moving_average_rewards_"+tag+".npy",) - steps=np.load(output_path+"steps_"+tag+".npy") - plot(rewards,ylabel='rewards_'+tag) - plot(moving_average_rewards,ylabel='moving_average_rewards_'+tag) - plot(steps,ylabel='steps_'+tag) \ No newline at end of file diff --git a/codes/PolicyGradient/result/20201123-135302/moving_average_rewards_train.npy b/codes/PolicyGradient/result/20201123-135302/moving_average_rewards_train.npy deleted file mode 100644 index 430dd9e..0000000 Binary files a/codes/PolicyGradient/result/20201123-135302/moving_average_rewards_train.npy and /dev/null differ diff --git a/codes/PolicyGradient/result/20201123-135302/rewards_train.npy b/codes/PolicyGradient/result/20201123-135302/rewards_train.npy deleted file mode 100644 index 1916910..0000000 Binary files a/codes/PolicyGradient/result/20201123-135302/rewards_train.npy and /dev/null differ diff --git a/codes/PolicyGradient/result/20201126-191039/moving_average_rewards_train.npy b/codes/PolicyGradient/result/20201126-191039/moving_average_rewards_train.npy deleted file mode 100644 index 661b070..0000000 Binary files a/codes/PolicyGradient/result/20201126-191039/moving_average_rewards_train.npy and /dev/null differ diff --git a/codes/PolicyGradient/result/20201126-191039/rewards_train.npy b/codes/PolicyGradient/result/20201126-191039/rewards_train.npy deleted file mode 100644 index 96f9738..0000000 Binary files a/codes/PolicyGradient/result/20201126-191039/rewards_train.npy and /dev/null differ diff --git a/codes/PolicyGradient/result/20201126-191145/moving_average_rewards_train.npy b/codes/PolicyGradient/result/20201126-191145/moving_average_rewards_train.npy deleted file mode 100644 index 784889c..0000000 Binary files a/codes/PolicyGradient/result/20201126-191145/moving_average_rewards_train.npy and /dev/null differ diff --git a/codes/PolicyGradient/result/20201126-191145/rewards_train.npy b/codes/PolicyGradient/result/20201126-191145/rewards_train.npy deleted file mode 100644 index 999029d..0000000 Binary files a/codes/PolicyGradient/result/20201126-191145/rewards_train.npy and /dev/null differ diff --git a/codes/PolicyGradient/result/moving_average_rewards_train.png b/codes/PolicyGradient/result/moving_average_rewards_train.png deleted file mode 100644 index b531cda..0000000 Binary files a/codes/PolicyGradient/result/moving_average_rewards_train.png and /dev/null differ diff --git a/codes/PolicyGradient/result/rewards_train.png b/codes/PolicyGradient/result/rewards_train.png deleted file mode 100644 index 007232d..0000000 Binary files a/codes/PolicyGradient/result/rewards_train.png and /dev/null differ diff --git a/codes/PolicyGradient/results/20210313-114904/ma_rewards_train.npy b/codes/PolicyGradient/results/20210313-114904/ma_rewards_train.npy new file mode 100644 index 0000000..cd5c266 Binary files /dev/null and b/codes/PolicyGradient/results/20210313-114904/ma_rewards_train.npy differ diff --git a/codes/PolicyGradient/results/20210313-114904/rewards_curve_train.png b/codes/PolicyGradient/results/20210313-114904/rewards_curve_train.png new file mode 100644 index 0000000..6786b02 Binary files /dev/null and b/codes/PolicyGradient/results/20210313-114904/rewards_curve_train.png differ diff --git a/codes/PolicyGradient/results/20210313-114904/rewards_train.npy b/codes/PolicyGradient/results/20210313-114904/rewards_train.npy new file mode 100644 index 0000000..710328a Binary files /dev/null and b/codes/PolicyGradient/results/20210313-114904/rewards_train.npy differ diff --git a/codes/PolicyGradient/saved_model/20201123-135302/checkpoint.pth b/codes/PolicyGradient/saved_model/20201123-135302/checkpoint.pth deleted file mode 100644 index 1f13387..0000000 Binary files a/codes/PolicyGradient/saved_model/20201123-135302/checkpoint.pth and /dev/null differ diff --git a/codes/PolicyGradient/saved_model/20201126-191039/checkpoint.pth b/codes/PolicyGradient/saved_model/20201126-191039/checkpoint.pth deleted file mode 100644 index 12bfcce..0000000 Binary files a/codes/PolicyGradient/saved_model/20201126-191039/checkpoint.pth and /dev/null differ diff --git a/codes/PolicyGradient/saved_model/20201126-191145/checkpoint.pth b/codes/PolicyGradient/saved_model/20201126-191145/checkpoint.pth deleted file mode 100644 index 0f7ced9..0000000 Binary files a/codes/PolicyGradient/saved_model/20201126-191145/checkpoint.pth and /dev/null differ diff --git a/codes/PolicyGradient/saved_model/20210313-114904/pg_checkpoint.pth b/codes/PolicyGradient/saved_model/20210313-114904/pg_checkpoint.pth new file mode 100644 index 0000000..7f5a671 Binary files /dev/null and b/codes/PolicyGradient/saved_model/20210313-114904/pg_checkpoint.pth differ diff --git a/codes/PolicyGradient/saved_model/checkpoint.pth b/codes/PolicyGradient/saved_model/checkpoint.pth deleted file mode 100644 index 1f13387..0000000 Binary files a/codes/PolicyGradient/saved_model/checkpoint.pth and /dev/null differ diff --git a/codes/PolicyGradient/utils.py b/codes/PolicyGradient/utils.py deleted file mode 100644 index 887ca25..0000000 --- a/codes/PolicyGradient/utils.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -Author: John -Email: johnjim0816@gmail.com -Date: 2020-11-23 13:44:52 -LastEditor: John -LastEditTime: 2020-11-23 13:45:42 -Discription: -Environment: -''' -import os -import numpy as np - - -def save_results(rewards,moving_average_rewards,tag='train',result_path='./result'): - '''保存reward等结果 - ''' - if not os.path.exists(result_path): # 检测是否存在文件夹 - os.mkdir(result_path) - np.save(result_path+'rewards_'+tag+'.npy', rewards) - np.save(result_path+'moving_average_rewards_'+tag+'.npy', moving_average_rewards) - print('results saved!') - -def save_model(agent,model_path='./saved_model'): - if not os.path.exists(model_path): # 检测是否存在文件夹 - os.mkdir(model_path) - agent.save_model(model_path+'checkpoint.pth') - print('model saved!') \ No newline at end of file diff --git a/codes/QLearning/main.py b/codes/QLearning/main.py index 6fefb8a..27a0934 100644 --- a/codes/QLearning/main.py +++ b/codes/QLearning/main.py @@ -5,14 +5,13 @@ Author: John Email: johnjim0816@gmail.com Date: 2020-09-11 23:03:00 LastEditor: John -LastEditTime: 2021-03-12 16:52:26 +LastEditTime: 2021-03-12 21:16:50 Discription: Environment: ''' import sys,os sys.path.append(os.getcwd()) # 添加当前终端路径 -import argparse import gym import datetime @@ -108,7 +107,6 @@ if __name__ == "__main__": agent = QLearning(n_actions,cfg) rewards,ma_rewards = train(cfg,env,agent) agent.save(path=SAVED_MODEL_PATH) - # eval(cfg,env,agent) save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH) plot_rewards(rewards,ma_rewards,tag="train",algo = "On-Policy First-Visit MC Control",path=RESULT_PATH) diff --git a/codes/QLearning/results/20210312-165244/ma_rewards_train.npy b/codes/QLearning/results/20210312-165244/ma_rewards_train.npy deleted file mode 100644 index 489cfb4..0000000 Binary files a/codes/QLearning/results/20210312-165244/ma_rewards_train.npy and /dev/null differ diff --git a/codes/QLearning/results/20210312-165244/rewards_curve_train.png b/codes/QLearning/results/20210312-165244/rewards_curve_train.png deleted file mode 100644 index 3353dee..0000000 Binary files a/codes/QLearning/results/20210312-165244/rewards_curve_train.png and /dev/null differ diff --git a/codes/QLearning/results/20210312-165244/rewards_train.npy b/codes/QLearning/results/20210312-165244/rewards_train.npy deleted file mode 100644 index 91008fd..0000000 Binary files a/codes/QLearning/results/20210312-165244/rewards_train.npy and /dev/null differ diff --git a/codes/QLearning/results/20210313-110213/ma_rewards_train.npy b/codes/QLearning/results/20210313-110213/ma_rewards_train.npy new file mode 100644 index 0000000..4f05a73 Binary files /dev/null and b/codes/QLearning/results/20210313-110213/ma_rewards_train.npy differ diff --git a/codes/QLearning/results/20210313-110213/rewards_curve_train.png b/codes/QLearning/results/20210313-110213/rewards_curve_train.png new file mode 100644 index 0000000..d6bbc01 Binary files /dev/null and b/codes/QLearning/results/20210313-110213/rewards_curve_train.png differ diff --git a/codes/QLearning/results/20210313-110213/rewards_train.npy b/codes/QLearning/results/20210313-110213/rewards_train.npy new file mode 100644 index 0000000..f1e8ba9 Binary files /dev/null and b/codes/QLearning/results/20210313-110213/rewards_train.npy differ diff --git a/codes/QLearning/saved_model/20210312-165244/Qleaning_model.pkl b/codes/QLearning/saved_model/20210312-165244/Qleaning_model.pkl deleted file mode 100644 index c70d88f..0000000 Binary files a/codes/QLearning/saved_model/20210312-165244/Qleaning_model.pkl and /dev/null differ diff --git a/codes/QLearning/saved_model/20210313-110213/Qleaning_model.pkl b/codes/QLearning/saved_model/20210313-110213/Qleaning_model.pkl new file mode 100644 index 0000000..9f71ab0 Binary files /dev/null and b/codes/QLearning/saved_model/20210313-110213/Qleaning_model.pkl differ diff --git a/codes/Sarsa/agent.py b/codes/Sarsa/agent.py index 9b8f443..3753381 100644 --- a/codes/Sarsa/agent.py +++ b/codes/Sarsa/agent.py @@ -5,7 +5,7 @@ Author: John Email: johnjim0816@gmail.com Date: 2021-03-12 16:58:16 LastEditor: John -LastEditTime: 2021-03-12 17:03:05 +LastEditTime: 2021-03-13 11:02:50 Discription: Environment: ''' @@ -42,11 +42,11 @@ class Sarsa(object): import dill torch.save( obj=self.Q, - f=path+"Sarsa_model.pkl", + f=path+"sarsa_model.pkl", pickle_module=dill ) def load(self, path): '''从文件中读取数据到 Q表格 ''' import dill - self.Q =torch.load(f=path+'Sarsa_model.pkl',pickle_module=dill) \ No newline at end of file + self.Q =torch.load(f=path+'sarsa_model.pkl',pickle_module=dill) \ No newline at end of file diff --git a/codes/Sarsa/results/20210312-170254/ma_rewards_train.npy b/codes/Sarsa/results/20210312-170254/ma_rewards_train.npy deleted file mode 100644 index e48eb0f..0000000 Binary files a/codes/Sarsa/results/20210312-170254/ma_rewards_train.npy and /dev/null differ diff --git a/codes/Sarsa/results/20210312-170254/rewards_curve_train.png b/codes/Sarsa/results/20210312-170254/rewards_curve_train.png deleted file mode 100644 index 9423936..0000000 Binary files a/codes/Sarsa/results/20210312-170254/rewards_curve_train.png and /dev/null differ diff --git a/codes/Sarsa/results/20210312-170254/rewards_train.npy b/codes/Sarsa/results/20210312-170254/rewards_train.npy deleted file mode 100644 index cf36d6a..0000000 Binary files a/codes/Sarsa/results/20210312-170254/rewards_train.npy and /dev/null differ diff --git a/codes/Sarsa/results/20210313-110256/ma_rewards_train.npy b/codes/Sarsa/results/20210313-110256/ma_rewards_train.npy new file mode 100644 index 0000000..943a6d4 Binary files /dev/null and b/codes/Sarsa/results/20210313-110256/ma_rewards_train.npy differ diff --git a/codes/Sarsa/results/20210313-110256/rewards_curve_train.png b/codes/Sarsa/results/20210313-110256/rewards_curve_train.png new file mode 100644 index 0000000..ea31886 Binary files /dev/null and b/codes/Sarsa/results/20210313-110256/rewards_curve_train.png differ diff --git a/codes/Sarsa/results/20210313-110256/rewards_train.npy b/codes/Sarsa/results/20210313-110256/rewards_train.npy new file mode 100644 index 0000000..d0702e8 Binary files /dev/null and b/codes/Sarsa/results/20210313-110256/rewards_train.npy differ diff --git a/codes/Sarsa/saved_model/20210312-170254/Sarsa_model.pkl b/codes/Sarsa/saved_model/20210312-170254/Sarsa_model.pkl deleted file mode 100644 index 28efcf4..0000000 Binary files a/codes/Sarsa/saved_model/20210312-170254/Sarsa_model.pkl and /dev/null differ diff --git a/codes/Sarsa/saved_model/20210313-110256/sarsa_model.pkl b/codes/Sarsa/saved_model/20210313-110256/sarsa_model.pkl new file mode 100644 index 0000000..d19971c Binary files /dev/null and b/codes/Sarsa/saved_model/20210313-110256/sarsa_model.pkl differ diff --git a/codes/TD3/main.py b/codes/TD3/main.py new file mode 100644 index 0000000..304b509 --- /dev/null +++ b/codes/TD3/main.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python +# coding=utf-8 +''' +@Author: John +@Email: johnjim0816@gmail.com +@Date: 2020-06-11 23:38:13 +@LastEditor: John +@LastEditTime: 2020-06-11 23:38:31 +@Discription: +@Environment: python 3.7.7 +''' +import torch +if __name__ == "__main__": + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") \ No newline at end of file diff --git a/codes/ddpg/memory.py b/codes/TD3/memory.py similarity index 77% rename from codes/ddpg/memory.py rename to codes/TD3/memory.py index 34d31b7..1feb2c4 100644 --- a/codes/ddpg/memory.py +++ b/codes/TD3/memory.py @@ -5,7 +5,7 @@ @Email: johnjim0816@gmail.com @Date: 2020-06-10 15:27:16 @LastEditor: John -@LastEditTime: 2020-06-13 00:29:45 +@LastEditTime: 2020-06-11 21:04:50 @Discription: @Environment: python 3.7.7 ''' @@ -27,8 +27,8 @@ class ReplayBuffer: def sample(self, batch_size): batch = random.sample(self.buffer, batch_size) - state_batch, action_batch, reward_batch, next_state_batch, done_batch = map(np.stack, zip(*batch)) - return state_batch, action_batch, reward_batch, next_state_batch, done_batch + state, action, reward, next_state, done = map(np.stack, zip(*batch)) + return state, action, reward, next_state, done def __len__(self): return len(self.buffer) \ No newline at end of file diff --git a/codes/assets/action_grid.png b/codes/assets/action_grid.png deleted file mode 100644 index 7759f8b..0000000 Binary files a/codes/assets/action_grid.png and /dev/null differ diff --git a/codes/assets/track_big.png b/codes/assets/track_big.png deleted file mode 100644 index f7b3dc1..0000000 Binary files a/codes/assets/track_big.png and /dev/null differ diff --git a/codes/common/model.py b/codes/common/model.py index c99d4a3..0b069ea 100644 --- a/codes/common/model.py +++ b/codes/common/model.py @@ -5,12 +5,14 @@ Author: John Email: johnjim0816@gmail.com Date: 2021-03-12 21:14:12 LastEditor: John -LastEditTime: 2021-03-13 13:48:35 +LastEditTime: 2021-03-20 16:44:00 Discription: Environment: ''' +import torch import torch.nn as nn import torch.nn.functional as F +from torch.distributions import Categorical class MLP1(nn.Module): ''' 多层感知机 输入:state维度 @@ -45,4 +47,61 @@ class MLP2(nn.Module): # 各层对应的激活函数 x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) - return self.fc3(x) \ No newline at end of file + return self.fc3(x) + +class Critic(nn.Module): + def __init__(self, n_obs, n_actions, hidden_size, init_w=3e-3): + super(Critic, self).__init__() + + self.linear1 = nn.Linear(n_obs + n_actions, hidden_size) + self.linear2 = nn.Linear(hidden_size, hidden_size) + self.linear3 = nn.Linear(hidden_size, 1) + # 随机初始化为较小的值 + self.linear3.weight.data.uniform_(-init_w, init_w) + self.linear3.bias.data.uniform_(-init_w, init_w) + + def forward(self, state, action): + # 按维数1拼接 + x = torch.cat([state, action], 1) + x = F.relu(self.linear1(x)) + x = F.relu(self.linear2(x)) + x = self.linear3(x) + return x + +class Actor(nn.Module): + def __init__(self, n_obs, n_actions, hidden_size, init_w=3e-3): + super(Actor, self).__init__() + self.linear1 = nn.Linear(n_obs, hidden_size) + self.linear2 = nn.Linear(hidden_size, hidden_size) + self.linear3 = nn.Linear(hidden_size, n_actions) + + self.linear3.weight.data.uniform_(-init_w, init_w) + self.linear3.bias.data.uniform_(-init_w, init_w) + + def forward(self, x): + x = F.relu(self.linear1(x)) + x = F.relu(self.linear2(x)) + x = F.tanh(self.linear3(x)) + return x + +class ActorCritic(nn.Module): + def __init__(self, n_states, n_actions, hidden_dim=256): + super(ActorCritic, self).__init__() + self.critic = nn.Sequential( + nn.Linear(n_states, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, 1) + ) + + self.actor = nn.Sequential( + nn.Linear(n_states, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, n_actions), + nn.Softmax(dim=1), + ) + + def forward(self, x): + value = self.critic(x) + probs = self.actor(x) + dist = Categorical(probs) + return dist, value \ No newline at end of file diff --git a/codes/ddpg/.vscode/settings.json b/codes/ddpg/.vscode/settings.json deleted file mode 100644 index be0f1ab..0000000 --- a/codes/ddpg/.vscode/settings.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "python.pythonPath": "/Users/jj/anaconda3/envs/py37/bin/python" -} \ No newline at end of file diff --git a/codes/ddpg/README.md b/codes/ddpg/README.md index 5a06c53..351615b 100644 --- a/codes/ddpg/README.md +++ b/codes/ddpg/README.md @@ -1,26 +1,5 @@ +# DDPG -python 3.7.9 +## 伪代码 -pytorch 1.6.0 - -tensorboard 2.3.0 - -torchvision 0.7.0 - -train: - -```python -python main.py -``` - -eval: - -```python -python main.py --train 0 -``` - -open tensorboard: - -```python -tensorboard --logdir logs -``` \ No newline at end of file +![image-20210320151900695](assets/image-20210320151900695.png) \ No newline at end of file diff --git a/codes/ddpg/agent.py b/codes/ddpg/agent.py index 3aded73..29f34d6 100644 --- a/codes/ddpg/agent.py +++ b/codes/ddpg/agent.py @@ -5,7 +5,7 @@ @Email: johnjim0816@gmail.com @Date: 2020-06-09 20:25:52 @LastEditor: John -LastEditTime: 2020-09-02 01:19:13 +LastEditTime: 2021-03-17 20:43:25 @Discription: @Environment: python 3.7.7 ''' @@ -14,18 +14,17 @@ import torch import torch.nn as nn import torch.optim as optim -from model import Actor, Critic -from memory import ReplayBuffer +from common.model import Actor, Critic +from common.memory import ReplayBuffer class DDPG: - def __init__(self, n_states, n_actions, hidden_dim=30, device="cpu", critic_lr=1e-3, - actor_lr=1e-4, gamma=0.99, soft_tau=1e-2, memory_capacity=100000, batch_size=128): - self.device = device - self.critic = Critic(n_states, n_actions, hidden_dim).to(device) - self.actor = Actor(n_states, n_actions, hidden_dim).to(device) - self.target_critic = Critic(n_states, n_actions, hidden_dim).to(device) - self.target_actor = Actor(n_states, n_actions, hidden_dim).to(device) + def __init__(self, n_states, n_actions, cfg): + self.device = cfg.device + self.critic = Critic(n_states, n_actions, cfg.hidden_dim).to(cfg.device) + self.actor = Actor(n_states, n_actions, cfg.hidden_dim).to(cfg.device) + self.target_critic = Critic(n_states, n_actions, cfg.hidden_dim).to(cfg.device) + self.target_actor = Actor(n_states, n_actions, cfg.hidden_dim).to(cfg.device) for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()): target_param.data.copy_(param.data) @@ -33,14 +32,14 @@ class DDPG: target_param.data.copy_(param.data) self.critic_optimizer = optim.Adam( - self.critic.parameters(), lr=critic_lr) - self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr) - self.memory = ReplayBuffer(memory_capacity) - self.batch_size = batch_size - self.soft_tau = soft_tau - self.gamma = gamma + self.critic.parameters(), lr=cfg.critic_lr) + self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=cfg.actor_lr) + self.memory = ReplayBuffer(cfg.memory_capacity) + self.batch_size = cfg.batch_size + self.soft_tau = cfg.soft_tau + self.gamma = cfg.gamma - def select_action(self, state): + def choose_action(self, state): state = torch.FloatTensor(state).unsqueeze(0).to(self.device) action = self.actor(state) # torch.detach()用于切断反向传播 @@ -87,8 +86,8 @@ class DDPG: target_param.data * (1.0 - self.soft_tau) + param.data * self.soft_tau ) - def save_model(self,path): - torch.save(self.target_actor.state_dict(), path) + def save(self,path): + torch.save(self.target_net.state_dict(), path+'DDPG_checkpoint.pth') - def load_model(self,path): - self.actor.load_state_dict(torch.load(path)) \ No newline at end of file + def load(self,path): + self.actor.load_state_dict(torch.load(path+'DDPG_checkpoint.pth')) \ No newline at end of file diff --git a/codes/ddpg/assets/image-20210320151900695.png b/codes/ddpg/assets/image-20210320151900695.png new file mode 100644 index 0000000..fd41201 Binary files /dev/null and b/codes/ddpg/assets/image-20210320151900695.png differ diff --git a/codes/ddpg/env.py b/codes/ddpg/env.py index 7e707cb..ad7bd0e 100644 --- a/codes/ddpg/env.py +++ b/codes/ddpg/env.py @@ -5,7 +5,7 @@ @Email: johnjim0816@gmail.com @Date: 2020-06-10 15:28:30 @LastEditor: John -LastEditTime: 2020-09-01 10:57:36 +LastEditTime: 2021-03-19 19:56:46 @Discription: @Environment: python 3.7.7 ''' @@ -29,4 +29,33 @@ class NormalizedActions(gym.ActionWrapper): upper_bound = self.action_space.high action = 2 * (action - low_bound) / (upper_bound - low_bound) - 1 action = np.clip(action, low_bound, upper_bound) - return action \ No newline at end of file + return action + +class OUNoise(object): + '''Ornstein–Uhlenbeck + ''' + def __init__(self, action_space, mu=0.0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000): + self.mu = mu + self.theta = theta + self.sigma = max_sigma + self.max_sigma = max_sigma + self.min_sigma = min_sigma + self.decay_period = decay_period + self.n_actions = action_space.shape[0] + self.low = action_space.low + self.high = action_space.high + self.reset() + + def reset(self): + self.obs = np.ones(self.n_actions) * self.mu + + def evolve_obs(self): + x = self.obs + dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.n_actions) + self.obs = x + dx + return self.obs + + def get_action(self, action, t=0): + ou_obs = self.evolve_obs() + self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period) + return np.clip(action + ou_obs, self.low, self.high) \ No newline at end of file diff --git a/codes/ddpg/logs/eval/20201015-192417/events.out.tfevents.1602761195.MacBook-Pro.local.156.3 b/codes/ddpg/logs/eval/20201015-192417/events.out.tfevents.1602761195.MacBook-Pro.local.156.3 deleted file mode 100644 index be5de57..0000000 Binary files a/codes/ddpg/logs/eval/20201015-192417/events.out.tfevents.1602761195.MacBook-Pro.local.156.3 and /dev/null differ diff --git a/codes/ddpg/logs/eval/20201015-192417/rewards_moving_average/events.out.tfevents.1602761195.MacBook-Pro.local.156.5 b/codes/ddpg/logs/eval/20201015-192417/rewards_moving_average/events.out.tfevents.1602761195.MacBook-Pro.local.156.5 deleted file mode 100644 index 3fda6d2..0000000 Binary files a/codes/ddpg/logs/eval/20201015-192417/rewards_moving_average/events.out.tfevents.1602761195.MacBook-Pro.local.156.5 and /dev/null differ diff --git a/codes/ddpg/logs/eval/20201015-192417/rewards_raw/events.out.tfevents.1602761195.MacBook-Pro.local.156.4 b/codes/ddpg/logs/eval/20201015-192417/rewards_raw/events.out.tfevents.1602761195.MacBook-Pro.local.156.4 deleted file mode 100644 index 485af31..0000000 Binary files a/codes/ddpg/logs/eval/20201015-192417/rewards_raw/events.out.tfevents.1602761195.MacBook-Pro.local.156.4 and /dev/null differ diff --git a/codes/ddpg/logs/train/20201015-192417/events.out.tfevents.1602761057.MacBook-Pro.local.156.0 b/codes/ddpg/logs/train/20201015-192417/events.out.tfevents.1602761057.MacBook-Pro.local.156.0 deleted file mode 100644 index 5a3a1d1..0000000 Binary files a/codes/ddpg/logs/train/20201015-192417/events.out.tfevents.1602761057.MacBook-Pro.local.156.0 and /dev/null differ diff --git a/codes/ddpg/logs/train/20201015-192417/rewards_moving_average/events.out.tfevents.1602761057.MacBook-Pro.local.156.2 b/codes/ddpg/logs/train/20201015-192417/rewards_moving_average/events.out.tfevents.1602761057.MacBook-Pro.local.156.2 deleted file mode 100644 index 70eb483..0000000 Binary files a/codes/ddpg/logs/train/20201015-192417/rewards_moving_average/events.out.tfevents.1602761057.MacBook-Pro.local.156.2 and /dev/null differ diff --git a/codes/ddpg/logs/train/20201015-192417/rewards_raw/events.out.tfevents.1602761057.MacBook-Pro.local.156.1 b/codes/ddpg/logs/train/20201015-192417/rewards_raw/events.out.tfevents.1602761057.MacBook-Pro.local.156.1 deleted file mode 100644 index b131138..0000000 Binary files a/codes/ddpg/logs/train/20201015-192417/rewards_raw/events.out.tfevents.1602761057.MacBook-Pro.local.156.1 and /dev/null differ diff --git a/codes/ddpg/main.py b/codes/ddpg/main.py index 2a2cc52..5308ec6 100644 --- a/codes/ddpg/main.py +++ b/codes/ddpg/main.py @@ -5,74 +5,60 @@ @Email: johnjim0816@gmail.com @Date: 2020-06-11 20:58:21 @LastEditor: John -LastEditTime: 2020-10-15 21:23:39 +LastEditTime: 2021-03-19 19:57:00 @Discription: @Environment: python 3.7.7 ''' -from token import NUMBER -from typing import Sequence +import sys,os +sys.path.append(os.getcwd()) # 添加当前终端路径 import torch import gym -from agent import DDPG -from env import NormalizedActions -from noise import OUNoise -import os import numpy as np -import argparse -from torch.utils.tensorboard import SummaryWriter import datetime +from DDPG.agent import DDPG +from DDPG.env import NormalizedActions,OUNoise +from common.plot import plot_rewards +from common.utils import save_results -SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") -SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' -RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/result/"+SEQUENCE+'/' +SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 +SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' # 生成保存的模型路径 +if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"): # 检测是否存在文件夹 + os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/saved_model/") +if not os.path.exists(SAVED_MODEL_PATH): # 检测是否存在文件夹 + os.mkdir(SAVED_MODEL_PATH) +RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/results/"+SEQUENCE+'/' # 存储reward的路径 +if not os.path.exists(os.path.split(os.path.abspath(__file__))[0]+"/results/"): # 检测是否存在文件夹 + os.mkdir(os.path.split(os.path.abspath(__file__))[0]+"/results/") +if not os.path.exists(RESULT_PATH): # 检测是否存在文件夹 + os.mkdir(RESULT_PATH) -def get_args(): - '''模型建立好之后只需要在这里调参 - ''' - parser = argparse.ArgumentParser() - parser.add_argument("--train", default=1, type=int) # 1 表示训练,0表示只进行eval - parser.add_argument("--gamma", default=0.99, - type=float) # q-learning中的gamma - parser.add_argument("--critic_lr", default=1e-3, type=float) # critic学习率 - parser.add_argument("--actor_lr", default=1e-4, type=float) - parser.add_argument("--memory_capacity", default=10000, - type=int, help="capacity of Replay Memory") - parser.add_argument("--batch_size", default=128, type=int, - help="batch size of memory sampling") - parser.add_argument("--train_eps", default=200, type=int) - parser.add_argument("--train_steps", default=200, type=int) - parser.add_argument("--eval_eps", default=200, type=int) # 训练的最大episode数目 - parser.add_argument("--eval_steps", default=200, - type=int) # 训练每个episode的长度 - parser.add_argument("--target_update", default=4, type=int, - help="when(every default 10 eisodes) to update target net ") - config = parser.parse_args() - return config - - -def train(cfg): - print('Start to train ! \n') - env = NormalizedActions(gym.make("Pendulum-v0")) - - # 增加action噪声 - ou_noise = OUNoise(env.action_space) - - n_states = env.observation_space.shape[0] - n_actions = env.action_space.shape[0] - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - agent = DDPG(n_states, n_actions, device="cpu", critic_lr=1e-3, - actor_lr=1e-4, gamma=0.99, soft_tau=1e-2, memory_capacity=100000, batch_size=128) +class DDPGConfig: + def __init__(self): + self.gamma = 0.99 + self.critic_lr = 1e-3 + self.actor_lr = 1e-4 + self.memory_capacity = 10000 + self.batch_size = 128 + self.train_eps =300 + self.train_steps = 200 + self.eval_eps = 200 + self.eval_steps = 200 + self.target_update = 4 + self.hidden_dim = 30 + self.soft_tau=1e-2 + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +def train(cfg,env,agent): + print('Start to train ! ') + ou_noise = OUNoise(env.action_space) # action noise rewards = [] - moving_average_rewards = [] + ma_rewards = [] # moving average rewards ep_steps = [] - log_dir=os.path.split(os.path.abspath(__file__))[0]+"/logs/train/" + SEQUENCE - writer = SummaryWriter(log_dir) - for i_episode in range(1, cfg.train_eps+1): + for i_episode in range(cfg.train_eps): state = env.reset() ou_noise.reset() ep_reward = 0 - for i_step in range(1, cfg.train_steps+1): - action = agent.select_action(state) + for i_step in range(cfg.train_steps): + action = agent.choose_action(state) action = ou_noise.get_action( action, i_step) # 即paper中的random process next_state, reward, done, _ = env.step(action) @@ -82,80 +68,25 @@ def train(cfg): state = next_state if done: break - print('Episode:', i_episode, ' Reward: %i' % - int(ep_reward), 'n_steps:', i_step) + print('Episode:{}/{}, Reward:{}, Steps:{}, Done:{}'.format(i_episode+1,cfg.train_eps,ep_reward,i_step+1,done)) ep_steps.append(i_step) rewards.append(ep_reward) - if i_episode == 1: - moving_average_rewards.append(ep_reward) + if ma_rewards: + ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) else: - moving_average_rewards.append( - 0.9*moving_average_rewards[-1]+0.1*ep_reward) - writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode) - writer.add_scalar('steps_of_each_episode', - ep_steps[-1], i_episode) - writer.close() + ma_rewards.append(ep_reward) print('Complete training!') - ''' 保存模型 ''' - if not os.path.exists(SAVED_MODEL_PATH): # 检测是否存在文件夹 - os.mkdir(SAVED_MODEL_PATH) - agent.save_model(SAVED_MODEL_PATH+'checkpoint.pth') - '''存储reward等相关结果''' - if not os.path.exists(RESULT_PATH): # 检测是否存在文件夹 - os.mkdir(RESULT_PATH) - np.save(RESULT_PATH+'rewards_train.npy', rewards) - np.save(RESULT_PATH+'moving_average_rewards_train.npy', moving_average_rewards) - np.save(RESULT_PATH+'steps_train.npy', ep_steps) - -def eval(cfg, saved_model_path = SAVED_MODEL_PATH): - print('start to eval ! \n') - env = NormalizedActions(gym.make("Pendulum-v0")) - n_states = env.observation_space.shape[0] - n_actions = env.action_space.shape[0] - agent = DDPG(n_states, n_actions, critic_lr=1e-3, - actor_lr=1e-4, gamma=0.99, soft_tau=1e-2, memory_capacity=100000, batch_size=128) - agent.load_model(saved_model_path+'checkpoint.pth') - rewards = [] - moving_average_rewards = [] - ep_steps = [] - log_dir=os.path.split(os.path.abspath(__file__))[0]+"/logs/eval/" + SEQUENCE - writer = SummaryWriter(log_dir) - for i_episode in range(1, cfg.eval_eps+1): - state = env.reset() # reset环境状态 - ep_reward = 0 - for i_step in range(1, cfg.eval_steps+1): - action = agent.select_action(state) # 根据当前环境state选择action - next_state, reward, done, _ = env.step(action) # 更新环境参数 - ep_reward += reward - state = next_state # 跳转到下一个状态 - if done: - break - print('Episode:', i_episode, ' Reward: %i' % - int(ep_reward), 'n_steps:', i_step, 'done: ', done) - ep_steps.append(i_step) - rewards.append(ep_reward) - # 计算滑动窗口的reward - if i_episode == 1: - moving_average_rewards.append(ep_reward) - else: - moving_average_rewards.append( - 0.9*moving_average_rewards[-1]+0.1*ep_reward) - writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode) - writer.add_scalar('steps_of_each_episode', - ep_steps[-1], i_episode) - writer.close() - '''存储reward等相关结果''' - if not os.path.exists(RESULT_PATH): # 检测是否存在文件夹 - os.mkdir(RESULT_PATH) - np.save(RESULT_PATH+'rewards_eval.npy', rewards) - np.save(RESULT_PATH+'moving_average_rewards_eval.npy', moving_average_rewards) - np.save(RESULT_PATH+'steps_eval.npy', ep_steps) + return rewards,ma_rewards if __name__ == "__main__": - cfg = get_args() - if cfg.train: - train(cfg) - eval(cfg) - else: - model_path = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/" - eval(cfg,saved_model_path=model_path) + cfg = DDPGConfig() + env = NormalizedActions(gym.make("Pendulum-v0")) + env.seed(1) # 设置env随机种子 + n_states = env.observation_space.shape[0] + n_actions = env.action_space.shape[0] + agent = DDPG(n_states,n_actions,cfg) + rewards,ma_rewards = train(cfg,env,agent) + agent.save(path=SAVED_MODEL_PATH) + save_results(rewards,ma_rewards,tag='train',path=RESULT_PATH) + plot_rewards(rewards,ma_rewards,tag="train",algo = cfg.algo,path=RESULT_PATH) + \ No newline at end of file diff --git a/codes/ddpg/model.py b/codes/ddpg/model.py deleted file mode 100644 index 96a7cdf..0000000 --- a/codes/ddpg/model.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -@Author: John -@Email: johnjim0816@gmail.com -@Date: 2020-06-10 15:03:59 -@LastEditor: John -LastEditTime: 2020-08-22 19:09:54 -@Discription: -@Environment: python 3.7.7 -''' -import torch -import torch.nn as nn -import torch.nn.functional as F - -class Critic(nn.Module): - def __init__(self, n_obs, n_actions, hidden_size, init_w=3e-3): - super(Critic, self).__init__() - - self.linear1 = nn.Linear(n_obs + n_actions, hidden_size) - self.linear2 = nn.Linear(hidden_size, hidden_size) - self.linear3 = nn.Linear(hidden_size, 1) - # 随机初始化为较小的值 - self.linear3.weight.data.uniform_(-init_w, init_w) - self.linear3.bias.data.uniform_(-init_w, init_w) - - def forward(self, state, action): - # 按维数1拼接 - x = torch.cat([state, action], 1) - x = F.relu(self.linear1(x)) - x = F.relu(self.linear2(x)) - x = self.linear3(x) - return x - -class Actor(nn.Module): - def __init__(self, n_obs, n_actions, hidden_size, init_w=3e-3): - super(Actor, self).__init__() - self.linear1 = nn.Linear(n_obs, hidden_size) - self.linear2 = nn.Linear(hidden_size, hidden_size) - self.linear3 = nn.Linear(hidden_size, n_actions) - - self.linear3.weight.data.uniform_(-init_w, init_w) - self.linear3.bias.data.uniform_(-init_w, init_w) - - def forward(self, x): - x = F.relu(self.linear1(x)) - x = F.relu(self.linear2(x)) - x = F.tanh(self.linear3(x)) - return x - \ No newline at end of file diff --git a/codes/ddpg/noise.py b/codes/ddpg/noise.py deleted file mode 100644 index 50fcec8..0000000 --- a/codes/ddpg/noise.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -@Author: John -@Email: johnjim0816@gmail.com -@Date: 2020-06-11 20:58:59 -@LastEditor: John -@LastEditTime: 2020-06-11 20:59:20 -@Discription: -@Environment: python 3.7.7 -''' -import numpy as np - -class OUNoise(object): - def __init__(self, action_space, mu=0.0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000): - self.mu = mu - self.theta = theta - self.sigma = max_sigma - self.max_sigma = max_sigma - self.min_sigma = min_sigma - self.decay_period = decay_period - self.n_actions = action_space.shape[0] - self.low = action_space.low - self.high = action_space.high - self.reset() - - def reset(self): - self.obs = np.ones(self.n_actions) * self.mu - - def evolve_obs(self): - x = self.obs - dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.n_actions) - self.obs = x + dx - return self.obs - - def get_action(self, action, t=0): - ou_obs = self.evolve_obs() - self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period) - return np.clip(action + ou_obs, self.low, self.high) \ No newline at end of file diff --git a/codes/ddpg/plot.py b/codes/ddpg/plot.py deleted file mode 100644 index f25efb0..0000000 --- a/codes/ddpg/plot.py +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -@Author: John -@Email: johnjim0816@gmail.com -@Date: 2020-06-11 16:30:09 -@LastEditor: John -LastEditTime: 2020-10-15 21:32:05 -@Discription: -@Environment: python 3.7.7 -''' -import matplotlib.pyplot as plt -import seaborn as sns -import numpy as np -import os - -def plot_results(item,ylabel='rewards_train', save_fig = True): - '''plot using searborn to plot - ''' - sns.set() - plt.figure() - plt.plot(np.arange(len(item)), item) - plt.title(ylabel+' of DDPG') - plt.ylabel(ylabel) - plt.xlabel('episodes') - if save_fig: - plt.savefig(os.path.dirname(__file__)+"/result/"+ylabel+".png") - plt.show() - -if __name__ == "__main__": - - output_path = os.path.split(os.path.abspath(__file__))[0]+"/result/" - tag = 'train' - rewards=np.load(output_path+"rewards_"+tag+".npy", ) - moving_average_rewards=np.load(output_path+"moving_average_rewards_"+tag+".npy",) - steps=np.load(output_path+"steps_"+tag+".npy") - plot_results(rewards) - plot_results(moving_average_rewards,ylabel='moving_average_rewards_'+tag) - plot_results(steps,ylabel='steps_'+tag) - tag = 'eval' - rewards=np.load(output_path+"rewards_"+tag+".npy", ) - moving_average_rewards=np.load(output_path+"moving_average_rewards_"+tag+".npy",) - steps=np.load(output_path+"steps_"+tag+".npy") - plot_results(rewards,ylabel='rewards_'+tag) - plot_results(moving_average_rewards,ylabel='moving_average_rewards_'+tag) - plot_results(steps,ylabel='steps_'+tag) diff --git a/codes/ddpg/result/20201015-193308/moving_average_rewards_eval.npy b/codes/ddpg/result/20201015-193308/moving_average_rewards_eval.npy deleted file mode 100644 index 892177b..0000000 Binary files a/codes/ddpg/result/20201015-193308/moving_average_rewards_eval.npy and /dev/null differ diff --git a/codes/ddpg/result/20201015-193308/moving_average_rewards_train.npy b/codes/ddpg/result/20201015-193308/moving_average_rewards_train.npy deleted file mode 100644 index baae56c..0000000 Binary files a/codes/ddpg/result/20201015-193308/moving_average_rewards_train.npy and /dev/null differ diff --git a/codes/ddpg/result/20201015-193308/rewards_eval copy.npy b/codes/ddpg/result/20201015-193308/rewards_eval copy.npy deleted file mode 100644 index 22c1d74..0000000 Binary files a/codes/ddpg/result/20201015-193308/rewards_eval copy.npy and /dev/null differ diff --git a/codes/ddpg/result/20201015-193308/rewards_eval.npy b/codes/ddpg/result/20201015-193308/rewards_eval.npy deleted file mode 100644 index 22c1d74..0000000 Binary files a/codes/ddpg/result/20201015-193308/rewards_eval.npy and /dev/null differ diff --git a/codes/ddpg/result/20201015-193308/rewards_train.npy b/codes/ddpg/result/20201015-193308/rewards_train.npy deleted file mode 100644 index 2f86826..0000000 Binary files a/codes/ddpg/result/20201015-193308/rewards_train.npy and /dev/null differ diff --git a/codes/ddpg/result/20201015-193308/steps_train.npy b/codes/ddpg/result/20201015-193308/steps_train.npy deleted file mode 100644 index 59825bb..0000000 Binary files a/codes/ddpg/result/20201015-193308/steps_train.npy and /dev/null differ diff --git a/codes/ddpg/result/moving_average_rewards_eval.npy b/codes/ddpg/result/moving_average_rewards_eval.npy deleted file mode 100644 index 892177b..0000000 Binary files a/codes/ddpg/result/moving_average_rewards_eval.npy and /dev/null differ diff --git a/codes/ddpg/result/moving_average_rewards_eval.png b/codes/ddpg/result/moving_average_rewards_eval.png deleted file mode 100644 index 3e9c92f..0000000 Binary files a/codes/ddpg/result/moving_average_rewards_eval.png and /dev/null differ diff --git a/codes/ddpg/result/moving_average_rewards_train.npy b/codes/ddpg/result/moving_average_rewards_train.npy deleted file mode 100644 index baae56c..0000000 Binary files a/codes/ddpg/result/moving_average_rewards_train.npy and /dev/null differ diff --git a/codes/ddpg/result/moving_average_rewards_train.png b/codes/ddpg/result/moving_average_rewards_train.png deleted file mode 100644 index 666e14d..0000000 Binary files a/codes/ddpg/result/moving_average_rewards_train.png and /dev/null differ diff --git a/codes/ddpg/result/rewards_eval.npy b/codes/ddpg/result/rewards_eval.npy deleted file mode 100644 index 22c1d74..0000000 Binary files a/codes/ddpg/result/rewards_eval.npy and /dev/null differ diff --git a/codes/ddpg/result/rewards_eval.png b/codes/ddpg/result/rewards_eval.png deleted file mode 100644 index f7b3c04..0000000 Binary files a/codes/ddpg/result/rewards_eval.png and /dev/null differ diff --git a/codes/ddpg/result/rewards_train.npy b/codes/ddpg/result/rewards_train.npy deleted file mode 100644 index 2f86826..0000000 Binary files a/codes/ddpg/result/rewards_train.npy and /dev/null differ diff --git a/codes/ddpg/result/rewards_train.png b/codes/ddpg/result/rewards_train.png deleted file mode 100644 index ee4862f..0000000 Binary files a/codes/ddpg/result/rewards_train.png and /dev/null differ diff --git a/codes/ddpg/result/steps_eval.npy b/codes/ddpg/result/steps_eval.npy deleted file mode 100644 index 59825bb..0000000 Binary files a/codes/ddpg/result/steps_eval.npy and /dev/null differ diff --git a/codes/ddpg/result/steps_eval.png b/codes/ddpg/result/steps_eval.png deleted file mode 100644 index d6d77d7..0000000 Binary files a/codes/ddpg/result/steps_eval.png and /dev/null differ diff --git a/codes/ddpg/result/steps_train.npy b/codes/ddpg/result/steps_train.npy deleted file mode 100644 index 59825bb..0000000 Binary files a/codes/ddpg/result/steps_train.npy and /dev/null differ diff --git a/codes/ddpg/result/steps_train.png b/codes/ddpg/result/steps_train.png deleted file mode 100644 index c6a9675..0000000 Binary files a/codes/ddpg/result/steps_train.png and /dev/null differ diff --git a/codes/ddpg/saved_model/20201015-193308/checkpoint.pth b/codes/ddpg/saved_model/20201015-193308/checkpoint.pth deleted file mode 100644 index e07405b..0000000 Binary files a/codes/ddpg/saved_model/20201015-193308/checkpoint.pth and /dev/null differ diff --git a/codes/ddpg/saved_model/checkpoint.pth b/codes/ddpg/saved_model/checkpoint.pth deleted file mode 100644 index b39ee07..0000000 Binary files a/codes/ddpg/saved_model/checkpoint.pth and /dev/null differ diff --git a/codes/ddpg/utils.py b/codes/ddpg/utils.py deleted file mode 100644 index f4cde57..0000000 --- a/codes/ddpg/utils.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -Author: John -Email: johnjim0816@gmail.com -Date: 2020-10-15 21:31:19 -LastEditor: John -LastEditTime: 2020-10-15 21:31:25 -Discription: -Environment: -''' -import os -import numpy as np -RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/result/"+SEQUENCE+'/' - -def save_results(rewards,moving_average_rewards,ep_steps,path=RESULT_PATH): - if not os.path.exists(path): # 检测是否存在文件夹 - os.mkdir(path) - np.save(RESULT_PATH+'rewards_train.npy', rewards) - np.save(RESULT_PATH+'moving_average_rewards_train.npy', moving_average_rewards) - np.save(RESULT_PATH+'steps_train.npy',ep_steps ) \ No newline at end of file diff --git a/codes/dqn_cnn/dqn.py b/codes/dqn_cnn/dqn.py deleted file mode 100644 index 3da4f3e..0000000 --- a/codes/dqn_cnn/dqn.py +++ /dev/null @@ -1,107 +0,0 @@ -import random -import math -import torch -import torch.optim as optim -import torch.nn.functional as F -from memory import ReplayBuffer -from model import CNN - - -class DQN: - def __init__(self, screen_height=0, screen_width=0, n_actions=0, gamma=0.999, epsilon_start=0.9, epsilon_end=0.05, epsilon_decay=200, memory_capacity=10000, batch_size=128, device="cpu"): - self.actions_count = 0 - self.n_actions = n_actions - self.device = device - self.gamma = gamma - self.epsilon = 0 - self.epsilon_start = epsilon_start - self.epsilon_end = epsilon_end - self.epsilon_decay = epsilon_decay - self.batch_size = batch_size - self.policy_net = CNN(screen_height, screen_width, - n_actions).to(self.device) - self.target_net = CNN(screen_height, screen_width, - n_actions).to(self.device) - self.target_net.load_state_dict(self.policy_net.state_dict()) - self.target_net.eval() # 不启用 BatchNormalization 和 Dropout - self.optimizer = optim.RMSprop(self.policy_net.parameters()) - self.loss = 0 - self.memory = ReplayBuffer(memory_capacity) - - - def select_action(self, state): - '''choose_action [summary] - Args: - state [torch tensor]: [description] - Returns: - actions [torch tensor]: [description] - ''' - sample = random.random() - self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ - math.exp(-1. * self.actions_count / self.epsilon_decay) - self.actions_count += 1 - if sample > self.epsilon: - with torch.no_grad(): - # t.max(1) will return largest column value of each row. - # second column on max result is index of where max element was - # found, so we pick action with the larger expected reward. - - q_value = self.policy_net(state) # q_value比如tensor([[-0.2522, 0.3887]]) - action = q_value.max(1)[1].view(1, 1) # q_value最大对应的下标,注意该action是个张量,如tensor([1]) - return action - else: - return torch.tensor([[random.randrange(self.n_actions)]], device=self.device, dtype=torch.long) - - def update(self): - if len(self.memory) < self.batch_size: - return - transitions = self.memory.sample(self.batch_size) - # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for - # detailed explanation). This converts batch-array of Transitions - # to Transition of batch-arrays. - batch = self.memory.Transition(*zip(*transitions)) - - # Compute a mask of non-final states and concatenate the batch elements - # (a final state would've been the one after which simulation ended) - non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, - batch.next_state)), device=self.device, dtype=torch.bool) - - non_final_next_states = torch.cat([s for s in batch.next_state - if s is not None]) - state_batch = torch.cat(batch.state) - action_batch = torch.cat(batch.action) - reward_batch = torch.cat(batch.reward) # tensor([1., 1.,...,]) - - - # Compute Q(s_t, a) - the model computes Q(s_t), then we select the - # columns of actions taken. These are the actions which would've been taken - # for each batch state according to policy_net - state_action_values = self.policy_net( - state_batch).gather(1, action_batch) #tensor([[ 1.1217],...,[ 0.8314]]) - - # Compute V(s_{t+1}) for all next states. - # Expected values of actions for non_final_next_states are computed based - # on the "older" target_net; selecting their best reward with max(1)[0]. - # This is merged based on the mask, such that we'll have either the expected - # state value or 0 in case the state was final. - next_state_values = torch.zeros(self.batch_size, device=self.device) - - next_state_values[non_final_mask] = self.target_net( - non_final_next_states).max(1)[0].detach() - - # Compute the expected Q values - expected_state_action_values = (next_state_values * self.gamma) + reward_batch # tensor([0.9685, 0.9683,...,]) - - # Compute Huber loss - self.loss = F.smooth_l1_loss( - state_action_values, expected_state_action_values.unsqueeze(1)) # .unsqueeze增加一个维度 - # Optimize the model - self.optimizer.zero_grad() # zero_grad clears old gradients from the last step (otherwise you’d just accumulate the gradients from all loss.backward() calls). - self.loss.backward() # loss.backward() computes the derivative of the loss w.r.t. the parameters (or anything requiring gradients) using backpropagation. - for param in self.policy_net.parameters(): # clip防止梯度爆炸 - param.grad.data.clamp_(-1, 1) - self.optimizer.step() # causes the optimizer to take a step based on the gradients of the parameters. - - -if __name__ == "__main__": - dqn = DQN() diff --git a/codes/dqn_cnn/main.py b/codes/dqn_cnn/main.py deleted file mode 100644 index 5cacee3..0000000 --- a/codes/dqn_cnn/main.py +++ /dev/null @@ -1,115 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -@Author: John -@Email: johnjim0816@gmail.com -@Date: 2020-06-11 10:01:09 -@LastEditor: John -@LastEditTime: 2020-06-13 00:24:31 -@Discription: -@Environment: python 3.7.7 -''' -''' -应该是没有收敛,但是pytorch官方教程的结果也差不多 -''' -import gym -import torch - -from screen_state import get_screen -from dqn import DQN -from plot import plot - -import argparse - -def get_args(): - '''模型建立好之后只需要在这里调参 - ''' - parser = argparse.ArgumentParser() - - parser.add_argument("--gamma", default=0.999, type=float) # q-learning中的gamma - parser.add_argument("--epsilon_start", default=0.9, type=float) # 基于贪心选择action对应的参数epsilon - parser.add_argument("--epsilon_end", default=0.05, type=float) - parser.add_argument("--epsilon_decay", default=200, type=float) - - parser.add_argument("--memory_capacity", default=10000, type=int,help="capacity of Replay Memory") - - parser.add_argument("--batch_size", default=128, type=int,help="batch size of memory sampling") - parser.add_argument("--max_episodes", default=100, type=int) - parser.add_argument("--max_steps", default=200, type=int) - parser.add_argument("--target_update", default=4, type=int,help="when(every default 10 eisodes) to update target net ") - config = parser.parse_args() - - return config - -if __name__ == "__main__": - - cfg = get_args() - # if gpu is to be used - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - # Get screen size so that we can initialize layers correctly based on shape - # returned from AI gym. Typical dimensions at this point are close to 3x40x90 - # which is the result of a clamped and down-scaled render buffer in get_screen(env,device) - env = gym.make('CartPole-v0').unwrapped - env.reset() - init_screen = get_screen(env, device) - _, _, screen_height, screen_width = init_screen.shape - # Get number of actions from gym action space - n_actions = env.action_space.n - agent = DQN(screen_height=screen_height, screen_width=screen_width, - n_actions=n_actions, device=device, gamma=cfg.gamma, epsilon_start=cfg.epsilon_start, epsilon_end=cfg.epsilon_end, epsilon_decay=cfg.epsilon_decay, memory_capacity=cfg.memory_capacity,batch_size=cfg.batch_size) - - rewards = [] - moving_average_rewards = [] - for i_episode in range(1,cfg.max_episodes+1): - # Initialize the environment and state - env.reset() - last_screen = get_screen(env, device) - current_screen = get_screen(env, device) - state = current_screen - last_screen - ep_reward = 0 - for t in range(1,cfg.max_steps+1): - # Select and perform an action - action = agent.select_action(state) - _, reward, done, _ = env.step(action.item()) - ep_reward += reward - reward = torch.tensor([reward], device=device) - # Observe new state - last_screen = current_screen - current_screen = get_screen(env, device) - - if done: break - next_state = current_screen - last_screen - - # Store the transition in memory - agent.memory.push(state, action, next_state, reward) - - # Move to the next state - state = next_state - - # Perform one step of the optimization (on the target network) - agent.update() - - # Update the target network, copying all weights and biases in DQN - if i_episode % cfg.target_update == 0: - agent.target_net.load_state_dict(agent.policy_net.state_dict()) - print('Episode:', i_episode, ' Reward: %i' %int(ep_reward), 'Explore: %.2f' % agent.epsilon) - rewards.append(ep_reward) - if i_episode == 1: - moving_average_rewards.append(ep_reward) - else: - moving_average_rewards.append( - 0.9*moving_average_rewards[-1]+0.1*ep_reward) - - import os - import numpy as np - output_path = os.path.dirname(__file__)+"/result/" - if not os.path.exists(output_path): - os.mkdir(output_path) - np.save(output_path+"rewards.npy", rewards) - np.save(output_path+"moving_average_rewards.npy", moving_average_rewards) - print('Complete!') - plot(rewards) - plot(moving_average_rewards,ylabel="moving_average_rewards") - - diff --git a/codes/dqn_cnn/memory.py b/codes/dqn_cnn/memory.py deleted file mode 100644 index 4b9c59c..0000000 --- a/codes/dqn_cnn/memory.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -@Author: John -@Email: johnjim0816@gmail.com -@Date: 2020-06-11 09:42:44 -@LastEditor: John -@LastEditTime: 2020-06-11 15:50:33 -@Discription: -@Environment: python 3.7.7 -''' -from collections import namedtuple -import random - - - -class ReplayBuffer(object): - - def __init__(self, capacity): - self.capacity = capacity - self.buffer = [] - self.position = 0 - self.Transition = namedtuple('Transition', - ('state', 'action', 'next_state', 'reward')) - - def push(self, *args): - """Saves a transition.""" - if len(self.buffer) < self.capacity: - self.buffer.append(None) - self.buffer[self.position] = self.Transition(*args) - self.position = (self.position + 1) % self.capacity - - def sample(self, batch_size): - return random.sample(self.buffer, batch_size) - - def __len__(self): - return len(self.buffer) diff --git a/codes/dqn_cnn/model.py b/codes/dqn_cnn/model.py deleted file mode 100644 index 71e67ca..0000000 --- a/codes/dqn_cnn/model.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -@Author: John -@Email: johnjim0816@gmail.com -@Date: 2020-06-11 12:18:12 -@LastEditor: John -@LastEditTime: 2020-06-11 17:23:45 -@Discription: -@Environment: python 3.7.7 -''' -import torch.nn as nn -import torch.nn.functional as F - -class CNN(nn.Module): - - def __init__(self, h, w, n_outputs): - super(CNN, self).__init__() - self.conv1 = nn.Conv2d(3, 16, kernel_size=5, stride=2) - self.bn1 = nn.BatchNorm2d(16) - self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2) - self.bn2 = nn.BatchNorm2d(32) - self.conv3 = nn.Conv2d(32, 32, kernel_size=5, stride=2) - self.bn3 = nn.BatchNorm2d(32) - - # Number of Linear input connections depends on output of conv2d layers - # and therefore the input image size, so compute it. - def conv2d_size_out(size, kernel_size = 5, stride = 2): - return (size - (kernel_size - 1) - 1) // stride + 1 - convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w))) - convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h))) - linear_input_size = convw * convh * 32 - self.head = nn.Linear(linear_input_size, n_outputs) - - # Called with either one element to determine next action, or a batch - # during optimization. Returns tensor([[left0exp,right0exp]...]). - def forward(self, x): - x = F.relu(self.bn1(self.conv1(x))) - x = F.relu(self.bn2(self.conv2(x))) - x = F.relu(self.bn3(self.conv3(x))) - return self.head(x.view(x.size(0), -1)) \ No newline at end of file diff --git a/codes/dqn_cnn/plot.py b/codes/dqn_cnn/plot.py deleted file mode 100644 index 2579f86..0000000 --- a/codes/dqn_cnn/plot.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -@Author: John -@Email: johnjim0816@gmail.com -@Date: 2020-06-11 16:30:09 -@LastEditor: John -@LastEditTime: 2020-06-11 22:27:24 -@Discription: -@Environment: python 3.7.7 -''' -import matplotlib.pyplot as plt -import numpy as np -import os - -def plot(item,ylabel='rewards'): - plt.figure() - plt.plot(np.arange(len(item)), item) - plt.title(ylabel+' of CnnDQN') - plt.ylabel('rewards') - plt.xlabel('episodes') - - plt.savefig(os.path.dirname(__file__)+"/result/"+ylabel+".png") - plt.show() diff --git a/codes/dqn_cnn/result/moving_average_rewards.npy b/codes/dqn_cnn/result/moving_average_rewards.npy deleted file mode 100644 index a132de3..0000000 Binary files a/codes/dqn_cnn/result/moving_average_rewards.npy and /dev/null differ diff --git a/codes/dqn_cnn/result/moving_average_rewards.png b/codes/dqn_cnn/result/moving_average_rewards.png deleted file mode 100644 index c2cf945..0000000 Binary files a/codes/dqn_cnn/result/moving_average_rewards.png and /dev/null differ diff --git a/codes/dqn_cnn/result/rewards.npy b/codes/dqn_cnn/result/rewards.npy deleted file mode 100644 index 551e671..0000000 Binary files a/codes/dqn_cnn/result/rewards.npy and /dev/null differ diff --git a/codes/dqn_cnn/result/rewards.png b/codes/dqn_cnn/result/rewards.png deleted file mode 100644 index 9cfbf4a..0000000 Binary files a/codes/dqn_cnn/result/rewards.png and /dev/null differ diff --git a/codes/dqn_cnn/screen_state.py b/codes/dqn_cnn/screen_state.py deleted file mode 100644 index 402eead..0000000 --- a/codes/dqn_cnn/screen_state.py +++ /dev/null @@ -1,66 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -@Author: John -@Email: johnjim0816@gmail.com -@Date: 2020-06-11 10:02:35 -@LastEditor: John -@LastEditTime: 2020-06-11 16:57:34 -@Discription: -@Environment: python 3.7.7 -''' - -import numpy as np -import torch -import torchvision.transforms as T -from PIL import Image - -resize = T.Compose([T.ToPILImage(), - T.Resize(40, interpolation=Image.CUBIC), - T.ToTensor()]) - - -def get_cart_location(env,screen_width): - world_width = env.x_threshold * 2 - scale = screen_width / world_width - return int(env.state[0] * scale + screen_width / 2.0) # MIDDLE OF CART - -def get_screen(env,device): - # Returned screen requested by gym is 400x600x3, but is sometimes larger - # such as 800x1200x3. Transpose it into torch order (CHW). - screen = env.render(mode='rgb_array').transpose((2, 0, 1)) - # Cart is in the lower half, so strip off the top and bottom of the screen - _, screen_height, screen_width = screen.shape - screen = screen[:, int(screen_height*0.4):int(screen_height * 0.8)] - view_width = int(screen_width * 0.6) - cart_location = get_cart_location(env,screen_width) - if cart_location < view_width // 2: - slice_range = slice(view_width) - elif cart_location > (screen_width - view_width // 2): - slice_range = slice(-view_width, None) - else: - slice_range = slice(cart_location - view_width // 2, - cart_location + view_width // 2) - # Strip off the edges, so that we have a square image centered on a cart - screen = screen[:, :, slice_range] - # Convert to float, rescale, convert to torch tensor - # (this doesn't require a copy) - screen = np.ascontiguousarray(screen, dtype=np.float32) / 255 - screen = torch.from_numpy(screen) - # Resize, and add a batch dimension (BCHW) - return resize(screen).unsqueeze(0).to(device) - -if __name__ == "__main__": - - import gym - env = gym.make('CartPole-v0').unwrapped - # if gpu is to be used - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - env.reset() - import matplotlib.pyplot as plt - - plt.figure() - plt.imshow(get_screen(env,device).cpu().squeeze(0).permute(1, 2, 0).numpy(), - interpolation='none') - plt.title('Example extracted screen') - plt.show() \ No newline at end of file diff --git a/codes/env_info.md b/codes/env_info.md index 0491a37..0cafc2b 100644 --- a/codes/env_info.md +++ b/codes/env_info.md @@ -1,54 +1,13 @@ -# 环境说明 +## 环境说明 -## [CartPole v0](https://github.com/openai/gym/wiki/CartPole-v0) +### [CartPole v0](https://github.com/openai/gym/wiki/CartPole-v0) image-20200820174307301 通过向左或向右推车能够实现平衡,所以动作空间由两个动作组成。每进行一个step就会给一个reward,如果无法保持平衡那么done等于true,本次episode失败。理想状态下,每个episode至少能进行200个step,也就是说每个episode的reward总和至少为200,step数目至少为200 -## [Pendulum-v0](https://github.com/openai/gym/wiki/Pendulum-v0) +### [Pendulum-v0](https://github.com/openai/gym/wiki/Pendulum-v0) image-20200820174814084 -钟摆以随机位置开始,目标是将其摆动,使其保持向上直立。动作空间是连续的,值的区间为[-2,2]。每个step给的reward最低为-16.27,最高为0。目前最好的成绩是100个episode的reward之和为-123.11 ± 6.86。 - - - -## The Racetrack - -We have implemented a custom environment called "Racetrack" for you to use during this piece of coursework. It is inspired by the environment described in the course textbook (Reinforcement Learning, Sutton & Barto, 2018, Exercise 5.12), but is not exactly the same. - -### Environment Description - -Consider driving a race car around a turn on a racetrack. In order to complete the race as quickly as possible, you would want to drive as fast as you can but, to avoid running off the track, you must slow down while turning. - -In our simplified racetrack environment, the agent is at one of a discrete set of grid positions. The agent also has a discrete speed in two directions, $x$ and $y$. So the state is represented as follows: -$$(\text{position}_y, \text{position}_x, \text{velocity}_y, \text{velocity}_x)$$ - -The agent collects a reward of -1 at each time step, an additional -10 for leaving the track (i.e., ending up on a black grid square in the figure below), and an additional +10 for reaching the finish line (any of the red grid squares). The agent starts each episode in a randomly selected grid-square on the starting line (green grid squares) with a speed of zero in both directions. At each time step, the agent can change its speed in both directions. Each speed can be changed by +1, -1 or 0, giving a total of nine actions. For example, the agent may increase its speed in the $x$ direction by -1 and its speed in the $y$ direction by +1. The agent's speed cannot be greater than +10 or less than -10 in either direction. - -![track_big](assets/track_big.png) - - -The agent's next state is determined by its current grid square, its current speed in two directions, and the changes it makes to its speed in the two directions. This environment is stochastic. When the agent tries to change its speed, no change occurs (in either direction) with probability 0.2. In other words, 20% of the time, the agent's action is ignored and the car's speed remains the same in both directions. - -If the agent leaves the track, it is returned to a random start grid-square and has its speed set to zero in both directions; the episode continues. An episode ends only when the agent transitions to a goal grid-square. - - - -### Environment Implementation - -See `racetrack_env.py` file, We provide a `RacetrackEnv` class for your agents to interact with. The class has the following methods: - -- **`reset()`** - this method initialises the environment, chooses a random starting state, and returns it. This method should be called before the start of every episode. -- **`step(action)`** - this method takes an integer action (more on this later), and executes one time-step in the environment. It returns a tuple containing the next state, the reward collected, and whether the next state is a terminal state. -- **`render(sleep_time)`** - this method renders a matplotlib graph representing the environment. It takes an optional float parameter giving the number of seconds to display each time-step. This method is useful for testing and debugging, but should not be used during training since it is *very* slow. **Do not use this method in your final submission**. -- **`get_actions()`** - a simple method that returns the available actions in the current state. Always returns a list containing integers in the range [0-8] (more on this later). - -In our code, states are represented as Python tuples - specifically a tuple of four integers. For example, if the agent is in a grid square with coordinates ($Y = 2$, $X = 3$), and is moving zero cells vertically and one cell horizontally per time-step, the state is represented as `(2, 3, 0, 1)`. Tuples of this kind will be returned by the `reset()` and `step(action)` methods. - -There are nine actions available to the agent in each state, as described above. However, to simplify your code, we have represented each of the nine actions as an integer in the range [0-8]. The table below shows the index of each action, along with the corresponding changes it will cause to the agent's speed in each direction. - -action_grid - -For example, taking action 8 will increase the agent's speed in the $x$ direction, but decrease its speed in the $y$ direction. \ No newline at end of file +钟摆以随机位置开始,目标是将其摆动,使其保持向上直立。动作空间是连续的,值的区间为[-2,2]。每个step给的reward最低为-16.27,最高为0。目前最好的成绩是100个episode的reward之和为-123.11 ± 6.86。 \ No newline at end of file diff --git a/codes/envs/blackjack.py b/codes/envs/blackjack.py new file mode 100644 index 0000000..87f02d2 --- /dev/null +++ b/codes/envs/blackjack.py @@ -0,0 +1,122 @@ +import gym +from gym import spaces +from gym.utils import seeding + +def cmp(a, b): + return int((a > b)) - int((a < b)) + +# 1 = Ace, 2-10 = Number cards, Jack/Queen/King = 10 +deck = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10] + + +def draw_card(np_random): + return np_random.choice(deck) + + +def draw_hand(np_random): + return [draw_card(np_random), draw_card(np_random)] + + +def usable_ace(hand): # Does this hand have a usable ace? + return 1 in hand and sum(hand) + 10 <= 21 + + +def sum_hand(hand): # Return current hand total + if usable_ace(hand): + return sum(hand) + 10 + return sum(hand) + + +def is_bust(hand): # Is this hand a bust? + return sum_hand(hand) > 21 + + +def score(hand): # What is the score of this hand (0 if bust) + return 0 if is_bust(hand) else sum_hand(hand) + + +def is_natural(hand): # Is this hand a natural blackjack? + return sorted(hand) == [1, 10] + + +class BlackjackEnv(gym.Env): + """Simple blackjack environment + Blackjack is a card game where the goal is to obtain cards that sum to as + near as possible to 21 without going over. They're playing against a fixed + dealer. + Face cards (Jack, Queen, King) have point value 10. + Aces can either count as 11 or 1, and it's called 'usable' at 11. + This game is placed with an infinite deck (or with replacement). + The game starts with each (player and dealer) having one face up and one + face down card. + The player can request additional cards (hit=1) until they decide to stop + (stick=0) or exceed 21 (bust). + After the player sticks, the dealer reveals their facedown card, and draws + until their sum is 17 or greater. If the dealer goes bust the player wins. + If neither player nor dealer busts, the outcome (win, lose, draw) is + decided by whose sum is closer to 21. The reward for winning is +1, + drawing is 0, and losing is -1. + The observation of a 3-tuple of: the players current sum, + the dealer's one showing card (1-10 where 1 is ace), + and whether or not the player holds a usable ace (0 or 1). + This environment corresponds to the version of the blackjack problem + described in Example 5.1 in Reinforcement Learning: An Introduction + by Sutton and Barto (1998). + https://webdocs.cs.ualberta.ca/~sutton/book/the-book.html + """ + def __init__(self, natural=False): + self.action_space = spaces.Discrete(2) + self.observation_space = spaces.Tuple(( + spaces.Discrete(32), + spaces.Discrete(11), + spaces.Discrete(2))) + self._seed() + + # Flag to payout 1.5 on a "natural" blackjack win, like casino rules + # Ref: http://www.bicyclecards.com/how-to-play/blackjack/ + self.natural = natural + # Start the first game + self._reset() # Number of + self.n_actions = 2 + + def reset(self): + return self._reset() + + def step(self, action): + return self._step(action) + + def _seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + def _step(self, action): + assert self.action_space.contains(action) + if action: # hit: add a card to players hand and return + self.player.append(draw_card(self.np_random)) + if is_bust(self.player): + done = True + reward = -1 + else: + done = False + reward = 0 + else: # stick: play out the dealers hand, and score + done = True + while sum_hand(self.dealer) < 17: + self.dealer.append(draw_card(self.np_random)) + reward = cmp(score(self.player), score(self.dealer)) + if self.natural and is_natural(self.player) and reward == 1: + reward = 1.5 + return self._get_obs(), reward, done, {} + + def _get_obs(self): + return (sum_hand(self.player), self.dealer[0], usable_ace(self.player)) + + def _reset(self): + self.dealer = draw_hand(self.np_random) + self.player = draw_hand(self.np_random) + + # Auto-draw another card if the score is less than 12 + while sum_hand(self.player) < 12: + self.player.append(draw_card(self.np_random)) + + return self._get_obs() diff --git a/codes/envs/cliff_walking.py b/codes/envs/cliff_walking.py new file mode 100644 index 0000000..05b9b2e --- /dev/null +++ b/codes/envs/cliff_walking.py @@ -0,0 +1,84 @@ +import numpy as np +import sys +from gym.envs.toy_text import discrete + + +UP = 0 +RIGHT = 1 +DOWN = 2 +LEFT = 3 + +class CliffWalkingEnv(discrete.DiscreteEnv): + + metadata = {'render.modes': ['human', 'ansi']} + + def _limit_coordinates(self, coord): + coord[0] = min(coord[0], self.shape[0] - 1) + coord[0] = max(coord[0], 0) + coord[1] = min(coord[1], self.shape[1] - 1) + coord[1] = max(coord[1], 0) + return coord + + def _calculate_transition_prob(self, current, delta): + new_position = np.array(current) + np.array(delta) + new_position = self._limit_coordinates(new_position).astype(int) + new_state = np.ravel_multi_index(tuple(new_position), self.shape) + reward = -100.0 if self._cliff[tuple(new_position)] else -1.0 + is_done = self._cliff[tuple(new_position)] or (tuple(new_position) == (3,11)) + return [(1.0, new_state, reward, is_done)] + + def __init__(self): + self.shape = (4, 12) + + nS = np.prod(self.shape) + n_actions = 4 + + # Cliff Location + self._cliff = np.zeros(self.shape, dtype=np.bool) + self._cliff[3, 1:-1] = True + + # Calculate transition probabilities + P = {} + for s in range(nS): + position = np.unravel_index(s, self.shape) + P[s] = { a : [] for a in range(n_actions) } + P[s][UP] = self._calculate_transition_prob(position, [-1, 0]) + P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1]) + P[s][DOWN] = self._calculate_transition_prob(position, [1, 0]) + P[s][LEFT] = self._calculate_transition_prob(position, [0, -1]) + + # We always start in state (3, 0) + isd = np.zeros(nS) + isd[np.ravel_multi_index((3,0), self.shape)] = 1.0 + + super(CliffWalkingEnv, self).__init__(nS, n_actions, P, isd) + + def render(self, mode='human', close=False): + self._render(mode, close) + + def _render(self, mode='human', close=False): + if close: + return + + outfile = StringIO() if mode == 'ansi' else sys.stdout + + for s in range(self.nS): + position = np.unravel_index(s, self.shape) + # print(self.s) + if self.s == s: + output = " x " + elif position == (3,11): + output = " T " + elif self._cliff[position]: + output = " C " + else: + output = " o " + + if position[1] == 0: + output = output.lstrip() + if position[1] == self.shape[1] - 1: + output = output.rstrip() + output += "\n" + + outfile.write(output) + outfile.write("\n") diff --git a/codes/envs/gridworld.py b/codes/envs/gridworld.py new file mode 100644 index 0000000..cf3aec2 --- /dev/null +++ b/codes/envs/gridworld.py @@ -0,0 +1,125 @@ +import io +import numpy as np +import sys +from gym.envs.toy_text import discrete + +UP = 0 +RIGHT = 1 +DOWN = 2 +LEFT = 3 + +class GridworldEnv(discrete.DiscreteEnv): + """ + Grid World environment from Sutton's Reinforcement Learning book chapter 4. + You are an agent on an MxN grid and your goal is to reach the terminal + state at the top left or the bottom right corner. + + For example, a 4x4 grid looks as follows: + + T o o o + o x o o + o o o o + o o o T + + x is your position and T are the two terminal states. + + You can take actions in each direction (UP=0, RIGHT=1, DOWN=2, LEFT=3). + Actions going off the edge leave you in your current state. + You receive a reward of -1 at each step until you reach a terminal state. + """ + + metadata = {'render.modes': ['human', 'ansi']} + + def __init__(self, shape=[4,4]): + if not isinstance(shape, (list, tuple)) or not len(shape) == 2: + raise ValueError('shape argument must be a list/tuple of length 2') + + self.shape = shape + + nS = np.prod(shape) + n_actions = 4 + + MAX_Y = shape[0] + MAX_X = shape[1] + + P = {} + grid = np.arange(nS).reshape(shape) + it = np.nditer(grid, flags=['multi_index']) + + while not it.finished: + s = it.iterindex + y, x = it.multi_index + + # P[s][a] = (prob, next_state, reward, is_done) + P[s] = {a : [] for a in range(n_actions)} + + is_done = lambda s: s == 0 or s == (nS - 1) + reward = 0.0 if is_done(s) else -1.0 + + # We're stuck in a terminal state + if is_done(s): + P[s][UP] = [(1.0, s, reward, True)] + P[s][RIGHT] = [(1.0, s, reward, True)] + P[s][DOWN] = [(1.0, s, reward, True)] + P[s][LEFT] = [(1.0, s, reward, True)] + # Not a terminal state + else: + ns_up = s if y == 0 else s - MAX_X + ns_right = s if x == (MAX_X - 1) else s + 1 + ns_down = s if y == (MAX_Y - 1) else s + MAX_X + ns_left = s if x == 0 else s - 1 + P[s][UP] = [(1.0, ns_up, reward, is_done(ns_up))] + P[s][RIGHT] = [(1.0, ns_right, reward, is_done(ns_right))] + P[s][DOWN] = [(1.0, ns_down, reward, is_done(ns_down))] + P[s][LEFT] = [(1.0, ns_left, reward, is_done(ns_left))] + + it.iternext() + + # Initial state distribution is uniform + isd = np.ones(nS) / nS + + # We expose the model of the environment for educational purposes + # This should not be used in any model-free learning algorithm + self.P = P + + super(GridworldEnv, self).__init__(nS, n_actions, P, isd) + + def _render(self, mode='human', close=False): + """ Renders the current gridworld layout + + For example, a 4x4 grid with the mode="human" looks like: + T o o o + o x o o + o o o o + o o o T + where x is your position and T are the two terminal states. + """ + if close: + return + + outfile = io.StringIO() if mode == 'ansi' else sys.stdout + + grid = np.arange(self.nS).reshape(self.shape) + it = np.nditer(grid, flags=['multi_index']) + while not it.finished: + s = it.iterindex + y, x = it.multi_index + + if self.s == s: + output = " x " + elif s == 0 or s == self.nS - 1: + output = " T " + else: + output = " o " + + if x == 0: + output = output.lstrip() + if x == self.shape[1] - 1: + output = output.rstrip() + + outfile.write(output) + + if x == self.shape[1] - 1: + outfile.write("\n") + + it.iternext() diff --git a/codes/envs/gridworld_env.py b/codes/envs/gridworld_env.py new file mode 100644 index 0000000..31d968f --- /dev/null +++ b/codes/envs/gridworld_env.py @@ -0,0 +1,195 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# -*- coding: utf-8 -*- + +import gym +import turtle +import numpy as np + +# turtle tutorial : https://docs.python.org/3.3/library/turtle.html + + +def GridWorld(gridmap=None, is_slippery=False): + if gridmap is None: + gridmap = ['SFFF', 'FHFH', 'FFFH', 'HFFG'] + env = gym.make("FrozenLake-v0", desc=gridmap, is_slippery=False) + env = FrozenLakeWapper(env) + return env + + +class FrozenLakeWapper(gym.Wrapper): + def __init__(self, env): + gym.Wrapper.__init__(self, env) + self.max_y = env.desc.shape[0] + self.max_x = env.desc.shape[1] + self.t = None + self.unit = 50 + + def draw_box(self, x, y, fillcolor='', line_color='gray'): + self.t.up() + self.t.goto(x * self.unit, y * self.unit) + self.t.color(line_color) + self.t.fillcolor(fillcolor) + self.t.setheading(90) + self.t.down() + self.t.begin_fill() + for _ in range(4): + self.t.forward(self.unit) + self.t.right(90) + self.t.end_fill() + + def move_player(self, x, y): + self.t.up() + self.t.setheading(90) + self.t.fillcolor('red') + self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit) + + def render(self): + if self.t == None: + self.t = turtle.Turtle() + self.wn = turtle.Screen() + self.wn.setup(self.unit * self.max_x + 100, + self.unit * self.max_y + 100) + self.wn.setworldcoordinates(0, 0, self.unit * self.max_x, + self.unit * self.max_y) + self.t.shape('circle') + self.t.width(2) + self.t.speed(0) + self.t.color('gray') + for i in range(self.desc.shape[0]): + for j in range(self.desc.shape[1]): + x = j + y = self.max_y - 1 - i + if self.desc[i][j] == b'S': # Start + self.draw_box(x, y, 'white') + elif self.desc[i][j] == b'F': # Frozen ice + self.draw_box(x, y, 'white') + elif self.desc[i][j] == b'G': # Goal + self.draw_box(x, y, 'yellow') + elif self.desc[i][j] == b'H': # Hole + self.draw_box(x, y, 'black') + else: + self.draw_box(x, y, 'white') + self.t.shape('turtle') + + x_pos = self.s % self.max_x + y_pos = self.max_y - 1 - int(self.s / self.max_x) + self.move_player(x_pos, y_pos) + + +class CliffWalkingWapper(gym.Wrapper): + def __init__(self, env): + gym.Wrapper.__init__(self, env) + self.t = None + self.unit = 50 + self.max_x = 12 + self.max_y = 4 + + def draw_x_line(self, y, x0, x1, color='gray'): + assert x1 > x0 + self.t.color(color) + self.t.setheading(0) + self.t.up() + self.t.goto(x0, y) + self.t.down() + self.t.forward(x1 - x0) + + def draw_y_line(self, x, y0, y1, color='gray'): + assert y1 > y0 + self.t.color(color) + self.t.setheading(90) + self.t.up() + self.t.goto(x, y0) + self.t.down() + self.t.forward(y1 - y0) + + def draw_box(self, x, y, fillcolor='', line_color='gray'): + self.t.up() + self.t.goto(x * self.unit, y * self.unit) + self.t.color(line_color) + self.t.fillcolor(fillcolor) + self.t.setheading(90) + self.t.down() + self.t.begin_fill() + for i in range(4): + self.t.forward(self.unit) + self.t.right(90) + self.t.end_fill() + + def move_player(self, x, y): + self.t.up() + self.t.setheading(90) + self.t.fillcolor('red') + self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit) + + def render(self): + if self.t == None: + self.t = turtle.Turtle() + self.wn = turtle.Screen() + self.wn.setup(self.unit * self.max_x + 100, + self.unit * self.max_y + 100) + self.wn.setworldcoordinates(0, 0, self.unit * self.max_x, + self.unit * self.max_y) + self.t.shape('circle') + self.t.width(2) + self.t.speed(0) + self.t.color('gray') + for _ in range(2): + self.t.forward(self.max_x * self.unit) + self.t.left(90) + self.t.forward(self.max_y * self.unit) + self.t.left(90) + for i in range(1, self.max_y): + self.draw_x_line( + y=i * self.unit, x0=0, x1=self.max_x * self.unit) + for i in range(1, self.max_x): + self.draw_y_line( + x=i * self.unit, y0=0, y1=self.max_y * self.unit) + + for i in range(1, self.max_x - 1): + self.draw_box(i, 0, 'black') + self.draw_box(self.max_x - 1, 0, 'yellow') + self.t.shape('turtle') + + x_pos = self.s % self.max_x + y_pos = self.max_y - 1 - int(self.s / self.max_x) + self.move_player(x_pos, y_pos) + + +if __name__ == '__main__': + # 环境1:FrozenLake, 可以配置冰面是否是滑的 + # 0 left, 1 down, 2 right, 3 up + env = gym.make("FrozenLake-v0", is_slippery=False) + env = FrozenLakeWapper(env) + + # 环境2:CliffWalking, 悬崖环境 + # env = gym.make("CliffWalking-v0") # 0 up, 1 right, 2 down, 3 left + # env = CliffWalkingWapper(env) + + # 环境3:自定义格子世界,可以配置地图, S为出发点Start, F为平地Floor, H为洞Hole, G为出口目标Goal + # gridmap = [ + # 'SFFF', + # 'FHFF', + # 'FFFF', + # 'HFGF' ] + # env = GridWorld(gridmap) + + env.reset() + for step in range(10): + action = np.random.randint(0, 4) + obs, reward, done, info = env.step(action) + print('step {}: action {}, obs {}, reward {}, done {}, info {}'.format(\ + step, action, obs, reward, done, info)) + # env.render() # 渲染一帧图像 \ No newline at end of file diff --git a/codes/envs/racetrack_env.py b/codes/envs/racetrack_env.py new file mode 100644 index 0000000..d6684f5 --- /dev/null +++ b/codes/envs/racetrack_env.py @@ -0,0 +1,260 @@ +# Please do not make changes to this file - it will be overwritten with a clean +# version when your work is marked. +# +# This file contains code for the racetrack environment that you will be using +# as part of the second part of the CM50270: Reinforcement Learning coursework. + +import time +import random +import numpy as np +import os +import matplotlib.pyplot as plt +import matplotlib.patheffects as pe +from IPython.display import clear_output + +from matplotlib import colors + +class RacetrackEnv(object) : + """ + Class representing a race-track environment inspired by exercise 5.12 in Sutton & Barto 2018 (p.111). + Please do not make changes to this class - it will be overwritten with a clean version when it comes to marking. + + The dynamics of this environment are detailed in this coursework exercise's jupyter notebook, although I have + included rather verbose comments here for those of you who are interested in how the environment has been + implemented (though this should not impact your solution code). + + If you find any *bugs* with this code, please let me know immediately - thank you for finding them, sorry that I didn't! + However, please do not suggest optimisations - some things have been purposely simplified for readability's sake. + """ + + + ACTIONS_DICT = { + 0 : (1, -1), # Acc Vert., Brake Horiz. + 1 : (1, 0), # Acc Vert., Hold Horiz. + 2 : (1, 1), # Acc Vert., Acc Horiz. + 3 : (0, -1), # Hold Vert., Brake Horiz. + 4 : (0, 0), # Hold Vert., Hold Horiz. + 5 : (0, 1), # Hold Vert., Acc Horiz. + 6 : (-1, -1), # Brake Vert., Brake Horiz. + 7 : (-1, 0), # Brake Vert., Hold Horiz. + 8 : (-1, 1) # Brake Vert., Acc Horiz. + } + + + CELL_TYPES_DICT = { + 0 : "track", + 1 : "wall", + 2 : "start", + 3 : "goal" + } + + + def __init__(self) : + # Load racetrack map from file. + self.track = np.flip(np.loadtxt(os.path.dirname(__file__)+"/track.txt", dtype = int), axis = 0) + + + # Discover start grid squares. + self.initial_states = [] + for y in range(self.track.shape[0]) : + for x in range(self.track.shape[1]) : + if (self.CELL_TYPES_DICT[self.track[y, x]] == "start") : + self.initial_states.append((y, x)) + + + self.is_reset = False + + #print("Racetrack Environment File Loaded Successfully.") + #print("Be sure to call .reset() before starting to initialise the environment and get an initial state!") + + + def step(self, action : int) : + """ + Takes a given action in the environment's current state, and returns a next state, + reward, and whether the next state is terminal or not. + + Arguments: + action {int} -- The action to take in the environment's current state. Should be an integer in the range [0-8]. + + Raises: + RuntimeError: Raised when the environment needs resetting.\n + TypeError: Raised when an action of an invalid type is given.\n + ValueError: Raised when an action outside the range [0-8] is given.\n + + Returns: + A tuple of:\n + {(int, int, int, int)} -- The next state, a tuple of (y_pos, x_pos, y_velocity, x_velocity).\n + {int} -- The reward earned by taking the given action in the current environment state.\n + {bool} -- Whether the environment's next state is terminal or not.\n + + """ + + # Check whether a reset is needed. + if (not self.is_reset) : + raise RuntimeError(".step() has been called when .reset() is needed.\n" + + "You need to call .reset() before using .step() for the first time, and after an episode ends.\n" + + ".reset() initialises the environment at the start of an episode, then returns an initial state.") + + # Check that action is the correct type (either a python integer or a numpy integer). + if (not (isinstance(action, int) or isinstance(action, np.integer))) : + raise TypeError("action should be an integer.\n" + + "action value {} of type {} was supplied.".format(action, type(action))) + + # Check that action is an allowed value. + if (action < 0 or action > 8) : + raise ValueError("action must be an integer in the range [0-8] corresponding to one of the legal actions.\n" + + "action value {} was supplied.".format(action)) + + + # Update Velocity. + # With probability, 0.85 update velocity components as intended. + if (np.random.uniform() < 0.8) : + (d_y, d_x) = self.ACTIONS_DICT[action] + # With probability, 0.15 Do not change velocity components. + else : + (d_y, d_x) = (0, 0) + + self.velocity = (self.velocity[0] + d_y, self.velocity[1] + d_x) + + # Keep velocity within bounds (-10, 10). + if (self.velocity[0] > 10) : + self.velocity[0] = 10 + elif (self.velocity[0] < -10) : + self.velocity[0] = -10 + if (self.velocity[1] > 10) : + self.velocity[1] = 10 + elif (self.velocity[1] < -10) : + self.velocity[1] = -10 + + # Update Position. + new_position = (self.position[0] + self.velocity[0], self.position[1] + self.velocity[1]) + + reward = 0 + terminal = False + + # If position is out-of-bounds, return to start and set velocity components to zero. + if (new_position[0] < 0 or new_position[1] < 0 or new_position[0] >= self.track.shape[0] or new_position[1] >= self.track.shape[1]) : + self.position = random.choice(self.initial_states) + self.velocity = (0, 0) + reward -= 10 + # If position is in a wall grid-square, return to start and set velocity components to zero. + elif (self.CELL_TYPES_DICT[self.track[new_position]] == "wall") : + self.position = random.choice(self.initial_states) + self.velocity = (0, 0) + reward -= 10 + # If position is in a track grid-squre or a start-square, update position. + elif (self.CELL_TYPES_DICT[self.track[new_position]] in ["track", "start"]) : + self.position = new_position + # If position is in a goal grid-square, end episode. + elif (self.CELL_TYPES_DICT[self.track[new_position]] == "goal") : + self.position = new_position + reward += 10 + terminal = True + # If this gets reached, then the student has touched something they shouldn't have. Naughty! + else : + raise RuntimeError("You've met with a terrible fate, haven't you?\nDon't modify things you shouldn't!") + + # Penalise every timestep. + reward -= 1 + + # Require a reset if the current state is terminal. + if (terminal) : + self.is_reset = False + + # Return next state, reward, and whether the episode has ended. + return (self.position[0], self.position[1], self.velocity[0], self.velocity[1]), reward, terminal + + + def reset(self) : + """ + Resets the environment, ready for a new episode to begin, then returns an initial state. + The initial state will be a starting grid square randomly chosen using a uniform distribution, + with both components of the velocity being zero. + + Returns: + {(int, int, int, int)} -- an initial state, a tuple of (y_pos, x_pos, y_velocity, x_velocity). + """ + + # Pick random starting grid-square. + self.position = random.choice(self.initial_states) + + # Set both velocity components to zero. + self.velocity = (0, 0) + + self.is_reset = True + + return (self.position[0], self.position[1], self.velocity[0], self.velocity[1]) + + + def render(self, sleep_time : float = 0.1) : + """ + Renders a pretty matplotlib plot representing the current state of the environment. + Calling this method on subsequent timesteps will update the plot. + This is VERY VERY SLOW and wil slow down training a lot. Only use for debugging/testing. + + Arguments: + sleep_time {float} -- How many seconds (or partial seconds) you want to wait on this rendered frame. + + """ + # Turn interactive mode on. + plt.ion() + fig = plt.figure(num = "env_render") + ax = plt.gca() + ax.clear() + clear_output(wait = True) + + # Prepare the environment plot and mark the car's position. + env_plot = np.copy(self.track) + env_plot[self.position] = 4 + env_plot = np.flip(env_plot, axis = 0) + + # Plot the gridworld. + cmap = colors.ListedColormap(["white", "black", "green", "red", "yellow"]) + bounds = list(range(6)) + norm = colors.BoundaryNorm(bounds, cmap.N) + ax.imshow(env_plot, cmap = cmap, norm = norm, zorder = 0) + + # Plot the velocity. + if (not self.velocity == (0, 0)) : + ax.arrow(self.position[1], self.track.shape[0] - 1 - self.position[0], self.velocity[1], -self.velocity[0], + path_effects=[pe.Stroke(linewidth=1, foreground='black')], color = "yellow", width = 0.1, length_includes_head = True, zorder = 2) + + # Set up axes. + ax.grid(which = 'major', axis = 'both', linestyle = '-', color = 'k', linewidth = 2, zorder = 1) + ax.set_xticks(np.arange(-0.5, self.track.shape[1] , 1)); + ax.set_xticklabels([]) + ax.set_yticks(np.arange(-0.5, self.track.shape[0], 1)); + ax.set_yticklabels([]) + + # Draw everything. + #fig.canvas.draw() + #fig.canvas.flush_events() + + plt.show() + + # Sleep if desired. + if (sleep_time > 0) : + time.sleep(sleep_time) + + + def get_actions(self) : + """ + Returns the available actions in the current state - will always be a list + of integers in the range [0-8]. + """ + return [*self.ACTIONS_DICT] + +# num_steps = 1000000 + +# env = RacetrackEnv() +# state = env.reset() +# print(state) + +# for _ in range(num_steps) : + +# next_state, reward, terminal = env.step(random.choice(env.get_actions())) +# print(next_state) +# env.render() + +# if (terminal) : +# _ = env.reset() diff --git a/codes/envs/track.txt b/codes/envs/track.txt new file mode 100644 index 0000000..4bbe230 --- /dev/null +++ b/codes/envs/track.txt @@ -0,0 +1,15 @@ +1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 +1 1 1 1 1 1 1 1 0 0 0 0 0 3 3 3 3 3 1 +1 1 1 1 1 1 0 0 0 0 0 0 0 3 3 3 3 3 1 +1 1 1 1 1 0 0 0 0 0 0 0 0 3 3 3 3 3 1 +1 1 1 1 0 0 0 0 0 0 0 0 0 3 3 3 3 3 1 +1 1 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 +1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 +1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 +1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 +1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 +1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 +1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 +1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 +1 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 +1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 \ No newline at end of file diff --git a/codes/envs/windy_gridworld.py b/codes/envs/windy_gridworld.py new file mode 100644 index 0000000..2a9d4a4 --- /dev/null +++ b/codes/envs/windy_gridworld.py @@ -0,0 +1,82 @@ +import gym +import numpy as np +import sys +from gym.envs.toy_text import discrete + +UP = 0 +RIGHT = 1 +DOWN = 2 +LEFT = 3 + +class WindyGridworldEnv(discrete.DiscreteEnv): + + metadata = {'render.modes': ['human', 'ansi']} + + def _limit_coordinates(self, coord): + coord[0] = min(coord[0], self.shape[0] - 1) + coord[0] = max(coord[0], 0) + coord[1] = min(coord[1], self.shape[1] - 1) + coord[1] = max(coord[1], 0) + return coord + + def _calculate_transition_prob(self, current, delta, winds): + new_position = np.array(current) + np.array(delta) + np.array([-1, 0]) * winds[tuple(current)] + new_position = self._limit_coordinates(new_position).astype(int) + new_state = np.ravel_multi_index(tuple(new_position), self.shape) + is_done = tuple(new_position) == (3, 7) + return [(1.0, new_state, -1.0, is_done)] + + def __init__(self): + self.shape = (7, 10) + + nS = np.prod(self.shape) + n_actions = 4 + + # Wind strength + winds = np.zeros(self.shape) + winds[:,[3,4,5,8]] = 1 + winds[:,[6,7]] = 2 + + # Calculate transition probabilities + P = {} + for s in range(nS): + position = np.unravel_index(s, self.shape) + P[s] = { a : [] for a in range(n_actions) } + P[s][UP] = self._calculate_transition_prob(position, [-1, 0], winds) + P[s][RIGHT] = self._calculate_transition_prob(position, [0, 1], winds) + P[s][DOWN] = self._calculate_transition_prob(position, [1, 0], winds) + P[s][LEFT] = self._calculate_transition_prob(position, [0, -1], winds) + + # We always start in state (3, 0) + isd = np.zeros(nS) + isd[np.ravel_multi_index((3,0), self.shape)] = 1.0 + + super(WindyGridworldEnv, self).__init__(nS, n_actions, P, isd) + + def render(self, mode='human', close=False): + self._render(mode, close) + + def _render(self, mode='human', close=False): + if close: + return + + outfile = StringIO() if mode == 'ansi' else sys.stdout + + for s in range(self.nS): + position = np.unravel_index(s, self.shape) + # print(self.s) + if self.s == s: + output = " x " + elif position == (3,7): + output = " T " + else: + output = " o " + + if position[1] == 0: + output = output.lstrip() + if position[1] == self.shape[1] - 1: + output = output.rstrip() + output += "\n" + + outfile.write(output) + outfile.write("\n") diff --git a/codes/snake/README.md b/codes/snake/README.md index bd9019c..b49b4e8 100644 --- a/codes/snake/README.md +++ b/codes/snake/README.md @@ -1,8 +1,8 @@ -# 使用 Q-learning 实现贪吃蛇 +# 贪吃蛇 贪吃蛇是一个起源于1976年的街机游戏 Blockade,玩家控制蛇上下左右吃到食物并将身体增长,吃到食物后移动速度逐渐加快,直到碰到墙体或者蛇的身体算游戏结束。 -![image-20200901202636603](assets/image-20200901202636603.png) +![image-20200901202636603](img/image-20200901202636603.png) 如图,本次任务整个游戏版面大小为560X560,绿色部分就是我们的智能体贪吃蛇,红色方块就是食物,墙位于四周,一旦食物被吃掉,会在下一个随机位置刷出新的食物。蛇的每一节以及食物的大小为40X40,除开墙体(厚度也为40),蛇可以活动的范围为480X480,也就是12X12的栅格。环境的状态等信息如下: @@ -34,8 +34,5 @@ * reward:如果吃到食物给一个+1的reward,如果蛇没了就-1,其他情况给-0.1的reward -## 任务要求 -设计一个Q-learning agent用于学习snake游戏,并绘制reward以及滑动平均后的reward随episode的变化曲线图并记录超参数写成报告。 -[参考代码](https://github.com/datawhalechina/leedeeprl-notes/tree/master/codes/snake) \ No newline at end of file diff --git a/codes/snake/agent.py b/codes/snake/agent.py index 1c05b64..e514dc3 100644 --- a/codes/snake/agent.py +++ b/codes/snake/agent.py @@ -64,7 +64,7 @@ class Agent: return adjoining_wall_x, adjoining_wall_y, food_dir_x, food_dir_y, adjoining_body_top, adjoining_body_bottom, adjoining_body_left, adjoining_body_right - def update_tables(self, _state, points, dead): + def update(self, _state, points, dead): if self.s: maxq = max(self.Q[_state]) reward = self.R(points,dead) @@ -72,7 +72,7 @@ class Agent: self.Q[self.s][self.a] += alpha * (reward + self.gamma * maxq - self.Q[self.s][self.a]) self.N[self.s][self.a] += 1.0 - def act(self, state, points, dead): + def choose_action(self, state, points, dead): ''' :param state: a list of [snake_head_x, snake_head_y, snake_body, food_x, food_y] from environment. :param points: float, the current points from environment @@ -88,7 +88,7 @@ class Agent: Qs = self.Q[_state][:] if self._train: - self.update_tables(_state, points, dead) + self.update(_state, points, dead) if dead: self.reset() return diff --git a/codes/snake/assets/image-20200901202636603.png b/codes/snake/assets/image-20200901202636603.png deleted file mode 100644 index 3f7dff6..0000000 Binary files a/codes/snake/assets/image-20200901202636603.png and /dev/null differ diff --git a/codes/snake/checkpoint3.npy b/codes/snake/checkpoint3.npy index 20085c9..8737b4c 100644 Binary files a/codes/snake/checkpoint3.npy and b/codes/snake/checkpoint3.npy differ diff --git a/codes/snake/example_assignment_and_report2.pdf b/codes/snake/example_assignment_and_report2.pdf new file mode 100644 index 0000000..84008c0 Binary files /dev/null and b/codes/snake/example_assignment_and_report2.pdf differ diff --git a/codes/snake/main.py b/codes/snake/main.py index c407491..3edb094 100644 --- a/codes/snake/main.py +++ b/codes/snake/main.py @@ -7,135 +7,10 @@ from snake_env import SnakeEnv import utils import time -class Application: - def __init__(self, args): - self.args = args - self.env = SnakeEnv(args.snake_head_x, args.snake_head_y, args.food_x, args.food_y) - self.agent = Agent(self.env.get_actions(), args.Ne, args.C, args.gamma) - - def execute(self): - if not self.args.human: - if self.args.train_eps != 0: - self.train() - self.test() - self.show_games() - - def train(self): - print("Train Phase:") - self.agent.train() - window = self.args.window - self.points_results = [] - first_eat = True - start = time.time() - - for game in range(1, self.args.train_eps + 1): - state = self.env.get_state() - dead = False - action = self.agent.act(state, 0, dead) - while not dead: - state, points, dead = self.env.step(action) - - # For debug convenience, you can check if your Q-table mathches ours for given setting of parameters - # (see Debug Convenience part on homework 4 web page) - if first_eat and points == 1: - self.agent.save_model(utils.CHECKPOINT) - first_eat = False - - action = self.agent.act(state, points, dead) - - - points = self.env.get_points() - self.points_results.append(points) - if game % self.args.window == 0: - print( - "Games:", len(self.points_results) - window, "-", len(self.points_results), - "Points (Average:", sum(self.points_results[-window:])/window, - "Max:", max(self.points_results[-window:]), - "Min:", min(self.points_results[-window:]),")", - ) - self.env.reset() - print("Training takes", time.time() - start, "seconds") - self.agent.save_model(self.args.model_name) - - def test(self): - print("Test Phase:") - self.agent.eval() - self.agent.load_model(self.args.model_name) - points_results = [] - start = time.time() - - for game in range(1, self.args.test_eps + 1): - state = self.env.get_state() - dead = False - action = self.agent.act(state, 0, dead) - while not dead: - state, points, dead = self.env.step(action) - action = self.agent.act(state, points, dead) - points = self.env.get_points() - points_results.append(points) - self.env.reset() - - print("Testing takes", time.time() - start, "seconds") - print("Number of Games:", len(points_results)) - print("Average Points:", sum(points_results)/len(points_results)) - print("Max Points:", max(points_results)) - print("Min Points:", min(points_results)) - - def show_games(self): - print("Display Games") - self.env.display() - pygame.event.pump() - self.agent.eval() - points_results = [] - end = False - for game in range(1, self.args.show_eps + 1): - state = self.env.get_state() - dead = False - action = self.agent.act(state, 0, dead) - count = 0 - while not dead: - count +=1 - pygame.event.pump() - keys = pygame.key.get_pressed() - if keys[K_ESCAPE] or self.check_quit(): - end = True - break - state, points, dead = self.env.step(action) - # Qlearning agent - if not self.args.human: - action = self.agent.act(state, points, dead) - # for human player - else: - for event in pygame.event.get(): - if event.type == pygame.KEYDOWN: - if event.key == pygame.K_UP: - action = 2 - elif event.key == pygame.K_DOWN: - action = 3 - elif event.key == pygame.K_LEFT: - action = 1 - elif event.key == pygame.K_RIGHT: - action = 0 - if end: - break - self.env.reset() - points_results.append(points) - print("Game:", str(game)+"/"+str(self.args.show_eps), "Points:", points) - if len(points_results) == 0: - return - print("Average Points:", sum(points_results)/len(points_results)) - - def check_quit(self): - for event in pygame.event.get(): - if event.type == pygame.QUIT: - return True - return False - - -def main(): +def get_args(): parser = argparse.ArgumentParser(description='CS440 MP4 Snake') - parser.add_argument('--human', default = False, action="store_true", + parser.add_argument('--human', default = True, action="store_true", help='making the game human playable - default False') parser.add_argument('--model_name', dest="model_name", type=str, default="checkpoint3.npy", @@ -173,10 +48,137 @@ def main(): parser.add_argument('--food_y', dest="food_y", type=int, default=80, help='initialized y position of food - default 80') + cfg = parser.parse_args() + return cfg +class Application: + def __init__(self, args): + self.args = args + self.env = SnakeEnv(args.snake_head_x, args.snake_head_y, args.food_x, args.food_y) + self.agent = Agent(self.env.get_actions(), args.Ne, args.C, args.gamma) + + def execute(self): + if not self.args.human: + if self.args.train_eps != 0: + self.train() + self.eval() + self.show_games() - args = parser.parse_args() - app = Application(args) + def train(self): + print("Train Phase:") + self.agent.train() + window = self.args.window + self.points_results = [] + first_eat = True + start = time.time() + + for game in range(1, self.args.train_eps + 1): + state = self.env.get_state() + dead = False + action = self.agent.choose_action(state, 0, dead) + while not dead: + state, points, dead = self.env.step(action) + + # For debug convenience, you can check if your Q-table mathches ours for given setting of parameters + # (see Debug Convenience part on homework 4 web page) + if first_eat and points == 1: + self.agent.save_model(utils.CHECKPOINT) + first_eat = False + + action = self.agent.choose_action(state, points, dead) + + + points = self.env.get_points() + self.points_results.append(points) + if game % self.args.window == 0: + print( + "Games:", len(self.points_results) - window, "-", len(self.points_results), + "Points (Average:", sum(self.points_results[-window:])/window, + "Max:", max(self.points_results[-window:]), + "Min:", min(self.points_results[-window:]),")", + ) + self.env.reset() + print("Training takes", time.time() - start, "seconds") + self.agent.save_model(self.args.model_name) + + def eval(self): + print("Evaling Phase:") + self.agent.eval() + self.agent.load_model(self.args.model_name) + points_results = [] + start = time.time() + + for game in range(1, self.args.test_eps + 1): + state = self.env.get_state() + dead = False + action = self.agent.choose_action(state, 0, dead) + while not dead: + state, points, dead = self.env.step(action) + action = self.agent.choose_action(state, points, dead) + points = self.env.get_points() + points_results.append(points) + self.env.reset() + + print("Testing takes", time.time() - start, "seconds") + print("Number of Games:", len(points_results)) + print("Average Points:", sum(points_results)/len(points_results)) + print("Max Points:", max(points_results)) + print("Min Points:", min(points_results)) + + def show_games(self): + print("Display Games") + self.env.display() + pygame.event.pump() + self.agent.eval() + points_results = [] + end = False + for game in range(1, self.args.show_eps + 1): + state = self.env.get_state() + dead = False + action = self.agent.choose_action(state, 0, dead) + count = 0 + while not dead: + count +=1 + pygame.event.pump() + keys = pygame.key.get_pressed() + if keys[K_ESCAPE] or self.check_quit(): + end = True + break + state, points, dead = self.env.step(action) + # Qlearning agent + if not self.args.human: + action = self.agent.choose_action(state, points, dead) + # for human player + else: + for event in pygame.event.get(): + if event.type == pygame.KEYDOWN: + if event.key == pygame.K_UP: + action = 2 + elif event.key == pygame.K_DOWN: + action = 3 + elif event.key == pygame.K_LEFT: + action = 1 + elif event.key == pygame.K_RIGHT: + action = 0 + if end: + break + self.env.reset() + points_results.append(points) + print("Game:", str(game)+"/"+str(self.args.show_eps), "Points:", points) + if len(points_results) == 0: + return + print("Average Points:", sum(points_results)/len(points_results)) + + def check_quit(self): + for event in pygame.event.get(): + if event.type == pygame.QUIT: + return True + return False + + +def main(): + cfg = get_args() + app = Application(cfg) app.execute() if __name__ == "__main__": diff --git a/codes/snake/snake_env.py b/codes/snake/snake_env.py index d70051e..a4afe0a 100644 --- a/codes/snake/snake_env.py +++ b/codes/snake/snake_env.py @@ -23,6 +23,7 @@ class SnakeEnv: state, points, dead = self.game.step(action) if self.render: self.draw(state, points, dead) + # return state, reward, done return state, points, dead def draw(self, state, points, dead): @@ -99,22 +100,16 @@ class SnakeEnv: self.render = True class Snake: - ''' 定义贪吃蛇的类 - ''' def __init__(self, snake_head_x, snake_head_y, food_x, food_y): - # 初始化蛇头的位置 - self.init_snake_head_x, self.init_snake_head_y = snake_head_x, snake_head_y - # 初始化食物的位置 - self.init_food_x, self.init_food_y = food_x, food_y + self.init_snake_head_x,self.init_snake_head_y = snake_head_x,snake_head_y # 蛇头初始位置 + self.init_food_x, self.init_food_y = food_x, food_y # 食物初始位置 self.reset() def reset(self): self.points = 0 - self.snake_head_x = self.init_snake_head_x - self.snake_head_y = self.init_snake_head_y - self.snake_body = [] - self.food_x = self.init_food_x - self.food_y = self.init_food_y + self.snake_head_x, self.snake_head_y = self.init_snake_head_x, self.init_snake_head_y + self.food_x, self.food_y = self.init_food_x, self.init_food_y + self.snake_body = [] # 蛇身的位置集合 def get_points(self): return self.points @@ -132,8 +127,10 @@ class Snake: ] def move(self, action): + '''根据action指令移动蛇头,并返回是否撞死 + ''' delta_x = delta_y = 0 - if action == 0: + if action == 0: # 上 delta_x = utils.GRID_SIZE elif action == 1: delta_x = - utils.GRID_SIZE @@ -141,33 +138,31 @@ class Snake: delta_y = - utils.GRID_SIZE elif action == 3: delta_y = utils.GRID_SIZE - old_body_head = None if len(self.snake_body) == 1: old_body_head = self.snake_body[0] + self.snake_body.append((self.snake_head_x, self.snake_head_y)) self.snake_head_x += delta_x self.snake_head_y += delta_y - if len(self.snake_body) > self.points: + if len(self.snake_body) > self.points: # 说明没有吃到食物 del(self.snake_body[0]) self.handle_eatfood() - # colliding with the snake body or going backwards while its body length - # greater than 1 + # 蛇长大于1时,蛇头与蛇身任一位置重叠则看作蛇与自身相撞 if len(self.snake_body) >= 1: for seg in self.snake_body: if self.snake_head_x == seg[0] and self.snake_head_y == seg[1]: return True - # moving towards body direction, not allowing snake to go backwards while - # its body length is 1 + # 蛇长为1时,如果蛇头与之前的位置重复则看作蛇与自身相撞 if len(self.snake_body) == 1: if old_body_head == (self.snake_head_x, self.snake_head_y): return True - # collide with the wall + # 蛇头是否撞墙 if (self.snake_head_x < utils.GRID_SIZE or self.snake_head_y < utils.GRID_SIZE or self.snake_head_x + utils.GRID_SIZE > utils.DISPLAY_SIZE-utils.GRID_SIZE or self.snake_head_y + utils.GRID_SIZE > utils.DISPLAY_SIZE-utils.GRID_SIZE): return True @@ -183,15 +178,16 @@ class Snake: self.random_food() self.points += 1 - def random_food(self): + '''生成随机位置的食物 + ''' max_x = (utils.DISPLAY_SIZE - utils.WALL_SIZE - utils.GRID_SIZE) max_y = (utils.DISPLAY_SIZE - utils.WALL_SIZE - utils.GRID_SIZE) self.food_x = random.randint(utils.WALL_SIZE, max_x)//utils.GRID_SIZE * utils.GRID_SIZE self.food_y = random.randint(utils.WALL_SIZE, max_y)//utils.GRID_SIZE * utils.GRID_SIZE - while self.check_food_on_snake(): + while self.check_food_on_snake(): # 食物不能生成在蛇身上 self.food_x = random.randint(utils.WALL_SIZE, max_x)//utils.GRID_SIZE * utils.GRID_SIZE self.food_y = random.randint(utils.WALL_SIZE, max_y)//utils.GRID_SIZE * utils.GRID_SIZE