diff --git a/codes/GAE/task0_train.py b/codes/GAE/task0_train.py new file mode 100644 index 0000000..961816c --- /dev/null +++ b/codes/GAE/task0_train.py @@ -0,0 +1,167 @@ +import math +import random + +import gym +import numpy as np + +import torch +import torch.nn as nn +import torch.optim as optim +import torch.nn.functional as F +from torch.distributions import Normal +import matplotlib.pyplot as plt +import seaborn as sns +import sys,os +curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 +parent_path = os.path.dirname(curr_path) # 父路径 +sys.path.append(parent_path) # 添加父路径到系统路径sys.path + +use_cuda = torch.cuda.is_available() +device = torch.device("cuda" if use_cuda else "cpu") + +from common.multiprocessing_env import SubprocVecEnv + +num_envs = 16 +env_name = "Pendulum-v0" + +def make_env(): + def _thunk(): + env = gym.make(env_name) + return env + + return _thunk + +envs = [make_env() for i in range(num_envs)] +envs = SubprocVecEnv(envs) + +env = gym.make(env_name) + +def init_weights(m): + if isinstance(m, nn.Linear): + nn.init.normal_(m.weight, mean=0., std=0.1) + nn.init.constant_(m.bias, 0.1) + +class ActorCritic(nn.Module): + def __init__(self, num_inputs, num_outputs, hidden_size, std=0.0): + super(ActorCritic, self).__init__() + + self.critic = nn.Sequential( + nn.Linear(num_inputs, hidden_size), + nn.ReLU(), + nn.Linear(hidden_size, 1) + ) + + self.actor = nn.Sequential( + nn.Linear(num_inputs, hidden_size), + nn.ReLU(), + nn.Linear(hidden_size, num_outputs), + ) + self.log_std = nn.Parameter(torch.ones(1, num_outputs) * std) + + self.apply(init_weights) + + def forward(self, x): + value = self.critic(x) + mu = self.actor(x) + std = self.log_std.exp().expand_as(mu) + dist = Normal(mu, std) + return dist, value + + +def plot(frame_idx, rewards): + plt.figure(figsize=(20,5)) + plt.subplot(131) + plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1])) + plt.plot(rewards) + plt.show() + +def test_env(vis=False): + state = env.reset() + if vis: env.render() + done = False + total_reward = 0 + while not done: + state = torch.FloatTensor(state).unsqueeze(0).to(device) + dist, _ = model(state) + next_state, reward, done, _ = env.step(dist.sample().cpu().numpy()[0]) + state = next_state + if vis: env.render() + total_reward += reward + return total_reward + +def compute_gae(next_value, rewards, masks, values, gamma=0.99, tau=0.95): + values = values + [next_value] + gae = 0 + returns = [] + for step in reversed(range(len(rewards))): + delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step] + gae = delta + gamma * tau * masks[step] * gae + returns.insert(0, gae + values[step]) + return returns + +num_inputs = envs.observation_space.shape[0] +num_outputs = envs.action_space.shape[0] + +#Hyper params: +hidden_size = 256 +lr = 3e-2 +num_steps = 20 + +model = ActorCritic(num_inputs, num_outputs, hidden_size).to(device) +optimizer = optim.Adam(model.parameters()) + +max_frames = 100000 +frame_idx = 0 +test_rewards = [] + +state = envs.reset() + +while frame_idx < max_frames: + + log_probs = [] + values = [] + rewards = [] + masks = [] + entropy = 0 + + for _ in range(num_steps): + state = torch.FloatTensor(state).to(device) + dist, value = model(state) + + action = dist.sample() + next_state, reward, done, _ = envs.step(action.cpu().numpy()) + + log_prob = dist.log_prob(action) + entropy += dist.entropy().mean() + + log_probs.append(log_prob) + values.append(value) + rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device)) + masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device)) + + state = next_state + frame_idx += 1 + + if frame_idx % 1000 == 0: + test_rewards.append(np.mean([test_env() for _ in range(10)])) + print(test_rewards[-1]) + # plot(frame_idx, test_rewards) + + next_state = torch.FloatTensor(next_state).to(device) + _, next_value = model(next_state) + returns = compute_gae(next_value, rewards, masks, values) + + log_probs = torch.cat(log_probs) + returns = torch.cat(returns).detach() + values = torch.cat(values) + + advantage = returns - values + + actor_loss = -(log_probs * advantage.detach()).mean() + critic_loss = advantage.pow(2).mean() + + loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy + + optimizer.zero_grad() + loss.backward() + optimizer.step() diff --git a/codes/PPO/agent.py b/codes/PPO/agent.py index c0bfd0c..28b2861 100644 --- a/codes/PPO/agent.py +++ b/codes/PPO/agent.py @@ -5,7 +5,7 @@ Author: John Email: johnjim0816@gmail.com Date: 2021-03-23 15:17:42 LastEditor: John -LastEditTime: 2021-04-28 10:11:09 +LastEditTime: 2021-09-26 22:02:00 Discription: Environment: ''' @@ -41,10 +41,8 @@ class PPO: def update(self): for _ in range(self.n_epochs): - state_arr, action_arr, old_prob_arr, vals_arr,\ - reward_arr, dones_arr, batches = \ - self.memory.sample() - values = vals_arr + state_arr, action_arr, old_prob_arr, vals_arr,reward_arr, dones_arr, batches = self.memory.sample() + values = vals_arr[:] ### compute advantage ### advantage = np.zeros(len(reward_arr), dtype=np.float32) for t in range(len(reward_arr)-1): diff --git a/codes/PPO/memory.py b/codes/PPO/memory.py index 605fe03..c47fbc8 100644 --- a/codes/PPO/memory.py +++ b/codes/PPO/memory.py @@ -5,7 +5,7 @@ Author: John Email: johnjim0816@gmail.com Date: 2021-03-23 15:30:46 LastEditor: John -LastEditTime: 2021-03-23 15:30:55 +LastEditTime: 2021-09-26 22:00:07 Discription: Environment: ''' @@ -24,14 +24,9 @@ class PPOMemory: indices = np.arange(len(self.states), dtype=np.int64) np.random.shuffle(indices) batches = [indices[i:i+self.batch_size] for i in batch_step] - return np.array(self.states),\ - np.array(self.actions),\ - np.array(self.probs),\ - np.array(self.vals),\ - np.array(self.rewards),\ - np.array(self.dones),\ - batches - + return np.array(self.states),np.array(self.actions),np.array(self.probs),\ + np.array(self.vals),np.array(self.rewards),np.array(self.dones),batches + def push(self, state, action, probs, vals, reward, done): self.states.append(state) self.actions.append(action) diff --git a/codes/PPO/task0_train.py b/codes/PPO/task0_train.py index a4600e4..ccca805 100644 --- a/codes/PPO/task0_train.py +++ b/codes/PPO/task0_train.py @@ -5,7 +5,7 @@ Author: John Email: johnjim0816@gmail.com Date: 2021-03-22 16:18:10 LastEditor: John -LastEditTime: 2021-05-06 00:43:36 +LastEditTime: 2021-09-26 22:05:00 Discription: Environment: ''' @@ -17,6 +17,7 @@ sys.path.append(parent_path) # add current terminal path to sys.path import gym import torch import datetime +import tqdm from PPO.agent import PPO from common.plot import plot_rewards from common.utils import save_results,make_dir @@ -51,7 +52,7 @@ def env_agent_config(cfg,seed=1): return env,agent def train(cfg,env,agent): - print('Start to train !') + print('开始训练!') print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}') rewards= [] ma_rewards = [] # moving average rewards @@ -75,7 +76,7 @@ def train(cfg,env,agent): 0.9*ma_rewards[-1]+0.1*ep_reward) else: ma_rewards.append(ep_reward) - print(f"Episode:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.3f}") + print(f"回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward:.2f}") print('Complete training!') return rewards,ma_rewards diff --git a/codes/QLearning/outputs/CliffWalking-v0/20210919-211343/models/Qleaning_model.pkl b/codes/QLearning/outputs/CliffWalking-v0/20210919-211343/models/Qleaning_model.pkl new file mode 100644 index 0000000..45dce51 Binary files /dev/null and b/codes/QLearning/outputs/CliffWalking-v0/20210919-211343/models/Qleaning_model.pkl differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20210919-211343/results/train_ma_rewards.npy b/codes/QLearning/outputs/CliffWalking-v0/20210919-211343/results/train_ma_rewards.npy new file mode 100644 index 0000000..3a8bde0 Binary files /dev/null and b/codes/QLearning/outputs/CliffWalking-v0/20210919-211343/results/train_ma_rewards.npy differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20210919-211343/results/train_rewards.npy b/codes/QLearning/outputs/CliffWalking-v0/20210919-211343/results/train_rewards.npy new file mode 100644 index 0000000..36de6fc Binary files /dev/null and b/codes/QLearning/outputs/CliffWalking-v0/20210919-211343/results/train_rewards.npy differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20210919-211343/results/train_rewards_curve_cn.png b/codes/QLearning/outputs/CliffWalking-v0/20210919-211343/results/train_rewards_curve_cn.png new file mode 100644 index 0000000..3226b8a Binary files /dev/null and b/codes/QLearning/outputs/CliffWalking-v0/20210919-211343/results/train_rewards_curve_cn.png differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20210919-211442/models/Qleaning_model.pkl b/codes/QLearning/outputs/CliffWalking-v0/20210919-211442/models/Qleaning_model.pkl new file mode 100644 index 0000000..5c46ec6 Binary files /dev/null and b/codes/QLearning/outputs/CliffWalking-v0/20210919-211442/models/Qleaning_model.pkl differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20210919-211442/results/train_ma_rewards.npy b/codes/QLearning/outputs/CliffWalking-v0/20210919-211442/results/train_ma_rewards.npy new file mode 100644 index 0000000..1d6b889 Binary files /dev/null and b/codes/QLearning/outputs/CliffWalking-v0/20210919-211442/results/train_ma_rewards.npy differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20210919-211442/results/train_rewards.npy b/codes/QLearning/outputs/CliffWalking-v0/20210919-211442/results/train_rewards.npy new file mode 100644 index 0000000..6e6ccf0 Binary files /dev/null and b/codes/QLearning/outputs/CliffWalking-v0/20210919-211442/results/train_rewards.npy differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20210919-211442/results/train_rewards_curve_cn.png b/codes/QLearning/outputs/CliffWalking-v0/20210919-211442/results/train_rewards_curve_cn.png new file mode 100644 index 0000000..e1cd04e Binary files /dev/null and b/codes/QLearning/outputs/CliffWalking-v0/20210919-211442/results/train_rewards_curve_cn.png differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20210919-211456/models/Qleaning_model.pkl b/codes/QLearning/outputs/CliffWalking-v0/20210919-211456/models/Qleaning_model.pkl new file mode 100644 index 0000000..6986805 Binary files /dev/null and b/codes/QLearning/outputs/CliffWalking-v0/20210919-211456/models/Qleaning_model.pkl differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20210919-211456/results/train_ma_rewards.npy b/codes/QLearning/outputs/CliffWalking-v0/20210919-211456/results/train_ma_rewards.npy new file mode 100644 index 0000000..e6793df Binary files /dev/null and b/codes/QLearning/outputs/CliffWalking-v0/20210919-211456/results/train_ma_rewards.npy differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20210919-211456/results/train_rewards.npy b/codes/QLearning/outputs/CliffWalking-v0/20210919-211456/results/train_rewards.npy new file mode 100644 index 0000000..e6793df Binary files /dev/null and b/codes/QLearning/outputs/CliffWalking-v0/20210919-211456/results/train_rewards.npy differ diff --git a/codes/QLearning/outputs/CliffWalking-v0/20210919-211456/results/train_rewards_curve_cn.png b/codes/QLearning/outputs/CliffWalking-v0/20210919-211456/results/train_rewards_curve_cn.png new file mode 100644 index 0000000..9c98cc9 Binary files /dev/null and b/codes/QLearning/outputs/CliffWalking-v0/20210919-211456/results/train_rewards_curve_cn.png differ diff --git a/codes/QLearning/task0_train.py b/codes/QLearning/task0_train.py index 7eba2de..6e616ab 100644 --- a/codes/QLearning/task0_train.py +++ b/codes/QLearning/task0_train.py @@ -5,7 +5,7 @@ Author: John Email: johnjim0816@gmail.com Date: 2020-09-11 23:03:00 LastEditor: John -LastEditTime: 2021-09-20 00:32:59 +LastEditTime: 2021-09-23 12:22:58 Discription: Environment: ''' @@ -34,7 +34,7 @@ class QlearningConfig: self.train_eps = 400 # 训练的回合数 self.eval_eps = 30 # 测试的回合数 self.gamma = 0.9 # reward的衰减率 - self.epsilon_start = 0.99 # e-greedy策略中初始epsilon + self.epsilon_start = 0.95 # e-greedy策略中初始epsilon self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon self.epsilon_decay = 300 # e-greedy策略中epsilon的衰减率 self.lr = 0.1 # 学习率 @@ -53,14 +53,15 @@ def env_agent_config(cfg,seed=1): def train(cfg,env,agent): print('开始训练!') print(f'环境:{cfg.env}, 算法:{cfg.algo}, 设备:{cfg.device}') - rewards = [] - ma_rewards = [] # 滑动平均奖励 + rewards = [] # 记录奖励 + ma_rewards = [] # 记录滑动平均奖励 for i_ep in range(cfg.train_eps): ep_reward = 0 # 记录每个回合的奖励 state = env.reset() # 重置环境,即开始新的回合 while True: action = agent.choose_action(state) # 根据算法选择一个动作 next_state, reward, done, _ = env.step(action) # 与环境进行一次动作交互 + print(reward) agent.update(state, action, reward, next_state, done) # Q学习算法更新 state = next_state # 更新状态 ep_reward += reward @@ -78,6 +79,8 @@ def train(cfg,env,agent): def eval(cfg,env,agent): print('开始测试!') print(f'环境:{cfg.env}, 算法:{cfg.algo}, 设备:{cfg.device}') + for item in agent.Q_table.items(): + print(item) rewards = [] # 记录所有回合的奖励 ma_rewards = [] # 滑动平均的奖励 for i_ep in range(cfg.eval_eps): @@ -86,7 +89,7 @@ def eval(cfg,env,agent): while True: action = agent.predict(state) # 根据算法选择一个动作 next_state, reward, done, _ = env.step(action) # 与环境进行一个交互 - state = next_state # 存储上一个观察值 + state = next_state # 更新状态 ep_reward += reward if done: break @@ -103,10 +106,12 @@ if __name__ == "__main__": cfg = QlearningConfig() # 训练 - env,agent = env_agent_config(cfg,seed=1) + env,agent = env_agent_config(cfg,seed=0) rewards,ma_rewards = train(cfg,env,agent) make_dir(cfg.result_path,cfg.model_path) # 创建文件夹 agent.save(path=cfg.model_path) # 保存模型 + for item in agent.Q_table.items(): + print(item) save_results(rewards,ma_rewards,tag='train',path=cfg.result_path) # 保存结果 plot_rewards_cn(rewards,ma_rewards,tag="train",env=cfg.env,algo = cfg.algo,path=cfg.result_path) @@ -114,6 +119,7 @@ if __name__ == "__main__": env,agent = env_agent_config(cfg,seed=10) agent.load(path=cfg.model_path) # 加载模型 rewards,ma_rewards = eval(cfg,env,agent) + save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path) plot_rewards_cn(rewards,ma_rewards,tag="eval",env=cfg.env,algo = cfg.algo,path=cfg.result_path) diff --git a/codes/common/plot.py b/codes/common/plot.py index 4f0f0b9..6707ff8 100644 --- a/codes/common/plot.py +++ b/codes/common/plot.py @@ -5,7 +5,7 @@ Author: John Email: johnjim0816@gmail.com Date: 2020-10-07 20:57:11 LastEditor: John -LastEditTime: 2021-09-19 23:00:36 +LastEditTime: 2021-09-23 12:23:01 Discription: Environment: ''' diff --git a/docs/chapter3/chapter3_questions&keywords.md b/docs/chapter3/chapter3_questions&keywords.md index 5ced147..a1d0638 100644 --- a/docs/chapter3/chapter3_questions&keywords.md +++ b/docs/chapter3/chapter3_questions&keywords.md @@ -77,7 +77,7 @@ 答: - 1. 生成policy上的差异:前者随机,后者确定。Value-Base中的 action-value估计值最终会收敛到对应的true values(通常是不同的有限数,可以转化为0到1之间的概率),因此通常会获得一个确定的策略(deterministic policy);而Policy-Based不会收敛到一个确定性的值,另外他们会趋向于生成optimal stochastic policy。如果optimal policy是deterministic的,那么optimal action对应的性能函数将远大于suboptimal actions对应的性能函数,性能函数的大小代表了概率的大小。 + 1. 生成policy上的差异:前者确定,后者随机。Value-Base中的 action-value估计值最终会收敛到对应的true values(通常是不同的有限数,可以转化为0到1之间的概率),因此通常会获得一个确定的策略(deterministic policy);而Policy-Based不会收敛到一个确定性的值,另外他们会趋向于生成optimal stochastic policy。如果optimal policy是deterministic的,那么optimal action对应的性能函数将远大于suboptimal actions对应的性能函数,性能函数的大小代表了概率的大小。 2. 动作空间是否连续,前者离散,后者连续。Value-Base,对于连续动作空间问题,虽然可以将动作空间离散化处理,但离散间距的选取不易确定。过大的离散间距会导致算法取不到最优action,会在这附近徘徊,过小的离散间距会使得action的维度增大,会和高维度动作空间一样导致维度灾难,影响算法的速度;而Policy-Based适用于连续的动作空间,在连续的动作空间中,可以不用计算每个动作的概率,而是通过Gaussian distribution (正态分布)选择action。 3. value-based,例如Q-learning,是通过求解最优值函数间接的求解最优策略;policy-based,例如REINFORCE,Monte-Carlo Policy Gradient,等方法直接将策略参数化,通过策略搜索,策略梯度或者进化方法来更新策略的参数以最大化回报。基于值函数的方法不易扩展到连续动作空间,并且当同时采用非线性近似、自举和离策略时会有收敛性问题。策略梯度具有良好的收敛性证明。 4. 补充:对于值迭代和策略迭代:策略迭代。它有两个循环,一个是在策略估计的时候,为了求当前策略的值函数需要迭代很多次。另外一个是外面的大循环,就是策略评估,策略提升这个循环。值迭代算法则是一步到位,直接估计最优值函数,因此没有策略提升环节。