diff --git a/codes/DDPG/agent.py b/codes/DDPG/agent.py index 0a8fd30..528872e 100644 --- a/codes/DDPG/agent.py +++ b/codes/DDPG/agent.py @@ -5,7 +5,7 @@ @Email: johnjim0816@gmail.com @Date: 2020-06-09 20:25:52 @LastEditor: John -LastEditTime: 2021-05-04 14:50:17 +LastEditTime: 2021-09-16 00:55:30 @Discription: @Environment: python 3.7.7 ''' @@ -26,7 +26,7 @@ class DDPG: self.target_critic = Critic(state_dim, action_dim, cfg.hidden_dim).to(cfg.device) self.target_actor = Actor(state_dim, action_dim, cfg.hidden_dim).to(cfg.device) - # copy parameters to target net + # 复制参数到目标网络 for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()): @@ -37,7 +37,7 @@ class DDPG: self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=cfg.actor_lr) self.memory = ReplayBuffer(cfg.memory_capacity) self.batch_size = cfg.batch_size - self.soft_tau = cfg.soft_tau + self.soft_tau = cfg.soft_tau # 软更新参数 self.gamma = cfg.gamma def choose_action(self, state): @@ -46,11 +46,11 @@ class DDPG: return action.detach().cpu().numpy()[0, 0] def update(self): - if len(self.memory) < self.batch_size: + if len(self.memory) < self.batch_size: # 当 memory 中不满足一个批量时,不更新策略 return - state, action, reward, next_state, done = self.memory.sample( - self.batch_size) - # convert variables to Tensor + # 从经验回放中(replay memory)中随机采样一个批量的转移(transition) + state, action, reward, next_state, done = self.memory.sample(self.batch_size) + # 转变为张量 state = torch.FloatTensor(state).to(self.device) next_state = torch.FloatTensor(next_state).to(self.device) action = torch.FloatTensor(action).to(self.device) @@ -70,10 +70,10 @@ class DDPG: self.actor_optimizer.zero_grad() policy_loss.backward() self.actor_optimizer.step() - self.critic_optimizer.zero_grad() value_loss.backward() self.critic_optimizer.step() + # 软更新 for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()): target_param.data.copy_( target_param.data * (1.0 - self.soft_tau) + diff --git a/codes/DDPG/env.py b/codes/DDPG/env.py index 85ca81c..99da3c5 100644 --- a/codes/DDPG/env.py +++ b/codes/DDPG/env.py @@ -5,7 +5,7 @@ @Email: johnjim0816@gmail.com @Date: 2020-06-10 15:28:30 @LastEditor: John -LastEditTime: 2021-03-19 19:56:46 +LastEditTime: 2021-09-16 00:52:30 @Discription: @Environment: python 3.7.7 ''' @@ -32,12 +32,12 @@ class NormalizedActions(gym.ActionWrapper): return action class OUNoise(object): - '''Ornstein–Uhlenbeck + '''Ornstein–Uhlenbeck噪声 ''' def __init__(self, action_space, mu=0.0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000): - self.mu = mu - self.theta = theta - self.sigma = max_sigma + self.mu = mu # OU噪声的参数 + self.theta = theta # OU噪声的参数 + self.sigma = max_sigma # OU噪声的参数 self.max_sigma = max_sigma self.min_sigma = min_sigma self.decay_period = decay_period @@ -45,17 +45,14 @@ class OUNoise(object): self.low = action_space.low self.high = action_space.high self.reset() - def reset(self): self.obs = np.ones(self.action_dim) * self.mu - def evolve_obs(self): x = self.obs dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim) self.obs = x + dx return self.obs - def get_action(self, action, t=0): ou_obs = self.evolve_obs() - self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period) - return np.clip(action + ou_obs, self.low, self.high) \ No newline at end of file + self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period) # sigma会逐渐衰减 + return np.clip(action + ou_obs, self.low, self.high) # 动作加上噪声后进行剪切 \ No newline at end of file diff --git a/codes/DDPG/outputs/Pendulum-v0/20210916-013138/models/checkpoint.pt b/codes/DDPG/outputs/Pendulum-v0/20210916-013138/models/checkpoint.pt new file mode 100644 index 0000000..2051294 Binary files /dev/null and b/codes/DDPG/outputs/Pendulum-v0/20210916-013138/models/checkpoint.pt differ diff --git a/codes/DDPG/outputs/Pendulum-v0/20210916-013138/results/eval_ma_rewards.npy b/codes/DDPG/outputs/Pendulum-v0/20210916-013138/results/eval_ma_rewards.npy new file mode 100644 index 0000000..936884c Binary files /dev/null and b/codes/DDPG/outputs/Pendulum-v0/20210916-013138/results/eval_ma_rewards.npy differ diff --git a/codes/DDPG/outputs/Pendulum-v0/20210916-013138/results/eval_rewards.npy b/codes/DDPG/outputs/Pendulum-v0/20210916-013138/results/eval_rewards.npy new file mode 100644 index 0000000..4d497f4 Binary files /dev/null and b/codes/DDPG/outputs/Pendulum-v0/20210916-013138/results/eval_rewards.npy differ diff --git a/codes/DDPG/outputs/Pendulum-v0/20210916-013138/results/eval_rewards_curve_cn.png b/codes/DDPG/outputs/Pendulum-v0/20210916-013138/results/eval_rewards_curve_cn.png new file mode 100644 index 0000000..a442aac Binary files /dev/null and b/codes/DDPG/outputs/Pendulum-v0/20210916-013138/results/eval_rewards_curve_cn.png differ diff --git a/codes/DDPG/outputs/Pendulum-v0/20210916-013138/results/train_ma_rewards.npy b/codes/DDPG/outputs/Pendulum-v0/20210916-013138/results/train_ma_rewards.npy new file mode 100644 index 0000000..ab923ee Binary files /dev/null and b/codes/DDPG/outputs/Pendulum-v0/20210916-013138/results/train_ma_rewards.npy differ diff --git a/codes/DDPG/outputs/Pendulum-v0/20210916-013138/results/train_rewards.npy b/codes/DDPG/outputs/Pendulum-v0/20210916-013138/results/train_rewards.npy new file mode 100644 index 0000000..0374e2e Binary files /dev/null and b/codes/DDPG/outputs/Pendulum-v0/20210916-013138/results/train_rewards.npy differ diff --git a/codes/DDPG/outputs/Pendulum-v0/20210916-013138/results/train_rewards_curve_cn.png b/codes/DDPG/outputs/Pendulum-v0/20210916-013138/results/train_rewards_curve_cn.png new file mode 100644 index 0000000..06f3dc8 Binary files /dev/null and b/codes/DDPG/outputs/Pendulum-v0/20210916-013138/results/train_rewards_curve_cn.png differ diff --git a/codes/DDPG/task0_train.py b/codes/DDPG/task0_train.py index 50e2723..29437f4 100644 --- a/codes/DDPG/task0_train.py +++ b/codes/DDPG/task0_train.py @@ -5,14 +5,14 @@ @Email: johnjim0816@gmail.com @Date: 2020-06-11 20:58:21 @LastEditor: John -LastEditTime: 2021-05-04 14:49:45 +LastEditTime: 2021-09-16 01:31:33 @Discription: @Environment: python 3.7.7 ''' import sys,os -curr_path = os.path.dirname(__file__) -parent_path = os.path.dirname(curr_path) -sys.path.append(parent_path) # add current terminal path to sys.path +curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 +parent_path = os.path.dirname(curr_path) # 父路径 +sys.path.append(parent_path) # 添加父路径到系统路径sys.path import datetime import gym @@ -21,49 +21,45 @@ import torch from DDPG.env import NormalizedActions, OUNoise from DDPG.agent import DDPG from common.utils import save_results,make_dir -from common.plot import plot_rewards - -curr_time = datetime.datetime.now().strftime( - "%Y%m%d-%H%M%S") # obtain current time +from common.plot import plot_rewards, plot_rewards_cn +curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 class DDPGConfig: def __init__(self): - self.algo = 'DDPG' - self.env = 'Pendulum-v0' # env name + self.algo = 'DDPG' # 算法名称 + self.env = 'Pendulum-v0' # 环境名称 self.result_path = curr_path+"/outputs/" + self.env + \ - '/'+curr_time+'/results/' # path to save results + '/'+curr_time+'/results/' # 保存结果的路径 self.model_path = curr_path+"/outputs/" + self.env + \ - '/'+curr_time+'/models/' # path to save results - self.gamma = 0.99 - self.critic_lr = 1e-3 - self.actor_lr = 1e-4 - self.memory_capacity = 10000 + '/'+curr_time+'/models/' # 保存模型的路径 + self.train_eps = 300 # 训练的回合数 + self.eval_eps = 50 # 测试的回合数 + self.gamma = 0.99 # 折扣因子 + self.critic_lr = 1e-3 # 评论家网络的学习率 + self.actor_lr = 1e-4 # 演员网络的学习率 + self.memory_capacity = 8000 self.batch_size = 128 - self.train_eps = 300 - self.eval_eps = 50 - self.eval_steps = 200 - self.target_update = 4 - self.hidden_dim = 30 - self.soft_tau = 1e-2 - self.device = torch.device( - "cuda" if torch.cuda.is_available() else "cpu") + self.target_update = 2 + self.hidden_dim = 256 + self.soft_tau = 1e-2 # 软更新参数 + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") def env_agent_config(cfg,seed=1): env = NormalizedActions(gym.make(cfg.env)) - env.seed(seed) + env.seed(seed) # 随机种子 state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] agent = DDPG(state_dim,action_dim,cfg) return env,agent def train(cfg, env, agent): - print('Start to train ! ') - print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}') - ou_noise = OUNoise(env.action_space) # action noise - rewards = [] - ma_rewards = [] # moving average rewards - for i_episode in range(cfg.train_eps): + print('开始训练!') + print(f'环境:{cfg.env},算法:{cfg.algo},设备:{cfg.device}') + ou_noise = OUNoise(env.action_space) # 动作噪声 + rewards = [] # 记录奖励 + ma_rewards = [] # 记录滑动平均奖励 + for i_ep in range(cfg.train_eps): state = env.reset() ou_noise.reset() done = False @@ -72,29 +68,29 @@ def train(cfg, env, agent): while not done: i_step += 1 action = agent.choose_action(state) - action = ou_noise.get_action( - action, i_step) # 即paper中的random process + action = ou_noise.get_action(action, i_step) next_state, reward, done, _ = env.step(action) ep_reward += reward agent.memory.push(state, action, reward, next_state, done) agent.update() state = next_state - print('Episode:{}/{}, Reward:{}'.format(i_episode+1, cfg.train_eps, ep_reward)) + if (i_ep+1)%10 == 0: + print('回合:{}/{},奖励:{:.2f}'.format(i_ep+1, cfg.train_eps, ep_reward)) rewards.append(ep_reward) if ma_rewards: ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) else: ma_rewards.append(ep_reward) - print('Complete training!') + print('完成训练!') return rewards, ma_rewards def eval(cfg, env, agent): - print('Start to Eval ! ') - print(f'Env:{cfg.env}, Algorithm:{cfg.algo}, Device:{cfg.device}') - rewards = [] - ma_rewards = [] # moving average rewards - for i_episode in range(cfg.eval_eps): - state = env.reset() + print('开始测试!') + print(f'环境:{cfg.env}, 算法:{cfg.algo}, 设备:{cfg.device}') + rewards = [] # 记录奖励 + ma_rewards = [] # 记录滑动平均奖励 + for i_ep in range(cfg.eval_eps): + state = env.reset() done = False ep_reward = 0 i_step = 0 @@ -104,32 +100,29 @@ def eval(cfg, env, agent): next_state, reward, done, _ = env.step(action) ep_reward += reward state = next_state - print('Episode:{}/{}, Reward:{}'.format(i_episode+1, cfg.train_eps, ep_reward)) + print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward)) rewards.append(ep_reward) if ma_rewards: ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) else: ma_rewards.append(ep_reward) - print('Complete Eval!') + print('完成测试!') return rewards, ma_rewards if __name__ == "__main__": cfg = DDPGConfig() - - # train + # 训练 env,agent = env_agent_config(cfg,seed=1) rewards, ma_rewards = train(cfg, env, agent) make_dir(cfg.result_path, cfg.model_path) agent.save(path=cfg.model_path) save_results(rewards, ma_rewards, tag='train', path=cfg.result_path) - plot_rewards(rewards, ma_rewards, tag="train", - algo=cfg.algo, path=cfg.result_path) - - # eval + plot_rewards_cn(rewards, ma_rewards, tag="train", env = cfg.env, algo=cfg.algo, path=cfg.result_path) + # 测试 env,agent = env_agent_config(cfg,seed=10) agent.load(path=cfg.model_path) rewards,ma_rewards = eval(cfg,env,agent) - save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path) - plot_rewards(rewards,ma_rewards,tag="eval",env=cfg.env,algo = cfg.algo,path=cfg.result_path) + save_results(rewards,ma_rewards,tag = 'eval',path = cfg.result_path) + plot_rewards_cn(rewards,ma_rewards,tag = "eval",env = cfg.env,algo = cfg.algo,path=cfg.result_path) diff --git a/codes/DQN/agent.py b/codes/DQN/agent.py index e0cb31c..66c11d7 100644 --- a/codes/DQN/agent.py +++ b/codes/DQN/agent.py @@ -5,7 +5,7 @@ @Email: johnjim0816@gmail.com @Date: 2020-06-12 00:50:49 @LastEditor: John -LastEditTime: 2021-09-15 02:18:56 +LastEditTime: 2021-09-15 13:35:36 @Discription: @Environment: python 3.7.7 ''' @@ -50,7 +50,7 @@ class DQN: with torch.no_grad(): state = torch.tensor([state], device=self.device, dtype=torch.float32) q_values = self.policy_net(state) - action = q_values.max(1)[1].item() + action = q_values.max(1)[1].item() # 选择Q值最大的动作 else: action = random.randrange(self.action_dim) return action @@ -61,45 +61,33 @@ class DQN: action = q_values.max(1)[1].item() return action def update(self): - - if len(self.memory) < self.batch_size: + if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时,不更新策略 return - # 从memory中随机采样transition + # 从经验回放中(replay memory)中随机采样一个批量的转移(transition) state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample( self.batch_size) - '''转为张量 - 例如tensor([[-4.5543e-02, -2.3910e-01, 1.8344e-02, 2.3158e-01],...,[-1.8615e-02, -2.3921e-01, -1.1791e-02, 2.3400e-01]])''' + # 转为张量 state_batch = torch.tensor( state_batch, device=self.device, dtype=torch.float) action_batch = torch.tensor(action_batch, device=self.device).unsqueeze( - 1) # 例如tensor([[1],...,[0]]) + 1) reward_batch = torch.tensor( - reward_batch, device=self.device, dtype=torch.float) # tensor([1., 1.,...,1]) + reward_batch, device=self.device, dtype=torch.float) next_state_batch = torch.tensor( next_state_batch, device=self.device, dtype=torch.float) done_batch = torch.tensor(np.float32( done_batch), device=self.device) - - '''计算当前(s_t,a)对应的Q(s_t, a)''' - '''torch.gather:对于a=torch.Tensor([[1,2],[3,4]]),那么a.gather(1,torch.Tensor([[0],[1]]))=torch.Tensor([[1],[3]])''' - q_values = self.policy_net(state_batch).gather( - dim=1, index=action_batch) # 等价于self.forward - # 计算所有next states的V(s_{t+1}),即通过target_net中选取reward最大的对应states - next_q_values = self.target_net(next_state_batch).max( - 1)[0].detach() # 比如tensor([ 0.0060, -0.0171,...,]) - # 计算 expected_q_value - # 对于终止状态,此时done_batch[0]=1, 对应的expected_q_value等于reward - expected_q_values = reward_batch + \ - self.gamma * next_q_values * (1-done_batch) - # self.loss = F.smooth_l1_loss(q_values,expected_q_values.unsqueeze(1)) # 计算 Huber loss - loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) # 计算 均方误差loss - # 优化模型 - self.optimizer.zero_grad() # zero_grad清除上一步所有旧的gradients from the last step - # loss.backward()使用backpropagation计算loss相对于所有parameters(需要gradients)的微分 + q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch) # 计算当前状态(s_t,a)对应的Q(s_t, a) + next_q_values = self.target_net(next_state_batch).max(1)[0].detach() # 计算下一时刻的状态(s_t_,a)对应的Q值 + # 计算期望的Q值,对于终止状态,此时done_batch[0]=1, 对应的expected_q_value等于reward + expected_q_values = reward_batch + self.gamma * next_q_values * (1-done_batch) + loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) # 计算均方根损失 + # 优化更新模型 + self.optimizer.zero_grad() loss.backward() - # for param in self.policy_net.parameters(): # clip防止梯度爆炸 - # param.grad.data.clamp_(-1, 1) - self.optimizer.step() # 更新模型 + for param in self.policy_net.parameters(): # clip防止梯度爆炸 + param.grad.data.clamp_(-1, 1) + self.optimizer.step() def save(self, path): torch.save(self.target_net.state_dict(), path+'dqn_checkpoint.pth') diff --git a/codes/DQN/outputs/CartPole-v0/20210915-145623/models/dqn_checkpoint.pth b/codes/DQN/outputs/CartPole-v0/20210915-145623/models/dqn_checkpoint.pth new file mode 100644 index 0000000..ecfc662 Binary files /dev/null and b/codes/DQN/outputs/CartPole-v0/20210915-145623/models/dqn_checkpoint.pth differ diff --git a/codes/DQN/outputs/CartPole-v0/20210915-145623/results/eval_ma_rewards.npy b/codes/DQN/outputs/CartPole-v0/20210915-145623/results/eval_ma_rewards.npy new file mode 100644 index 0000000..343fcc6 Binary files /dev/null and b/codes/DQN/outputs/CartPole-v0/20210915-145623/results/eval_ma_rewards.npy differ diff --git a/codes/DQN/outputs/CartPole-v0/20210915-145623/results/eval_rewards.npy b/codes/DQN/outputs/CartPole-v0/20210915-145623/results/eval_rewards.npy new file mode 100644 index 0000000..343fcc6 Binary files /dev/null and b/codes/DQN/outputs/CartPole-v0/20210915-145623/results/eval_rewards.npy differ diff --git a/codes/DQN/outputs/CartPole-v0/20210915-145623/results/eval_rewards_curve_cn.png b/codes/DQN/outputs/CartPole-v0/20210915-145623/results/eval_rewards_curve_cn.png new file mode 100644 index 0000000..1f55598 Binary files /dev/null and b/codes/DQN/outputs/CartPole-v0/20210915-145623/results/eval_rewards_curve_cn.png differ diff --git a/codes/DQN/outputs/CartPole-v0/20210915-145623/results/train_ma_rewards.npy b/codes/DQN/outputs/CartPole-v0/20210915-145623/results/train_ma_rewards.npy new file mode 100644 index 0000000..65ead2d Binary files /dev/null and b/codes/DQN/outputs/CartPole-v0/20210915-145623/results/train_ma_rewards.npy differ diff --git a/codes/DQN/outputs/CartPole-v0/20210915-145623/results/train_rewards.npy b/codes/DQN/outputs/CartPole-v0/20210915-145623/results/train_rewards.npy new file mode 100644 index 0000000..5f8371f Binary files /dev/null and b/codes/DQN/outputs/CartPole-v0/20210915-145623/results/train_rewards.npy differ diff --git a/codes/DQN/outputs/CartPole-v0/20210915-145623/results/train_rewards_curve_cn.png b/codes/DQN/outputs/CartPole-v0/20210915-145623/results/train_rewards_curve_cn.png new file mode 100644 index 0000000..617f693 Binary files /dev/null and b/codes/DQN/outputs/CartPole-v0/20210915-145623/results/train_rewards_curve_cn.png differ diff --git a/codes/DQN/task0_train.py b/codes/DQN/task0_train.py index 0d723b4..70b5b69 100644 --- a/codes/DQN/task0_train.py +++ b/codes/DQN/task0_train.py @@ -5,7 +5,7 @@ @Email: johnjim0816@gmail.com @Date: 2020-06-12 00:48:57 @LastEditor: John -LastEditTime: 2021-09-15 02:19:54 +LastEditTime: 2021-09-15 15:34:13 @Discription: @Environment: python 3.7.7 ''' @@ -19,7 +19,7 @@ import torch import datetime from common.utils import save_results, make_dir -from common.plot import plot_rewards +from common.plot import plot_rewards,plot_rewards_cn from DQN.agent import DQN curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 @@ -29,21 +29,21 @@ class DQNConfig: self.algo = "DQN" # 算法名称 self.env = 'CartPole-v0' # 环境名称 self.result_path = curr_path+"/outputs/" + self.env + \ - '/'+curr_time+'/results/' # path to save results + '/'+curr_time+'/results/' # 保存结果的路径 self.model_path = curr_path+"/outputs/" + self.env + \ - '/'+curr_time+'/models/' # path to save models + '/'+curr_time+'/models/' # 保存模型的路径 self.train_eps = 200 # 训练的回合数 self.eval_eps = 30 # 测试的回合数 - self.gamma = 0.95 + self.gamma = 0.95 # 强化学习中的折扣因子 self.epsilon_start = 0.90 # e-greedy策略中初始epsilon self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon self.epsilon_decay = 500 # e-greedy策略中epsilon的衰减率 self.lr = 0.0001 # 学习率 - self.memory_capacity = 100000 # capacity of Replay Memory - self.batch_size = 64 + self.memory_capacity = 100000 # 经验回放的容量 + self.batch_size = 64 # mini-batch SGD中的批量大小 self.target_update = 4 # 目标网络的更新频率 self.device = torch.device( - "cuda" if torch.cuda.is_available() else "cpu") # jian che + "cuda" if torch.cuda.is_available() else "cpu") # 检测GPU self.hidden_dim = 256 # hidden size of net def env_agent_config(cfg,seed=1): @@ -55,10 +55,10 @@ def env_agent_config(cfg,seed=1): return env,agent def train(cfg, env, agent): - print('Start to train !') - print(f'Env: {cfg.env}, Algorithm: {cfg.algo}, Device: {cfg.device}') - rewards = [] - ma_rewards = [] # moveing average reward + print('开始训练!') + print(f'环境:{cfg.env}, 算法:{cfg.algo}, 设备:{cfg.device}') + rewards = [] # 记录奖励 + ma_rewards = [] # 记录滑动平均奖励 for i_ep in range(cfg.train_eps): state = env.reset() done = False @@ -75,19 +75,19 @@ def train(cfg, env, agent): if (i_ep+1) % cfg.target_update == 0: agent.target_net.load_state_dict(agent.policy_net.state_dict()) if (i_ep+1)%10 == 0: - print('Episode:{}/{}, Reward:{}'.format(i_ep+1, cfg.train_eps, ep_reward)) + print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward)) rewards.append(ep_reward) # save ma_rewards if ma_rewards: ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) else: ma_rewards.append(ep_reward) - print('Complete training!') + print('完成训练!') return rewards, ma_rewards def eval(cfg,env,agent): - print('Start to eval !') - print(f'Env: {cfg.env}, Algorithm: {cfg.algo}, Device: {cfg.device}') + print('开始测试!') + print(f'环境:{cfg.env}, 算法:{cfg.algo}, 设备:{cfg.device}') rewards = [] ma_rewards = [] # moving average rewards for i_ep in range(cfg.eval_eps): @@ -105,24 +105,23 @@ def eval(cfg,env,agent): ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1) else: ma_rewards.append(ep_reward) - print(f"Episode:{i_ep+1}/{cfg.eval_eps}, reward:{ep_reward:.1f}") - print('Complete evaling!') + print(f"回合:{i_ep+1}/{cfg.eval_eps}, 奖励:{ep_reward:.1f}") + print('完成测试!') return rewards,ma_rewards if __name__ == "__main__": cfg = DQNConfig() - - # train + # 训练 env,agent = env_agent_config(cfg,seed=1) rewards, ma_rewards = train(cfg, env, agent) make_dir(cfg.result_path, cfg.model_path) agent.save(path=cfg.model_path) save_results(rewards, ma_rewards, tag='train', path=cfg.result_path) - plot_rewards(rewards, ma_rewards, tag="train", + plot_rewards_cn(rewards, ma_rewards, tag="train", algo=cfg.algo, path=cfg.result_path) - # eval + # 测试 env,agent = env_agent_config(cfg,seed=10) agent.load(path=cfg.model_path) rewards,ma_rewards = eval(cfg,env,agent) save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path) - plot_rewards(rewards,ma_rewards,tag="eval",env=cfg.env,algo = cfg.algo,path=cfg.result_path) + plot_rewards_cn(rewards,ma_rewards,tag="eval",env=cfg.env,algo = cfg.algo,path=cfg.result_path) diff --git a/codes/Docs/assets/eval_rewards_curve_cn-1689282.png b/codes/Docs/assets/eval_rewards_curve_cn-1689282.png new file mode 100644 index 0000000..1f55598 Binary files /dev/null and b/codes/Docs/assets/eval_rewards_curve_cn-1689282.png differ diff --git a/codes/Docs/assets/eval_rewards_curve_cn-1760950.png b/codes/Docs/assets/eval_rewards_curve_cn-1760950.png new file mode 100644 index 0000000..a442aac Binary files /dev/null and b/codes/Docs/assets/eval_rewards_curve_cn-1760950.png differ diff --git a/codes/Docs/assets/pendulum_1.png b/codes/Docs/assets/pendulum_1.png new file mode 100644 index 0000000..5eec82e Binary files /dev/null and b/codes/Docs/assets/pendulum_1.png differ diff --git a/codes/Docs/assets/train_rewards_curve_cn-1689150.png b/codes/Docs/assets/train_rewards_curve_cn-1689150.png new file mode 100644 index 0000000..617f693 Binary files /dev/null and b/codes/Docs/assets/train_rewards_curve_cn-1689150.png differ diff --git a/codes/Docs/assets/train_rewards_curve_cn-1760758.png b/codes/Docs/assets/train_rewards_curve_cn-1760758.png new file mode 100644 index 0000000..06f3dc8 Binary files /dev/null and b/codes/Docs/assets/train_rewards_curve_cn-1760758.png differ diff --git a/codes/Docs/使用DDPG解决倒立摆问题.md b/codes/Docs/使用DDPG解决倒立摆问题.md new file mode 100644 index 0000000..da815dc --- /dev/null +++ b/codes/Docs/使用DDPG解决倒立摆问题.md @@ -0,0 +1,175 @@ +前面项目讲的环境都是离散动作的,但实际中也有很多连续动作的环境,比如Open AI Gym中的[Pendulum-v0](https://github.com/openai/gym/wiki/Pendulum-v0)环境,它解决的是一个倒立摆问题,我们先对该环境做一个简要说明。 + +## Pendulum-v0简介 + +如果说 CartPole-v0 是一个离散动作的经典入门环境的话,那么对应 Pendulum-v0 就是连续动作的经典入门环境,如下图,我们通过施加力矩使其向上摆动并保持直立。 + +image-20210915161550713 + +该环境的状态数有三个,设摆针竖直方向上的顺时针旋转角为$\theta$,$\theta$设在$[-\pi,\pi]$之间,则相应的状态为$[cos\theta,sin\theta,\dot{\theta}]$,即表示角度和角速度,我们的动作则是一个-2到2之间的力矩,它是一个连续量,因而该环境不能用离散动作的算法比如 DQN 来解决。关于奖励是根据相关的物理原理而计算出的等式,如下: +$$ +-\left(\theta^{2}+0.1 * \hat{\theta}^{2}+0.001 * \text { action }^{2}\right) +$$ +对于每一步,其最低奖励为$-\left(\pi^{2}+0.1 * 8^{2}+0.001 * 2^{2}\right)= -16.2736044$,最高奖励为0。同 CartPole-v0 环境一样,达到最优算法的情况下,每回合的步数是无限的,因此这里设定每回合最大步数为200以便于训练。 + +## DDPG 基本接口 + +我们依然使用接口的概念,通过伪代码分析并实现 DDPG 的训练模式,如下: + +> 初始化评论家网络$Q\left(s, a \mid \theta^{Q}\right)$和演员网络$\mu\left(s \mid \theta^{\mu}\right)$,其权重分别为$\theta^{Q}$和$\theta^{\mu}$ +> +> 初始化目标网络$Q'$和$\mu'$,并复制权重$\theta^{Q^{\prime}} \leftarrow \theta^{Q}, \theta^{\mu^{\prime}} \leftarrow \theta^{\mu}$ +> +> 初始化经验回放缓冲区$R$ +> +> 执行$M$个回合循环,对于每个回合: +> +> * 初始化动作探索的的随机过程即噪声$\mathcal{N}$ +> +> * 初始化状态$s_1$ +> +> 循环$T$个时间步长,对于每个时步$ +> +> * 根据当前策略和噪声选择动作$a_{t}=\mu\left(s_{t} \mid \theta^{\mu}\right)+\mathcal{N}_{t}$ +> * 执行动作$a_t$并得到反馈$r_t$和下一个状态$s_{t+1}$ +> * 存储转移$\left(s_{t}, a_{t}, r_{t}, s_{t+1}\right)$到经验缓冲$R$中 +> * (更新策略)从$D$随机采样一个小批量的转移 +> * (更新策略)计算实际的Q值$y_{i}=r_{i}+\gamma Q^{\prime}\left(s_{i+1}, \mu^{\prime}\left(s_{i+1} \mid \theta^{\mu^{\prime}}\right) \mid \theta^{Q^{\prime}}\right)$ +> * (更新策略)对损失函数$L=\frac{1}{N} \sum_{i}\left(y_{i}-Q\left(s_{i}, a_{i} \mid \theta^{Q}\right)\right)^{2}$关于参数$\theta$做梯度下降用于更新评论家网络 +> * (更新策略)使用采样梯度更新演员网络的策略:$\left.\left.\nabla_{\theta^{\mu}} J \approx \frac{1}{N} \sum_{i} \nabla_{a} Q\left(s, a \mid \theta^{Q}\right)\right|_{s=s_{i}, a=\mu\left(s_{i}\right)} \nabla_{\theta^{\mu}} \mu\left(s \mid \theta^{\mu}\right)\right|_{s_{i}}$ +> * (更新策略)更新目标网络:$\theta^{Q^{\prime}} \leftarrow \tau \theta^{Q}+(1-\tau) \theta^{Q^{\prime}}$,$\theta^{\mu^{\prime}} \leftarrow \tau \theta^{\mu}+(1-\tau) \theta^{\mu^{\prime}}$ + +代码如下: + +```python +ou_noise = OUNoise(env.action_space) # 动作噪声 +rewards = [] # 记录奖励 +ma_rewards = [] # 记录滑动平均奖励 +for i_ep in range(cfg.train_eps): + state = env.reset() + ou_noise.reset() + done = False + ep_reward = 0 + i_step = 0 + while not done: + i_step += 1 + action = agent.choose_action(state) + action = ou_noise.get_action(action, i_step) + next_state, reward, done, _ = env.step(action) + ep_reward += reward + agent.memory.push(state, action, reward, next_state, done) + agent.update() + state = next_state + if (i_ep+1)%10 == 0: + print('回合:{}/{},奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward)) + rewards.append(ep_reward) + if ma_rewards: + ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) + else: + ma_rewards.append(ep_reward) +``` + +相比于 DQN ,DDPG 主要多了两处修改,一个是给动作施加噪声,另外一个是软更新策略,即最后一步。 + +## Ornstein-Uhlenbeck噪声 + + OU 噪声适用于惯性系统,尤其是时间离散化粒度较小的情况。 OU 噪声是一种随机过程,下面略去证明,直接给出公式: +$$ +x(t+\Delta t)=x(t)-\theta(x(t)-\mu) \Delta t+\sigma W_t +$$ +其中 $W_t$ 属于正太分布,进而代码实现如下: + +```python +class OUNoise(object): + '''Ornstein–Uhlenbeck噪声 + ''' + def __init__(self, action_space, mu=0.0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000): + self.mu = mu # OU噪声的参数 + self.theta = theta # OU噪声的参数 + self.sigma = max_sigma # OU噪声的参数 + self.max_sigma = max_sigma + self.min_sigma = min_sigma + self.decay_period = decay_period + self.action_dim = action_space.shape[0] + self.low = action_space.low + self.high = action_space.high + self.reset() + def reset(self): + self.obs = np.ones(self.action_dim) * self.mu + def evolve_obs(self): + x = self.obs + dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim) + self.obs = x + dx + return self.obs + def get_action(self, action, t=0): + ou_obs = self.evolve_obs() + self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period) # sigma会逐渐衰减 + return np.clip(action + ou_obs, self.low, self.high) # 动作加上噪声后进行剪切 +``` + +## DDPG算法 + +DDPG算法主要也包括两个功能,一个是选择动作,另外一个是更新策略,首先看选择动作: + +```python +def choose_action(self, state): + state = torch.FloatTensor(state).unsqueeze(0).to(self.device) + action = self.actor(state) + return action.detach().cpu().numpy()[0, 0] +``` + +由于DDPG是直接从演员网络取得动作,所以这里不用$\epsilon-greedy$策略。在更新策略函数中,也会跟DQN稍有不同,并且加入软更新: + +```python +def update(self): + if len(self.memory) < self.batch_size: # 当 memory 中不满足一个批量时,不更新策略 + return + # 从经验回放中(replay memory)中随机采样一个批量的转移(transition) + state, action, reward, next_state, done = self.memory.sample(self.batch_size) + # 转变为张量 + state = torch.FloatTensor(state).to(self.device) + next_state = torch.FloatTensor(next_state).to(self.device) + action = torch.FloatTensor(action).to(self.device) + reward = torch.FloatTensor(reward).unsqueeze(1).to(self.device) + done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(self.device) + + policy_loss = self.critic(state, self.actor(state)) + policy_loss = -policy_loss.mean() + next_action = self.target_actor(next_state) + target_value = self.target_critic(next_state, next_action.detach()) + expected_value = reward + (1.0 - done) * self.gamma * target_value + expected_value = torch.clamp(expected_value, -np.inf, np.inf) + + value = self.critic(state, action) + value_loss = nn.MSELoss()(value, expected_value.detach()) + + self.actor_optimizer.zero_grad() + policy_loss.backward() + self.actor_optimizer.step() + self.critic_optimizer.zero_grad() + value_loss.backward() + self.critic_optimizer.step() + # 软更新 + for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()): + target_param.data.copy_( + target_param.data * (1.0 - self.soft_tau) + + param.data * self.soft_tau + ) + for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()): + target_param.data.copy_( + target_param.data * (1.0 - self.soft_tau) + + param.data * self.soft_tau + ) +``` + +## 结果分析 + +实现算法之后,我们先看看训练效果: + +![train_rewards_curve_cn](../../easy_rl_book/res/ch12/assets/train_rewards_curve_cn-1760758.png) + +可以看到算法整体上是达到收敛了的,但是稳定状态下波动还比较大,依然有提升的空间,限于笔者的精力,这里只是帮助赌注实现一个基础的代码演示,想要使得算法调到最优感兴趣的读者可以多思考实现。我们再来看看测试的结果: + +![eval_rewards_curve_cn](../../easy_rl_book/res/ch12/assets/eval_rewards_curve_cn-1760950.png) + +从图中看出测试的平均奖励在-150左右,但其实训练的时候平均的稳态奖励在-300左右,这是因为测试的时候我们舍去了OU噪声的缘故。 \ No newline at end of file diff --git a/codes/Docs/使用DQN解决推车杆问题.md b/codes/Docs/使用DQN解决推车杆问题.md index a20b05e..5889165 100644 --- a/codes/Docs/使用DQN解决推车杆问题.md +++ b/codes/Docs/使用DQN解决推车杆问题.md @@ -2,7 +2,7 @@ 在练习本项目之前,可以先回顾一下之前的项目实战,即使用Q学习解决悬崖寻路问题。本项目将具体实现DQN算法来解决推车杆问题,对应的模拟环境为Open AI Gym中的[CartPole-v0](https://datawhalechina.github.io/easy-rl/#/chapter7/project2?id=cartpole-v0),我们同样先对该环境做一个简要说明。 -## CartPole-v0环境简介 +## CartPole-v0 简介 CartPole-v0是一个经典的入门环境,如下图,它通过向左(动作=0)或向右(动作=1)推动推车来实现竖直杆的平衡,每次实施一个动作后如果能够继续保持平衡就会得到一个+1的奖励,否则杆将无法保持平衡而导致游戏结束。 @@ -28,15 +28,64 @@ print(f"初始状态:{state}") 初始状态:[ 0.03073904 0.00145001 -0.03088818 -0.03131252] ``` -该环境状态数是四个,分别为车的位置、车的速度、杆的角度以及杆顶部的速度,动作数为两个,并且是离散的向左或者向右。 +该环境状态数是四个,分别为车的位置、车的速度、杆的角度以及杆顶部的速度,动作数为两个,并且是离散的向左或者向右。理论上达到最优化算法的情况下,推车杆是一直能保持平衡的,也就是每回合的步数是无限,但是这不方便训练,所以环境内部设置了每回合的最大步数为200,也就是说理想情况下,只需要我们每回合的奖励达到200就算训练完成。 ## DQN基本接口 介绍完环境之后,我们沿用接口的概念,通过分析伪代码来实现DQN的基本训练模式,以及一些要素比如建立什么网络需要什么模块等等。我们现在常用的DQN伪代码如下: -![image-20210915020027615](assets/image-20210915020027615.png) +> 初始化经验回放缓冲区(replay memory)$D$,容量(capacity)为$N$ +> +> 初始化状态-动作函数,即带有初始随机权重$\theta$的$Q$网络 +> +> 初始化目标状态-动作函数,即带有初始随机权重$\theta^-$的$\hat{Q}$网络,且$\theta^-=\theta$ +> +> 执行$M$个回合循环,对于每个回合 +> +> * 初始化环境,得到初始状态$s_1$ +> * 循环$T$个时间步长,对于每个时步$t$ +> * 使用$\epsilon-greedy$策略选择动作$a_t$ +> * 环境根据$a_t$反馈当前的奖励$r_t$和下一个状态$s_{t+1}$ +> * 更新状态$s_{t+1}=s_t$ +> * 存储转移(transition)即$(s_t,a_t,r-t,s_{t+1})$到经验回放$D$中 +> * (更新策略)从$D$随机采样一个小批量的转移 +> * (更新策略)计算实际的Q值$y_{j}=\left\{\begin{array}{cc}r_{j} & \text { 如果回合在时步 j+1终止 }\\ r_{j}+\gamma \max _{a^{\prime}} \hat{Q}\left(\phi_{j+1}, a^{\prime} ; \theta^{-}\right) & \text {否则 }\end{array}\right.$ +> * (更新策略)对损失函数$\left(y_{j}-Q\left(\phi_{j}, a_{j} ; \theta\right)\right)^{2}$关于参数$\theta$做梯度下降 +> * (更新策略)每$C$步重置$\hat{Q}=Q$ -与传统的Q学习算法相比,DQN使用神经网络来代替之前的Q表格从而存储更多的信息,且由于使用了神经网络所以我们一般需要利用随机梯度下降来优化Q值的预测。此外多了经验回放缓冲区(replay memory),并且使用两个网络,即目标网络和当前网络。 +用代码来实现的话如下: + +```python +rewards = [] # 记录奖励 + ma_rewards = [] # 记录滑动平均奖励 + for i_ep in range(cfg.train_eps): + state = env.reset() + done = False + ep_reward = 0 + while True: + action = agent.choose_action(state) + next_state, reward, done, _ = env.step(action) + ep_reward += reward + agent.memory.push(state, action, reward, next_state, done) + state = next_state + agent.update() + if done: + break + if (i_ep+1) % cfg.target_update == 0: + agent.target_net.load_state_dict(agent.policy_net.state_dict()) + if (i_ep+1)%10 == 0: + print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward)) + rewards.append(ep_reward) + # save ma_rewards + if ma_rewards: + ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) + else: + ma_rewards.append(ep_reward) +``` + + + +可以看到,DQN的训练模式其实和大多强化学习算法是一样的套路,但与传统的Q学习算法相比,DQN使用神经网络来代替之前的Q表格从而存储更多的信息,且由于使用了神经网络所以我们一般需要利用随机梯度下降来优化Q值的预测。此外多了经验回放缓冲区(replay memory),并且使用两个网络,即目标网络和当前网络。 ## 经验回放缓冲区 @@ -62,5 +111,98 @@ class ReplayBuffer: batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移 state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等 return state, action, reward, next_state, done + def __len__(self): + ''' 返回当前存储的量 + ''' + return len(self.buffer) ``` +## Q网络 + +在DQN中我们使用神经网络替代原有的Q表,从而能够存储更多的Q值,实现更为高级的策略以便用于复杂的环境,这里我们用的是一个三层的感知机或者说全连接网络: + +```python +class MLP(nn.Module): + def __init__(self, input_dim,output_dim,hidden_dim=128): + """ 初始化q网络,为全连接网络 + input_dim: 输入的特征数即环境的状态数 + output_dim: 输出的动作维度 + """ + super(MLP, self).__init__() + self.fc1 = nn.Linear(input_dim, hidden_dim) # 输入层 + self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层 + self.fc3 = nn.Linear(hidden_dim, output_dim) # 输出层 + + def forward(self, x): + # 各层对应的激活函数 + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + return self.fc3(x) +``` + +学过深度学习的同学应该都对这个网络十分熟悉,在强化学习中,网络的输入一般是状态,输出则是一个动作,假如总共有两个动作,那么这里的动作维度就是2,可能的输出就是0或1,一般我们用ReLU作为激活函数。根据实际需要也可以改变神经网络的模型结构等等,比如若我们使用图像作为输入的话,这里可以使用卷积神经网络(CNN)。 + +## DQN算法 + +跟前面的项目实战一样,DQN算法一般也包括选择动作和更新策略两个函数,首先我们看选择动作: + +```python +def choose_action(self, state): + '''选择动作 + ''' + self.frame_idx += 1 + if random.random() > self.epsilon(self.frame_idx): + with torch.no_grad(): + state = torch.tensor([state], device=self.device, dtype=torch.float32) + q_values = self.policy_net(state) + action = q_values.max(1)[1].item() # 选择Q值最大的动作 + else: + action = random.randrange(self.action_dim) +``` + +可以看到跟Q学习算法其实是一样的,都是用的$\epsilon-greedy$策略,只是使用神经网络的话我们需要通过Torch或者Tensorflow工具来处理相应的数据。 + +而DQN更新策略的步骤稍微复杂一点,主要包括三个部分:随机采样,计算期望Q值和梯度下降,如下: + +```python +def update(self): + if len(self.memory) < self.batch_size: # 当memory中不满足一个批量时,不更新策略 + return + # 从经验回放中(replay memory)中随机采样一个批量的转移(transition) + state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample( + self.batch_size) + # 转为张量 + state_batch = torch.tensor( + state_batch, device=self.device, dtype=torch.float) + action_batch = torch.tensor(action_batch, device=self.device).unsqueeze( + 1) + reward_batch = torch.tensor( + reward_batch, device=self.device, dtype=torch.float) + next_state_batch = torch.tensor( + next_state_batch, device=self.device, dtype=torch.float) + done_batch = torch.tensor(np.float32( + done_batch), device=self.device) + q_values = self.policy_net(state_batch).gather(dim=1, index=action_batch) # 计算当前状态(s_t,a)对应的Q(s_t, a) + next_q_values = self.target_net(next_state_batch).max(1)[0].detach() # 计算下一时刻的状态(s_t_,a)对应的Q值 + # 计算期望的Q值,对于终止状态,此时done_batch[0]=1, 对应的expected_q_value等于reward + expected_q_values = reward_batch + self.gamma * next_q_values * (1-done_batch) + loss = nn.MSELoss()(q_values, expected_q_values.unsqueeze(1)) # 计算均方根损失 + # 优化更新模型 + self.optimizer.zero_grad() + loss.backward() + for param in self.policy_net.parameters(): # clip防止梯度爆炸 + param.grad.data.clamp_(-1, 1) + self.optimizer.step() +``` + +## 结果分析 + +完成代码之后,我们先来看看DQN算法的训练效果,曲线如下: + +![train_rewards_curve_cn](../../easy_rl_book/res/ch7/assets/train_rewards_curve_cn-1689150.png) + +从图中看出,算法其实已经在60回合左右达到收敛,最后一直维持在最佳奖励200左右,可能会有轻微的波动,这是因为我们在收敛的情况下依然保持了一定的探索率,即epsilon_end=0.01。现在我们可以载入模型看看测试的效果: + +![eval_rewards_curve_cn](../../easy_rl_book/res/ch7/assets/eval_rewards_curve_cn-1689282.png) + +我们测试了30个回合,每回合都保持在200左右,说明我们的模型学习得不错了! \ No newline at end of file diff --git a/codes/QLearning/agent.py b/codes/QLearning/agent.py index 4dff2a9..a66128f 100644 --- a/codes/QLearning/agent.py +++ b/codes/QLearning/agent.py @@ -5,7 +5,7 @@ Author: John Email: johnjim0816@gmail.com Date: 2020-09-11 23:03:00 LastEditor: John -LastEditTime: 2021-09-11 21:53:18 +LastEditTime: 2021-09-15 13:18:37 Discription: use defaultdict to define Q table Environment: ''' @@ -26,7 +26,6 @@ class QLearning(object): self.epsilon_end = cfg.epsilon_end self.epsilon_decay = cfg.epsilon_decay self.Q_table = defaultdict(lambda: np.zeros(action_dim)) # A nested dictionary that maps state -> (action -> action-value) - def choose_action(self, state): self.sample_count += 1 self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ diff --git a/codes/QLearning/task0_train.py b/codes/QLearning/task0_train.py index ebc4fa4..a9bc36d 100644 --- a/codes/QLearning/task0_train.py +++ b/codes/QLearning/task0_train.py @@ -5,7 +5,7 @@ Author: John Email: johnjim0816@gmail.com Date: 2020-09-11 23:03:00 LastEditor: John -LastEditTime: 2021-09-12 01:29:40 +LastEditTime: 2021-09-15 14:44:25 Discription: Environment: ''' @@ -57,11 +57,11 @@ def train(cfg,env,agent): ma_rewards = [] # 滑动平均奖励 for i_ep in range(cfg.train_eps): ep_reward = 0 # 记录每个回合的奖励 - state = env.reset() # 重置环境, 重新开一局(即开始新的一个episode) + state = env.reset() # 重置环境,即开始新的回合 while True: action = agent.choose_action(state) # 根据算法选择一个动作 next_state, reward, done, _ = env.step(action) # 与环境进行一次动作交互 - agent.update(state, action, reward, next_state, done) # Q-learning算法更新 + agent.update(state, action, reward, next_state, done) # Q学习算法更新 state = next_state # 更新状态 ep_reward += reward if done: diff --git a/codes/common/memory.py b/codes/common/memory.py index d24061d..a238696 100644 --- a/codes/common/memory.py +++ b/codes/common/memory.py @@ -5,7 +5,7 @@ @Email: johnjim0816@gmail.com @Date: 2020-06-10 15:27:16 @LastEditor: John -LastEditTime: 2021-09-15 02:17:59 +LastEditTime: 2021-09-15 14:52:37 @Discription: @Environment: python 3.7.7 ''' @@ -28,5 +28,9 @@ class ReplayBuffer: batch = random.sample(self.buffer, batch_size) # 随机采出小批量转移 state, action, reward, next_state, done = zip(*batch) # 解压成状态,动作等 return state, action, reward, next_state, done - + + def __len__(self): + ''' 返回当前存储的量 + ''' + return len(self.buffer) diff --git a/codes/common/model.py b/codes/common/model.py index 257c33b..9800dbf 100644 --- a/codes/common/model.py +++ b/codes/common/model.py @@ -5,7 +5,7 @@ Author: John Email: johnjim0816@gmail.com Date: 2021-03-12 21:14:12 LastEditor: John -LastEditTime: 2021-05-04 02:45:27 +LastEditTime: 2021-09-15 13:21:03 Discription: Environment: ''' @@ -17,8 +17,8 @@ from torch.distributions import Categorical class MLP(nn.Module): def __init__(self, input_dim,output_dim,hidden_dim=128): """ 初始化q网络,为全连接网络 - input_dim: 输入的feature即环境的state数目 - output_dim: 输出的action总个数 + input_dim: 输入的特征数即环境的状态数 + output_dim: 输出的动作维度 """ super(MLP, self).__init__() self.fc1 = nn.Linear(input_dim, hidden_dim) # 输入层 diff --git a/codes/common/plot.py b/codes/common/plot.py index 5add0bf..df78d9e 100644 --- a/codes/common/plot.py +++ b/codes/common/plot.py @@ -5,7 +5,7 @@ Author: John Email: johnjim0816@gmail.com Date: 2020-10-07 20:57:11 LastEditor: John -LastEditTime: 2021-09-11 21:35:00 +LastEditTime: 2021-09-15 14:56:15 Discription: Environment: ''' @@ -29,7 +29,7 @@ def plot_rewards_cn(rewards,ma_rewards,tag="train",env='CartPole-v0',algo = "DQN ''' 中文画图 ''' sns.set() - plt.title(u"{}环境下Q学习算法的学习曲线".format(env),fontproperties=chinese_font()) + plt.title(u"{}环境下{}算法的学习曲线".format(env,algo),fontproperties=chinese_font()) plt.xlabel(u'回合数',fontproperties=chinese_font()) plt.plot(rewards) plt.plot(ma_rewards)