diff --git a/codes/DQN/outputs/CartPole-v0/20211111-165800/models/dqn_checkpoint.pth b/codes/DQN/outputs/CartPole-v0/20211111-165800/models/dqn_checkpoint.pth deleted file mode 100644 index a0b6ef9..0000000 Binary files a/codes/DQN/outputs/CartPole-v0/20211111-165800/models/dqn_checkpoint.pth and /dev/null differ diff --git a/codes/DQN/outputs/CartPole-v0/20211111-165800/results/eval_rewards_curve.png b/codes/DQN/outputs/CartPole-v0/20211111-165800/results/eval_rewards_curve.png deleted file mode 100644 index a260f79..0000000 Binary files a/codes/DQN/outputs/CartPole-v0/20211111-165800/results/eval_rewards_curve.png and /dev/null differ diff --git a/codes/DQN/outputs/CartPole-v0/20211111-165800/results/train_ma_rewards.npy b/codes/DQN/outputs/CartPole-v0/20211111-165800/results/train_ma_rewards.npy deleted file mode 100644 index 1e0ab6c..0000000 Binary files a/codes/DQN/outputs/CartPole-v0/20211111-165800/results/train_ma_rewards.npy and /dev/null differ diff --git a/codes/DQN/outputs/CartPole-v0/20211111-165800/results/train_rewards_curve.png b/codes/DQN/outputs/CartPole-v0/20211111-165800/results/train_rewards_curve.png deleted file mode 100644 index 4c14b8d..0000000 Binary files a/codes/DQN/outputs/CartPole-v0/20211111-165800/results/train_rewards_curve.png and /dev/null differ diff --git a/codes/DQN/outputs/CartPole-v0/20211229-144313/models/dqn_checkpoint.pth b/codes/DQN/outputs/CartPole-v0/20211229-144313/models/dqn_checkpoint.pth new file mode 100644 index 0000000..7fcf736 Binary files /dev/null and b/codes/DQN/outputs/CartPole-v0/20211229-144313/models/dqn_checkpoint.pth differ diff --git a/codes/DQN/outputs/CartPole-v0/20211111-165800/results/eval_ma_rewards.npy b/codes/DQN/outputs/CartPole-v0/20211229-144313/results/test_ma_rewards.npy similarity index 100% rename from codes/DQN/outputs/CartPole-v0/20211111-165800/results/eval_ma_rewards.npy rename to codes/DQN/outputs/CartPole-v0/20211229-144313/results/test_ma_rewards.npy diff --git a/codes/DQN/outputs/CartPole-v0/20211111-165800/results/eval_rewards.npy b/codes/DQN/outputs/CartPole-v0/20211229-144313/results/test_rewards.npy similarity index 100% rename from codes/DQN/outputs/CartPole-v0/20211111-165800/results/eval_rewards.npy rename to codes/DQN/outputs/CartPole-v0/20211229-144313/results/test_rewards.npy diff --git a/codes/DQN/outputs/CartPole-v0/20211229-144313/results/test_rewards_curve.png b/codes/DQN/outputs/CartPole-v0/20211229-144313/results/test_rewards_curve.png new file mode 100644 index 0000000..bc60080 Binary files /dev/null and b/codes/DQN/outputs/CartPole-v0/20211229-144313/results/test_rewards_curve.png differ diff --git a/codes/DQN/outputs/CartPole-v0/20211229-144313/results/train_ma_rewards.npy b/codes/DQN/outputs/CartPole-v0/20211229-144313/results/train_ma_rewards.npy new file mode 100644 index 0000000..d81acd2 Binary files /dev/null and b/codes/DQN/outputs/CartPole-v0/20211229-144313/results/train_ma_rewards.npy differ diff --git a/codes/DQN/outputs/CartPole-v0/20211111-165800/results/train_rewards.npy b/codes/DQN/outputs/CartPole-v0/20211229-144313/results/train_rewards.npy similarity index 55% rename from codes/DQN/outputs/CartPole-v0/20211111-165800/results/train_rewards.npy rename to codes/DQN/outputs/CartPole-v0/20211229-144313/results/train_rewards.npy index 88c137f..900914d 100644 Binary files a/codes/DQN/outputs/CartPole-v0/20211111-165800/results/train_rewards.npy and b/codes/DQN/outputs/CartPole-v0/20211229-144313/results/train_rewards.npy differ diff --git a/codes/DQN/outputs/CartPole-v0/20211229-144313/results/train_rewards_curve.png b/codes/DQN/outputs/CartPole-v0/20211229-144313/results/train_rewards_curve.png new file mode 100644 index 0000000..9df7664 Binary files /dev/null and b/codes/DQN/outputs/CartPole-v0/20211229-144313/results/train_rewards_curve.png differ diff --git a/codes/DQN/task0.py b/codes/DQN/task0.py index 871edf3..c7cd5da 100644 --- a/codes/DQN/task0.py +++ b/codes/DQN/task0.py @@ -25,6 +25,7 @@ class Config: self.env_name = 'CartPole-v0' # 环境名称 self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # 检测GPUgjgjlkhfsf风刀霜的撒发十 + self.seed = 10 # 随机种子,置0则不设置随机种子 self.train_eps = 200 # 训练的回合数 self.test_eps = 30 # 测试的回合数 ################################################################################ @@ -41,7 +42,7 @@ class Config: self.hidden_dim = 256 # 网络隐藏层 ################################################################################ - ################################# 保存结果相关参数 ################################ + ################################# 保存结果相关参数 ############################## self.result_path = curr_path + "/outputs/" + self.env_name + \ '/' + curr_time + '/results/' # 保存结果的路径 self.model_path = curr_path + "/outputs/" + self.env_name + \ @@ -50,17 +51,17 @@ class Config: ################################################################################ -def env_agent_config(cfg, seed=1): +def env_agent_config(cfg): ''' 创建环境和智能体 ''' env = gym.make(cfg.env_name) # 创建环境 state_dim = env.observation_space.shape[0] # 状态维度 action_dim = env.action_space.n # 动作维度 agent = DQN(state_dim, action_dim, cfg) # 创建智能体 - if seed !=0: # 设置随机种子 - torch.manual_seed(seed) - env.seed(seed) - np.random.seed(seed) + if cfg.seed !=0: # 设置随机种子 + torch.manual_seed(cfg.seed) + env.seed(cfg.seed) + np.random.seed(cfg.seed) return env, agent @@ -94,15 +95,17 @@ def train(cfg, env, agent): if (i_ep + 1) % 10 == 0: print('回合:{}/{}, 奖励:{}'.format(i_ep + 1, cfg.train_eps, ep_reward)) print('完成训练!') + env.close() return rewards, ma_rewards def test(cfg, env, agent): print('开始测试!') print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') - # 由于测试不需要使用epsilon-greedy策略,所以相应的值设置为0 + ############# 由于测试不需要使用epsilon-greedy策略,所以相应的值设置为0 ############### cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon + ################################################################################ rewards = [] # 记录所有回合的奖励 ma_rewards = [] # 记录所有回合的滑动平均奖励 for i_ep in range(cfg.test_eps): @@ -122,13 +125,14 @@ def test(cfg, env, agent): ma_rewards.append(ep_reward) print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}") print('完成测试!') + env.close() return rewards, ma_rewards if __name__ == "__main__": cfg = Config() # 训练 - env, agent = env_agent_config(cfg, seed=1) + env, agent = env_agent_config(cfg) rewards, ma_rewards = train(cfg, env, agent) make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹 agent.save(path=cfg.model_path) # 保存模型 @@ -136,7 +140,7 @@ if __name__ == "__main__": path=cfg.result_path) # 保存结果 plot_rewards(rewards, ma_rewards, cfg, tag="train") # 画出结果 # 测试 - env, agent = env_agent_config(cfg, seed=10) + env, agent = env_agent_config(cfg) agent.load(path=cfg.model_path) # 导入模型 rewards, ma_rewards = test(cfg, env, agent) save_results(rewards, ma_rewards, tag='test', diff --git a/codes/DoubleDQN/agent.py b/codes/DoubleDQN/double_dqn.py similarity index 100% rename from codes/DoubleDQN/agent.py rename to codes/DoubleDQN/double_dqn.py diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/models/checkpoint.pth b/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/models/checkpoint.pth deleted file mode 100644 index fc1ca66..0000000 Binary files a/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/models/checkpoint.pth and /dev/null differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/results/test_ma_rewards.npy b/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/results/test_ma_rewards.npy deleted file mode 100644 index b32c0a8..0000000 Binary files a/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/results/test_ma_rewards.npy and /dev/null differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/results/test_rewards.npy b/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/results/test_rewards.npy deleted file mode 100644 index 9ccf4e9..0000000 Binary files a/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/results/test_rewards.npy and /dev/null differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/results/test_rewards_curve.png b/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/results/test_rewards_curve.png deleted file mode 100644 index 3580ef9..0000000 Binary files a/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/results/test_rewards_curve.png and /dev/null differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/results/train_ma_rewards.npy b/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/results/train_ma_rewards.npy deleted file mode 100644 index b0838ab..0000000 Binary files a/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/results/train_ma_rewards.npy and /dev/null differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/results/train_rewards.npy b/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/results/train_rewards.npy deleted file mode 100644 index 12e1347..0000000 Binary files a/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/results/train_rewards.npy and /dev/null differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/results/train_rewards_curve.png b/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/results/train_rewards_curve.png deleted file mode 100644 index d612a6a..0000000 Binary files a/codes/DoubleDQN/outputs/CartPole-v0/20211221-185355/results/train_rewards_curve.png and /dev/null differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/models/checkpoint.pth b/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/models/checkpoint.pth new file mode 100644 index 0000000..2ec6bfd Binary files /dev/null and b/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/models/checkpoint.pth differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/results/test_ma_rewards.npy b/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/results/test_ma_rewards.npy new file mode 100644 index 0000000..81e0bba Binary files /dev/null and b/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/results/test_ma_rewards.npy differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/results/test_rewards.npy b/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/results/test_rewards.npy new file mode 100644 index 0000000..e7b6307 Binary files /dev/null and b/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/results/test_rewards.npy differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/results/test_rewards_curve.png b/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/results/test_rewards_curve.png new file mode 100644 index 0000000..4fbd77c Binary files /dev/null and b/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/results/test_rewards_curve.png differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/results/train_ma_rewards.npy b/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/results/train_ma_rewards.npy new file mode 100644 index 0000000..a73bbde Binary files /dev/null and b/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/results/train_ma_rewards.npy differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/results/train_rewards.npy b/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/results/train_rewards.npy new file mode 100644 index 0000000..3e707c5 Binary files /dev/null and b/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/results/train_rewards.npy differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/results/train_rewards_curve.png b/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/results/train_rewards_curve.png new file mode 100644 index 0000000..cb9dbeb Binary files /dev/null and b/codes/DoubleDQN/outputs/CartPole-v0/20211229-145006/results/train_rewards_curve.png differ diff --git a/codes/DoubleDQN/task0.py b/codes/DoubleDQN/task0.py index 945753a..7657a88 100644 --- a/codes/DoubleDQN/task0.py +++ b/codes/DoubleDQN/task0.py @@ -5,7 +5,7 @@ Author: JiangJi Email: johnjim0816@gmail.com Date: 2021-11-07 18:10:37 LastEditor: JiangJi -LastEditTime: 2021-11-19 18:34:05 +LastEditTime: 2021-12-29 15:02:30 Discription: ''' @@ -20,20 +20,22 @@ import datetime from common.utils import save_results, make_dir from common.utils import plot_rewards -from DoubleDQN.agent import DoubleDQN -from DoubleDQN.train import train,test +from DoubleDQN.double_dqn import DoubleDQN curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 -algo_name = 'DoubleDQN' # 算法名称 -env_name = 'CartPole-v0' # 环境名称 -class DoubleDQNConfig: + +class Config: def __init__(self): - self.algo_name = algo_name # 算法名称 - self.env_name = env_name # 环境名称 + ################################## 环境超参数 ################################### + self.algo_name = 'DoubleDQN' # 算法名称 + self.env_name = 'CartPole-v0' # 环境名称 self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # 检测GPU self.train_eps = 200 # 训练的回合数 self.test_eps = 30 # 测试的回合数 + ################################################################################ + + ################################## 算法超参数 ################################### self.gamma = 0.95 # 强化学习中的折扣因子 self.epsilon_start = 0.95 # e-greedy策略中初始epsilon self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon @@ -43,20 +45,16 @@ class DoubleDQNConfig: self.batch_size = 64 # mini-batch SGD中的批量大小 self.target_update = 2 # 目标网络的更新频率 self.hidden_dim = 256 # 网络隐藏层 -class PlotConfig: - ''' 绘图相关参数设置 - ''' + ################################################################################ - def __init__(self) -> None: - self.algo_name = algo_name # 算法名称 - self.env_name = env_name # 环境名称 - self.device = torch.device( - "cuda" if torch.cuda.is_available() else "cpu") # 检测GPU + ################################# 保存结果相关参数 ############################## self.result_path = curr_path + "/outputs/" + self.env_name + \ '/' + curr_time + '/results/' # 保存结果的路径 self.model_path = curr_path + "/outputs/" + self.env_name + \ '/' + curr_time + '/models/' # 保存模型的路径 - self.save = True # 是否保存图片 + self.save = True # 是否保存图片 + ################################################################################ + def env_agent_config(cfg,seed=1): env = gym.make(cfg.env_name) @@ -66,18 +64,81 @@ def env_agent_config(cfg,seed=1): agent = DoubleDQN(state_dim,action_dim,cfg) return env,agent -cfg = DoubleDQNConfig() -plot_cfg = PlotConfig() -# 训练 -env,agent = env_agent_config(cfg,seed=1) -rewards, ma_rewards = train(cfg, env, agent) -make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹 -agent.save(path=plot_cfg.model_path) # 保存模型 -save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path) # 保存结果 -plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果 -# 测试 -env,agent = env_agent_config(cfg,seed=10) -agent.load(path=plot_cfg.model_path) # 导入模型 -rewards,ma_rewards = test(cfg,env,agent) -save_results(rewards, ma_rewards, tag='test', path=plot_cfg.result_path) # 保存结果 -plot_rewards(rewards, ma_rewards, plot_cfg, tag="test") # 画出结果 +def train(cfg,env,agent): + print('开始训练!') + print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') + rewards = [] # 记录所有回合的奖励 + ma_rewards = [] # 记录所有回合的滑动平均奖励 + for i_ep in range(cfg.train_eps): + ep_reward = 0 # 记录一回合内的奖励 + state = env.reset() # 重置环境,返回初始状态 + while True: + action = agent.choose_action(state) + next_state, reward, done, _ = env.step(action) + ep_reward += reward + agent.memory.push(state, action, reward, next_state, done) + state = next_state + agent.update() + if done: + break + if i_ep % cfg.target_update == 0: + agent.target_net.load_state_dict(agent.policy_net.state_dict()) + if (i_ep+1)%10 == 0: + print(f'回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward}') + rewards.append(ep_reward) + if ma_rewards: + ma_rewards.append( + 0.9*ma_rewards[-1]+0.1*ep_reward) + else: + ma_rewards.append(ep_reward) + print('完成训练!') + env.close() + return rewards,ma_rewards + +def test(cfg,env,agent): + print('开始测试!') + print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') + ############# 由于测试不需要使用epsilon-greedy策略,所以相应的值设置为0 ############### + cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon + cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon + ################################################################################ + rewards = [] # 记录所有回合的奖励 + ma_rewards = [] # 记录所有回合的滑动平均奖励 + + for i_ep in range(cfg.test_eps): + state = env.reset() + ep_reward = 0 + while True: + action = agent.choose_action(state) + next_state, reward, done, _ = env.step(action) + state = next_state + ep_reward += reward + if done: + break + rewards.append(ep_reward) + if ma_rewards: + ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1) + else: + ma_rewards.append(ep_reward) + print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}") + print('完成测试!') + env.close() + return rewards,ma_rewards + +if __name__ == "__main__": + cfg = Config() + # 训练 + env, agent = env_agent_config(cfg) + rewards, ma_rewards = train(cfg, env, agent) + make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹 + agent.save(path=cfg.model_path) # 保存模型 + save_results(rewards, ma_rewards, tag='train', + path=cfg.result_path) # 保存结果 + plot_rewards(rewards, ma_rewards, cfg, tag="train") # 画出结果 + # 测试 + env, agent = env_agent_config(cfg) + agent.load(path=cfg.model_path) # 导入模型 + rewards, ma_rewards = test(cfg, env, agent) + save_results(rewards, ma_rewards, tag='test', + path=cfg.result_path) # 保存结果 + plot_rewards(rewards, ma_rewards, cfg, tag="test") # 画出结果 diff --git a/codes/DoubleDQN/train.py b/codes/DoubleDQN/train.py deleted file mode 100644 index ff0a786..0000000 --- a/codes/DoubleDQN/train.py +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -Author: JiangJi -Email: johnjim0816@gmail.com -Date: 2021-11-07 18:10:37 -LastEditor: JiangJi -LastEditTime: 2021-11-19 18:34:05 -Discription: -''' - -import sys,os -curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 -parent_path = os.path.dirname(curr_path) # 父路径 -sys.path.append(parent_path) # 添加路径到系统路径 - -def train(cfg,env,agent): - print('开始训练!') - print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') - rewards = [] # 记录所有回合的奖励 - ma_rewards = [] # 记录所有回合的滑动平均奖励 - for i_ep in range(cfg.train_eps): - ep_reward = 0 # 记录一回合内的奖励 - state = env.reset() # 重置环境,返回初始状态 - while True: - action = agent.choose_action(state) - next_state, reward, done, _ = env.step(action) - ep_reward += reward - agent.memory.push(state, action, reward, next_state, done) - state = next_state - agent.update() - if done: - break - if i_ep % cfg.target_update == 0: - agent.target_net.load_state_dict(agent.policy_net.state_dict()) - if (i_ep+1)%10 == 0: - print(f'回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward}') - rewards.append(ep_reward) - if ma_rewards: - ma_rewards.append( - 0.9*ma_rewards[-1]+0.1*ep_reward) - else: - ma_rewards.append(ep_reward) - print('完成训练!') - return rewards,ma_rewards - -def test(cfg,env,agent): - print('开始测试!') - print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') - # 由于测试不需要使用epsilon-greedy策略,所以相应的值设置为0 - cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon - cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon - rewards = [] # 记录所有回合的奖励 - ma_rewards = [] # 记录所有回合的滑动平均奖励 - for i_ep in range(cfg.test_eps): - state = env.reset() - ep_reward = 0 - while True: - action = agent.choose_action(state) - next_state, reward, done, _ = env.step(action) - state = next_state - ep_reward += reward - if done: - break - rewards.append(ep_reward) - if ma_rewards: - ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1) - else: - ma_rewards.append(ep_reward) - print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}") - print('完成测试!') - return rewards,ma_rewards - diff --git a/codes/Logs.md b/codes/Logs.md index 4efc3cd..7dc6497 100644 --- a/codes/Logs.md +++ b/codes/Logs.md @@ -1,5 +1,7 @@ ## 记录笔者更新的日志 +**2021.12.28-1**:将```task.py```中的两个Config类合并为一个,并加以注释便于阅读,从DQN算法开始更新 + **2021.12.22-3**:将```agent.py```更改为对应的算法名称,便于区分如```dqn```与```dqn_cnn```的情况 **2021.12.22-2**:简化了代码结构,将原来的```train.py```和```task.py```等合并到```task.py```中 **2021.12.22-1**:简化了代码结构,将原来的```model.py```和```memory.py```等合并到```agent.py```中,```plot.py```的内容合并到```common.utils.py```中 \ No newline at end of file diff --git a/codes/QLearning/train.py b/codes/QLearning/train.py index 40a7746..2c4aa09 100644 --- a/codes/QLearning/train.py +++ b/codes/QLearning/train.py @@ -19,7 +19,6 @@ def train(cfg,env,agent): ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1) else: ma_rewards.append(ep_reward) - if () print("回合数:{}/{},奖励{:.1f}".format(i_ep+1, cfg.train_eps,ep_reward)) print('完成训练!') return rewards,ma_rewards