diff --git a/codes/A2C/task0.py b/codes/A2C/task0.py index 98e8021..e29266b 100644 --- a/codes/A2C/task0.py +++ b/codes/A2C/task0.py @@ -123,14 +123,15 @@ def train(cfg,envs): loss.backward() optimizer.step() print('Finish training!') - return test_rewards, test_ma_rewards + return {'rewards':test_rewards,'ma_rewards':test_ma_rewards} if __name__ == "__main__": cfg = get_args() envs = [make_envs(cfg.env_name) for i in range(cfg.n_envs)] envs = SubprocVecEnv(envs) # training - rewards,ma_rewards = train(cfg,envs) + res_dic = train(cfg,envs) make_dir(cfg.result_path,cfg.model_path) save_args(cfg) - save_results(rewards, ma_rewards, tag='train', path=cfg.result_path) # 保存结果 - plot_rewards(rewards, ma_rewards, cfg, tag="train") # 画出结果 + save_results(res_dic, tag='train', + path=cfg.result_path) + plot_rewards(res_dic['rewards'], res_dic['ma_rewards'], cfg, tag="train") # 画出结果 diff --git a/codes/DDPG/task0.py b/codes/DDPG/task0.py index d280abc..20688d3 100644 --- a/codes/DDPG/task0.py +++ b/codes/DDPG/task0.py @@ -5,7 +5,7 @@ @Email: johnjim0816@gmail.com @Date: 2020-06-11 20:58:21 @LastEditor: John -LastEditTime: 2022-07-21 00:05:41 +LastEditTime: 2022-07-21 21:51:34 @Discription: @Environment: python 3.7.7 ''' @@ -86,7 +86,7 @@ def train(cfg, env, agent): else: ma_rewards.append(ep_reward) print('Finish training!') - return rewards, ma_rewards + return {'rewards':rewards,'ma_rewards':ma_rewards} def test(cfg, env, agent): print('Start testing') @@ -111,21 +111,23 @@ def test(cfg, env, agent): ma_rewards.append(ep_reward) print(f"Epside:{i_ep+1}/{cfg.test_eps}, Reward:{ep_reward:.1f}") print('Finish testing!') - return rewards, ma_rewards + return {'rewards':rewards,'ma_rewards':ma_rewards} if __name__ == "__main__": cfg = get_args() # training env,agent = env_agent_config(cfg,seed=1) - rewards, ma_rewards = train(cfg, env, agent) + res_dic = train(cfg, env, agent) make_dir(cfg.result_path, cfg.model_path) save_args(cfg) agent.save(path=cfg.model_path) - save_results(rewards, ma_rewards, tag='train', path=cfg.result_path) - plot_rewards(rewards, ma_rewards, cfg, tag="train") + save_results(res_dic, tag='train', + path=cfg.result_path) + plot_rewards(res_dic['rewards'], res_dic['ma_rewards'], cfg, tag="train") # testing env,agent = env_agent_config(cfg,seed=10) agent.load(path=cfg.model_path) - rewards,ma_rewards = test(cfg,env,agent) - save_results(rewards,ma_rewards,tag = 'test',path = cfg.result_path) - plot_rewards(rewards, ma_rewards, cfg, tag="test") + res_dic = test(cfg,env,agent) + save_results(res_dic, tag='test', + path=cfg.result_path) + plot_rewards(res_dic['rewards'], res_dic['ma_rewards'], cfg, tag="test") diff --git a/codes/DQN/task0.py b/codes/DQN/task0.py index 02f9d83..04344aa 100644 --- a/codes/DQN/task0.py +++ b/codes/DQN/task0.py @@ -10,7 +10,7 @@ import torch import datetime import numpy as np import argparse -from common.utils import save_results_1, make_dir +from common.utils import save_results, make_dir from common.utils import plot_rewards,save_args from dqn import DQN @@ -95,8 +95,8 @@ def train(cfg, env, agent): def test(cfg, env, agent): - print('开始测试!') - print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') + print('Start testing!') + print(f'Env:{cfg.env_name}, A{cfg.algo_name}, 设备:{cfg.device}') ############# 由于测试不需要使用epsilon-greedy策略,所以相应的值设置为0 ############### cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon @@ -123,7 +123,7 @@ def test(cfg, env, agent): else: ma_rewards.append(ep_reward) print(f'Episode:{i_ep+1}/{cfg.test_eps}, Reward:{ep_reward:.2f}, Step:{ep_step:.2f}') - print('完成测试!') + print('Finish testing') env.close() return {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps} @@ -133,16 +133,16 @@ if __name__ == "__main__": # 训练 env, agent = env_agent_config(cfg) res_dic = train(cfg, env, agent) - make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹 - save_args(cfg) - agent.save(path=cfg.model_path) # 保存模型 - save_results_1(res_dic, tag='train', - path=cfg.result_path) # 保存结果 - plot_rewards(res_dic['rewards'], res_dic['ma_rewards'], cfg, tag="train") # 画出结果 + make_dir(cfg.result_path, cfg.model_path) + save_args(cfg) # save parameters + agent.save(path=cfg.model_path) # save model + save_results(res_dic, tag='train', + path=cfg.result_path) + plot_rewards(res_dic['rewards'], res_dic['ma_rewards'], cfg, tag="train") # 测试 env, agent = env_agent_config(cfg) agent.load(path=cfg.model_path) # 导入模型 res_dic = test(cfg, env, agent) - save_results_1(res_dic, tag='test', + save_results(res_dic, tag='test', path=cfg.result_path) # 保存结果 plot_rewards(res_dic['rewards'], res_dic['ma_rewards'],cfg, tag="test") # 画出结果 diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20220721-000842/models/checkpoint.pth b/codes/DoubleDQN/outputs/CartPole-v0/20220721-000842/models/checkpoint.pth deleted file mode 100644 index a4901d2..0000000 Binary files a/codes/DoubleDQN/outputs/CartPole-v0/20220721-000842/models/checkpoint.pth and /dev/null differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20220721-000842/results/params.json b/codes/DoubleDQN/outputs/CartPole-v0/20220721-000842/results/params.json deleted file mode 100644 index dbdd76b..0000000 --- a/codes/DoubleDQN/outputs/CartPole-v0/20220721-000842/results/params.json +++ /dev/null @@ -1 +0,0 @@ -{"algo_name": "DoubleDQN", "env_name": "CartPole-v0", "train_eps": 200, "test_eps": 20, "gamma": 0.99, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 500, "lr": 0.0001, "memory_capacity": 100000, "batch_size": 64, "target_update": 2, "hidden_dim": 256, "device": "cuda", "result_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DoubleDQN/outputs/CartPole-v0/20220721-000842/results/", "model_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DoubleDQN/outputs/CartPole-v0/20220721-000842/models/", "save_fig": true} \ No newline at end of file diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20220721-000842/results/test_ma_rewards.npy b/codes/DoubleDQN/outputs/CartPole-v0/20220721-000842/results/test_ma_rewards.npy deleted file mode 100644 index f7d200d..0000000 Binary files a/codes/DoubleDQN/outputs/CartPole-v0/20220721-000842/results/test_ma_rewards.npy and /dev/null differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20220721-000842/results/test_rewards_curve.png b/codes/DoubleDQN/outputs/CartPole-v0/20220721-000842/results/test_rewards_curve.png deleted file mode 100644 index 4efa0e1..0000000 Binary files a/codes/DoubleDQN/outputs/CartPole-v0/20220721-000842/results/test_rewards_curve.png and /dev/null differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20220721-000842/results/train_ma_rewards.npy b/codes/DoubleDQN/outputs/CartPole-v0/20220721-000842/results/train_ma_rewards.npy deleted file mode 100644 index 2a2a816..0000000 Binary files a/codes/DoubleDQN/outputs/CartPole-v0/20220721-000842/results/train_ma_rewards.npy and /dev/null differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20220721-000842/results/train_rewards.npy b/codes/DoubleDQN/outputs/CartPole-v0/20220721-000842/results/train_rewards.npy deleted file mode 100644 index 485de52..0000000 Binary files a/codes/DoubleDQN/outputs/CartPole-v0/20220721-000842/results/train_rewards.npy and /dev/null differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20220721-000842/results/train_rewards_curve.png b/codes/DoubleDQN/outputs/CartPole-v0/20220721-000842/results/train_rewards_curve.png deleted file mode 100644 index 9d2ff8d..0000000 Binary files a/codes/DoubleDQN/outputs/CartPole-v0/20220721-000842/results/train_rewards_curve.png and /dev/null differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20220721-215416/models/checkpoint.pth b/codes/DoubleDQN/outputs/CartPole-v0/20220721-215416/models/checkpoint.pth new file mode 100644 index 0000000..2818144 Binary files /dev/null and b/codes/DoubleDQN/outputs/CartPole-v0/20220721-215416/models/checkpoint.pth differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20220721-215416/results/params.json b/codes/DoubleDQN/outputs/CartPole-v0/20220721-215416/results/params.json new file mode 100644 index 0000000..abc1877 --- /dev/null +++ b/codes/DoubleDQN/outputs/CartPole-v0/20220721-215416/results/params.json @@ -0,0 +1,19 @@ +{ + "algo_name": "DoubleDQN", + "env_name": "CartPole-v0", + "train_eps": 200, + "test_eps": 20, + "gamma": 0.99, + "epsilon_start": 0.95, + "epsilon_end": 0.01, + "epsilon_decay": 500, + "lr": 0.0001, + "memory_capacity": 100000, + "batch_size": 64, + "target_update": 2, + "hidden_dim": 256, + "device": "cuda", + "result_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DoubleDQN/outputs/CartPole-v0/20220721-215416/results/", + "model_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DoubleDQN/outputs/CartPole-v0/20220721-215416/models/", + "save_fig": true +} \ No newline at end of file diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20220721-215416/results/test_ma_rewards.npy b/codes/DoubleDQN/outputs/CartPole-v0/20220721-215416/results/test_ma_rewards.npy new file mode 100644 index 0000000..da15b7f Binary files /dev/null and b/codes/DoubleDQN/outputs/CartPole-v0/20220721-215416/results/test_ma_rewards.npy differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20220721-000842/results/test_rewards.npy b/codes/DoubleDQN/outputs/CartPole-v0/20220721-215416/results/test_rewards.npy similarity index 55% rename from codes/DoubleDQN/outputs/CartPole-v0/20220721-000842/results/test_rewards.npy rename to codes/DoubleDQN/outputs/CartPole-v0/20220721-215416/results/test_rewards.npy index 7325942..ce7e7be 100644 Binary files a/codes/DoubleDQN/outputs/CartPole-v0/20220721-000842/results/test_rewards.npy and b/codes/DoubleDQN/outputs/CartPole-v0/20220721-215416/results/test_rewards.npy differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20220721-215416/results/test_rewards_curve.png b/codes/DoubleDQN/outputs/CartPole-v0/20220721-215416/results/test_rewards_curve.png new file mode 100644 index 0000000..9123a84 Binary files /dev/null and b/codes/DoubleDQN/outputs/CartPole-v0/20220721-215416/results/test_rewards_curve.png differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20220721-215416/results/train_ma_rewards.npy b/codes/DoubleDQN/outputs/CartPole-v0/20220721-215416/results/train_ma_rewards.npy new file mode 100644 index 0000000..b44206b Binary files /dev/null and b/codes/DoubleDQN/outputs/CartPole-v0/20220721-215416/results/train_ma_rewards.npy differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20220721-215416/results/train_rewards.npy b/codes/DoubleDQN/outputs/CartPole-v0/20220721-215416/results/train_rewards.npy new file mode 100644 index 0000000..d9b5730 Binary files /dev/null and b/codes/DoubleDQN/outputs/CartPole-v0/20220721-215416/results/train_rewards.npy differ diff --git a/codes/DoubleDQN/outputs/CartPole-v0/20220721-215416/results/train_rewards_curve.png b/codes/DoubleDQN/outputs/CartPole-v0/20220721-215416/results/train_rewards_curve.png new file mode 100644 index 0000000..d07d996 Binary files /dev/null and b/codes/DoubleDQN/outputs/CartPole-v0/20220721-215416/results/train_rewards_curve.png differ diff --git a/codes/DoubleDQN/task0.py b/codes/DoubleDQN/task0.py index 5bb4fe6..66dfcd9 100644 --- a/codes/DoubleDQN/task0.py +++ b/codes/DoubleDQN/task0.py @@ -5,7 +5,7 @@ Author: JiangJi Email: johnjim0816@gmail.com Date: 2021-11-07 18:10:37 LastEditor: JiangJi -LastEditTime: 2022-07-21 00:08:38 +LastEditTime: 2022-07-21 21:52:31 Discription: ''' import sys,os @@ -86,7 +86,7 @@ def train(cfg,env,agent): else: ma_rewards.append(ep_reward) print('Finish training!') - return rewards,ma_rewards + return {'rewards':rewards,'ma_rewards':ma_rewards} def test(cfg,env,agent): print('Start testing') @@ -115,22 +115,24 @@ def test(cfg,env,agent): ma_rewards.append(ep_reward) print(f"Epside:{i_ep+1}/{cfg.test_eps}, Reward:{ep_reward:.1f}") print('Finish testing!') - return rewards,ma_rewards + return {'rewards':rewards,'ma_rewards':ma_rewards} if __name__ == "__main__": cfg = get_args() print(cfg.device) # training env,agent = env_agent_config(cfg,seed=1) - rewards, ma_rewards = train(cfg, env, agent) + res_dic = train(cfg, env, agent) make_dir(cfg.result_path, cfg.model_path) save_args(cfg) agent.save(path=cfg.model_path) - save_results(rewards, ma_rewards, tag='train', path=cfg.result_path) - plot_rewards(rewards, ma_rewards, cfg, tag="train") + save_results(res_dic, tag='train', + path=cfg.result_path) + plot_rewards(res_dic['rewards'], res_dic['ma_rewards'], cfg, tag="train") # testing env,agent = env_agent_config(cfg,seed=10) agent.load(path=cfg.model_path) - rewards,ma_rewards = test(cfg,env,agent) - save_results(rewards,ma_rewards,tag = 'test',path = cfg.result_path) - plot_rewards(rewards, ma_rewards, cfg, tag="test") + res_dic = test(cfg,env,agent) + save_results(res_dic, tag='test', + path=cfg.result_path) + plot_rewards(res_dic['rewards'], res_dic['ma_rewards'], cfg, tag="test") diff --git a/codes/PolicyGradient/task0.py b/codes/PolicyGradient/task0.py index c676fe3..b9e11a0 100644 --- a/codes/PolicyGradient/task0.py +++ b/codes/PolicyGradient/task0.py @@ -5,56 +5,47 @@ Author: John Email: johnjim0816@gmail.com Date: 2020-11-22 23:21:53 LastEditor: John -LastEditTime: 2022-02-10 06:13:21 +LastEditTime: 2022-07-21 21:44:00 Discription: Environment: ''' -import sys -import os -curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 -parent_path = os.path.dirname(curr_path) # 父路径 -sys.path.append(parent_path) # 添加路径到系统路径 +import sys,os +curr_path = os.path.dirname(os.path.abspath(__file__)) # current path +parent_path = os.path.dirname(curr_path) # parent path +sys.path.append(parent_path) # add to system path import gym import torch import datetime +import argparse from itertools import count from pg import PolicyGradient from common.utils import save_results, make_dir from common.utils import plot_rewards -curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 -class Config: - '''超参数 - ''' - - def __init__(self): - ################################## 环境超参数 ################################### - self.algo_name = "PolicyGradient" # 算法名称 - self.env_name = 'CartPole-v0' # 环境名称 - self.device = torch.device( - "cuda" if torch.cuda.is_available() else "cpu") # 检测GPUgjgjlkhfsf风刀霜的撒发十 - self.seed = 10 # 随机种子,置0则不设置随机种子 - self.train_eps = 300 # 训练的回合数 - self.test_eps = 30 # 测试的回合数 - ################################################################################ - - ################################## 算法超参数 ################################### - self.batch_size = 8 # mini-batch SGD中的批量大小 - self.lr = 0.01 # 学习率 - self.gamma = 0.99 # 强化学习中的折扣因子 - self.hidden_dim = 36 # 网络隐藏层 - ################################################################################ - - ################################# 保存结果相关参数 ################################ - self.result_path = curr_path + "/outputs/" + self.env_name + \ - '/' + curr_time + '/results/' # 保存结果的路径 - self.model_path = curr_path + "/outputs/" + self.env_name + \ - '/' + curr_time + '/models/' # 保存模型的路径 - self.save = True # 是否保存图片 - ################################################################################ +def get_args(): + """ Hyperparameters + """ + curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # Obtain current time + parser = argparse.ArgumentParser(description="hyperparameters") + parser.add_argument('--algo_name',default='PolicyGradient',type=str,help="name of algorithm") + parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment") + parser.add_argument('--train_eps',default=300,type=int,help="episodes of training") + parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing") + parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor") + parser.add_argument('--lr',default=0.01,type=float,help="learning rate") + parser.add_argument('--batch_size',default=8,type=int) + parser.add_argument('--hidden_dim',default=36,type=int) + parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda") + parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \ + '/' + curr_time + '/results/' ) + parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \ + '/' + curr_time + '/models/' ) # path to save models + parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not") + args = parser.parse_args() + return args def env_agent_config(cfg,seed=1): @@ -65,9 +56,9 @@ def env_agent_config(cfg,seed=1): return env,agent def train(cfg,env,agent): - print('开始训练!') - print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') - state_pool = [] # 存放每batch_size个episode的state序列 + print('Start training!') + print(f'Env:{cfg.env_name}, Algorithm:{cfg.algo_name}, Device:{cfg.device}') + state_pool = [] # temp states pool per several episodes action_pool = [] reward_pool = [] rewards = [] @@ -86,11 +77,11 @@ def train(cfg,env,agent): reward_pool.append(reward) state = next_state if done: - print('回合:{}/{}, 奖励:{}'.format(i_ep + 1, cfg.train_eps, ep_reward)) + print(f'Episode:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}') break if i_ep > 0 and i_ep % cfg.batch_size == 0: agent.update(reward_pool,state_pool,action_pool) - state_pool = [] # 每个episode的state + state_pool = [] action_pool = [] reward_pool = [] rewards.append(ep_reward) @@ -99,8 +90,8 @@ def train(cfg,env,agent): 0.9*ma_rewards[-1]+0.1*ep_reward) else: ma_rewards.append(ep_reward) - print('完成训练!') - env.close() + print('Finish training!') + env.close() # close environment return rewards, ma_rewards diff --git a/codes/common/utils.py b/codes/common/utils.py index dbee61a..654b73c 100644 --- a/codes/common/utils.py +++ b/codes/common/utils.py @@ -5,7 +5,7 @@ Author: John Email: johnjim0816@gmail.com Date: 2021-03-12 16:02:24 LastEditor: John -LastEditTime: 2022-07-20 23:53:34 +LastEditTime: 2022-07-21 21:45:33 Discription: Environment: ''' @@ -69,19 +69,19 @@ def plot_losses(losses, algo="DQN", save=True, path='./'): plt.savefig(path+"losses_curve") plt.show() -def save_results_1(dic, tag='train', path='./results'): +def save_results(dic, tag='train', path='./results'): ''' 保存奖励 ''' for key,value in dic.items(): np.save(path+'{}_{}.npy'.format(tag,key),value) print('Results saved!') -def save_results(rewards, ma_rewards, tag='train', path='./results'): - ''' 保存奖励 - ''' - np.save(path+'{}_rewards.npy'.format(tag), rewards) - np.save(path+'{}_ma_rewards.npy'.format(tag), ma_rewards) - print('Result saved!') +# def save_results(rewards, ma_rewards, tag='train', path='./results'): +# ''' 保存奖励 +# ''' +# np.save(path+'{}_rewards.npy'.format(tag), rewards) +# np.save(path+'{}_ma_rewards.npy'.format(tag), ma_rewards) +# print('Result saved!') def make_dir(*paths):