diff --git a/codes/A2C/outputs/CartPole-v0/20210503-224814/results/train_ma_rewards.npy b/codes/A2C/outputs/CartPole-v0/20210503-224814/results/train_ma_rewards.npy deleted file mode 100644 index 57f4174..0000000 Binary files a/codes/A2C/outputs/CartPole-v0/20210503-224814/results/train_ma_rewards.npy and /dev/null differ diff --git a/codes/A2C/outputs/CartPole-v0/20210503-224814/results/train_rewards.npy b/codes/A2C/outputs/CartPole-v0/20210503-224814/results/train_rewards.npy deleted file mode 100644 index bdb3fce..0000000 Binary files a/codes/A2C/outputs/CartPole-v0/20210503-224814/results/train_rewards.npy and /dev/null differ diff --git a/codes/A2C/outputs/CartPole-v0/20210503-224814/results/train_rewards_curve.png b/codes/A2C/outputs/CartPole-v0/20210503-224814/results/train_rewards_curve.png deleted file mode 100644 index 5f1cf9a..0000000 Binary files a/codes/A2C/outputs/CartPole-v0/20210503-224814/results/train_rewards_curve.png and /dev/null differ diff --git a/codes/A2C/outputs/CartPole-v0/20211221-165620/results/train_ma_rewards.npy b/codes/A2C/outputs/CartPole-v0/20211221-165620/results/train_ma_rewards.npy deleted file mode 100644 index 6537afd..0000000 Binary files a/codes/A2C/outputs/CartPole-v0/20211221-165620/results/train_ma_rewards.npy and /dev/null differ diff --git a/codes/A2C/outputs/CartPole-v0/20211221-165620/results/train_rewards.npy b/codes/A2C/outputs/CartPole-v0/20211221-165620/results/train_rewards.npy deleted file mode 100644 index 56f779b..0000000 Binary files a/codes/A2C/outputs/CartPole-v0/20211221-165620/results/train_rewards.npy and /dev/null differ diff --git a/codes/A2C/outputs/CartPole-v0/20220713-221850/results/params.txt b/codes/A2C/outputs/CartPole-v0/20220713-221850/results/params.txt new file mode 100644 index 0000000..2daca8c --- /dev/null +++ b/codes/A2C/outputs/CartPole-v0/20220713-221850/results/params.txt @@ -0,0 +1,14 @@ +------------------ start ------------------ +algo_name : A2C +env_name : CartPole-v0 +n_envs : 8 +max_steps : 30000 +n_steps : 5 +gamma : 0.99 +lr : 0.001 +hidden_dim : 256 +result_path : c:\Users\24438\Desktop\rl-tutorials\codes\A2C/outputs/CartPole-v0/20220713-221850/results/ +model_path : c:\Users\24438\Desktop\rl-tutorials\codes\A2C/outputs/CartPole-v0/20220713-221850/models/ +save_fig : True +device : cuda +------------------- end ------------------- \ No newline at end of file diff --git a/codes/A2C/outputs/CartPole-v0/20220713-221850/results/train_ma_rewards.npy b/codes/A2C/outputs/CartPole-v0/20220713-221850/results/train_ma_rewards.npy new file mode 100644 index 0000000..66091a2 Binary files /dev/null and b/codes/A2C/outputs/CartPole-v0/20220713-221850/results/train_ma_rewards.npy differ diff --git a/codes/A2C/outputs/CartPole-v0/20220713-221850/results/train_rewards.npy b/codes/A2C/outputs/CartPole-v0/20220713-221850/results/train_rewards.npy new file mode 100644 index 0000000..5e6ea3f Binary files /dev/null and b/codes/A2C/outputs/CartPole-v0/20220713-221850/results/train_rewards.npy differ diff --git a/codes/A2C/outputs/CartPole-v0/20220713-221850/results/train_rewards_curve.png b/codes/A2C/outputs/CartPole-v0/20220713-221850/results/train_rewards_curve.png new file mode 100644 index 0000000..b8c0921 Binary files /dev/null and b/codes/A2C/outputs/CartPole-v0/20220713-221850/results/train_rewards_curve.png differ diff --git a/codes/A2C/task0.py b/codes/A2C/task0.py index 8e3cd0f..bfea4d7 100644 --- a/codes/A2C/task0.py +++ b/codes/A2C/task0.py @@ -1,45 +1,43 @@ -import sys -import os -curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 -parent_path = os.path.dirname(curr_path) # 父路径 -sys.path.append(parent_path) # 添加路径到系统路径 +import sys,os +curr_path = os.path.dirname(os.path.abspath(__file__)) # current path +parent_path = os.path.dirname(curr_path) # parent path +sys.path.append(parent_path) # add to system path import gym import numpy as np import torch import torch.optim as optim import datetime +import argparse from common.multiprocessing_env import SubprocVecEnv from a2c import ActorCritic from common.utils import save_results, make_dir -from common.utils import plot_rewards +from common.utils import plot_rewards, save_args -curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 -algo_name = 'A2C' # 算法名称 -env_name = 'CartPole-v0' # 环境名称 -class A2CConfig: - def __init__(self) -> None: - self.algo_name = algo_name# 算法名称 - self.env_name = env_name # 环境名称 - self.n_envs = 8 # 异步的环境数目 - self.gamma = 0.99 # 强化学习中的折扣因子 - self.hidden_dim = 256 - self.lr = 1e-3 # learning rate - self.max_frames = 30000 - self.n_steps = 5 - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -class PlotConfig: - def __init__(self) -> None: - self.algo_name = algo_name # 算法名称 - self.env_name = env_name # 环境名称 - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU - self.result_path = curr_path+"/outputs/" + self.env_name + \ - '/'+curr_time+'/results/' # 保存结果的路径 - self.model_path = curr_path+"/outputs/" + self.env_name + \ - '/'+curr_time+'/models/' # 保存模型的路径 - self.save = True # 是否保存图片 - +def get_args(): + """ Hyperparameters + """ + curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # Obtain current time + parser = argparse.ArgumentParser(description="hyperparameters") + parser.add_argument('--algo_name',default='A2C',type=str,help="name of algorithm") + parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment") + parser.add_argument('--n_envs',default=8,type=int,help="numbers of environments") + + parser.add_argument('--max_steps',default=20000,type=int,help="episodes of training") + parser.add_argument('--n_steps',default=5,type=int,help="episodes of testing") + parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor") + parser.add_argument('--lr',default=1e-3,type=float,help="learning rate") + parser.add_argument('--hidden_dim',default=256,type=int) + parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \ + '/' + curr_time + '/results/' ) + parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \ + '/' + curr_time + '/models/' ) # path to save models + parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not") + args = parser.parse_args() + args.device = torch.device( + "cuda" if torch.cuda.is_available() else "cpu") # check GPU + return args def make_envs(env_name): def _thunk(): @@ -60,6 +58,7 @@ def test_env(env,model,vis=False): if vis: env.render() total_reward += reward return total_reward + def compute_returns(next_value, rewards, masks, gamma=0.99): R = next_value returns = [] @@ -70,19 +69,19 @@ def compute_returns(next_value, rewards, masks, gamma=0.99): def train(cfg,envs): - print('开始训练!') - print(f'环境:{cfg.env_name}, 算法:{cfg.algo}, 设备:{cfg.device}') + print('Start training!') + print(f'Env:{cfg.env_name}, Algorithm:{cfg.algo_name}, Device:{cfg.device}') env = gym.make(cfg.env_name) # a single env env.seed(10) n_states = envs.observation_space.shape[0] n_actions = envs.action_space.n model = ActorCritic(n_states, n_actions, cfg.hidden_dim).to(cfg.device) optimizer = optim.Adam(model.parameters()) - frame_idx = 0 + step_idx = 0 test_rewards = [] test_ma_rewards = [] state = envs.reset() - while frame_idx < cfg.max_frames: + while step_idx < cfg.max_steps: log_probs = [] values = [] rewards = [] @@ -101,16 +100,16 @@ def train(cfg,envs): rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(cfg.device)) masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(cfg.device)) state = next_state - frame_idx += 1 - if frame_idx % 100 == 0: + step_idx += 1 + if step_idx % 100 == 0: test_reward = np.mean([test_env(env,model) for _ in range(10)]) - print(f"frame_idx:{frame_idx}, test_reward:{test_reward}") + print(f"step_idx:{step_idx}, test_reward:{test_reward}") test_rewards.append(test_reward) if test_ma_rewards: test_ma_rewards.append(0.9*test_ma_rewards[-1]+0.1*test_reward) else: test_ma_rewards.append(test_reward) - # plot(frame_idx, test_rewards) + # plot(step_idx, test_rewards) next_state = torch.FloatTensor(next_state).to(cfg.device) _, next_value = model(next_state) returns = compute_returns(next_value, rewards, masks) @@ -124,15 +123,15 @@ def train(cfg,envs): optimizer.zero_grad() loss.backward() optimizer.step() - print('完成训练!') + print('Finish training!') return test_rewards, test_ma_rewards if __name__ == "__main__": - cfg = A2CConfig() - plot_cfg = PlotConfig() + cfg = get_args() envs = [make_envs(cfg.env_name) for i in range(cfg.n_envs)] envs = SubprocVecEnv(envs) - # 训练 + # training rewards,ma_rewards = train(cfg,envs) - make_dir(plot_cfg.result_path,plot_cfg.model_path) - save_results(rewards, ma_rewards, tag='train', path=plot_cfg.result_path) # 保存结果 - plot_rewards(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果 + make_dir(cfg.result_path,cfg.model_path) + save_args(cfg) + save_results(rewards, ma_rewards, tag='train', path=cfg.result_path) # 保存结果 + plot_rewards(rewards, ma_rewards, cfg, tag="train") # 画出结果 diff --git a/codes/DDPG/README.md b/codes/DDPG/README.md deleted file mode 100644 index bbcedcc..0000000 --- a/codes/DDPG/README.md +++ /dev/null @@ -1,7 +0,0 @@ -# DDPG - -#TODO - -## 伪代码 - -![image-20210320151900695](assets/image-20210320151900695.png) \ No newline at end of file diff --git a/codes/DDPG/assets/image-20210320151900695.png b/codes/DDPG/assets/image-20210320151900695.png deleted file mode 100644 index fd41201..0000000 Binary files a/codes/DDPG/assets/image-20210320151900695.png and /dev/null differ diff --git a/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/models/checkpoint.pt b/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/models/checkpoint.pt deleted file mode 100644 index 2051294..0000000 Binary files a/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/models/checkpoint.pt and /dev/null differ diff --git a/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/eval_ma_rewards.npy b/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/eval_ma_rewards.npy deleted file mode 100644 index 936884c..0000000 Binary files a/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/eval_ma_rewards.npy and /dev/null differ diff --git a/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/eval_rewards.npy b/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/eval_rewards.npy deleted file mode 100644 index 4d497f4..0000000 Binary files a/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/eval_rewards.npy and /dev/null differ diff --git a/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/eval_rewards_curve_cn.png b/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/eval_rewards_curve_cn.png deleted file mode 100644 index a442aac..0000000 Binary files a/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/eval_rewards_curve_cn.png and /dev/null differ diff --git a/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/train_ma_rewards.npy b/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/train_ma_rewards.npy deleted file mode 100644 index ab923ee..0000000 Binary files a/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/train_ma_rewards.npy and /dev/null differ diff --git a/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/train_rewards.npy b/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/train_rewards.npy deleted file mode 100644 index 0374e2e..0000000 Binary files a/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/train_rewards.npy and /dev/null differ diff --git a/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/train_rewards_curve_cn.png b/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/train_rewards_curve_cn.png deleted file mode 100644 index 06f3dc8..0000000 Binary files a/codes/DDPG/assets/outputs/Pendulum-v0/20210916-013138/results/train_rewards_curve_cn.png and /dev/null differ diff --git a/codes/DDPG/outputs/Pendulum-v1/20220713-225402/models/checkpoint.pt b/codes/DDPG/outputs/Pendulum-v1/20220713-225402/models/checkpoint.pt new file mode 100644 index 0000000..f245d72 Binary files /dev/null and b/codes/DDPG/outputs/Pendulum-v1/20220713-225402/models/checkpoint.pt differ diff --git a/codes/DDPG/outputs/Pendulum-v1/20220713-225402/results/params.txt b/codes/DDPG/outputs/Pendulum-v1/20220713-225402/results/params.txt new file mode 100644 index 0000000..95a5a55 --- /dev/null +++ b/codes/DDPG/outputs/Pendulum-v1/20220713-225402/results/params.txt @@ -0,0 +1,18 @@ +------------------ start ------------------ +algo_name : DDPG +env_name : Pendulum-v1 +train_eps : 300 +test_eps : 20 +gamma : 0.99 +critic_lr : 0.001 +actor_lr : 0.0001 +memory_capacity : 8000 +batch_size : 128 +target_update : 2 +soft_tau : 0.01 +hidden_dim : 256 +result_path : c:\Users\24438\Desktop\rl-tutorials\codes\DDPG/outputs/Pendulum-v1/20220713-225402/results/ +model_path : c:\Users\24438\Desktop\rl-tutorials\codes\DDPG/outputs/Pendulum-v1/20220713-225402/models/ +save_fig : True +device : cuda +------------------- end ------------------- \ No newline at end of file diff --git a/codes/DDPG/outputs/Pendulum-v1/20220713-225402/results/test_ma_rewards.npy b/codes/DDPG/outputs/Pendulum-v1/20220713-225402/results/test_ma_rewards.npy new file mode 100644 index 0000000..5c72032 Binary files /dev/null and b/codes/DDPG/outputs/Pendulum-v1/20220713-225402/results/test_ma_rewards.npy differ diff --git a/codes/DDPG/outputs/Pendulum-v1/20220713-225402/results/test_rewards.npy b/codes/DDPG/outputs/Pendulum-v1/20220713-225402/results/test_rewards.npy new file mode 100644 index 0000000..3508874 Binary files /dev/null and b/codes/DDPG/outputs/Pendulum-v1/20220713-225402/results/test_rewards.npy differ diff --git a/codes/DDPG/outputs/Pendulum-v1/20220713-225402/results/test_rewards_curve.png b/codes/DDPG/outputs/Pendulum-v1/20220713-225402/results/test_rewards_curve.png new file mode 100644 index 0000000..8d7fbd2 Binary files /dev/null and b/codes/DDPG/outputs/Pendulum-v1/20220713-225402/results/test_rewards_curve.png differ diff --git a/codes/DDPG/outputs/Pendulum-v1/20220713-225402/results/train_ma_rewards.npy b/codes/DDPG/outputs/Pendulum-v1/20220713-225402/results/train_ma_rewards.npy new file mode 100644 index 0000000..c3dd9ad Binary files /dev/null and b/codes/DDPG/outputs/Pendulum-v1/20220713-225402/results/train_ma_rewards.npy differ diff --git a/codes/DDPG/outputs/Pendulum-v1/20220713-225402/results/train_rewards.npy b/codes/DDPG/outputs/Pendulum-v1/20220713-225402/results/train_rewards.npy new file mode 100644 index 0000000..48e4157 Binary files /dev/null and b/codes/DDPG/outputs/Pendulum-v1/20220713-225402/results/train_rewards.npy differ diff --git a/codes/DDPG/outputs/Pendulum-v1/20220713-225402/results/train_rewards_curve.png b/codes/DDPG/outputs/Pendulum-v1/20220713-225402/results/train_rewards_curve.png new file mode 100644 index 0000000..ec6038f Binary files /dev/null and b/codes/DDPG/outputs/Pendulum-v1/20220713-225402/results/train_rewards_curve.png differ diff --git a/codes/DDPG/task0.py b/codes/DDPG/task0.py index f3d5bc2..861d7f3 100644 --- a/codes/DDPG/task0.py +++ b/codes/DDPG/task0.py @@ -5,59 +5,51 @@ @Email: johnjim0816@gmail.com @Date: 2020-06-11 20:58:21 @LastEditor: John -LastEditTime: 2022-06-09 19:05:20 +LastEditTime: 2022-07-13 22:53:11 @Discription: @Environment: python 3.7.7 ''' import sys,os -os.environ['KMP_DUPLICATE_LIB_OK']='True' -curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 -parent_path = os.path.dirname(curr_path) # 父路径 -sys.path.append(parent_path) # 添加路径到系统路径sys.path +curr_path = os.path.dirname(os.path.abspath(__file__)) # current path +parent_path = os.path.dirname(curr_path) # parent path +sys.path.append(parent_path) # add to system path import datetime import gym import torch +import argparse from env import NormalizedActions,OUNoise from ddpg import DDPG from common.utils import save_results,make_dir -from common.utils import plot_rewards +from common.utils import plot_rewards,save_args -curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 -class Config: - '''超参数 - ''' - - def __init__(self): - ################################## 环境超参数 ################################### - self.algo_name = 'DDPG' # 算法名称 - self.env_name = 'Pendulum-v1' # 环境名称,gym新版本(约0.21.0之后)中Pendulum-v0改为Pendulum-v1 - self.device = torch.device( - "cuda" if torch.cuda.is_available() else "cpu") # 检测GPUgjgjlkhfsf风刀霜的撒发十 - self.seed = 10 # 随机种子,置0则不设置随机种子 - self.train_eps = 300 # 训练的回合数 - self.test_eps = 20 # 测试的回合数 - ################################################################################ - - ################################## 算法超参数 ################################### - self.gamma = 0.99 # 折扣因子 - self.critic_lr = 1e-3 # 评论家网络的学习率 - self.actor_lr = 1e-4 # 演员网络的学习率 - self.memory_capacity = 8000 # 经验回放的容量 - self.batch_size = 128 # mini-batch SGD中的批量大小 - self.target_update = 2 # 目标网络的更新频率 - self.hidden_dim = 256 # 网络隐藏层维度 - self.soft_tau = 1e-2 # 软更新参数 - ################################################################################ - - ################################# 保存结果相关参数 ################################ - self.result_path = curr_path + "/outputs/" + self.env_name + \ - '/' + curr_time + '/results/' # 保存结果的路径 - self.model_path = curr_path + "/outputs/" + self.env_name + \ - '/' + curr_time + '/models/' # 保存模型的路径 - self.save = True # 是否保存图片 - ################################################################################ +def get_args(): + """ Hyperparameters + """ + curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # Obtain current time + parser = argparse.ArgumentParser(description="hyperparameters") + parser.add_argument('--algo_name',default='DDPG',type=str,help="name of algorithm") + parser.add_argument('--env_name',default='Pendulum-v1',type=str,help="name of environment") + parser.add_argument('--train_eps',default=300,type=int,help="episodes of training") + parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing") + parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor") + parser.add_argument('--critic_lr',default=1e-3,type=float,help="learning rate of critic") + parser.add_argument('--actor_lr',default=1e-4,type=float,help="learning rate of actor") + parser.add_argument('--memory_capacity',default=8000,type=int,help="memory capacity") + parser.add_argument('--batch_size',default=128,type=int) + parser.add_argument('--target_update',default=2,type=int) + parser.add_argument('--soft_tau',default=1e-2,type=float) + parser.add_argument('--hidden_dim',default=256,type=int) + parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \ + '/' + curr_time + '/results/' ) + parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \ + '/' + curr_time + '/models/' ) # path to save models + parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not") + args = parser.parse_args() + args.device = torch.device( + "cuda" if torch.cuda.is_available() else "cpu") # check GPU + return args def env_agent_config(cfg,seed=1): env = NormalizedActions(gym.make(cfg.env_name)) # 装饰action噪声 @@ -67,9 +59,9 @@ def env_agent_config(cfg,seed=1): agent = DDPG(n_states,n_actions,cfg) return env,agent def train(cfg, env, agent): - print('开始训练!') - print(f'环境:{cfg.env_name},算法:{cfg.algo_name},设备:{cfg.device}') - ou_noise = OUNoise(env.action_space) # 动作噪声 + print('Start training!') + print(f'Env:{cfg.env_name}, Algorithm:{cfg.algo_name}, Device:{cfg.device}') + ou_noise = OUNoise(env.action_space) # noise of action rewards = [] # 记录所有回合的奖励 ma_rewards = [] # 记录所有回合的滑动平均奖励 for i_ep in range(cfg.train_eps): @@ -88,18 +80,18 @@ def train(cfg, env, agent): agent.update() state = next_state if (i_ep+1)%10 == 0: - print('回合:{}/{},奖励:{:.2f}'.format(i_ep+1, cfg.train_eps, ep_reward)) + print(f'Env:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}') rewards.append(ep_reward) if ma_rewards: ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) else: ma_rewards.append(ep_reward) - print('完成训练!') + print('Finish training!') return rewards, ma_rewards def test(cfg, env, agent): - print('开始测试!') - print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') + print('Start testing') + print(f'Env:{cfg.env_name}, Algorithm:{cfg.algo_name}, Device:{cfg.device}') rewards = [] # 记录所有回合的奖励 ma_rewards = [] # 记录所有回合的滑动平均奖励 for i_ep in range(cfg.test_eps): @@ -113,25 +105,25 @@ def test(cfg, env, agent): next_state, reward, done, _ = env.step(action) ep_reward += reward state = next_state - print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward)) rewards.append(ep_reward) if ma_rewards: ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) else: ma_rewards.append(ep_reward) - print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}") - print('完成测试!') + print(f"Epside:{i_ep+1}/{cfg.test_eps}, Reward:{ep_reward:.1f}") + print('Finish testing!') return rewards, ma_rewards if __name__ == "__main__": - cfg = Config() - # 训练 + cfg = get_args() + # training env,agent = env_agent_config(cfg,seed=1) rewards, ma_rewards = train(cfg, env, agent) make_dir(cfg.result_path, cfg.model_path) + save_args(cfg) agent.save(path=cfg.model_path) save_results(rewards, ma_rewards, tag='train', path=cfg.result_path) plot_rewards(rewards, ma_rewards, cfg, tag="train") # 画出结果 - # 测试 + # testing env,agent = env_agent_config(cfg,seed=10) agent.load(path=cfg.model_path) rewards,ma_rewards = test(cfg,env,agent) diff --git a/codes/DQN/dqn.py b/codes/DQN/dqn.py index 8e74e37..0fa0d94 100644 --- a/codes/DQN/dqn.py +++ b/codes/DQN/dqn.py @@ -5,7 +5,7 @@ @Email: johnjim0816@gmail.com @Date: 2020-06-12 00:50:49 @LastEditor: John -LastEditTime: 2022-03-02 11:05:11 +LastEditTime: 2022-07-13 00:08:18 @Discription: @Environment: python 3.7.7 ''' @@ -20,7 +20,22 @@ import random import math import numpy as np - +class MLP(nn.Module): + def __init__(self, n_states,n_actions,hidden_dim=128): + """ 初始化q网络,为全连接网络 + n_states: 输入的特征数即环境的状态维度 + n_actions: 输出的动作维度 + """ + super(MLP, self).__init__() + self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层 + self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层 + self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层 + + def forward(self, x): + # 各层对应的激活函数 + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + return self.fc3(x) class ReplayBuffer: def __init__(self, capacity): @@ -47,7 +62,7 @@ class ReplayBuffer: return len(self.buffer) class DQN: - def __init__(self, n_actions,model,cfg): + def __init__(self, n_states,n_actions,cfg): self.n_actions = n_actions # 总的动作个数 self.device = cfg.device # 设备,cpu或gpu等 @@ -58,8 +73,8 @@ class DQN: (cfg.epsilon_start - cfg.epsilon_end) * \ math.exp(-1. * frame_idx / cfg.epsilon_decay) self.batch_size = cfg.batch_size - self.policy_net = model.to(self.device) - self.target_net = model.to(self.device) + self.policy_net = MLP(n_states,n_actions).to(self.device) + self.target_net = MLP(n_states,n_actions).to(self.device) for target_param, param in zip(self.target_net.parameters(),self.policy_net.parameters()): # 复制参数到目标网路targe_net target_param.data.copy_(param.data) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=cfg.lr) # 优化器 diff --git a/codes/DQN/outputs/CartPole-v0/20211229-144313/models/dqn_checkpoint.pth b/codes/DQN/outputs/CartPole-v0/20211229-144313/models/dqn_checkpoint.pth deleted file mode 100644 index 7fcf736..0000000 Binary files a/codes/DQN/outputs/CartPole-v0/20211229-144313/models/dqn_checkpoint.pth and /dev/null differ diff --git a/codes/DQN/outputs/CartPole-v0/20211229-144313/results/test_ma_rewards.npy b/codes/DQN/outputs/CartPole-v0/20211229-144313/results/test_ma_rewards.npy deleted file mode 100644 index 343fcc6..0000000 Binary files a/codes/DQN/outputs/CartPole-v0/20211229-144313/results/test_ma_rewards.npy and /dev/null differ diff --git a/codes/DQN/outputs/CartPole-v0/20211229-144313/results/test_rewards.npy b/codes/DQN/outputs/CartPole-v0/20211229-144313/results/test_rewards.npy deleted file mode 100644 index 343fcc6..0000000 Binary files a/codes/DQN/outputs/CartPole-v0/20211229-144313/results/test_rewards.npy and /dev/null differ diff --git a/codes/DQN/outputs/CartPole-v0/20211229-144313/results/test_rewards_curve.png b/codes/DQN/outputs/CartPole-v0/20211229-144313/results/test_rewards_curve.png deleted file mode 100644 index bc60080..0000000 Binary files a/codes/DQN/outputs/CartPole-v0/20211229-144313/results/test_rewards_curve.png and /dev/null differ diff --git a/codes/DQN/outputs/CartPole-v0/20211229-144313/results/train_ma_rewards.npy b/codes/DQN/outputs/CartPole-v0/20211229-144313/results/train_ma_rewards.npy deleted file mode 100644 index d81acd2..0000000 Binary files a/codes/DQN/outputs/CartPole-v0/20211229-144313/results/train_ma_rewards.npy and /dev/null differ diff --git a/codes/DQN/outputs/CartPole-v0/20211229-144313/results/train_rewards.npy b/codes/DQN/outputs/CartPole-v0/20211229-144313/results/train_rewards.npy deleted file mode 100644 index 900914d..0000000 Binary files a/codes/DQN/outputs/CartPole-v0/20211229-144313/results/train_rewards.npy and /dev/null differ diff --git a/codes/DQN/outputs/CartPole-v0/20211229-144313/results/train_rewards_curve.png b/codes/DQN/outputs/CartPole-v0/20211229-144313/results/train_rewards_curve.png deleted file mode 100644 index 9df7664..0000000 Binary files a/codes/DQN/outputs/CartPole-v0/20211229-144313/results/train_rewards_curve.png and /dev/null differ diff --git a/codes/DQN/outputs/CartPole-v0/20220302-111332/models/dqn_checkpoint.pth b/codes/DQN/outputs/CartPole-v0/20220302-111332/models/dqn_checkpoint.pth deleted file mode 100644 index 6eb0130..0000000 Binary files a/codes/DQN/outputs/CartPole-v0/20220302-111332/models/dqn_checkpoint.pth and /dev/null differ diff --git a/codes/DQN/outputs/CartPole-v0/20220302-111332/results/train_ma_rewards.npy b/codes/DQN/outputs/CartPole-v0/20220302-111332/results/train_ma_rewards.npy deleted file mode 100644 index d43b263..0000000 Binary files a/codes/DQN/outputs/CartPole-v0/20220302-111332/results/train_ma_rewards.npy and /dev/null differ diff --git a/codes/DQN/outputs/CartPole-v0/20220302-111332/results/train_rewards.npy b/codes/DQN/outputs/CartPole-v0/20220302-111332/results/train_rewards.npy deleted file mode 100644 index 303e570..0000000 Binary files a/codes/DQN/outputs/CartPole-v0/20220302-111332/results/train_rewards.npy and /dev/null differ diff --git a/codes/DQN/outputs/CartPole-v0/20220302-111332/results/train_rewards_curve.png b/codes/DQN/outputs/CartPole-v0/20220302-111332/results/train_rewards_curve.png deleted file mode 100644 index 012be04..0000000 Binary files a/codes/DQN/outputs/CartPole-v0/20220302-111332/results/train_rewards_curve.png and /dev/null differ diff --git a/codes/DQN/outputs/CartPole-v0/20220302-111332/results/train_steps.npy b/codes/DQN/outputs/CartPole-v0/20220302-111332/results/train_steps.npy deleted file mode 100644 index 3d25f8f..0000000 Binary files a/codes/DQN/outputs/CartPole-v0/20220302-111332/results/train_steps.npy and /dev/null differ diff --git a/codes/DQN/outputs/CartPole-v0/20220713-211653/models/dqn_checkpoint.pth b/codes/DQN/outputs/CartPole-v0/20220713-211653/models/dqn_checkpoint.pth new file mode 100644 index 0000000..237eedd Binary files /dev/null and b/codes/DQN/outputs/CartPole-v0/20220713-211653/models/dqn_checkpoint.pth differ diff --git a/codes/DQN/outputs/CartPole-v0/20220713-211653/results/params.txt b/codes/DQN/outputs/CartPole-v0/20220713-211653/results/params.txt new file mode 100644 index 0000000..40eac02 --- /dev/null +++ b/codes/DQN/outputs/CartPole-v0/20220713-211653/results/params.txt @@ -0,0 +1,19 @@ +------------------ start ------------------ +algo_name : DQN +env_name : CartPole-v0 +train_eps : 200 +test_eps : 20 +gamma : 0.95 +epsilon_start : 0.95 +epsilon_end : 0.01 +epsilon_decay : 500 +lr : 0.0001 +memory_capacity : 100000 +batch_size : 64 +target_update : 4 +hidden_dim : 256 +result_path : C:\Users\24438\Desktop\rl-tutorials\codes\DQN/outputs/CartPole-v0/20220713-211653/results/ +model_path : C:\Users\24438\Desktop\rl-tutorials\codes\DQN/outputs/CartPole-v0/20220713-211653/models/ +save_fig : True +device : cuda +------------------- end ------------------- \ No newline at end of file diff --git a/codes/DQN/outputs/CartPole-v0/20220302-111332/results/test_ma_rewards.npy b/codes/DQN/outputs/CartPole-v0/20220713-211653/results/test_ma_rewards.npy similarity index 100% rename from codes/DQN/outputs/CartPole-v0/20220302-111332/results/test_ma_rewards.npy rename to codes/DQN/outputs/CartPole-v0/20220713-211653/results/test_ma_rewards.npy diff --git a/codes/DQN/outputs/CartPole-v0/20220302-111332/results/test_rewards.npy b/codes/DQN/outputs/CartPole-v0/20220713-211653/results/test_rewards.npy similarity index 100% rename from codes/DQN/outputs/CartPole-v0/20220302-111332/results/test_rewards.npy rename to codes/DQN/outputs/CartPole-v0/20220713-211653/results/test_rewards.npy diff --git a/codes/DQN/outputs/CartPole-v0/20220302-111332/results/test_rewards_curve.png b/codes/DQN/outputs/CartPole-v0/20220713-211653/results/test_rewards_curve.png similarity index 100% rename from codes/DQN/outputs/CartPole-v0/20220302-111332/results/test_rewards_curve.png rename to codes/DQN/outputs/CartPole-v0/20220713-211653/results/test_rewards_curve.png diff --git a/codes/DQN/outputs/CartPole-v0/20220302-111332/results/test_steps.npy b/codes/DQN/outputs/CartPole-v0/20220713-211653/results/test_steps.npy similarity index 100% rename from codes/DQN/outputs/CartPole-v0/20220302-111332/results/test_steps.npy rename to codes/DQN/outputs/CartPole-v0/20220713-211653/results/test_steps.npy diff --git a/codes/DQN/outputs/CartPole-v0/20220713-211653/results/train_ma_rewards.npy b/codes/DQN/outputs/CartPole-v0/20220713-211653/results/train_ma_rewards.npy new file mode 100644 index 0000000..017fcb6 Binary files /dev/null and b/codes/DQN/outputs/CartPole-v0/20220713-211653/results/train_ma_rewards.npy differ diff --git a/codes/DQN/outputs/CartPole-v0/20220713-211653/results/train_rewards.npy b/codes/DQN/outputs/CartPole-v0/20220713-211653/results/train_rewards.npy new file mode 100644 index 0000000..877f53a Binary files /dev/null and b/codes/DQN/outputs/CartPole-v0/20220713-211653/results/train_rewards.npy differ diff --git a/codes/DQN/outputs/CartPole-v0/20220713-211653/results/train_rewards_curve.png b/codes/DQN/outputs/CartPole-v0/20220713-211653/results/train_rewards_curve.png new file mode 100644 index 0000000..8b2aa59 Binary files /dev/null and b/codes/DQN/outputs/CartPole-v0/20220713-211653/results/train_rewards_curve.png differ diff --git a/codes/DQN/outputs/CartPole-v0/20220713-211653/results/train_steps.npy b/codes/DQN/outputs/CartPole-v0/20220713-211653/results/train_steps.npy new file mode 100644 index 0000000..76fb505 Binary files /dev/null and b/codes/DQN/outputs/CartPole-v0/20220713-211653/results/train_steps.npy differ diff --git a/codes/DQN/outputs/CartPole-v1/20220618-201318/models/dqn_checkpoint.pth b/codes/DQN/outputs/CartPole-v1/20220618-201318/models/dqn_checkpoint.pth deleted file mode 100644 index a7b6b70..0000000 Binary files a/codes/DQN/outputs/CartPole-v1/20220618-201318/models/dqn_checkpoint.pth and /dev/null differ diff --git a/codes/DQN/outputs/CartPole-v1/20220618-201318/results/test_ma_rewards.npy b/codes/DQN/outputs/CartPole-v1/20220618-201318/results/test_ma_rewards.npy deleted file mode 100644 index 4907824..0000000 Binary files a/codes/DQN/outputs/CartPole-v1/20220618-201318/results/test_ma_rewards.npy and /dev/null differ diff --git a/codes/DQN/outputs/CartPole-v1/20220618-201318/results/test_rewards.npy b/codes/DQN/outputs/CartPole-v1/20220618-201318/results/test_rewards.npy deleted file mode 100644 index 2a74d8c..0000000 Binary files a/codes/DQN/outputs/CartPole-v1/20220618-201318/results/test_rewards.npy and /dev/null differ diff --git a/codes/DQN/outputs/CartPole-v1/20220618-201318/results/test_rewards_curve_cn.png b/codes/DQN/outputs/CartPole-v1/20220618-201318/results/test_rewards_curve_cn.png deleted file mode 100644 index 31f434c..0000000 Binary files a/codes/DQN/outputs/CartPole-v1/20220618-201318/results/test_rewards_curve_cn.png and /dev/null differ diff --git a/codes/DQN/outputs/CartPole-v1/20220618-201318/results/train_ma_rewards.npy b/codes/DQN/outputs/CartPole-v1/20220618-201318/results/train_ma_rewards.npy deleted file mode 100644 index 4485f7a..0000000 Binary files a/codes/DQN/outputs/CartPole-v1/20220618-201318/results/train_ma_rewards.npy and /dev/null differ diff --git a/codes/DQN/outputs/CartPole-v1/20220618-201318/results/train_rewards.npy b/codes/DQN/outputs/CartPole-v1/20220618-201318/results/train_rewards.npy deleted file mode 100644 index b79f659..0000000 Binary files a/codes/DQN/outputs/CartPole-v1/20220618-201318/results/train_rewards.npy and /dev/null differ diff --git a/codes/DQN/outputs/CartPole-v1/20220618-201318/results/train_rewards_curve_cn.png b/codes/DQN/outputs/CartPole-v1/20220618-201318/results/train_rewards_curve_cn.png deleted file mode 100644 index 9ab8a0b..0000000 Binary files a/codes/DQN/outputs/CartPole-v1/20220618-201318/results/train_rewards_curve_cn.png and /dev/null differ diff --git a/codes/DQN/task0.py b/codes/DQN/task0.py index 49a97a4..9ccf26f 100644 --- a/codes/DQN/task0.py +++ b/codes/DQN/task0.py @@ -1,5 +1,7 @@ +from lib2to3.pytree import type_repr import sys import os +from parso import parse import torch.nn as nn import torch.nn.functional as F curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 @@ -10,86 +12,58 @@ import gym import torch import datetime import numpy as np +import argparse from common.utils import save_results_1, make_dir -from common.utils import plot_rewards +from common.utils import plot_rewards,save_args from dqn import DQN -curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 +def get_args(): + """ Hyperparameters + """ + curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # Obtain current time + parser = argparse.ArgumentParser(description="hyperparameters") + parser.add_argument('--algo_name',default='DQN',type=str,help="name of algorithm") + parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment") + parser.add_argument('--train_eps',default=200,type=int,help="episodes of training") + parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing") + parser.add_argument('--gamma',default=0.95,type=float,help="discounted factor") + parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon") + parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon") + parser.add_argument('--epsilon_decay',default=500,type=int,help="decay rate of epsilon") + parser.add_argument('--lr',default=0.0001,type=float,help="learning rate") + parser.add_argument('--memory_capacity',default=100000,type=int,help="memory capacity") + parser.add_argument('--batch_size',default=64,type=int) + parser.add_argument('--target_update',default=4,type=int) + parser.add_argument('--hidden_dim',default=256,type=int) + parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \ + '/' + curr_time + '/results/' ) + parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \ + '/' + curr_time + '/models/' ) # path to save models + parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not") + args = parser.parse_args() + args.device = torch.device( + "cuda" if torch.cuda.is_available() else "cpu") # check GPU + return args -class MLP(nn.Module): - def __init__(self, n_states,n_actions,hidden_dim=128): - """ 初始化q网络,为全连接网络 - n_states: 输入的特征数即环境的状态维度 - n_actions: 输出的动作维度 - """ - super(MLP, self).__init__() - self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层 - self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层 - self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层 - - def forward(self, x): - # 各层对应的激活函数 - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - return self.fc3(x) - -class Config: - '''超参数 - ''' - - def __init__(self): - ############################### hyperparameters ################################ - self.algo_name = 'DQN' # algorithm name - self.env_name = 'CartPole-v0' # environment name - self.device = torch.device( - "cuda" if torch.cuda.is_available() else "cpu") # check GPU - self.seed = 10 # 随机种子,置0则不设置随机种子 - self.train_eps = 200 # 训练的回合数 - self.test_eps = 20 # 测试的回合数 - ################################################################################ - - ################################## 算法超参数 ################################### - self.gamma = 0.95 # 强化学习中的折扣因子 - self.epsilon_start = 0.90 # e-greedy策略中初始epsilon - self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon - self.epsilon_decay = 500 # e-greedy策略中epsilon的衰减率 - self.lr = 0.0001 # 学习率 - self.memory_capacity = 100000 # 经验回放的容量 - self.batch_size = 64 # mini-batch SGD中的批量大小 - self.target_update = 4 # 目标网络的更新频率 - self.hidden_dim = 256 # 网络隐藏层 - ################################################################################ - - ################################# 保存结果相关参数 ################################ - self.result_path = curr_path + "/outputs/" + self.env_name + \ - '/' + curr_time + '/results/' # 保存结果的路径 - self.model_path = curr_path + "/outputs/" + self.env_name + \ - '/' + curr_time + '/models/' # 保存模型的路径 - self.save = True # 是否保存图片 - ################################################################################ - - -def env_agent_config(cfg): +def env_agent_config(cfg,seed=1): ''' 创建环境和智能体 ''' env = gym.make(cfg.env_name) # 创建环境 n_states = env.observation_space.shape[0] # 状态维度 n_actions = env.action_space.n # 动作维度 print(f"n states: {n_states}, n actions: {n_actions}") - model = MLP(n_states,n_actions) - agent = DQN(n_actions, model, cfg) # 创建智能体 - if cfg.seed !=0: # 设置随机种子 - torch.manual_seed(cfg.seed) - env.seed(cfg.seed) - np.random.seed(cfg.seed) + agent = DQN(n_states,n_actions, cfg) # 创建智能体 + if seed !=0: # 设置随机种子 + torch.manual_seed(seed) + env.seed(seed) + np.random.seed(seed) return env, agent - def train(cfg, env, agent): - ''' 训练 + ''' Training ''' - print('开始训练!') - print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') + print('Start training!') + print(f'Env:{cfg.env_name}, A{cfg.algo_name}, 设备:{cfg.device}') rewards = [] # 记录所有回合的奖励 ma_rewards = [] # 记录所有回合的滑动平均奖励 steps = [] @@ -117,7 +91,7 @@ def train(cfg, env, agent): else: ma_rewards.append(ep_reward) if (i_ep + 1) % 1 == 0: - print(f'Episode:{i_ep+1}/{cfg.test_eps}, Reward:{ep_reward:.2f}, Step:{ep_step:.2f} Epislon:{agent.epsilon(agent.frame_idx):.3f}') + print(f'Episode:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}, Step:{ep_step:.2f} Epislon:{agent.epsilon(agent.frame_idx):.3f}') print('Finish training!') env.close() res_dic = {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps} @@ -152,18 +126,19 @@ def test(cfg, env, agent): ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1) else: ma_rewards.append(ep_reward) - print(f'Episode:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}, Step:{ep_step:.2f}') + print(f'Episode:{i_ep+1}/{cfg.test_eps}, Reward:{ep_reward:.2f}, Step:{ep_step:.2f}') print('完成测试!') env.close() return {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps} if __name__ == "__main__": - cfg = Config() + cfg = get_args() # 训练 env, agent = env_agent_config(cfg) res_dic = train(cfg, env, agent) make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹 + save_args(cfg) agent.save(path=cfg.model_path) # 保存模型 save_results_1(res_dic, tag='train', path=cfg.result_path) # 保存结果 diff --git a/codes/DQN/task1.py b/codes/DQN/task1.py deleted file mode 100644 index 872f30d..0000000 --- a/codes/DQN/task1.py +++ /dev/null @@ -1,168 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -Author: JiangJi -Email: johnjim0816@gmail.com -Date: 2021-12-22 11:14:17 -LastEditor: JiangJi -LastEditTime: 2022-06-18 20:12:20 -Discription: 使用 Nature DQN 训练 CartPole-v1 -''' -import sys -import os -curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 -parent_path = os.path.dirname(curr_path) # 父路径 -sys.path.append(parent_path) # 添加路径到系统路径 - -import gym -import torch -import datetime -import torch.nn as nn -import torch.nn.functional as F - -from common.utils import save_results, make_dir -from common.utils import plot_rewards, plot_rewards_cn -from dqn import DQN - -curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 -algo_name = "DQN" # 算法名称 -env_name = 'CartPole-v1' # 环境名称 -class DQNConfig: - ''' 算法相关参数设置 - ''' - - def __init__(self): - self.algo_name = algo_name # 算法名称 - self.env_name = env_name # 环境名称 - self.device = torch.device( - "cuda" if torch.cuda.is_available() else "cpu") # 检测GPU - self.train_eps = 300 # 训练的回合数 - self.test_eps = 20 # 测试的回合数 - # 超参数 - self.gamma = 0.99 # 强化学习中的折扣因子 - self.epsilon_start = 0.99 # e-greedy策略中初始epsilon - self.epsilon_end = 0.005 # e-greedy策略中的终止epsilon - self.epsilon_decay = 500 # e-greedy策略中epsilon的衰减率 - self.lr = 0.0001 # 学习率 - self.memory_capacity = 100000 # 经验回放的容量 - self.batch_size = 128 # mini-batch SGD中的批量大小 - self.target_update = 4 # 目标网络的更新频率 - self.hidden_dim = 512 # 网络隐藏层 -class PlotConfig: - ''' 绘图相关参数设置 - ''' - - def __init__(self) -> None: - self.algo_name = algo_name # 算法名称 - self.env_name = env_name # 环境名称 - self.device = torch.device( - "cuda" if torch.cuda.is_available() else "cpu") # 检测GPU - self.result_path = curr_path + "/outputs/" + self.env_name + \ - '/' + curr_time + '/results/' # 保存结果的路径 - self.model_path = curr_path + "/outputs/" + self.env_name + \ - '/' + curr_time + '/models/' # 保存模型的路径 - self.save = True # 是否保存图片 - -class MLP(nn.Module): - def __init__(self, n_states,n_actions,hidden_dim=128): - """ 初始化q网络,为全连接网络 - n_states: 输入的特征数即环境的状态维度 - n_actions: 输出的动作维度 - """ - super(MLP, self).__init__() - self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层 - self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层 - self.fc3 = nn.Linear(hidden_dim, n_actions) # 输出层 - - def forward(self, x): - # 各层对应的激活函数 - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - return self.fc3(x) - -def env_agent_config(cfg, seed=1): - ''' 创建环境和智能体 - ''' - env = gym.make(cfg.env_name) # 创建环境 - env.seed(seed) # 设置随机种子 - n_states = env.observation_space.shape[0] # 状态维度 - n_actions = env.action_space.n # 动作维度 - model = MLP(n_states,n_actions) - agent = DQN(n_actions,model,cfg) # 创建智能体 - return env, agent - -def train(cfg, env, agent): - ''' 训练 - ''' - print('开始训练!') - print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') - rewards = [] # 记录所有回合的奖励 - ma_rewards = [] # 记录所有回合的滑动平均奖励 - for i_ep in range(cfg.train_eps): - ep_reward = 0 # 记录一回合内的奖励 - state = env.reset() # 重置环境,返回初始状态 - while True: - action = agent.choose_action(state) # 选择动作 - next_state, reward, done, _ = env.step(action) # 更新环境,返回transition - agent.memory.push(state, action, reward, next_state, done) # 保存transition - state = next_state # 更新下一个状态 - agent.update() # 更新智能体 - ep_reward += reward # 累加奖励 - if done: - break - if (i_ep+1) % cfg.target_update == 0: # 智能体目标网络更新 - agent.target_net.load_state_dict(agent.policy_net.state_dict()) - rewards.append(ep_reward) - if ma_rewards: - ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) - else: - ma_rewards.append(ep_reward) - if (i_ep+1)%10 == 0: - print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward)) - print('完成训练!') - return rewards, ma_rewards - -def test(cfg,env,agent): - print('开始测试!') - print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') - # 由于测试不需要使用epsilon-greedy策略,所以相应的值设置为0 - cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon - cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon - rewards = [] # 记录所有回合的奖励 - ma_rewards = [] # 记录所有回合的滑动平均奖励 - for i_ep in range(cfg.test_eps): - ep_reward = 0 # 记录一回合内的奖励 - state = env.reset() # 重置环境,返回初始状态 - while True: - action = agent.choose_action(state) # 选择动作 - next_state, reward, done, _ = env.step(action) # 更新环境,返回transition - state = next_state # 更新下一个状态 - ep_reward += reward # 累加奖励 - if done: - break - rewards.append(ep_reward) - if ma_rewards: - ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1) - else: - ma_rewards.append(ep_reward) - print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}") - print('完成测试!') - return rewards,ma_rewards -if __name__ == "__main__": - cfg = DQNConfig() - plot_cfg = PlotConfig() - # 训练 - env, agent = env_agent_config(cfg, seed=1) - rewards, ma_rewards = train(cfg, env, agent) - make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹 - agent.save(path=plot_cfg.model_path) # 保存模型 - save_results(rewards, ma_rewards, tag='train', - path=plot_cfg.result_path) # 保存结果 - plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果 - # 测试 - env, agent = env_agent_config(cfg, seed=10) - agent.load(path=plot_cfg.model_path) # 导入模型 - rewards, ma_rewards = test(cfg, env, agent) - save_results(rewards, ma_rewards, tag='test', - path=plot_cfg.result_path) # 保存结果 - plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="test") # 画出结果 diff --git a/codes/DQN/task2.py b/codes/DQN/task2.py deleted file mode 100644 index 9e0f8c2..0000000 --- a/codes/DQN/task2.py +++ /dev/null @@ -1,150 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 -''' -Author: JiangJi -Email: johnjim0816@gmail.com -Date: 2021-12-22 11:14:17 -LastEditor: JiangJi -LastEditTime: 2022-02-10 06:17:46 -Discription: 使用 DQN-cnn 训练 PongNoFrameskip-v4 -''' -import sys -import os -curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 -parent_path = os.path.dirname(curr_path) # 父路径 -sys.path.append(parent_path) # 添加路径到系统路径 - -import gym -import torch -import datetime -from common.utils import save_results, make_dir -from common.utils import plot_rewards, plot_rewards_cn -from common.atari_wrappers import make_atari, wrap_deepmind -from dqn import DQN - -curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 -algo_name = 'DQN-cnn' # 算法名称 -env_name = 'PongNoFrameskip-v4' # 环境名称 -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU -class DQNConfig: - ''' 算法相关参数设置 - ''' - - def __init__(self): - self.algo_name = algo_name # 算法名称 - self.env_name = env_name # 环境名称 - self.device = device # 检测GPU - self.train_eps = 500 # 训练的回合数 - self.test_eps = 30 # 测试的回合数 - # 超参数 - self.gamma = 0.95 # 强化学习中的折扣因子 - self.epsilon_start = 0.90 # e-greedy策略中初始epsilon - self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon - self.epsilon_decay = 500 # e-greedy策略中epsilon的衰减率 - self.lr = 0.0001 # 学习率 - self.memory_capacity = 100000 # 经验回放的容量 - self.batch_size = 64 # mini-batch SGD中的批量大小 - self.target_update = 4 # 目标网络的更新频率 - self.hidden_dim = 256 # 网络隐藏层 -class PlotConfig: - ''' 绘图相关参数设置 - ''' - - def __init__(self) -> None: - self.algo_name = algo_name # 算法名称 - self.env_name = env_name # 环境名称 - self.device = device # 检测GPU - self.result_path = curr_path + "/outputs/" + self.env_name + \ - '/' + curr_time + '/results/' # 保存结果的路径 - self.model_path = curr_path + "/outputs/" + self.env_name + \ - '/' + curr_time + '/models/' # 保存模型的路径 - self.save = True # 是否保存图片 - - -def env_agent_config(cfg, seed=1): - ''' 创建环境和智能体 - ''' - env = make_atari(cfg.env_name) # 创建环境 - # env = wrap_deepmind(env) - # env = wrap_pytorch(env) - env.seed(seed) # 设置随机种子 - n_states = env.observation_space.shape[0] # 状态维度 - n_actions = env.action_space.n # 动作维度 - agent = DQN(n_states, n_actions, cfg) # 创建智能体 - return env, agent - -def train(cfg, env, agent): - ''' 训练 - ''' - print('开始训练!') - print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') - rewards = [] # 记录所有回合的奖励 - ma_rewards = [] # 记录所有回合的滑动平均奖励 - for i_ep in range(cfg.train_eps): - ep_reward = 0 # 记录一回合内的奖励 - state = env.reset() # 重置环境,返回初始状态 - while True: - action = agent.choose_action(state) # 选择动作 - next_state, reward, done, _ = env.step(action) # 更新环境,返回transition - agent.memory.push(state, action, reward, next_state, done) # 保存transition - state = next_state # 更新下一个状态 - agent.update() # 更新智能体 - ep_reward += reward # 累加奖励 - if done: - break - if (i_ep+1) % cfg.target_update == 0: # 智能体目标网络更新 - agent.target_net.load_state_dict(agent.policy_net.state_dict()) - rewards.append(ep_reward) - if ma_rewards: - ma_rewards.append(0.9*ma_rewards[-1]+0.1*ep_reward) - else: - ma_rewards.append(ep_reward) - if (i_ep+1)%10 == 0: - print('回合:{}/{}, 奖励:{}'.format(i_ep+1, cfg.train_eps, ep_reward)) - print('完成训练!') - return rewards, ma_rewards - -def test(cfg,env,agent): - print('开始测试!') - print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') - # 由于测试不需要使用epsilon-greedy策略,所以相应的值设置为0 - cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon - cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon - rewards = [] # 记录所有回合的奖励 - ma_rewards = [] # 记录所有回合的滑动平均奖励 - for i_ep in range(cfg.test_eps): - ep_reward = 0 # 记录一回合内的奖励 - state = env.reset() # 重置环境,返回初始状态 - while True: - action = agent.choose_action(state) # 选择动作 - next_state, reward, done, _ = env.step(action) # 更新环境,返回transition - state = next_state # 更新下一个状态 - ep_reward += reward # 累加奖励 - if done: - break - rewards.append(ep_reward) - if ma_rewards: - ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1) - else: - ma_rewards.append(ep_reward) - print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}") - print('完成测试!') - return rewards,ma_rewards -if __name__ == "__main__": - cfg = DQNConfig() - plot_cfg = PlotConfig() - # 训练 - env, agent = env_agent_config(cfg, seed=1) - rewards, ma_rewards = train(cfg, env, agent) - make_dir(plot_cfg.result_path, plot_cfg.model_path) # 创建保存结果和模型路径的文件夹 - agent.save(path=plot_cfg.model_path) # 保存模型 - save_results(rewards, ma_rewards, tag='train', - path=plot_cfg.result_path) # 保存结果 - plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="train") # 画出结果 - # 测试 - env, agent = env_agent_config(cfg, seed=10) - agent.load(path=plot_cfg.model_path) # 导入模型 - rewards, ma_rewards = test(cfg, env, agent) - save_results(rewards, ma_rewards, tag='test', - path=plot_cfg.result_path) # 保存结果 - plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag="test") # 画出结果 diff --git a/codes/DQN/task4.py b/codes/DQN/task4.py deleted file mode 100644 index 436b36b..0000000 --- a/codes/DQN/task4.py +++ /dev/null @@ -1,180 +0,0 @@ -import sys -import os -import torch.nn as nn -import torch.nn.functional as F -curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 -parent_path = os.path.dirname(curr_path) # 父路径 -sys.path.append(parent_path) # 添加路径到系统路径 - -import gym -import torch -import datetime -import numpy as np -from common.utils import save_results_1, make_dir -from common.utils import plot_rewards -from dqn_1 import DQN - -curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 - -class MLP(nn.Module): - def __init__(self, n_states,n_actions,hidden_dim=256): - """ 初始化q网络,为全连接网络 - n_states: 输入的特征数即环境的状态维度 - n_actions: 输出的动作维度 - """ - super(MLP, self).__init__() - self.fc1 = nn.Linear(n_states, hidden_dim) # 输入层 - self.fc2 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层 - self.fc3 = nn.Linear(hidden_dim,hidden_dim) # 隐藏层 - self.fc4 = nn.Linear(hidden_dim, n_actions) # 输出层 - - def forward(self, x): - # 各层对应的激活函数 - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = F.relu(self.fc3(x)) - return self.fc4(x) - -class Config: - '''超参数 - ''' - - def __init__(self): - ################################## 环境超参数 ################################### - self.algo_name = 'DQN' # 算法名称 - # self.env_name = 'Breakout-ram-v0' # 环境名称 - self.env_name = 'ALE/Pong-ram-v5' - self.device = torch.device( - "cuda" if torch.cuda.is_available() else "cpu") # 检测GPUgjgjlkhfsf风刀霜的撒发十 - self.seed = 10 # 随机种子,置0则不设置随机种子 - self.train_eps = 5 # 训练的回合数 - self.test_eps = 30 # 测试的回合数 - ################################################################################ - - ################################## 算法超参数 ################################### - self.gamma = 0.99 # 强化学习中的折扣因子 - self.epsilon_start = 0.95 # e-greedy策略中初始epsilon - self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon - self.epsilon_decay = 500000 # e-greedy策略中epsilon的衰减率 - self.lr = 0.00025 # 学习率 - self.memory_capacity = int(5e4) # 经验回放的容量 - self.batch_size = 32 # mini-batch SGD中的批量大小 - self.target_update = 4 # 目标网络的更新频率 - self.hidden_dim = 512 # 网络隐藏层 - ################################################################################ - - ################################# 保存结果相关参数 ################################ - self.result_path = curr_path + "/outputs/" + self.env_name + \ - '/' + curr_time + '/results/' # 保存结果的路径 - self.model_path = curr_path + "/outputs/" + self.env_name + \ - '/' + curr_time + '/models/' # 保存模型的路径 - self.save = True # 是否保存图片 - ################################################################################ - - -def env_agent_config(cfg): - ''' 创建环境和智能体 - ''' - env = gym.make(cfg.env_name) # 创建环境 - n_states = env.observation_space.shape[0] # 状态维度 - n_actions = env.action_space.n # 动作维度 - print(f"n states: {n_states}, n actions: {n_actions}") - model = MLP(n_states,n_actions) - agent = DQN(n_states, n_actions, model, cfg) # 创建智能体 - if cfg.seed !=0: # 设置随机种子 - torch.manual_seed(cfg.seed) - env.seed(cfg.seed) - np.random.seed(cfg.seed) - return env, agent - - -def train(cfg, env, agent): - ''' 训练 - ''' - print('开始训练!') - print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') - rewards = [] # 记录所有回合的奖励 - ma_rewards = [] # 记录所有回合的滑动平均奖励 - steps = [] - for i_ep in range(cfg.train_eps): - ep_reward = 0 # 记录一回合内的奖励 - state = env.reset() # 重置环境,返回初始状态 - ep_step = 0 - while True: - ep_step+=1 - action = agent.choose_action(state) # 选择动作 - next_state, reward, done, _ = env.step(action) # 更新环境,返回transition - agent.memory.push(state, action, reward, - next_state, done) # 保存transition - state = next_state # 更新下一个状态 - agent.update() # 更新智能体 - ep_reward += reward # 累加奖励 - if done: - break - if (i_ep + 1) % cfg.target_update == 0: # 智能体目标网络更新 - agent.target_net.load_state_dict(agent.policy_net.state_dict()) - steps.append(ep_step) - rewards.append(ep_reward) - if ma_rewards: - ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward) - else: - ma_rewards.append(ep_reward) - if (i_ep + 1) % 1 == 0: - print(f'Episode:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}, Epislon:{agent.epsilon(agent.frame_idx):.3f}') - print('完成训练!') - env.close() - res_dic = {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps} - return res_dic - - -def test(cfg, env, agent): - print('开始测试!') - print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') - ############# 由于测试不需要使用epsilon-greedy策略,所以相应的值设置为0 ############### - cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon - cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon - ################################################################################ - rewards = [] # 记录所有回合的奖励 - ma_rewards = [] # 记录所有回合的滑动平均奖励 - steps = [] - for i_ep in range(cfg.test_eps): - ep_reward = 0 # 记录一回合内的奖励 - ep_step = 0 - state = env.reset() # 重置环境,返回初始状态 - while True: - ep_step+=1 - action = agent.choose_action(state) # 选择动作 - next_state, reward, done, _ = env.step(action) # 更新环境,返回transition - state = next_state # 更新下一个状态 - ep_reward += reward # 累加奖励 - if done: - break - steps.append(ep_step) - rewards.append(ep_reward) - if ma_rewards: - ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1) - else: - ma_rewards.append(ep_reward) - print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}") - print('完成测试!') - env.close() - return {'rewards':rewards,'ma_rewards':ma_rewards,'steps':steps} - - -if __name__ == "__main__": - cfg = Config() - # 训练 - env, agent = env_agent_config(cfg) - res_dic = train(cfg, env, agent) - make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹 - agent.save(path=cfg.model_path) # 保存模型 - save_results_1(res_dic, tag='train', - path=cfg.result_path) # 保存结果 - plot_rewards(res_dic['rewards'], res_dic['ma_rewards'], cfg, tag="train") # 画出结果 - # 测试 - env, agent = env_agent_config(cfg) - agent.load(path=cfg.model_path) # 导入模型 - res_dic = test(cfg, env, agent) - save_results_1(res_dic, tag='test', - path=cfg.result_path) # 保存结果 - plot_rewards(res_dic['rewards'], res_dic['ma_rewards'],cfg, tag="test") # 画出结果 diff --git a/codes/DQN/task5.py b/codes/DQN/task5.py deleted file mode 100644 index 519a8f6..0000000 --- a/codes/DQN/task5.py +++ /dev/null @@ -1,149 +0,0 @@ -import sys -import os -curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径 -parent_path = os.path.dirname(curr_path) # 父路径 -sys.path.append(parent_path) # 添加路径到系统路径 - -import gym -import torch -import datetime -import numpy as np -from common.utils import save_results, make_dir -from common.utils import plot_rewards -from dqn import DQN - -curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间 - - -class Config: - '''超参数 - ''' - - def __init__(self): - ################################## 环境超参数 ################################### - self.algo_name = 'DQN' # 算法名称 - self.env_name = 'SpaceInvaders-ram-v0' # 环境名称 - self.device = torch.device( - "cuda" if torch.cuda.is_available() else "cpu") # 检测GPUgjgjlkhfsf风刀霜的撒发十 - self.seed = 10 # 随机种子,置0则不设置随机种子 - self.train_eps = 200 # 训练的回合数 - self.test_eps = 30 # 测试的回合数 - ################################################################################ - - ################################## 算法超参数 ################################### - self.gamma = 0.99 # 强化学习中的折扣因子 - self.epsilon_start = 0.95 # e-greedy策略中初始epsilon - self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon - self.epsilon_decay = 20000 # e-greedy策略中epsilon的衰减率 - self.lr = 2e-4 # 学习率 - self.memory_capacity = int(1e5) # 经验回放的容量 - self.batch_size = 32 # mini-batch SGD中的批量大小 - self.target_update = 4 # 目标网络的更新频率 - self.hidden_dim = 512 # 网络隐藏层 - ################################################################################ - - ################################# 保存结果相关参数 ################################ - self.result_path = curr_path + "/outputs/" + self.env_name + \ - '/' + curr_time + '/results/' # 保存结果的路径 - self.model_path = curr_path + "/outputs/" + self.env_name + \ - '/' + curr_time + '/models/' # 保存模型的路径 - self.save = True # 是否保存图片 - ################################################################################ - - -def env_agent_config(cfg): - ''' 创建环境和智能体 - ''' - env = gym.make(cfg.env_name) # 创建环境 - n_states = env.observation_space.shape[0] # 状态维度 - n_actions = env.action_space.n # 动作维度 - print(f"n states: {n_states}, n actions: {n_actions}") - agent = DQN(n_states, n_actions, cfg) # 创建智能体 - if cfg.seed !=0: # 设置随机种子 - torch.manual_seed(cfg.seed) - env.seed(cfg.seed) - np.random.seed(cfg.seed) - return env, agent - - -def train(cfg, env, agent): - ''' 训练 - ''' - print('开始训练!') - print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') - rewards = [] # 记录所有回合的奖励 - ma_rewards = [] # 记录所有回合的滑动平均奖励 - for i_ep in range(cfg.train_eps): - ep_reward = 0 # 记录一回合内的奖励 - state = env.reset() # 重置环境,返回初始状态 - while True: - action = agent.choose_action(state) # 选择动作 - next_state, reward, done, _ = env.step(action) # 更新环境,返回transition - agent.memory.push(state, action, reward, - next_state, done) # 保存transition - state = next_state # 更新下一个状态 - agent.update() # 更新智能体 - ep_reward += reward # 累加奖励 - if done: - break - if (i_ep + 1) % cfg.target_update == 0: # 智能体目标网络更新 - agent.target_net.load_state_dict(agent.policy_net.state_dict()) - rewards.append(ep_reward) - if ma_rewards: - ma_rewards.append(0.9 * ma_rewards[-1] + 0.1 * ep_reward) - else: - ma_rewards.append(ep_reward) - if (i_ep + 1) % 1 == 0: - print(f'Episode:{i_ep+1}/{cfg.train_eps}, Reward:{ep_reward:.2f}, Epislon:{agent.epsilon(agent.frame_idx):.3f}') - print('完成训练!') - env.close() - return rewards, ma_rewards - - -def test(cfg, env, agent): - print('开始测试!') - print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') - ############# 由于测试不需要使用epsilon-greedy策略,所以相应的值设置为0 ############### - cfg.epsilon_start = 0.0 # e-greedy策略中初始epsilon - cfg.epsilon_end = 0.0 # e-greedy策略中的终止epsilon - ################################################################################ - rewards = [] # 记录所有回合的奖励 - ma_rewards = [] # 记录所有回合的滑动平均奖励 - for i_ep in range(cfg.test_eps): - ep_reward = 0 # 记录一回合内的奖励 - state = env.reset() # 重置环境,返回初始状态 - while True: - action = agent.choose_action(state) # 选择动作 - next_state, reward, done, _ = env.step(action) # 更新环境,返回transition - state = next_state # 更新下一个状态 - ep_reward += reward # 累加奖励 - if done: - break - rewards.append(ep_reward) - if ma_rewards: - ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1) - else: - ma_rewards.append(ep_reward) - print(f"回合:{i_ep+1}/{cfg.test_eps},奖励:{ep_reward:.1f}") - print('完成测试!') - env.close() - return rewards, ma_rewards - - -if __name__ == "__main__": - cfg = Config() - # 训练 - env, agent = env_agent_config(cfg) - rewards, ma_rewards = train(cfg, env, agent) - make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹 - agent.save(path=cfg.model_path) # 保存模型 - save_results(rewards, ma_rewards, tag='train', - path=cfg.result_path) # 保存结果 - plot_rewards(rewards, ma_rewards, cfg, tag="train") # 画出结果 - # 测试 - env, agent = env_agent_config(cfg) - agent.load(path=cfg.model_path) # 导入模型 - rewards, ma_rewards = test(cfg, env, agent) - save_results(rewards, ma_rewards, tag='test', - path=cfg.result_path) # 保存结果 - plot_rewards(rewards, ma_rewards, cfg, tag="test") # 画出结果 diff --git a/codes/QLearning/task0.py b/codes/QLearning/task0.py index 607cefa..98e620e 100644 --- a/codes/QLearning/task0.py +++ b/codes/QLearning/task0.py @@ -5,7 +5,7 @@ Author: John Email: johnjim0816@gmail.com Date: 2020-09-11 23:03:00 LastEditor: John -LastEditTime: 2022-02-10 00:54:02 +LastEditTime: 2022-06-21 19:36:05 Discription: Environment: ''' @@ -84,8 +84,6 @@ def train(cfg,env,agent): def test(cfg,env,agent): print('开始测试!') print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}') - for item in agent.Q_table.items(): - print(item) rewards = [] # 记录所有回合的奖励 ma_rewards = [] # 滑动平均的奖励 for i_ep in range(cfg.test_eps): diff --git a/codes/common/utils.py b/codes/common/utils.py index 612163f..b47ef72 100644 --- a/codes/common/utils.py +++ b/codes/common/utils.py @@ -5,7 +5,7 @@ Author: John Email: johnjim0816@gmail.com Date: 2021-03-12 16:02:24 LastEditor: John -LastEditTime: 2022-02-28 11:50:11 +LastEditTime: 2022-07-13 22:15:46 Discription: Environment: ''' @@ -27,33 +27,33 @@ def chinese_font(): font = None return font -def plot_rewards_cn(rewards, ma_rewards, plot_cfg, tag='train'): +def plot_rewards_cn(rewards, ma_rewards, cfg, tag='train'): ''' 中文画图 ''' sns.set() plt.figure() - plt.title(u"{}环境下{}算法的学习曲线".format(plot_cfg.env_name, - plot_cfg.algo_name), fontproperties=chinese_font()) + plt.title(u"{}环境下{}算法的学习曲线".format(cfg.env_name, + cfg.algo_name), fontproperties=chinese_font()) plt.xlabel(u'回合数', fontproperties=chinese_font()) plt.plot(rewards) plt.plot(ma_rewards) plt.legend((u'奖励', u'滑动平均奖励',), loc="best", prop=chinese_font()) - if plot_cfg.save: - plt.savefig(plot_cfg.result_path+f"{tag}_rewards_curve_cn") + if cfg.save: + plt.savefig(cfg.result_path+f"{tag}_rewards_curve_cn") # plt.show() -def plot_rewards(rewards, ma_rewards, plot_cfg, tag='train'): +def plot_rewards(rewards, ma_rewards, cfg, tag='train'): sns.set() plt.figure() # 创建一个图形实例,方便同时多画几个图 plt.title("learning curve on {} of {} for {}".format( - plot_cfg.device, plot_cfg.algo_name, plot_cfg.env_name)) + cfg.device, cfg.algo_name, cfg.env_name)) plt.xlabel('epsiodes') plt.plot(rewards, label='rewards') plt.plot(ma_rewards, label='ma rewards') plt.legend() - if plot_cfg.save: - plt.savefig(plot_cfg.result_path+"{}_rewards_curve".format(tag)) + if cfg.save_fig: + plt.savefig(cfg.result_path+"{}_rewards_curve".format(tag)) plt.show() @@ -80,7 +80,7 @@ def save_results(rewards, ma_rewards, tag='train', path='./results'): ''' np.save(path+'{}_rewards.npy'.format(tag), rewards) np.save(path+'{}_ma_rewards.npy'.format(tag), ma_rewards) - print('结果保存完毕!') + print('Result saved!') def make_dir(*paths): @@ -98,3 +98,14 @@ def del_empty_dir(*paths): for dir in dirs: if not os.listdir(os.path.join(path, dir)): os.removedirs(os.path.join(path, dir)) + +def save_args(args): + # save parameters + argsDict = args.__dict__ + with open(args.result_path+'params.txt', 'w') as f: + f.writelines('------------------ start ------------------' + '\n') + for eachArg, value in argsDict.items(): + f.writelines(eachArg + ' : ' + str(value) + '\n') + f.writelines('------------------- end -------------------') + print("Parameters saved!") + \ No newline at end of file diff --git a/notebooks/QLearning.ipynb b/notebooks/QLearning.ipynb new file mode 100644 index 0000000..a610084 --- /dev/null +++ b/notebooks/QLearning.ipynb @@ -0,0 +1,19 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "language_info": { + "name": "python" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +}