diff --git a/codes/ddpg/.vscode/settings.json b/codes/ddpg/.vscode/settings.json new file mode 100644 index 0000000..be0f1ab --- /dev/null +++ b/codes/ddpg/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.pythonPath": "/Users/jj/anaconda3/envs/py37/bin/python" +} \ No newline at end of file diff --git a/codes/ddpg/README.md b/codes/ddpg/README.md new file mode 100644 index 0000000..5a06c53 --- /dev/null +++ b/codes/ddpg/README.md @@ -0,0 +1,26 @@ + +python 3.7.9 + +pytorch 1.6.0 + +tensorboard 2.3.0 + +torchvision 0.7.0 + +train: + +```python +python main.py +``` + +eval: + +```python +python main.py --train 0 +``` + +open tensorboard: + +```python +tensorboard --logdir logs +``` \ No newline at end of file diff --git a/codes/ddpg/ddpg.py b/codes/ddpg/agent.py similarity index 100% rename from codes/ddpg/ddpg.py rename to codes/ddpg/agent.py diff --git a/codes/ddpg/logs/eval/20201015-192417/events.out.tfevents.1602761195.MacBook-Pro.local.156.3 b/codes/ddpg/logs/eval/20201015-192417/events.out.tfevents.1602761195.MacBook-Pro.local.156.3 new file mode 100644 index 0000000..be5de57 Binary files /dev/null and b/codes/ddpg/logs/eval/20201015-192417/events.out.tfevents.1602761195.MacBook-Pro.local.156.3 differ diff --git a/codes/ddpg/logs/eval/20201015-192417/rewards_moving_average/events.out.tfevents.1602761195.MacBook-Pro.local.156.5 b/codes/ddpg/logs/eval/20201015-192417/rewards_moving_average/events.out.tfevents.1602761195.MacBook-Pro.local.156.5 new file mode 100644 index 0000000..3fda6d2 Binary files /dev/null and b/codes/ddpg/logs/eval/20201015-192417/rewards_moving_average/events.out.tfevents.1602761195.MacBook-Pro.local.156.5 differ diff --git a/codes/ddpg/logs/eval/20201015-192417/rewards_raw/events.out.tfevents.1602761195.MacBook-Pro.local.156.4 b/codes/ddpg/logs/eval/20201015-192417/rewards_raw/events.out.tfevents.1602761195.MacBook-Pro.local.156.4 new file mode 100644 index 0000000..485af31 Binary files /dev/null and b/codes/ddpg/logs/eval/20201015-192417/rewards_raw/events.out.tfevents.1602761195.MacBook-Pro.local.156.4 differ diff --git a/codes/ddpg/logs/train/20201015-192417/events.out.tfevents.1602761057.MacBook-Pro.local.156.0 b/codes/ddpg/logs/train/20201015-192417/events.out.tfevents.1602761057.MacBook-Pro.local.156.0 new file mode 100644 index 0000000..5a3a1d1 Binary files /dev/null and b/codes/ddpg/logs/train/20201015-192417/events.out.tfevents.1602761057.MacBook-Pro.local.156.0 differ diff --git a/codes/ddpg/logs/train/20201015-192417/rewards_moving_average/events.out.tfevents.1602761057.MacBook-Pro.local.156.2 b/codes/ddpg/logs/train/20201015-192417/rewards_moving_average/events.out.tfevents.1602761057.MacBook-Pro.local.156.2 new file mode 100644 index 0000000..70eb483 Binary files /dev/null and b/codes/ddpg/logs/train/20201015-192417/rewards_moving_average/events.out.tfevents.1602761057.MacBook-Pro.local.156.2 differ diff --git a/codes/ddpg/logs/train/20201015-192417/rewards_raw/events.out.tfevents.1602761057.MacBook-Pro.local.156.1 b/codes/ddpg/logs/train/20201015-192417/rewards_raw/events.out.tfevents.1602761057.MacBook-Pro.local.156.1 new file mode 100644 index 0000000..b131138 Binary files /dev/null and b/codes/ddpg/logs/train/20201015-192417/rewards_raw/events.out.tfevents.1602761057.MacBook-Pro.local.156.1 differ diff --git a/codes/ddpg/main.py b/codes/ddpg/main.py index 5215ff5..2a2cc52 100644 --- a/codes/ddpg/main.py +++ b/codes/ddpg/main.py @@ -5,63 +5,76 @@ @Email: johnjim0816@gmail.com @Date: 2020-06-11 20:58:21 @LastEditor: John -LastEditTime: 2020-09-02 01:24:50 +LastEditTime: 2020-10-15 21:23:39 @Discription: @Environment: python 3.7.7 ''' +from token import NUMBER +from typing import Sequence import torch import gym - -from ddpg import DDPG +from agent import DDPG from env import NormalizedActions from noise import OUNoise -from plot import plot - +import os +import numpy as np import argparse +from torch.utils.tensorboard import SummaryWriter +import datetime + +SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") +SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' +RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/result/"+SEQUENCE+'/' def get_args(): '''模型建立好之后只需要在这里调参 ''' parser = argparse.ArgumentParser() - - parser.add_argument("--gamma", default=0.99, type=float) # q-learning中的gamma - parser.add_argument("--critic_lr", default=1e-3, type=float) # critic学习率 + parser.add_argument("--train", default=1, type=int) # 1 表示训练,0表示只进行eval + parser.add_argument("--gamma", default=0.99, + type=float) # q-learning中的gamma + parser.add_argument("--critic_lr", default=1e-3, type=float) # critic学习率 parser.add_argument("--actor_lr", default=1e-4, type=float) - - parser.add_argument("--memory_capacity", default=10000, type=int,help="capacity of Replay Memory") - - parser.add_argument("--batch_size", default=128, type=int,help="batch size of memory sampling") + parser.add_argument("--memory_capacity", default=10000, + type=int, help="capacity of Replay Memory") + parser.add_argument("--batch_size", default=128, type=int, + help="batch size of memory sampling") parser.add_argument("--train_eps", default=200, type=int) parser.add_argument("--train_steps", default=200, type=int) - parser.add_argument("--eval_eps", default=200, type=int) # 训练的最大episode数目 - parser.add_argument("--eval_steps", default=200, type=int) # 训练每个episode的长度 - parser.add_argument("--target_update", default=4, type=int,help="when(every default 10 eisodes) to update target net ") + parser.add_argument("--eval_eps", default=200, type=int) # 训练的最大episode数目 + parser.add_argument("--eval_steps", default=200, + type=int) # 训练每个episode的长度 + parser.add_argument("--target_update", default=4, type=int, + help="when(every default 10 eisodes) to update target net ") config = parser.parse_args() return config -def train(): - cfg = get_args() + +def train(cfg): + print('Start to train ! \n') env = NormalizedActions(gym.make("Pendulum-v0")) - + # 增加action噪声 ou_noise = OUNoise(env.action_space) - - n_states = env.observation_space.shape[0] - n_actions = env.action_space.shape[0] - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - agent=DDPG(n_states,n_actions,device="cpu", critic_lr=1e-3, - actor_lr=1e-4, gamma=0.99, soft_tau=1e-2, memory_capacity=100000, batch_size=128) - rewards = [] + n_states = env.observation_space.shape[0] + n_actions = env.action_space.shape[0] + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + agent = DDPG(n_states, n_actions, device="cpu", critic_lr=1e-3, + actor_lr=1e-4, gamma=0.99, soft_tau=1e-2, memory_capacity=100000, batch_size=128) + rewards = [] moving_average_rewards = [] ep_steps = [] - for i_episode in range(1,cfg.train_eps+1): - state=env.reset() + log_dir=os.path.split(os.path.abspath(__file__))[0]+"/logs/train/" + SEQUENCE + writer = SummaryWriter(log_dir) + for i_episode in range(1, cfg.train_eps+1): + state = env.reset() ou_noise.reset() ep_reward = 0 - for i_step in range(1,cfg.train_steps+1): - action = agent.select_action(state) - action = ou_noise.get_action(action, i_step) # 即paper中的random process + for i_step in range(1, cfg.train_steps+1): + action = agent.select_action(state) + action = ou_noise.get_action( + action, i_step) # 即paper中的random process next_state, reward, done, _ = env.step(action) ep_reward += reward agent.memory.push(state, action, reward, next_state, done) @@ -69,7 +82,8 @@ def train(): state = next_state if done: break - print('Episode:', i_episode, ' Reward: %i' % int(ep_reward),'n_steps:', i_step) + print('Episode:', i_episode, ' Reward: %i' % + int(ep_reward), 'n_steps:', i_step) ep_steps.append(i_step) rewards.append(ep_reward) if i_episode == 1: @@ -77,54 +91,43 @@ def train(): else: moving_average_rewards.append( 0.9*moving_average_rewards[-1]+0.1*ep_reward) - print('Complete!') - # 保存模型 - import os - import numpy as np - save_path = os.path.dirname(__file__)+"/saved_model/" - if not os.path.exists(save_path): - os.mkdir(save_path) - agent.save_model(save_path+'checkpoint.pth') - # 存储reward等相关结果 - output_path = os.path.dirname(__file__)+"/result/" - # 检测是否存在文件夹 - if not os.path.exists(output_path): - os.mkdir(output_path) - np.save(output_path+"rewards.npy", rewards) - np.save(output_path+"moving_average_rewards.npy", moving_average_rewards) - np.save(output_path+"steps.npy", ep_steps) - plot(rewards) - plot(moving_average_rewards,ylabel="moving_average_rewards") - plot(ep_steps, ylabel="steps_of_each_episode") - -def eval(): - cfg = get_args() - env = NormalizedActions(gym.make("Pendulum-v0")) - - # 增加action噪声 - ou_noise = OUNoise(env.action_space) - - n_states = env.observation_space.shape[0] - n_actions = env.action_space.shape[0] - agent=DDPG(n_states,n_actions, critic_lr=1e-3, - actor_lr=1e-4, gamma=0.99, soft_tau=1e-2, memory_capacity=100000, batch_size=128) + writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode) + writer.add_scalar('steps_of_each_episode', + ep_steps[-1], i_episode) + writer.close() + print('Complete training!') + ''' 保存模型 ''' + if not os.path.exists(SAVED_MODEL_PATH): # 检测是否存在文件夹 + os.mkdir(SAVED_MODEL_PATH) + agent.save_model(SAVED_MODEL_PATH+'checkpoint.pth') + '''存储reward等相关结果''' + if not os.path.exists(RESULT_PATH): # 检测是否存在文件夹 + os.mkdir(RESULT_PATH) + np.save(RESULT_PATH+'rewards_train.npy', rewards) + np.save(RESULT_PATH+'moving_average_rewards_train.npy', moving_average_rewards) + np.save(RESULT_PATH+'steps_train.npy', ep_steps) - import os - save_path = os.path.dirname(__file__)+"/saved_model/" - if not os.path.exists(save_path): - os.mkdir(save_path) - agent.load_model(save_path+'checkpoint.pth') +def eval(cfg, saved_model_path = SAVED_MODEL_PATH): + print('start to eval ! \n') + env = NormalizedActions(gym.make("Pendulum-v0")) + n_states = env.observation_space.shape[0] + n_actions = env.action_space.shape[0] + agent = DDPG(n_states, n_actions, critic_lr=1e-3, + actor_lr=1e-4, gamma=0.99, soft_tau=1e-2, memory_capacity=100000, batch_size=128) + agent.load_model(saved_model_path+'checkpoint.pth') rewards = [] moving_average_rewards = [] ep_steps = [] + log_dir=os.path.split(os.path.abspath(__file__))[0]+"/logs/eval/" + SEQUENCE + writer = SummaryWriter(log_dir) for i_episode in range(1, cfg.eval_eps+1): - state = env.reset() # reset环境状态 + state = env.reset() # reset环境状态 ep_reward = 0 for i_step in range(1, cfg.eval_steps+1): - action = agent.select_action(state) # 根据当前环境state选择action - next_state, reward, done, _ = env.step(action) # 更新环境参数 + action = agent.select_action(state) # 根据当前环境state选择action + next_state, reward, done, _ = env.step(action) # 更新环境参数 ep_reward += reward - state = next_state # 跳转到下一个状态 + state = next_state # 跳转到下一个状态 if done: break print('Episode:', i_episode, ' Reward: %i' % @@ -137,11 +140,22 @@ def eval(): else: moving_average_rewards.append( 0.9*moving_average_rewards[-1]+0.1*ep_reward) - plot(rewards,save_fig=False) - plot(moving_average_rewards, ylabel="moving_average_rewards",save_fig=False) - plot(ep_steps, ylabel="steps_of_each_episode",save_fig=False) - + writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode) + writer.add_scalar('steps_of_each_episode', + ep_steps[-1], i_episode) + writer.close() + '''存储reward等相关结果''' + if not os.path.exists(RESULT_PATH): # 检测是否存在文件夹 + os.mkdir(RESULT_PATH) + np.save(RESULT_PATH+'rewards_eval.npy', rewards) + np.save(RESULT_PATH+'moving_average_rewards_eval.npy', moving_average_rewards) + np.save(RESULT_PATH+'steps_eval.npy', ep_steps) if __name__ == "__main__": - # train() - eval() \ No newline at end of file + cfg = get_args() + if cfg.train: + train(cfg) + eval(cfg) + else: + model_path = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/" + eval(cfg,saved_model_path=model_path) diff --git a/codes/ddpg/plot.py b/codes/ddpg/plot.py index 6e9e145..f25efb0 100644 --- a/codes/ddpg/plot.py +++ b/codes/ddpg/plot.py @@ -5,17 +5,16 @@ @Email: johnjim0816@gmail.com @Date: 2020-06-11 16:30:09 @LastEditor: John -LastEditTime: 2020-09-02 01:20:03 +LastEditTime: 2020-10-15 21:32:05 @Discription: @Environment: python 3.7.7 ''' import matplotlib.pyplot as plt -import pandas as pd -import seaborn as sns; +import seaborn as sns import numpy as np import os -def plot(item,ylabel='rewards',save_fig = True): +def plot_results(item,ylabel='rewards_train', save_fig = True): '''plot using searborn to plot ''' sns.set() @@ -24,25 +23,24 @@ def plot(item,ylabel='rewards',save_fig = True): plt.title(ylabel+' of DDPG') plt.ylabel(ylabel) plt.xlabel('episodes') - plt.savefig(os.path.dirname(__file__)+"/result/"+ylabel+".png") + if save_fig: + plt.savefig(os.path.dirname(__file__)+"/result/"+ylabel+".png") plt.show() -# def plot(item,ylabel='rewards'): -# -# df = pd.DataFrame(dict(time=np.arange(len(item)),value=item)) -# g = sns.relplot(x="time", y="value", kind="line", data=df) -# # g.fig.autofmt_xdate() -# # sns.lineplot(time=time, data=item, color="r", condition="behavior_cloning") -# # # sns.tsplot(time=time, data=x2, color="b", condition="dagger") -# # plt.ylabel("Reward") -# # plt.xlabel("Iteration Number") -# # plt.title("Imitation Learning") - - # plt.show() if __name__ == "__main__": - output_path = os.path.dirname(__file__)+"/result/" - rewards=np.load(output_path+"rewards.npy", ) - moving_average_rewards=np.load(output_path+"moving_average_rewards.npy",) - plot(rewards) - plot(moving_average_rewards,ylabel='moving_average_rewards') + output_path = os.path.split(os.path.abspath(__file__))[0]+"/result/" + tag = 'train' + rewards=np.load(output_path+"rewards_"+tag+".npy", ) + moving_average_rewards=np.load(output_path+"moving_average_rewards_"+tag+".npy",) + steps=np.load(output_path+"steps_"+tag+".npy") + plot_results(rewards) + plot_results(moving_average_rewards,ylabel='moving_average_rewards_'+tag) + plot_results(steps,ylabel='steps_'+tag) + tag = 'eval' + rewards=np.load(output_path+"rewards_"+tag+".npy", ) + moving_average_rewards=np.load(output_path+"moving_average_rewards_"+tag+".npy",) + steps=np.load(output_path+"steps_"+tag+".npy") + plot_results(rewards,ylabel='rewards_'+tag) + plot_results(moving_average_rewards,ylabel='moving_average_rewards_'+tag) + plot_results(steps,ylabel='steps_'+tag) diff --git a/codes/ddpg/result/20201015-193308/moving_average_rewards_eval.npy b/codes/ddpg/result/20201015-193308/moving_average_rewards_eval.npy new file mode 100644 index 0000000..892177b Binary files /dev/null and b/codes/ddpg/result/20201015-193308/moving_average_rewards_eval.npy differ diff --git a/codes/ddpg/result/20201015-193308/moving_average_rewards_train.npy b/codes/ddpg/result/20201015-193308/moving_average_rewards_train.npy new file mode 100644 index 0000000..baae56c Binary files /dev/null and b/codes/ddpg/result/20201015-193308/moving_average_rewards_train.npy differ diff --git a/codes/ddpg/result/20201015-193308/rewards_eval copy.npy b/codes/ddpg/result/20201015-193308/rewards_eval copy.npy new file mode 100644 index 0000000..22c1d74 Binary files /dev/null and b/codes/ddpg/result/20201015-193308/rewards_eval copy.npy differ diff --git a/codes/ddpg/result/20201015-193308/rewards_eval.npy b/codes/ddpg/result/20201015-193308/rewards_eval.npy new file mode 100644 index 0000000..22c1d74 Binary files /dev/null and b/codes/ddpg/result/20201015-193308/rewards_eval.npy differ diff --git a/codes/ddpg/result/20201015-193308/rewards_train.npy b/codes/ddpg/result/20201015-193308/rewards_train.npy new file mode 100644 index 0000000..2f86826 Binary files /dev/null and b/codes/ddpg/result/20201015-193308/rewards_train.npy differ diff --git a/codes/ddpg/result/steps.npy b/codes/ddpg/result/20201015-193308/steps_train.npy similarity index 100% rename from codes/ddpg/result/steps.npy rename to codes/ddpg/result/20201015-193308/steps_train.npy diff --git a/codes/ddpg/result/moving_average_rewards.npy b/codes/ddpg/result/moving_average_rewards.npy deleted file mode 100644 index 5055900..0000000 Binary files a/codes/ddpg/result/moving_average_rewards.npy and /dev/null differ diff --git a/codes/ddpg/result/moving_average_rewards.png b/codes/ddpg/result/moving_average_rewards.png deleted file mode 100644 index 9725858..0000000 Binary files a/codes/ddpg/result/moving_average_rewards.png and /dev/null differ diff --git a/codes/ddpg/result/moving_average_rewards_eval.npy b/codes/ddpg/result/moving_average_rewards_eval.npy new file mode 100644 index 0000000..892177b Binary files /dev/null and b/codes/ddpg/result/moving_average_rewards_eval.npy differ diff --git a/codes/ddpg/result/moving_average_rewards_eval.png b/codes/ddpg/result/moving_average_rewards_eval.png new file mode 100644 index 0000000..3e9c92f Binary files /dev/null and b/codes/ddpg/result/moving_average_rewards_eval.png differ diff --git a/codes/ddpg/result/moving_average_rewards_train.npy b/codes/ddpg/result/moving_average_rewards_train.npy new file mode 100644 index 0000000..baae56c Binary files /dev/null and b/codes/ddpg/result/moving_average_rewards_train.npy differ diff --git a/codes/ddpg/result/moving_average_rewards_train.png b/codes/ddpg/result/moving_average_rewards_train.png new file mode 100644 index 0000000..666e14d Binary files /dev/null and b/codes/ddpg/result/moving_average_rewards_train.png differ diff --git a/codes/ddpg/result/rewards.npy b/codes/ddpg/result/rewards.npy deleted file mode 100644 index fcb4f3e..0000000 Binary files a/codes/ddpg/result/rewards.npy and /dev/null differ diff --git a/codes/ddpg/result/rewards.png b/codes/ddpg/result/rewards.png deleted file mode 100644 index a39c983..0000000 Binary files a/codes/ddpg/result/rewards.png and /dev/null differ diff --git a/codes/ddpg/result/rewards_eval.npy b/codes/ddpg/result/rewards_eval.npy new file mode 100644 index 0000000..22c1d74 Binary files /dev/null and b/codes/ddpg/result/rewards_eval.npy differ diff --git a/codes/ddpg/result/rewards_eval.png b/codes/ddpg/result/rewards_eval.png new file mode 100644 index 0000000..f7b3c04 Binary files /dev/null and b/codes/ddpg/result/rewards_eval.png differ diff --git a/codes/ddpg/result/rewards_train.npy b/codes/ddpg/result/rewards_train.npy new file mode 100644 index 0000000..2f86826 Binary files /dev/null and b/codes/ddpg/result/rewards_train.npy differ diff --git a/codes/ddpg/result/rewards_train.png b/codes/ddpg/result/rewards_train.png new file mode 100644 index 0000000..ee4862f Binary files /dev/null and b/codes/ddpg/result/rewards_train.png differ diff --git a/codes/ddpg/result/steps_eval.npy b/codes/ddpg/result/steps_eval.npy new file mode 100644 index 0000000..59825bb Binary files /dev/null and b/codes/ddpg/result/steps_eval.npy differ diff --git a/codes/ddpg/result/steps_eval.png b/codes/ddpg/result/steps_eval.png new file mode 100644 index 0000000..d6d77d7 Binary files /dev/null and b/codes/ddpg/result/steps_eval.png differ diff --git a/codes/ddpg/result/steps_of_each_episode.png b/codes/ddpg/result/steps_of_each_episode.png deleted file mode 100644 index 9b9e58b..0000000 Binary files a/codes/ddpg/result/steps_of_each_episode.png and /dev/null differ diff --git a/codes/ddpg/result/steps_train.npy b/codes/ddpg/result/steps_train.npy new file mode 100644 index 0000000..59825bb Binary files /dev/null and b/codes/ddpg/result/steps_train.npy differ diff --git a/codes/ddpg/result/steps_train.png b/codes/ddpg/result/steps_train.png new file mode 100644 index 0000000..c6a9675 Binary files /dev/null and b/codes/ddpg/result/steps_train.png differ diff --git a/codes/ddpg/saved_model/20201015-193308/checkpoint.pth b/codes/ddpg/saved_model/20201015-193308/checkpoint.pth new file mode 100644 index 0000000..e07405b Binary files /dev/null and b/codes/ddpg/saved_model/20201015-193308/checkpoint.pth differ diff --git a/codes/ddpg/saved_model/checkpoint.pth b/codes/ddpg/saved_model/checkpoint.pth index d30efae..b39ee07 100644 Binary files a/codes/ddpg/saved_model/checkpoint.pth and b/codes/ddpg/saved_model/checkpoint.pth differ diff --git a/codes/ddpg/utils.py b/codes/ddpg/utils.py new file mode 100644 index 0000000..f4cde57 --- /dev/null +++ b/codes/ddpg/utils.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python +# coding=utf-8 +''' +Author: John +Email: johnjim0816@gmail.com +Date: 2020-10-15 21:31:19 +LastEditor: John +LastEditTime: 2020-10-15 21:31:25 +Discription: +Environment: +''' +import os +import numpy as np +RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/result/"+SEQUENCE+'/' + +def save_results(rewards,moving_average_rewards,ep_steps,path=RESULT_PATH): + if not os.path.exists(path): # 检测是否存在文件夹 + os.mkdir(path) + np.save(RESULT_PATH+'rewards_train.npy', rewards) + np.save(RESULT_PATH+'moving_average_rewards_train.npy', moving_average_rewards) + np.save(RESULT_PATH+'steps_train.npy',ep_steps ) \ No newline at end of file