diff --git a/codes/A2C/.vscode/settings.json b/codes/A2C/.vscode/settings.json new file mode 100644 index 0000000..be0f1ab --- /dev/null +++ b/codes/A2C/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.pythonPath": "/Users/jj/anaconda3/envs/py37/bin/python" +} \ No newline at end of file diff --git a/codes/A2C/README.md b/codes/A2C/README.md new file mode 100644 index 0000000..e69de29 diff --git a/codes/A2C/agent.py b/codes/A2C/agent.py new file mode 100644 index 0000000..a5a2fee --- /dev/null +++ b/codes/A2C/agent.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python +# coding=utf-8 +''' +Author: John +Email: johnjim0816@gmail.com +Date: 2020-11-03 20:47:09 +LastEditor: John +LastEditTime: 2020-11-08 22:16:29 +Discription: +Environment: +''' +from model import ActorCritic +import torch.optim as optim + +class A2C: + def __init__(self,n_states, n_actions, hidden_dim=256,device="cpu",lr = 3e-4): + self.device = device + self.gamma = 0.99 + self.model = ActorCritic(n_states, n_actions, hidden_dim=hidden_dim).to(device) + self.optimizer = optim.Adam(self.model.parameters(),lr=lr) + def choose_action(self, state): + dist, value = self.model(state) + action = dist.sample() + return action + def compute_returns(self,next_value, rewards, masks): + R = next_value + returns = [] + for step in reversed(range(len(rewards))): + R = rewards[step] + self.gamma * R * masks[step] + returns.insert(0, R) + return returns + def update(self): + pass \ No newline at end of file diff --git a/codes/A2C/common/__init__.py b/codes/A2C/common/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/codes/A2C/common/multiprocessing_env.py b/codes/A2C/common/multiprocessing_env.py new file mode 100644 index 0000000..46bbc08 --- /dev/null +++ b/codes/A2C/common/multiprocessing_env.py @@ -0,0 +1,153 @@ +#This code is from openai baseline +#https://github.com/openai/baselines/tree/master/baselines/common/vec_env + +import numpy as np +from multiprocessing import Process, Pipe + +def worker(remote, parent_remote, env_fn_wrapper): + parent_remote.close() + env = env_fn_wrapper.x() + while True: + cmd, data = remote.recv() + if cmd == 'step': + ob, reward, done, info = env.step(data) + if done: + ob = env.reset() + remote.send((ob, reward, done, info)) + elif cmd == 'reset': + ob = env.reset() + remote.send(ob) + elif cmd == 'reset_task': + ob = env.reset_task() + remote.send(ob) + elif cmd == 'close': + remote.close() + break + elif cmd == 'get_spaces': + remote.send((env.observation_space, env.action_space)) + else: + raise NotImplementedError + +class VecEnv(object): + """ + An abstract asynchronous, vectorized environment. + """ + def __init__(self, num_envs, observation_space, action_space): + self.num_envs = num_envs + self.observation_space = observation_space + self.action_space = action_space + + def reset(self): + """ + Reset all the environments and return an array of + observations, or a tuple of observation arrays. + If step_async is still doing work, that work will + be cancelled and step_wait() should not be called + until step_async() is invoked again. + """ + pass + + def step_async(self, actions): + """ + Tell all the environments to start taking a step + with the given actions. + Call step_wait() to get the results of the step. + You should not call this if a step_async run is + already pending. + """ + pass + + def step_wait(self): + """ + Wait for the step taken with step_async(). + Returns (obs, rews, dones, infos): + - obs: an array of observations, or a tuple of + arrays of observations. + - rews: an array of rewards + - dones: an array of "episode done" booleans + - infos: a sequence of info objects + """ + pass + + def close(self): + """ + Clean up the environments' resources. + """ + pass + + def step(self, actions): + self.step_async(actions) + return self.step_wait() + + +class CloudpickleWrapper(object): + """ + Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle) + """ + def __init__(self, x): + self.x = x + def __getstate__(self): + import cloudpickle + return cloudpickle.dumps(self.x) + def __setstate__(self, ob): + import pickle + self.x = pickle.loads(ob) + + +class SubprocVecEnv(VecEnv): + def __init__(self, env_fns, spaces=None): + """ + envs: list of gym environments to run in subprocesses + """ + self.waiting = False + self.closed = False + nenvs = len(env_fns) + self.nenvs = nenvs + self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)]) + self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn))) + for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)] + for p in self.ps: + p.daemon = True # if the main process crashes, we should not cause things to hang + p.start() + for remote in self.work_remotes: + remote.close() + + self.remotes[0].send(('get_spaces', None)) + observation_space, action_space = self.remotes[0].recv() + VecEnv.__init__(self, len(env_fns), observation_space, action_space) + + def step_async(self, actions): + for remote, action in zip(self.remotes, actions): + remote.send(('step', action)) + self.waiting = True + + def step_wait(self): + results = [remote.recv() for remote in self.remotes] + self.waiting = False + obs, rews, dones, infos = zip(*results) + return np.stack(obs), np.stack(rews), np.stack(dones), infos + + def reset(self): + for remote in self.remotes: + remote.send(('reset', None)) + return np.stack([remote.recv() for remote in self.remotes]) + + def reset_task(self): + for remote in self.remotes: + remote.send(('reset_task', None)) + return np.stack([remote.recv() for remote in self.remotes]) + + def close(self): + if self.closed: + return + if self.waiting: + for remote in self.remotes: + remote.recv() + for remote in self.remotes: + remote.send(('close', None)) + for p in self.ps: + p.join() + self.closed = True + + def __len__(self): + return self.nenvs diff --git a/codes/A2C/env.py b/codes/A2C/env.py new file mode 100644 index 0000000..fda92d8 --- /dev/null +++ b/codes/A2C/env.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python +# coding=utf-8 +''' +Author: John +Email: johnjim0816@gmail.com +Date: 2020-10-30 15:39:37 +LastEditor: John +LastEditTime: 2020-11-03 20:52:07 +Discription: +Environment: +''' + +import gym +from common.multiprocessing_env import SubprocVecEnv + +# num_envs = 16 +# env_name = "Pendulum-v0" + +def make_envs(num_envs=16,env_name="Pendulum-v0"): + ''' 创建多个子环境 + ''' + num_envs = 16 + env_name = "CartPole-v0" + def make_env(): + def _thunk(): + env = gym.make(env_name) + return env + + return _thunk + + envs = [make_env() for i in range(num_envs)] + envs = SubprocVecEnv(envs) + return envs +# if __name__ == "__main__": + +# num_envs = 16 +# env_name = "CartPole-v0" +# def make_env(): +# def _thunk(): +# env = gym.make(env_name) +# return env + +# return _thunk + +# envs = [make_env() for i in range(num_envs)] +# envs = SubprocVecEnv(envs) +if __name__ == "__main__": + envs = make_envs(num_envs=16,env_name="CartPole-v0") \ No newline at end of file diff --git a/codes/A2C/main.py b/codes/A2C/main.py new file mode 100644 index 0000000..65d8a32 --- /dev/null +++ b/codes/A2C/main.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python +# coding=utf-8 +''' +@Author: John +@Email: johnjim0816@gmail.com +@Date: 2020-06-11 20:58:21 +@LastEditor: John +LastEditTime: 2020-11-08 22:19:56 +@Discription: +@Environment: python 3.7.9 +''' +import torch +import gym +import os +import numpy as np +import argparse +from torch.utils.tensorboard import SummaryWriter + +from agent import A2C +from env import make_envs +from utils import SEQUENCE, SAVED_MODEL_PATH, RESULT_PATH +from utils import save_model,save_results + +def get_args(): + '''模型建立好之后只需要在这里调参 + ''' + parser = argparse.ArgumentParser() + parser.add_argument("--train", default=1, type=int) # 1 表示训练,0表示只进行eval + parser.add_argument("--gamma", default=0.99, + type=float) # reward 折扣因子 + parser.add_argument("--lr", default=3e-4, type=float) # critic学习率 + parser.add_argument("--actor_lr", default=1e-4, type=float) + parser.add_argument("--memory_capacity", default=10000, + type=int, help="capacity of Replay Memory") + parser.add_argument("--batch_size", default=128, type=int, + help="batch size of memory sampling") + parser.add_argument("--train_eps", default=4000, type=int) + parser.add_argument("--train_steps", default=5, type=int) + parser.add_argument("--eval_eps", default=200, type=int) # 训练的最大episode数目 + parser.add_argument("--eval_steps", default=200, + type=int) # 训练每个episode的长度 + parser.add_argument("--target_update", default=4, type=int, + help="when(every default 10 eisodes) to update target net ") + config = parser.parse_args() + return config + +def test_env(agent,device='cpu'): + env = gym.make("CartPole-v0") + state = env.reset() + ep_reward=0 + for _ in range(200): + state = torch.FloatTensor(state).unsqueeze(0).to(device) + dist, value = agent.model(state) + action = dist.sample() + next_state, reward, done, _ = env.step(action.cpu().numpy()[0]) + state = next_state + ep_reward += reward + if done: + break + return ep_reward + + +def train(cfg): + print('Start to train ! \n') + envs = make_envs(num_envs=16,env_name="CartPole-v0") + n_states = envs.observation_space.shape[0] + n_actions = envs.action_space.n + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + agent = A2C(n_states, n_actions, hidden_dim=256) + # moving_average_rewards = [] + # ep_steps = [] + log_dir=os.path.split(os.path.abspath(__file__))[0]+"/logs/train/" + SEQUENCE + writer = SummaryWriter(log_dir) + state = envs.reset() + for i_episode in range(1, cfg.train_eps+1): + log_probs = [] + values = [] + rewards = [] + masks = [] + entropy = 0 + for i_step in range(1, cfg.train_steps+1): + state = torch.FloatTensor(state).to(device) + dist, value = agent.model(state) + action = dist.sample() + next_state, reward, done, _ = envs.step(action.cpu().numpy()) + state = next_state + log_prob = dist.log_prob(action) + entropy += dist.entropy().mean() + log_probs.append(log_prob) + values.append(value) + rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device)) + masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device)) + if i_episode%20 == 0: + print("reward",test_env(agent,device='cpu')) + next_state = torch.FloatTensor(next_state).to(device) + _, next_value =agent.model(next_state) + returns = agent.compute_returns(next_value, rewards, masks) + + log_probs = torch.cat(log_probs) + returns = torch.cat(returns).detach() + values = torch.cat(values) + advantage = returns - values + actor_loss = -(log_probs * advantage.detach()).mean() + critic_loss = advantage.pow(2).mean() + loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy + + agent.optimizer.zero_grad() + loss.backward() + agent.optimizer.step() + for _ in range(100): + print("test_reward",test_env(agent,device='cpu')) + + # print('Episode:', i_episode, ' Reward: %i' % + # int(ep_reward[0]), 'n_steps:', i_step) + # ep_steps.append(i_step) + # rewards.append(ep_reward) + # if i_episode == 1: + # moving_average_rewards.append(ep_reward[0]) + # else: + # moving_average_rewards.append( + # 0.9*moving_average_rewards[-1]+0.1*ep_reward[0]) + # writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode) + # writer.add_scalar('steps_of_each_episode', + # ep_steps[-1], i_episode) + writer.close() + print('Complete training!') + ''' 保存模型 ''' + # save_model(agent,model_path=SAVED_MODEL_PATH) + # '''存储reward等相关结果''' + # save_results(rewards,moving_average_rewards,ep_steps,tag='train',result_path=RESULT_PATH) + +# def eval(cfg, saved_model_path = SAVED_MODEL_PATH): +# print('start to eval ! \n') +# env = NormalizedActions(gym.make("Pendulum-v0")) +# n_states = env.observation_space.shape[0] +# n_actions = env.action_space.shape[0] +# agent = DDPG(n_states, n_actions, critic_lr=1e-3, +# actor_lr=1e-4, gamma=0.99, soft_tau=1e-2, memory_capacity=100000, batch_size=128) +# agent.load_model(saved_model_path+'checkpoint.pth') +# rewards = [] +# moving_average_rewards = [] +# ep_steps = [] +# log_dir=os.path.split(os.path.abspath(__file__))[0]+"/logs/eval/" + SEQUENCE +# writer = SummaryWriter(log_dir) +# for i_episode in range(1, cfg.eval_eps+1): +# state = env.reset() # reset环境状态 +# ep_reward = 0 +# for i_step in range(1, cfg.eval_steps+1): +# action = agent.choose_action(state) # 根据当前环境state选择action +# next_state, reward, done, _ = env.step(action) # 更新环境参数 +# ep_reward += reward +# state = next_state # 跳转到下一个状态 +# if done: +# break +# print('Episode:', i_episode, ' Reward: %i' % +# int(ep_reward), 'n_steps:', i_step, 'done: ', done) +# ep_steps.append(i_step) +# rewards.append(ep_reward) +# # 计算滑动窗口的reward +# if i_episode == 1: +# moving_average_rewards.append(ep_reward) +# else: +# moving_average_rewards.append( +# 0.9*moving_average_rewards[-1]+0.1*ep_reward) +# writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode) +# writer.add_scalar('steps_of_each_episode', +# ep_steps[-1], i_episode) +# writer.close() +# '''存储reward等相关结果''' +# if not os.path.exists(RESULT_PATH): # 检测是否存在文件夹 +# os.mkdir(RESULT_PATH) +# np.save(RESULT_PATH+'rewards_eval.npy', rewards) +# np.save(RESULT_PATH+'moving_average_rewards_eval.npy', moving_average_rewards) +# np.save(RESULT_PATH+'steps_eval.npy', ep_steps) + +if __name__ == "__main__": + cfg = get_args() + train(cfg) + + # cfg = get_args() + # if cfg.train: + # train(cfg) + # eval(cfg) + # else: + # model_path = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/" + # eval(cfg,saved_model_path=model_path) diff --git a/codes/A2C/model.py b/codes/A2C/model.py new file mode 100644 index 0000000..0d68901 --- /dev/null +++ b/codes/A2C/model.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python +# coding=utf-8 +''' +Author: John +Email: johnjim0816@gmail.com +Date: 2020-11-03 20:45:25 +LastEditor: John +LastEditTime: 2020-11-07 18:49:09 +Discription: +Environment: +''' +import torch.nn as nn +from torch.distributions import Categorical + +class ActorCritic(nn.Module): + def __init__(self, n_states, n_actions, hidden_dim=256, std=0.0): + super(ActorCritic, self).__init__() + self.critic = nn.Sequential( + nn.Linear(n_states, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, 1) + ) + + self.actor = nn.Sequential( + nn.Linear(n_states, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, n_actions), + nn.Softmax(dim=1), + ) + + def forward(self, x): + value = self.critic(x) + probs = self.actor(x) + dist = Categorical(probs) + return dist, value \ No newline at end of file diff --git a/codes/A2C/utils.py b/codes/A2C/utils.py new file mode 100644 index 0000000..ce89c7c --- /dev/null +++ b/codes/A2C/utils.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python +# coding=utf-8 +''' +Author: John +Email: johnjim0816@gmail.com +Date: 2020-10-15 21:31:19 +LastEditor: John +LastEditTime: 2020-11-03 17:05:48 +Discription: +Environment: +''' +import os +import numpy as np +import datetime + +SEQUENCE = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") +SAVED_MODEL_PATH = os.path.split(os.path.abspath(__file__))[0]+"/saved_model/"+SEQUENCE+'/' +RESULT_PATH = os.path.split(os.path.abspath(__file__))[0]+"/result/"+SEQUENCE+'/' + + +def save_results(rewards,moving_average_rewards,ep_steps,path=RESULT_PATH): + if not os.path.exists(path): # 检测是否存在文件夹 + os.mkdir(path) + np.save(RESULT_PATH+'rewards_train.npy', rewards) + np.save(RESULT_PATH+'moving_average_rewards_train.npy', moving_average_rewards) + np.save(RESULT_PATH+'steps_train.npy',ep_steps ) + +def save_model(agent,model_path='./saved_model'): + if not os.path.exists(model_path): # 检测是否存在文件夹 + os.mkdir(model_path) + agent.save_model(model_path+'checkpoint.pth') + print('model saved!') \ No newline at end of file