Files
easy-rl/codes/PolicyGradient/task0.py
2022-05-31 01:20:58 +08:00

153 lines
5.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2020-11-22 23:21:53
LastEditor: John
LastEditTime: 2022-02-10 06:13:21
Discription:
Environment:
'''
import sys
import os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径
import gym
import torch
import datetime
from itertools import count
from pg import PolicyGradient
from common.utils import save_results, make_dir
from common.utils import plot_rewards
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
class Config:
'''超参数
'''
def __init__(self):
################################## 环境超参数 ###################################
self.algo_name = "PolicyGradient" # 算法名称
self.env_name = 'CartPole-v0' # 环境名称
self.device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu") # 检测GPUgjgjlkhfsf风刀霜的撒发十
self.seed = 10 # 随机种子置0则不设置随机种子
self.train_eps = 300 # 训练的回合数
self.test_eps = 30 # 测试的回合数
################################################################################
################################## 算法超参数 ###################################
self.batch_size = 8 # mini-batch SGD中的批量大小
self.lr = 0.01 # 学习率
self.gamma = 0.99 # 强化学习中的折扣因子
self.hidden_dim = 36 # 网络隐藏层
################################################################################
################################# 保存结果相关参数 ################################
self.result_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/results/' # 保存结果的路径
self.model_path = curr_path + "/outputs/" + self.env_name + \
'/' + curr_time + '/models/' # 保存模型的路径
self.save = True # 是否保存图片
################################################################################
def env_agent_config(cfg,seed=1):
env = gym.make(cfg.env_name)
env.seed(seed)
n_states = env.observation_space.shape[0]
agent = PolicyGradient(n_states,cfg)
return env,agent
def train(cfg,env,agent):
print('开始训练!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
state_pool = [] # 存放每batch_size个episode的state序列
action_pool = []
reward_pool = []
rewards = []
ma_rewards = []
for i_ep in range(cfg.train_eps):
state = env.reset()
ep_reward = 0
for _ in count():
action = agent.choose_action(state) # 根据当前环境state选择action
next_state, reward, done, _ = env.step(action)
ep_reward += reward
if done:
reward = 0
state_pool.append(state)
action_pool.append(float(action))
reward_pool.append(reward)
state = next_state
if done:
print('回合:{}/{}, 奖励:{}'.format(i_ep + 1, cfg.train_eps, ep_reward))
break
if i_ep > 0 and i_ep % cfg.batch_size == 0:
agent.update(reward_pool,state_pool,action_pool)
state_pool = [] # 每个episode的state
action_pool = []
reward_pool = []
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(
0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print('完成训练!')
env.close()
return rewards, ma_rewards
def test(cfg,env,agent):
print('开始测试!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
rewards = []
ma_rewards = []
for i_ep in range(cfg.test_eps):
state = env.reset()
ep_reward = 0
for _ in count():
action = agent.choose_action(state) # 根据当前环境state选择action
next_state, reward, done, _ = env.step(action)
ep_reward += reward
if done:
reward = 0
state = next_state
if done:
print('回合:{}/{}, 奖励:{}'.format(i_ep + 1, cfg.train_eps, ep_reward))
break
rewards.append(ep_reward)
if ma_rewards:
ma_rewards.append(
0.9*ma_rewards[-1]+0.1*ep_reward)
else:
ma_rewards.append(ep_reward)
print('完成测试!')
env.close()
return rewards, ma_rewards
if __name__ == "__main__":
cfg = Config()
# 训练
env, agent = env_agent_config(cfg)
rewards, ma_rewards = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path) # 创建保存结果和模型路径的文件夹
agent.save(path=cfg.model_path) # 保存模型
save_results(rewards, ma_rewards, tag='train',
path=cfg.result_path) # 保存结果
plot_rewards(rewards, ma_rewards, cfg, tag="train") # 画出结果
# 测试
env, agent = env_agent_config(cfg)
agent.load(path=cfg.model_path) # 导入模型
rewards, ma_rewards = test(cfg, env, agent)
save_results(rewards, ma_rewards, tag='test',
path=cfg.result_path) # 保存结果
plot_rewards(rewards, ma_rewards, cfg, tag="test") # 画出结果