update rainbowdqn

2022-05-31 01:20:58 +08:00
parent cfc0f6492e
commit c7c94468c9
149 changed files with 1866 additions and 1549 deletions
--- a/codes/QLearning/task0.py
+++ b/codes/QLearning/task0.py
@@ -5,7 +5,7 @@ Author: John
 Email: johnjim0816@gmail.com
 Date: 2020-09-11 23:03:00
 LastEditor: John
-LastEditTime: 2021-12-22 11:13:23
+LastEditTime: 2022-02-10 00:54:02
 Discription: 
 Environment: 
 '''
@@ -19,42 +19,93 @@ import gym
 import torch
 import datetime

-from envs.gridworld_env import CliffWalkingWapper
-from QLearning.agent import QLearning
-from QLearning.train import train,test
-from common.utils import plot_rewards,plot_rewards_cn
+from env.gridworld_env import CliffWalkingWapper
+from qlearning import QLearning
+from common.utils import plot_rewards
 from common.utils import save_results,make_dir

 curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
-algo_name = 'Q-learning'  # 算法名称
-env_name = 'CliffWalking-v0'  # 环境名称
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
-class QlearningConfig:
-    '''训练相关参数'''
-    def __init__(self):
-        self.algo_name = algo_name # 算法名称
-        self.env_name = env_name # 环境名称
-        self.device = device # 检测GPU
-        self.train_eps = 400 # 训练的回合数
-        self.test_eps = 30 # 测试的回合数
-        self.gamma = 0.9 # reward的衰减率
-        self.epsilon_start = 0.95 # e-greedy策略中初始epsilon
-        self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon
-        self.epsilon_decay = 300 # e-greedy策略中epsilon的衰减率
-        self.lr = 0.1 # 学习率      
-class PlotConfig:
-    ''' 绘图相关参数设置
+class Config:
+    '''超参数
    '''

-    def __init__(self) -> None:
-        self.algo_name = algo_name  # 算法名称
-        self.env_name = env_name  # 环境名称
-        self.device = device # 检测GPU
+    def __init__(self):
+        ################################## 环境超参数 ###################################
+        self.algo_name = 'Q-learning'  # 算法名称
+        self.env_name = 'CliffWalking-v0'  # 环境名称
+        self.device = torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu")  # 检测GPUgjgjlkhfsf风刀霜的撒发十
+        self.seed = 10 # 随机种子，置0则不设置随机种子
+        self.train_eps = 400  # 训练的回合数
+        self.test_eps = 30  # 测试的回合数
+        ################################################################################
+        
+        ################################## 算法超参数 ###################################
+        self.gamma = 0.90  # 强化学习中的折扣因子
+        self.epsilon_start = 0.95  # e-greedy策略中初始epsilon
+        self.epsilon_end = 0.01  # e-greedy策略中的终止epsilon
+        self.epsilon_decay = 300  # e-greedy策略中epsilon的衰减率
+        self.lr = 0.1  # 学习率
+        ################################################################################
+        
+        ################################# 保存结果相关参数 ################################
        self.result_path = curr_path + "/outputs/" + self.env_name + \
            '/' + curr_time + '/results/'  # 保存结果的路径
        self.model_path = curr_path + "/outputs/" + self.env_name + \
            '/' + curr_time + '/models/'  # 保存模型的路径
-        self.save = True  # 是否保存图片
+        self.save = True # 是否保存图片
+        ################################################################################
+        
+def train(cfg,env,agent):
+    print('开始训练！')
+    print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
+    rewards = []  # 记录奖励
+    ma_rewards = [] # 记录滑动平均奖励
+    for i_ep in range(cfg.train_eps):
+        ep_reward = 0  # 记录每个回合的奖励
+        state = env.reset()  # 重置环境,即开始新的回合
+        while True:
+            action = agent.choose_action(state)  # 根据算法选择一个动作
+            next_state, reward, done, _ = env.step(action)  # 与环境进行一次动作交互
+            agent.update(state, action, reward, next_state, done)  # Q学习算法更新
+            state = next_state  # 更新状态
+            ep_reward += reward
+            if done:
+                break
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
+        else:
+            ma_rewards.append(ep_reward)
+        print("回合数：{}/{}，奖励{:.1f}".format(i_ep+1, cfg.train_eps,ep_reward))
+    print('完成训练！')
+    return rewards,ma_rewards
+    
+def test(cfg,env,agent):
+    print('开始测试！')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
+    for item in agent.Q_table.items():
+        print(item)
+    rewards = []  # 记录所有回合的奖励
+    ma_rewards = [] # 滑动平均的奖励
+    for i_ep in range(cfg.test_eps):
+        ep_reward = 0  # 记录每个episode的reward
+        state = env.reset()  # 重置环境, 重新开一局（即开始新的一个回合）
+        while True:
+            action = agent.predict(state)  # 根据算法选择一个动作
+            next_state, reward, done, _ = env.step(action)  # 与环境进行一个交互
+            state = next_state  # 更新状态
+            ep_reward += reward
+            if done:
+                break
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
+        else:
+            ma_rewards.append(ep_reward)
+        print(f"回合数：{i_ep+1}/{cfg.test_eps}, 奖励：{ep_reward:.1f}")
+    print('完成测试！')
+    return rewards,ma_rewards
        
 def env_agent_config(cfg,seed=1):
    '''创建环境和智能体
@@ -68,26 +119,25 @@ def env_agent_config(cfg,seed=1):
    env = gym.make(cfg.env_name)  
    env = CliffWalkingWapper(env)
    env.seed(seed) # 设置随机种子
-    state_dim = env.observation_space.n # 状态维度
-    action_dim = env.action_space.n # 动作维度
-    agent = QLearning(state_dim,action_dim,cfg)
+    n_states = env.observation_space.n # 状态维度
+    n_actions = env.action_space.n # 动作维度
+    agent = QLearning(n_states,n_actions,cfg)
    return env,agent
-
-cfg = QlearningConfig()
-plot_cfg = PlotConfig()
-# 训练
-env, agent = env_agent_config(cfg, seed=1)
-rewards, ma_rewards = train(cfg, env, agent)
-make_dir(plot_cfg.result_path, plot_cfg.model_path)  # 创建保存结果和模型路径的文件夹
-agent.save(path=plot_cfg.model_path)  # 保存模型
-save_results(rewards, ma_rewards, tag='train',
-            path=plot_cfg.result_path)  # 保存结果
-plot_rewards(rewards, ma_rewards, plot_cfg, tag="train")  # 画出结果
-# 测试
-env, agent = env_agent_config(cfg, seed=10)
-agent.load(path=plot_cfg.model_path)  # 导入模型
-rewards, ma_rewards = test(cfg, env, agent)
-save_results(rewards, ma_rewards, tag='test', path=plot_cfg.result_path)  # 保存结果
-plot_rewards(rewards, ma_rewards, plot_cfg, tag="test")  # 画出结果
+if __name__ == "__main__":
+    cfg = Config()
+    # 训练
+    env, agent = env_agent_config(cfg, seed=1)
+    rewards, ma_rewards = train(cfg, env, agent)
+    make_dir(cfg.result_path, cfg.model_path)  # 创建保存结果和模型路径的文件夹
+    agent.save(path=cfg.model_path)  # 保存模型
+    save_results(rewards, ma_rewards, tag='train',
+                path=cfg.result_path)  # 保存结果
+    plot_rewards(rewards, ma_rewards, cfg, tag="train")  # 画出结果
+    # 测试
+    env, agent = env_agent_config(cfg, seed=10)
+    agent.load(path=cfg.model_path)  # 导入模型
+    rewards, ma_rewards = test(cfg, env, agent)
+    save_results(rewards, ma_rewards, tag='test', path=cfg.result_path)  # 保存结果
+    plot_rewards(rewards, ma_rewards, cfg, tag="test")  # 画出结果