update

2021-12-22 11:19:13 +08:00
parent c257313d5b
commit 75df999258
55 changed files with 605 additions and 403 deletions
@@ -5,7 +5,7 @@ Author: John
 Email: johnjim0816@gmail.com
 Date: 2020-09-11 23:03:00
 LastEditor: John
-LastEditTime: 2021-09-19 23:05:45
+LastEditTime: 2021-12-22 10:54:57
 Discription: use defaultdict to define Q table
 Environment: 
 '''
@@ -15,17 +15,17 @@ import torch
 from collections import defaultdict

 class QLearning(object):
-    def __init__(self,state_dim,
-                 action_dim,cfg):
-        self.action_dim = action_dim  # dimension of acgtion
-        self.lr = cfg.lr  # learning rate
+    def __init__(self,n_states,
+                 n_actions,cfg):
+        self.n_actions = n_actions 
+        self.lr = cfg.lr  # 学习率
        self.gamma = cfg.gamma  
        self.epsilon = 0 
        self.sample_count = 0  
        self.epsilon_start = cfg.epsilon_start
        self.epsilon_end = cfg.epsilon_end
        self.epsilon_decay = cfg.epsilon_decay
-        self.Q_table  = defaultdict(lambda: np.zeros(action_dim)) # A nested dictionary that maps state -> (action -> action-value)
+        self.Q_table  = defaultdict(lambda: np.zeros(n_actions)) # 用嵌套字典存放状态->动作->状态-动作值（Q值）的映射，即Q表
    def choose_action(self, state):
        self.sample_count += 1
        self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
@@ -34,7 +34,7 @@ class QLearning(object):
        if np.random.uniform(0, 1) > self.epsilon:
            action = np.argmax(self.Q_table[str(state)]) # 选择Q(s,a)最大对应的动作
        else:
-            action = np.random.choice(self.action_dim) # 随机选择动作
+            action = np.random.choice(self.n_actions) # 随机选择动作
        return action
    def predict(self,state):
        action = np.argmax(self.Q_table[str(state)])
@@ -0,0 +1,93 @@
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Author: John
+Email: johnjim0816@gmail.com
+Date: 2020-09-11 23:03:00
+LastEditor: John
+LastEditTime: 2021-12-22 11:13:23
+Discription: 
+Environment: 
+'''
+import sys
+import os
+curr_path = os.path.dirname(os.path.abspath(__file__))  # 当前文件所在绝对路径
+parent_path = os.path.dirname(curr_path)  # 父路径
+sys.path.append(parent_path)  # 添加路径到系统路径
+
+import gym
+import torch
+import datetime
+
+from envs.gridworld_env import CliffWalkingWapper
+from QLearning.agent import QLearning
+from QLearning.train import train,test
+from common.utils import plot_rewards,plot_rewards_cn
+from common.utils import save_results,make_dir
+
+curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
+algo_name = 'Q-learning'  # 算法名称
+env_name = 'CliffWalking-v0'  # 环境名称
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
+class QlearningConfig:
+    '''训练相关参数'''
+    def __init__(self):
+        self.algo_name = algo_name # 算法名称
+        self.env_name = env_name # 环境名称
+        self.device = device # 检测GPU
+        self.train_eps = 400 # 训练的回合数
+        self.test_eps = 30 # 测试的回合数
+        self.gamma = 0.9 # reward的衰减率
+        self.epsilon_start = 0.95 # e-greedy策略中初始epsilon
+        self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon
+        self.epsilon_decay = 300 # e-greedy策略中epsilon的衰减率
+        self.lr = 0.1 # 学习率      
+class PlotConfig:
+    ''' 绘图相关参数设置
+    '''
+
+    def __init__(self) -> None:
+        self.algo_name = algo_name  # 算法名称
+        self.env_name = env_name  # 环境名称
+        self.device = device # 检测GPU
+        self.result_path = curr_path + "/outputs/" + self.env_name + \
+            '/' + curr_time + '/results/'  # 保存结果的路径
+        self.model_path = curr_path + "/outputs/" + self.env_name + \
+            '/' + curr_time + '/models/'  # 保存模型的路径
+        self.save = True  # 是否保存图片
+        
+def env_agent_config(cfg,seed=1):
+    '''创建环境和智能体
+    Args:
+        cfg ([type]): [description]
+        seed (int, optional): 随机种子. Defaults to 1.
+    Returns:
+        env [type]: 环境
+        agent : 智能体
+    '''    
+    env = gym.make(cfg.env_name)  
+    env = CliffWalkingWapper(env)
+    env.seed(seed) # 设置随机种子
+    n_states = env.observation_space.n # 状态维度
+    n_actions = env.action_space.n # 动作维度
+    agent = QLearning(n_states,n_actions,cfg)
+    return env,agent
+
+cfg = QlearningConfig()
+plot_cfg = PlotConfig()
+# 训练
+env, agent = env_agent_config(cfg, seed=1)
+rewards, ma_rewards = train(cfg, env, agent)
+make_dir(plot_cfg.result_path, plot_cfg.model_path)  # 创建保存结果和模型路径的文件夹
+agent.save(path=plot_cfg.model_path)  # 保存模型
+save_results(rewards, ma_rewards, tag='train',
+            path=plot_cfg.result_path)  # 保存结果
+plot_rewards(rewards, ma_rewards, plot_cfg, tag="train")  # 画出结果
+# 测试
+env, agent = env_agent_config(cfg, seed=10)
+agent.load(path=plot_cfg.model_path)  # 导入模型
+rewards, ma_rewards = test(cfg, env, agent)
+save_results(rewards, ma_rewards, tag='test', path=plot_cfg.result_path)  # 保存结果
+plot_rewards(rewards, ma_rewards, plot_cfg, tag="test")  # 画出结果
+        
+    
@@ -1,126 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-'''
-Author: John
-Email: johnjim0816@gmail.com
-Date: 2020-09-11 23:03:00
-LastEditor: John
-LastEditTime: 2021-09-23 12:22:58
-Discription: 
-Environment: 
-'''
-import sys,os
-curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前路径
-parent_path=os.path.dirname(curr_path) # 父路径，这里就是我们的项目路径
-sys.path.append(parent_path) # 由于需要引用项目路径下的其他模块比如envs，所以需要添加路径到sys.path
-
-import gym
-import torch
-import datetime
-
-from envs.gridworld_env import CliffWalkingWapper
-from QLearning.agent import QLearning
-from common.plot import plot_rewards,plot_rewards_cn
-from common.utils import save_results,make_dir
-
-curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
-class QlearningConfig:
-    '''训练相关参数'''
-    def __init__(self):
-        self.algo = 'Q-learning' # 算法名称
-        self.env = 'CliffWalking-v0' # 环境名称
-        self.result_path = curr_path+"/outputs/" +self.env+'/'+curr_time+'/results/'  # 保存结果的路径
-        self.model_path = curr_path+"/outputs/" +self.env+'/'+curr_time+'/models/'  # 保存模型的路径
-        self.train_eps = 400 # 训练的回合数
-        self.eval_eps = 30 # 测试的回合数
-        self.gamma = 0.9 # reward的衰减率
-        self.epsilon_start = 0.95 # e-greedy策略中初始epsilon
-        self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon
-        self.epsilon_decay = 300 # e-greedy策略中epsilon的衰减率
-        self.lr = 0.1 # 学习率
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测GPU
-
-        
-def env_agent_config(cfg,seed=1):
-    env = gym.make(cfg.env)  
-    env = CliffWalkingWapper(env)
-    env.seed(seed) # 设置随机种子
-    state_dim = env.observation_space.n # 状态维度
-    action_dim = env.action_space.n # 动作维度
-    agent = QLearning(state_dim,action_dim,cfg)
-    return env,agent
-
-def train(cfg,env,agent):
-    print('开始训练！')
-    print(f'环境:{cfg.env}, 算法:{cfg.algo}, 设备:{cfg.device}')
-    rewards = []  # 记录奖励
-    ma_rewards = [] # 记录滑动平均奖励
-    for i_ep in range(cfg.train_eps):
-        ep_reward = 0  # 记录每个回合的奖励
-        state = env.reset()  # 重置环境,即开始新的回合
-        while True:
-            action = agent.choose_action(state)  # 根据算法选择一个动作
-            next_state, reward, done, _ = env.step(action)  # 与环境进行一次动作交互
-            print(reward)
-            agent.update(state, action, reward, next_state, done)  # Q学习算法更新
-            state = next_state  # 更新状态
-            ep_reward += reward
-            if done:
-                break
-        rewards.append(ep_reward)
-        if ma_rewards:
-            ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
-        else:
-            ma_rewards.append(ep_reward)
-        print("回合数：{}/{}，奖励{:.1f}".format(i_ep+1, cfg.train_eps,ep_reward))
-    print('完成训练！')
-    return rewards,ma_rewards
-    
-def eval(cfg,env,agent):
-    print('开始测试！')
-    print(f'环境：{cfg.env}, 算法：{cfg.algo}, 设备：{cfg.device}')
-    for item in agent.Q_table.items():
-        print(item)
-    rewards = []  # 记录所有回合的奖励
-    ma_rewards = [] # 滑动平均的奖励
-    for i_ep in range(cfg.eval_eps):
-        ep_reward = 0  # 记录每个episode的reward
-        state = env.reset()  # 重置环境, 重新开一局（即开始新的一个回合）
-        while True:
-            action = agent.predict(state)  # 根据算法选择一个动作
-            next_state, reward, done, _ = env.step(action)  # 与环境进行一个交互
-            state = next_state  # 更新状态
-            ep_reward += reward
-            if done:
-                break
-        rewards.append(ep_reward)
-        if ma_rewards:
-            ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
-        else:
-            ma_rewards.append(ep_reward)
-        print(f"回合数：{i_ep+1}/{cfg.eval_eps}, 奖励：{ep_reward:.1f}")
-    print('完成测试！')
-    return rewards,ma_rewards
-    
-if __name__ == "__main__":
-    cfg = QlearningConfig()
-
-    # 训练
-    env,agent = env_agent_config(cfg,seed=0)
-    rewards,ma_rewards = train(cfg,env,agent)
-    make_dir(cfg.result_path,cfg.model_path) # 创建文件夹
-    agent.save(path=cfg.model_path) # 保存模型
-    for item in agent.Q_table.items():
-        print(item)
-    save_results(rewards,ma_rewards,tag='train',path=cfg.result_path) # 保存结果
-    plot_rewards_cn(rewards,ma_rewards,tag="train",env=cfg.env,algo = cfg.algo,path=cfg.result_path)
-
-    # # 测试
-    env,agent = env_agent_config(cfg,seed=10)
-    agent.load(path=cfg.model_path) # 加载模型
-    rewards,ma_rewards = eval(cfg,env,agent)
-    
-    save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path)
-    plot_rewards_cn(rewards,ma_rewards,tag="eval",env=cfg.env,algo = cfg.algo,path=cfg.result_path)
-    
-    
@@ -0,0 +1,51 @@
+def train(cfg,env,agent):
+    print('开始训练！')
+    print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
+    rewards = []  # 记录奖励
+    ma_rewards = [] # 记录滑动平均奖励
+    for i_ep in range(cfg.train_eps):
+        ep_reward = 0  # 记录每个回合的奖励
+        state = env.reset()  # 重置环境,即开始新的回合
+        while True:
+            action = agent.choose_action(state)  # 根据算法选择一个动作
+            next_state, reward, done, _ = env.step(action)  # 与环境进行一次动作交互
+            agent.update(state, action, reward, next_state, done)  # Q学习算法更新
+            state = next_state  # 更新状态
+            ep_reward += reward
+            if done:
+                break
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
+        else:
+            ma_rewards.append(ep_reward)
+        if ()
+        print("回合数：{}/{}，奖励{:.1f}".format(i_ep+1, cfg.train_eps,ep_reward))
+    print('完成训练！')
+    return rewards,ma_rewards
+    
+def test(cfg,env,agent):
+    print('开始测试！')
+    print(f'环境：{cfg.env_name}, 算法：{cfg.algo_name}, 设备：{cfg.device}')
+    for item in agent.Q_table.items():
+        print(item)
+    rewards = []  # 记录所有回合的奖励
+    ma_rewards = [] # 滑动平均的奖励
+    for i_ep in range(cfg.test_eps):
+        ep_reward = 0  # 记录每个episode的reward
+        state = env.reset()  # 重置环境, 重新开一局（即开始新的一个回合）
+        while True:
+            action = agent.predict(state)  # 根据算法选择一个动作
+            next_state, reward, done, _ = env.step(action)  # 与环境进行一个交互
+            state = next_state  # 更新状态
+            ep_reward += reward
+            if done:
+                break
+        rewards.append(ep_reward)
+        if ma_rewards:
+            ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
+        else:
+            ma_rewards.append(ep_reward)
+        print(f"回合数：{i_ep+1}/{cfg.test_eps}, 奖励：{ep_reward:.1f}")
+    print('完成测试！')
+    return rewards,ma_rewards