{ "metadata": { "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.11" }, "orig_nbformat": 2, "kernelspec": { "name": "python3", "display_name": "Python 3.7.11 64-bit ('py37': conda)" }, "interpreter": { "hash": "fbea1422c2cf61ed9c0cfc03f38f71cc9083cc288606edc4170b5309b352ce27" } }, "nbformat": 4, "nbformat_minor": 2, "cells": [ { "cell_type": "code", "execution_count": 1, "source": [ "import sys\n", "from pathlib import Path\n", "curr_path = str(Path().absolute())\n", "parent_path = str(Path().absolute().parent)\n", "sys.path.append(parent_path) # add current terminal path to sys.path\n", "\n", "import gym\n", "import datetime\n", "\n", "from envs.gridworld_env import CliffWalkingWapper\n", "from QLearning.agent import QLearning\n", "from common.plot import plot_rewards\n", "from common.utils import save_results,make_dir\n", "curr_time = datetime.datetime.now().strftime(\"%Y%m%d-%H%M%S\") # obtain current time" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 2, "source": [ "class QlearningConfig:\n", " '''训练相关参数'''\n", " def __init__(self):\n", " self.algo = 'Qlearning'\n", " self.env = 'CliffWalking-v0' # 0 up, 1 right, 2 down, 3 left\n", " self.result_path = curr_path+\"/outputs/\" +self.env+'/'+curr_time+'/results/' # path to save results\n", " self.model_path = curr_path+\"/outputs/\" +self.env+'/'+curr_time+'/models/' # path to save models\n", " self.train_eps = 200 # 训练的episode数目\n", " self.eval_eps = 30\n", " self.gamma = 0.9 # reward的衰减率\n", " self.epsilon_start = 0.95 # e-greedy策略中初始epsilon\n", " self.epsilon_end = 0.01 # e-greedy策略中的终止epsilon\n", " self.epsilon_decay = 200 # e-greedy策略中epsilon的衰减率\n", " self.lr = 0.1 # learning rate" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 3, "source": [ "def env_agent_config(cfg,seed=1):\n", " env = gym.make(cfg.env) \n", " env = CliffWalkingWapper(env)\n", " env.seed(seed)\n", " state_dim = env.observation_space.n\n", " action_dim = env.action_space.n\n", " agent = QLearning(state_dim,action_dim,cfg)\n", " return env,agent" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 4, "source": [ "def train(cfg,env,agent):\n", " rewards = [] \n", " ma_rewards = [] # moving average reward\n", " for i_ep in range(cfg.train_eps):\n", " ep_reward = 0 # 记录每个episode的reward\n", " state = env.reset() # 重置环境, 重新开一局(即开始新的一个episode)\n", " while True:\n", " action = agent.choose_action(state) # 根据算法选择一个动作\n", " next_state, reward, done, _ = env.step(action) # 与环境进行一次动作交互\n", " agent.update(state, action, reward, next_state, done) # Q-learning算法更新\n", " state = next_state # 存储上一个观察值\n", " ep_reward += reward\n", " if done:\n", " break\n", " rewards.append(ep_reward)\n", " if ma_rewards:\n", " ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)\n", " else:\n", " ma_rewards.append(ep_reward)\n", " if (i_ep+1)%10==0:\n", " print(\"Episode:{}/{}: reward:{:.1f}\".format(i_ep+1, cfg.train_eps,ep_reward))\n", " return rewards,ma_rewards" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 5, "source": [ "def eval(cfg,env,agent):\n", " # env = gym.make(\"FrozenLake-v0\", is_slippery=False) # 0 left, 1 down, 2 right, 3 up\n", " # env = FrozenLakeWapper(env)\n", " rewards = [] # 记录所有episode的reward\n", " ma_rewards = [] # 滑动平均的reward\n", " for i_ep in range(cfg.eval_eps):\n", " ep_reward = 0 # 记录每个episode的reward\n", " state = env.reset() # 重置环境, 重新开一局(即开始新的一个episode)\n", " while True:\n", " action = agent.predict(state) # 根据算法选择一个动作\n", " next_state, reward, done, _ = env.step(action) # 与环境进行一个交互\n", " state = next_state # 存储上一个观察值\n", " ep_reward += reward\n", " if done:\n", " break\n", " rewards.append(ep_reward)\n", " if ma_rewards:\n", " ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)\n", " else:\n", " ma_rewards.append(ep_reward)\n", " if (i_ep+1)%10==0:\n", " print(f\"Episode:{i_ep+1}/{cfg.eval_eps}, reward:{ep_reward:.1f}\")\n", " return rewards,ma_rewards" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 6, "source": [ "cfg = QlearningConfig()\n", "env,agent = env_agent_config(cfg,seed=1)\n", "rewards,ma_rewards = train(cfg,env,agent)\n", "make_dir(cfg.result_path,cfg.model_path)\n", "agent.save(path=cfg.model_path)\n", "save_results(rewards,ma_rewards,tag='train',path=cfg.result_path)\n", "plot_rewards(rewards,ma_rewards,tag=\"train\",env=cfg.env,algo = cfg.algo,path=cfg.result_path)\n", "\n", "env,agent = env_agent_config(cfg,seed=10)\n", "agent.load(path=cfg.model_path)\n", "rewards,ma_rewards = eval(cfg,env,agent)\n", "save_results(rewards,ma_rewards,tag='eval',path=cfg.result_path)\n", "plot_rewards(rewards,ma_rewards,tag=\"eval\",env=cfg.env,algo = cfg.algo,path=cfg.result_path)" ], "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Episode:10/200: reward:-287.0\n", "Episode:20/200: reward:-142.0\n", "Episode:30/200: reward:-67.0\n", "Episode:40/200: reward:-61.0\n", "Episode:50/200: reward:-74.0\n", "Episode:60/200: reward:-41.0\n", "Episode:70/200: reward:-55.0\n", "Episode:80/200: reward:-66.0\n", "Episode:90/200: reward:-31.0\n", "Episode:100/200: reward:-31.0\n", "Episode:110/200: reward:-58.0\n", "Episode:120/200: reward:-25.0\n", "Episode:130/200: reward:-18.0\n", "Episode:140/200: reward:-27.0\n", "Episode:150/200: reward:-28.0\n", "Episode:160/200: reward:-25.0\n", "Episode:170/200: reward:-35.0\n", "Episode:180/200: reward:-13.0\n", "Episode:190/200: reward:-22.0\n", "Episode:200/200: reward:-26.0\n", "保存模型成功!\n", "结果保存完毕!\n" ] }, { "output_type": "display_data", "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "加载模型成功!\n" ] } ], "metadata": {} } ] }