Files
easy-rl/projects/notebooks/1.QLearning.ipynb
2022-08-15 22:31:37 +08:00

923 lines
92 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1、定义算法\n",
"强化学习算法的模式都比较固定一般包括sample即训练时采样动作predict测试时预测动作update算法更新以及保存模型和加载模型等几个方法其中对于每种算法samle和update的方式是不相同而其他方法就大同小异。"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import math\n",
"import torch\n",
"from collections import defaultdict\n",
"\n",
"class QLearning(object):\n",
" def __init__(self,n_states,\n",
" n_actions,cfg):\n",
" self.n_actions = n_actions \n",
" self.lr = cfg.lr # 学习率\n",
" self.gamma = cfg.gamma \n",
" self.epsilon = cfg.epsilon_start\n",
" self.sample_count = 0 \n",
" self.epsilon_start = cfg.epsilon_start\n",
" self.epsilon_end = cfg.epsilon_end\n",
" self.epsilon_decay = cfg.epsilon_decay\n",
" self.Q_table = defaultdict(lambda: np.zeros(n_actions)) # 用嵌套字典存放状态->动作->状态-动作值Q值的映射即Q表\n",
" def sample(self, state):\n",
" ''' 采样动作,训练时用\n",
" '''\n",
" self.sample_count += 1\n",
" self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \\\n",
" math.exp(-1. * self.sample_count / self.epsilon_decay) # epsilon是会递减的这里选择指数递减\n",
" # e-greedy 策略\n",
" if np.random.uniform(0, 1) > self.epsilon:\n",
" action = np.argmax(self.Q_table[str(state)]) # 选择Q(s,a)最大对应的动作\n",
" else:\n",
" action = np.random.choice(self.n_actions) # 随机选择动作\n",
" return action\n",
" def predict(self,state):\n",
" ''' 预测或选择动作,测试时用\n",
" '''\n",
" action = np.argmax(self.Q_table[str(state)])\n",
" return action\n",
" def update(self, state, action, reward, next_state, done):\n",
" Q_predict = self.Q_table[str(state)][action] \n",
" if done: # 终止状态\n",
" Q_target = reward \n",
" else:\n",
" Q_target = reward + self.gamma * np.max(self.Q_table[str(next_state)]) \n",
" self.Q_table[str(state)][action] += self.lr * (Q_target - Q_predict)\n",
" def save(self,path):\n",
" import dill\n",
" torch.save(\n",
" obj=self.Q_table,\n",
" f=path+\"Qleaning_model.pkl\",\n",
" pickle_module=dill\n",
" )\n",
" print(\"保存模型成功!\")\n",
" def load(self, path):\n",
" import dill\n",
" self.Q_table =torch.load(f=path+'Qleaning_model.pkl',pickle_module=dill)\n",
" print(\"加载模型成功!\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2、定义训练\n",
"强化学习算法的训练方式也比较固定,如下:\n",
"```python\n",
"for i_ep in range(train_eps): # 遍历每个回合\n",
" state = env.reset() # 重置环境,即开始新的回合\n",
" while True: # 对于一些比较复杂的游戏可以设置每回合最大的步长例如while ep_step<100就是每回合最大步长为100。\n",
" action = agent.sample(state) # 根据算法采样一个动作\n",
" next_state, reward, done, _ = env.step(action) # 与环境进行一次动作交互\n",
" agent.memory.push(state, action, reward, next_state, done) # 记录memory\n",
" agent.update(state, action, reward, next_state, done) # 算法更新\n",
" state = next_state # 更新状态\n",
" if done:\n",
" break\n",
"```\n",
"首先对于每个回合回合开始时环境需要重置好比我们每次开一把游戏需要从头再来一样。我们可以设置智能体在每回合数的最大步长尤其是对于比较复杂的游戏这样做的好处之一就是帮助智能体在训练中快速收敛比如我们先验地知道最优解的大概步数那么理论上智能体收敛时也应该是这个步数附近设置最大步数可以方便智能体接近这个最优解。在每个回合中智能体首先需要采样sample或者说采用探索策略例如常见的$\\varepsilon$-greedy策略或者UCB探索策略等等。采样的过程是将当前的状态state作为输入智能体采样输出动作action。然后环境根据采样出来的动作反馈出下一个状态以及相应的reward等信息。接下来对于具有memory的智能体例如包含replay memory的DQN来说需要将相应的transition记住这个词中文不好翻译通常是状态、动作、奖励等信息。紧接着就是智能体更新对于深度强化学习此时一般从memory中随机采样一些transition进行更新对于Q learning一般是采样上一次的transition。更新公式是比较关键的部分但是也很通用一般基于值的算法更新公式都是一个套路如下\n",
"$$\n",
"y_{j}= \\begin{cases}r_{j} & \\text { for terminal } s_{t+1} \\\\ r_{j}+\\gamma \\max _{a^{\\prime}} Q\\left(s_{t+1}, a^{\\prime} ; \\theta\\right) & \\text { for non-terminal } s_{t+1}\\end{cases}\n",
"$$\n",
"智能体更新完之后,通常需要更新状态,即```state = next_state```,然后会检查是否完成了这一回合的游戏,即```done==True```注意完成并不代表这回合成功也有可能是失败的太离谱等同学们有了自定义强化学习环境的经验就知道了等你长大就知道了XD。\n",
"如果需要记录奖励、损失等等的话可以再加上如下方代码实际项目中更多地使用tensorboard来记录相应的数据甚至于笔者就在这些教学代码中使用过但是看起来有些繁琐容易给大家增加不必要的学习难度因此学有余力以及需要在项目研究中做强化学习的可以去看看也很简单。\n",
"此外稍微复杂一些的强化学习不是一次性写完代码就能收敛的这时需要我们做一个调参侠。为了检查我们参数调得好不好可以在终端print出奖励、损失以及epsilon等随着回合数的变化这点说明一下强化学习的训练过程一般都是先探索然后收敛的官方的话就是权衡exploration and exploitation。e-greedy策略的做法就是前期探索然后逐渐减小探索率至慢慢收敛也就是这个epsilon。这个值越大比如0.9就说明智能体90%的概率在随机探索通常情况下会设置三个值epsilon_start、epsilon_end以及epsilon_decay即初始值、终止值和衰减率其中初始值一般是0.95不变终止值是0.01也就是说即使在收敛阶段也让智能体保持很小概率的探索这样做的原因就是智能体已经学出了一个不错的策略但是保不齐还有更好的策略好比我们知道要出人头地学历高比较重要但是“人还是要有梦想的万一实现了呢”总是存在意外的可能对吧。回归正题比较关键的是epsilon_decay这个衰减率这个epsilon衰减太快了学来的策略往往过拟合好比一条只能选择一朵花的花道上你早早选择了一朵看起来还可以的花却错过了后面更多的好花。但是衰减的太慢会影响收敛的速度好比你走过了花道的尽头也还没选出一朵花来相比前者不如更甚。当然强化学习的调参相比于深度学习只能说是有过之无不及比较复杂不止epsilon这一个这就需要同学们的耐心学习了。\n",
"强化学习测试的代码跟训练基本上是一样的因此我放到同一个代码段里。相比于训练代码测试代码主要有以下几点不同1、测试模型的过程是不需要更新的这个是不言而喻的2、测试代码不需要采样sample动作相比之代替的是预测sample动作其区别就是采样动作时可能会使用各种策略例如$\\varepsilon$-greedy策略而预测动作不需要只需要根据训练时学习好的Q表或者网络模型代入状态得到动作即可3、测试过程终端一般只需要看奖励不需要看epislon等反正它在测试中也是无意义的。"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def train(cfg,env,agent):\n",
" print('开始训练!')\n",
" print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')\n",
" rewards = [] # 记录奖励\n",
" for i_ep in range(cfg.train_eps):\n",
" ep_reward = 0 # 记录每个回合的奖励\n",
" state = env.reset() # 重置环境,即开始新的回合\n",
" while True:\n",
" action = agent.sample(state) # 根据算法采样一个动作\n",
" next_state, reward, done, _ = env.step(action) # 与环境进行一次动作交互\n",
" agent.update(state, action, reward, next_state, done) # Q学习算法更新\n",
" state = next_state # 更新状态\n",
" ep_reward += reward\n",
" if done:\n",
" break\n",
" rewards.append(ep_reward)\n",
" print(f\"回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward:.1f}Epsilon{agent.epsilon}\")\n",
" print('完成训练!')\n",
" return {\"rewards\":rewards}\n",
"def test(cfg,env,agent):\n",
" print('开始测试!')\n",
" print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')\n",
" rewards = [] # 记录所有回合的奖励\n",
" for i_ep in range(cfg.test_eps):\n",
" ep_reward = 0 # 记录每个episode的reward\n",
" state = env.reset() # 重置环境, 重新开一局(即开始新的一个回合)\n",
" while True:\n",
" action = agent.predict(state) # 根据算法选择一个动作\n",
" next_state, reward, done, _ = env.step(action) # 与环境进行一个交互\n",
" state = next_state # 更新状态\n",
" ep_reward += reward\n",
" if done:\n",
" break\n",
" rewards.append(ep_reward)\n",
" print(f\"回合数:{i_ep+1}/{cfg.test_eps}, 奖励:{ep_reward:.1f}\")\n",
" print('完成测试!')\n",
" return {\"rewards\":rewards}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3、定义环境\n",
"\n",
"OpenAI Gym中其实集成了很多强化学习环境足够大家学习了但是在做强化学习的应用中免不了要自己创建环境比如在本项目中其实不太好找到Qlearning能学出来的环境Qlearning实在是太弱了需要足够简单的环境才行因此本项目写了一个环境大家感兴趣的话可以看一下一般环境接口最关键的部分即使reset和step。"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import gym\n",
"import turtle\n",
"import numpy as np\n",
"\n",
"# turtle tutorial : https://docs.python.org/3.3/library/turtle.html\n",
"\n",
"def GridWorld(gridmap=None, is_slippery=False):\n",
" if gridmap is None:\n",
" gridmap = ['SFFF', 'FHFH', 'FFFH', 'HFFG']\n",
" env = gym.make(\"FrozenLake-v0\", desc=gridmap, is_slippery=False)\n",
" env = FrozenLakeWapper(env)\n",
" return env\n",
"\n",
"\n",
"class FrozenLakeWapper(gym.Wrapper):\n",
" def __init__(self, env):\n",
" gym.Wrapper.__init__(self, env)\n",
" self.max_y = env.desc.shape[0]\n",
" self.max_x = env.desc.shape[1]\n",
" self.t = None\n",
" self.unit = 50\n",
"\n",
" def draw_box(self, x, y, fillcolor='', line_color='gray'):\n",
" self.t.up()\n",
" self.t.goto(x * self.unit, y * self.unit)\n",
" self.t.color(line_color)\n",
" self.t.fillcolor(fillcolor)\n",
" self.t.setheading(90)\n",
" self.t.down()\n",
" self.t.begin_fill()\n",
" for _ in range(4):\n",
" self.t.forward(self.unit)\n",
" self.t.right(90)\n",
" self.t.end_fill()\n",
"\n",
" def move_player(self, x, y):\n",
" self.t.up()\n",
" self.t.setheading(90)\n",
" self.t.fillcolor('red')\n",
" self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit)\n",
"\n",
" def render(self):\n",
" if self.t == None:\n",
" self.t = turtle.Turtle()\n",
" self.wn = turtle.Screen()\n",
" self.wn.setup(self.unit * self.max_x + 100,\n",
" self.unit * self.max_y + 100)\n",
" self.wn.setworldcoordinates(0, 0, self.unit * self.max_x,\n",
" self.unit * self.max_y)\n",
" self.t.shape('circle')\n",
" self.t.width(2)\n",
" self.t.speed(0)\n",
" self.t.color('gray')\n",
" for i in range(self.desc.shape[0]):\n",
" for j in range(self.desc.shape[1]):\n",
" x = j\n",
" y = self.max_y - 1 - i\n",
" if self.desc[i][j] == b'S': # Start\n",
" self.draw_box(x, y, 'white')\n",
" elif self.desc[i][j] == b'F': # Frozen ice\n",
" self.draw_box(x, y, 'white')\n",
" elif self.desc[i][j] == b'G': # Goal\n",
" self.draw_box(x, y, 'yellow')\n",
" elif self.desc[i][j] == b'H': # Hole\n",
" self.draw_box(x, y, 'black')\n",
" else:\n",
" self.draw_box(x, y, 'white')\n",
" self.t.shape('turtle')\n",
"\n",
" x_pos = self.s % self.max_x\n",
" y_pos = self.max_y - 1 - int(self.s / self.max_x)\n",
" self.move_player(x_pos, y_pos)\n",
"\n",
"\n",
"class CliffWalkingWapper(gym.Wrapper):\n",
" def __init__(self, env):\n",
" gym.Wrapper.__init__(self, env)\n",
" self.t = None\n",
" self.unit = 50\n",
" self.max_x = 12\n",
" self.max_y = 4\n",
"\n",
" def draw_x_line(self, y, x0, x1, color='gray'):\n",
" assert x1 > x0\n",
" self.t.color(color)\n",
" self.t.setheading(0)\n",
" self.t.up()\n",
" self.t.goto(x0, y)\n",
" self.t.down()\n",
" self.t.forward(x1 - x0)\n",
"\n",
" def draw_y_line(self, x, y0, y1, color='gray'):\n",
" assert y1 > y0\n",
" self.t.color(color)\n",
" self.t.setheading(90)\n",
" self.t.up()\n",
" self.t.goto(x, y0)\n",
" self.t.down()\n",
" self.t.forward(y1 - y0)\n",
"\n",
" def draw_box(self, x, y, fillcolor='', line_color='gray'):\n",
" self.t.up()\n",
" self.t.goto(x * self.unit, y * self.unit)\n",
" self.t.color(line_color)\n",
" self.t.fillcolor(fillcolor)\n",
" self.t.setheading(90)\n",
" self.t.down()\n",
" self.t.begin_fill()\n",
" for i in range(4):\n",
" self.t.forward(self.unit)\n",
" self.t.right(90)\n",
" self.t.end_fill()\n",
"\n",
" def move_player(self, x, y):\n",
" self.t.up()\n",
" self.t.setheading(90)\n",
" self.t.fillcolor('red')\n",
" self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit)\n",
"\n",
" def render(self):\n",
" if self.t == None:\n",
" self.t = turtle.Turtle()\n",
" self.wn = turtle.Screen()\n",
" self.wn.setup(self.unit * self.max_x + 100,\n",
" self.unit * self.max_y + 100)\n",
" self.wn.setworldcoordinates(0, 0, self.unit * self.max_x,\n",
" self.unit * self.max_y)\n",
" self.t.shape('circle')\n",
" self.t.width(2)\n",
" self.t.speed(0)\n",
" self.t.color('gray')\n",
" for _ in range(2):\n",
" self.t.forward(self.max_x * self.unit)\n",
" self.t.left(90)\n",
" self.t.forward(self.max_y * self.unit)\n",
" self.t.left(90)\n",
" for i in range(1, self.max_y):\n",
" self.draw_x_line(\n",
" y=i * self.unit, x0=0, x1=self.max_x * self.unit)\n",
" for i in range(1, self.max_x):\n",
" self.draw_y_line(\n",
" x=i * self.unit, y0=0, y1=self.max_y * self.unit)\n",
"\n",
" for i in range(1, self.max_x - 1):\n",
" self.draw_box(i, 0, 'black')\n",
" self.draw_box(self.max_x - 1, 0, 'yellow')\n",
" self.t.shape('turtle')\n",
"\n",
" x_pos = self.s % self.max_x\n",
" y_pos = self.max_y - 1 - int(self.s / self.max_x)\n",
" self.move_player(x_pos, y_pos)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import gym\n",
"def env_agent_config(cfg,seed=1):\n",
" '''创建环境和智能体\n",
" Args:\n",
" cfg ([type]): [description]\n",
" seed (int, optional): 随机种子. Defaults to 1.\n",
" Returns:\n",
" env [type]: 环境\n",
" agent : 智能体\n",
" ''' \n",
" env = gym.make(cfg.env_name) \n",
" env = CliffWalkingWapper(env)\n",
" env.seed(seed) # 设置随机种子\n",
" n_states = env.observation_space.n # 状态维度\n",
" n_actions = env.action_space.n # 动作维度\n",
" agent = QLearning(n_states,n_actions,cfg)\n",
" return env,agent"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4、设置参数\n",
"\n",
"到这里所有qlearning模块就算完成了下面需要设置一些参数方便大家“炼丹”其中默认的是笔者已经调好的。另外为了定义了一个画图函数用来描述奖励的变化。"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import datetime\n",
"import argparse\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"def get_args():\n",
" \"\"\" \n",
" \"\"\"\n",
" curr_time = datetime.datetime.now().strftime(\"%Y%m%d-%H%M%S\") # 获取当前时间\n",
" parser = argparse.ArgumentParser(description=\"hyperparameters\") \n",
" parser.add_argument('--algo_name',default='Q-learning',type=str,help=\"name of algorithm\")\n",
" parser.add_argument('--env_name',default='CliffWalking-v0',type=str,help=\"name of environment\")\n",
" parser.add_argument('--train_eps',default=400,type=int,help=\"episodes of training\") # 训练的回合数\n",
" parser.add_argument('--test_eps',default=20,type=int,help=\"episodes of testing\") # 测试的回合数\n",
" parser.add_argument('--gamma',default=0.90,type=float,help=\"discounted factor\") # 折扣因子\n",
" parser.add_argument('--epsilon_start',default=0.95,type=float,help=\"initial value of epsilon\") # e-greedy策略中初始epsilon\n",
" parser.add_argument('--epsilon_end',default=0.01,type=float,help=\"final value of epsilon\") # e-greedy策略中的终止epsilon\n",
" parser.add_argument('--epsilon_decay',default=300,type=int,help=\"decay rate of epsilon\") # e-greedy策略中epsilon的衰减率\n",
" parser.add_argument('--lr',default=0.1,type=float,help=\"learning rate\")\n",
" parser.add_argument('--device',default='cpu',type=str,help=\"cpu or cuda\") \n",
" args = parser.parse_args([]) \n",
" return args\n",
"curr_time = datetime.datetime.now().strftime(\"%Y%m%d-%H%M%S\") # 获取当前时间\n",
"\n",
"def smooth(data, weight=0.9): \n",
" '''用于平滑曲线类似于Tensorboard中的smooth\n",
"\n",
" Args:\n",
" data (List):输入数据\n",
" weight (Float): 平滑权重处于0-1之间数值越高说明越平滑一般取0.9\n",
"\n",
" Returns:\n",
" smoothed (List): 平滑后的数据\n",
" '''\n",
" last = data[0] # First value in the plot (first timestep)\n",
" smoothed = list()\n",
" for point in data:\n",
" smoothed_val = last * weight + (1 - weight) * point # 计算平滑值\n",
" smoothed.append(smoothed_val) \n",
" last = smoothed_val \n",
" return smoothed\n",
"\n",
"def plot_rewards(rewards,cfg, tag='train'):\n",
" sns.set()\n",
" plt.figure() # 创建一个图形实例,方便同时多画几个图\n",
" plt.title(\"learning curve on {} of {} for {}\".format(\n",
" cfg.device, cfg.algo_name, cfg.env_name))\n",
" plt.xlabel('epsiodes')\n",
" plt.plot(rewards, label='rewards')\n",
" plt.plot(smooth(rewards), label='smoothed')\n",
" plt.legend()\n",
" plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5、我准备好了\n",
"\n",
"到现在我们真的可以像海绵宝宝那样大声说出来“我准备好了!“,跟着注释来看下效果吧~。"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"开始训练!\n",
"环境:CliffWalking-v0, 算法:Q-learning, 设备:cpu\n",
"回合1/400奖励-1668.0Epsilon0.3771901652370099\n",
"回合2/400奖励-2328.0Epsilon0.03210668110464856\n",
"回合3/400奖励-152.0Epsilon0.02331928797825333\n",
"回合4/400奖励-296.0Epsilon0.014965661602689185\n",
"回合5/400奖励-168.0Epsilon0.012836430915462094\n",
"回合6/400奖励-149.0Epsilon0.011726126490407173\n",
"回合7/400奖励-274.0Epsilon0.010963239247691907\n",
"回合8/400奖励-127.0Epsilon0.010630787152305933\n",
"回合9/400奖励-356.0Epsilon0.010267816440118822\n",
"回合10/400奖励-105.0Epsilon0.0101887270555826\n",
"回合11/400奖励-162.0Epsilon0.01010998036181645\n",
"回合12/400奖励-124.0Epsilon0.010072745604688937\n",
"回合13/400奖励-125.0Epsilon0.010047956858279448\n",
"回合14/400奖励-69.0Epsilon0.010038103335373512\n",
"回合15/400奖励-146.0Epsilon0.010023421049147612\n",
"回合16/400奖励-99.0Epsilon0.010016837948094095\n",
"回合17/400奖励-102.0Epsilon0.010011984751749595\n",
"回合18/400奖励-114.0Epsilon0.010008195909220538\n",
"回合19/400奖励-95.0Epsilon0.010005971322860786\n",
"回合20/400奖励-50.0Epsilon0.010005054615675078\n",
"回合21/400奖励-179.0Epsilon0.010002783294099886\n",
"回合22/400奖励-51.0Epsilon0.010002348167306314\n",
"回合23/400奖励-53.0Epsilon0.010001967902958245\n",
"回合24/400奖励-126.0Epsilon0.01000129300438042\n",
"回合25/400奖励-105.0Epsilon0.010000911164786836\n",
"回合26/400奖励-55.0Epsilon0.010000758536131584\n",
"回合27/400奖励-112.0Epsilon0.010000522203364875\n",
"回合28/400奖励-81.0Epsilon0.01000039863934062\n",
"回合29/400奖励-187.0Epsilon0.010000297294659517\n",
"回合30/400奖励-176.0Epsilon0.01000022999489198\n",
"回合31/400奖励-71.0Epsilon0.010000181524464132\n",
"回合32/400奖励-77.0Epsilon0.010000140432053464\n",
"回合33/400奖励-82.0Epsilon0.010000106846201706\n",
"回合34/400奖励-95.0Epsilon0.010000077845318887\n",
"回合35/400奖励-53.0Epsilon0.010000065238977184\n",
"回合36/400奖励-30.0Epsilon0.010000059030667672\n",
"回合37/400奖励-122.0Epsilon0.010000039306520976\n",
"回合38/400奖励-37.0Epsilon0.010000034745744355\n",
"回合39/400奖励-100.0Epsilon0.01000002489641374\n",
"回合40/400奖励-201.0Epsilon0.010000017720528442\n",
"回合41/400奖励-62.0Epsilon0.010000014411941012\n",
"回合42/400奖励-61.0Epsilon0.010000011760233133\n",
"回合43/400奖励-57.0Epsilon0.010000009725232207\n",
"回合44/400奖励-73.0Epsilon0.0100000076246806\n",
"回合45/400奖励-39.0Epsilon0.010000006695197199\n",
"回合46/400奖励-71.0Epsilon0.010000005284213373\n",
"回合47/400奖励-77.0Epsilon0.010000004088005098\n",
"回合48/400奖励-53.0Epsilon0.010000003425989836\n",
"回合49/400奖励-88.0Epsilon0.010000002555012459\n",
"回合50/400奖励-65.0Epsilon0.01000000205729175\n",
"回合51/400奖励-41.0Epsilon0.010000001794495218\n",
"回合52/400奖励-67.0Epsilon0.010000001435323749\n",
"回合53/400奖励-38.0Epsilon0.010000001264559407\n",
"回合54/400奖励-50.0Epsilon0.010000001070426428\n",
"回合55/400奖励-35.0Epsilon0.010000000952552966\n",
"回合56/400奖励-74.0Epsilon0.010000000744325952\n",
"回合57/400奖励-75.0Epsilon0.010000000579681634\n",
"回合58/400奖励-31.0Epsilon0.010000000522772152\n",
"回合59/400奖励-38.0Epsilon0.010000000460576537\n",
"回合60/400奖励-51.0Epsilon0.01000000038857222\n",
"回合61/400奖励-64.0Epsilon0.010000000313922366\n",
"回合62/400奖励-78.0Epsilon0.010000000242050338\n",
"回合63/400奖励-41.0Epsilon0.010000000211131054\n",
"回合64/400奖励-62.0Epsilon0.010000000171710922\n",
"回合65/400奖励-58.0Epsilon0.010000000141525377\n",
"回合66/400奖励-34.0Epsilon0.010000000126361357\n",
"回合67/400奖励-52.0Epsilon0.010000000106251867\n",
"回合68/400奖励-28.0Epsilon0.010000000096783744\n",
"回合69/400奖励-57.0Epsilon0.010000000080036202\n",
"回合70/400奖励-39.0Epsilon0.010000000070279423\n",
"回合71/400奖励-55.0Epsilon0.01000000005850696\n",
"回合72/400奖励-33.0Epsilon0.010000000052412531\n",
"回合73/400奖励-62.0Epsilon0.010000000042626625\n",
"回合74/400奖励-56.0Epsilon0.010000000035368174\n",
"回合75/400奖励-34.0Epsilon0.01000000003157858\n",
"回合76/400奖励-37.0Epsilon0.010000000027914485\n",
"回合77/400奖励-149.0Epsilon0.0100000000236291\n",
"回合78/400奖励-46.0Epsilon0.010000000020270076\n",
"回合79/400奖励-28.0Epsilon0.010000000018463805\n",
"回合80/400奖励-37.0Epsilon0.010000000016321432\n",
"回合81/400奖励-64.0Epsilon0.01000000001318587\n",
"回合82/400奖励-52.0Epsilon0.010000000011087433\n",
"回合83/400奖励-22.0Epsilon0.010000000010303453\n",
"回合84/400奖励-32.0Epsilon0.010000000009261004\n",
"回合85/400奖励-74.0Epsilon0.010000000007236559\n",
"回合86/400奖励-33.0Epsilon0.010000000006482756\n",
"回合87/400奖励-39.0Epsilon0.010000000005692478\n",
"回合88/400奖励-40.0Epsilon0.010000000004981906\n",
"回合89/400奖励-33.0Epsilon0.010000000004462961\n",
"回合90/400奖励-47.0Epsilon0.010000000003815783\n",
"回合91/400奖励-45.0Epsilon0.010000000003284274\n",
"回合92/400奖励-28.0Epsilon0.010000000002991612\n",
"回合93/400奖励-45.0Epsilon0.010000000002574904\n",
"回合94/400奖励-56.0Epsilon0.010000000002136451\n",
"回合95/400奖励-31.0Epsilon0.010000000001926707\n",
"回合96/400奖励-38.0Epsilon0.010000000001697481\n",
"回合97/400奖励-50.0Epsilon0.010000000001436887\n",
"回合98/400奖励-41.0Epsilon0.010000000001253341\n",
"回合99/400奖励-41.0Epsilon0.01000000000109324\n",
"回合100/400奖励-13.0Epsilon0.010000000001046878\n",
"回合101/400奖励-45.0Epsilon0.010000000000901057\n",
"回合102/400奖励-19.0Epsilon0.01000000000084576\n",
"回合103/400奖励-44.0Epsilon0.010000000000730383\n",
"回合104/400奖励-23.0Epsilon0.010000000000676478\n",
"回合105/400奖励-40.0Epsilon0.010000000000592037\n",
"回合106/400奖励-52.0Epsilon0.010000000000497817\n",
"回合107/400奖励-38.0Epsilon0.010000000000438592\n",
"回合108/400奖励-24.0Epsilon0.01000000000040487\n",
"回合109/400奖励-32.0Epsilon0.010000000000363909\n",
"回合110/400奖励-38.0Epsilon0.010000000000320614\n",
"回合111/400奖励-52.0Epsilon0.01000000000026959\n",
"回合112/400奖励-22.0Epsilon0.010000000000250527\n",
"回合113/400奖励-38.0Epsilon0.010000000000220721\n",
"回合114/400奖励-33.0Epsilon0.01000000000019773\n",
"回合115/400奖励-29.0Epsilon0.010000000000179511\n",
"回合116/400奖励-56.0Epsilon0.010000000000148944\n",
"回合117/400奖励-20.0Epsilon0.010000000000139338\n",
"回合118/400奖励-31.0Epsilon0.010000000000125658\n",
"回合119/400奖励-33.0Epsilon0.01000000000011257\n",
"回合120/400奖励-39.0Epsilon0.010000000000098846\n",
"回合121/400奖励-26.0Epsilon0.010000000000090641\n",
"回合122/400奖励-31.0Epsilon0.010000000000081742\n",
"回合123/400奖励-40.0Epsilon0.010000000000071538\n",
"回合124/400奖励-33.0Epsilon0.010000000000064086\n",
"回合125/400奖励-46.0Epsilon0.010000000000054977\n",
"回合126/400奖励-28.0Epsilon0.010000000000050078\n",
"回合127/400奖励-23.0Epsilon0.010000000000046382\n",
"回合128/400奖励-30.0Epsilon0.010000000000041968\n",
"回合129/400奖励-24.0Epsilon0.010000000000038742\n",
"回合130/400奖励-36.0Epsilon0.01000000000003436\n",
"回合131/400奖励-28.0Epsilon0.010000000000031298\n",
"回合132/400奖励-28.0Epsilon0.01000000000002851\n",
"回合133/400奖励-35.0Epsilon0.01000000000002537\n",
"回合134/400奖励-27.0Epsilon0.010000000000023187\n",
"回合135/400奖励-30.0Epsilon0.01000000000002098\n",
"回合136/400奖励-35.0Epsilon0.01000000000001867\n",
"回合137/400奖励-31.0Epsilon0.010000000000016837\n",
"回合138/400奖励-27.0Epsilon0.010000000000015387\n",
"回合139/400奖励-48.0Epsilon0.010000000000013113\n",
"回合140/400奖励-23.0Epsilon0.010000000000012145\n",
"回合141/400奖励-29.0Epsilon0.010000000000011026\n",
"回合142/400奖励-21.0Epsilon0.01000000000001028\n",
"回合143/400奖励-22.0Epsilon0.010000000000009553\n",
"回合144/400奖励-42.0Epsilon0.010000000000008306\n",
"回合145/400奖励-21.0Epsilon0.010000000000007744\n",
"回合146/400奖励-141.0Epsilon0.010000000000006733\n",
"回合147/400奖励-43.0Epsilon0.010000000000005834\n",
"回合148/400奖励-44.0Epsilon0.010000000000005038\n",
"回合149/400奖励-18.0Epsilon0.010000000000004745\n",
"回合150/400奖励-23.0Epsilon0.010000000000004394\n",
"回合151/400奖励-24.0Epsilon0.010000000000004056\n",
"回合152/400奖励-30.0Epsilon0.010000000000003671\n",
"回合153/400奖励-27.0Epsilon0.010000000000003355\n",
"回合154/400奖励-15.0Epsilon0.01000000000000319\n",
"回合155/400奖励-19.0Epsilon0.010000000000002994\n",
"回合156/400奖励-50.0Epsilon0.010000000000002535\n",
"回合157/400奖励-22.0Epsilon0.010000000000002356\n",
"回合158/400奖励-28.0Epsilon0.010000000000002146\n",
"回合159/400奖励-27.0Epsilon0.010000000000001962\n",
"回合160/400奖励-13.0Epsilon0.010000000000001879\n",
"回合161/400奖励-33.0Epsilon0.010000000000001683\n",
"回合162/400奖励-24.0Epsilon0.010000000000001553\n",
"回合163/400奖励-30.0Epsilon0.010000000000001405\n",
"回合164/400奖励-19.0Epsilon0.010000000000001319\n",
"回合165/400奖励-22.0Epsilon0.010000000000001227\n",
"回合166/400奖励-32.0Epsilon0.010000000000001102\n",
"回合167/400奖励-35.0Epsilon0.01000000000000098\n",
"回合168/400奖励-32.0Epsilon0.010000000000000881\n",
"回合169/400奖励-21.0Epsilon0.010000000000000822\n",
"回合170/400奖励-27.0Epsilon0.010000000000000751\n",
"回合171/400奖励-22.0Epsilon0.010000000000000698\n",
"回合172/400奖励-22.0Epsilon0.010000000000000649\n",
"回合173/400奖励-34.0Epsilon0.01000000000000058\n",
"回合174/400奖励-22.0Epsilon0.010000000000000538\n",
"回合175/400奖励-27.0Epsilon0.010000000000000491\n",
"回合176/400奖励-13.0Epsilon0.01000000000000047\n",
"回合177/400奖励-29.0Epsilon0.010000000000000427\n",
"回合178/400奖励-20.0Epsilon0.010000000000000401\n",
"回合179/400奖励-22.0Epsilon0.010000000000000371\n",
"回合180/400奖励-33.0Epsilon0.010000000000000333\n",
"回合181/400奖励-20.0Epsilon0.010000000000000312\n",
"回合182/400奖励-26.0Epsilon0.010000000000000286\n",
"回合183/400奖励-22.0Epsilon0.010000000000000266\n",
"回合184/400奖励-29.0Epsilon0.010000000000000241\n",
"回合185/400奖励-25.0Epsilon0.010000000000000222\n",
"回合186/400奖励-16.0Epsilon0.01000000000000021\n",
"回合187/400奖励-28.0Epsilon0.010000000000000191\n",
"回合188/400奖励-23.0Epsilon0.010000000000000177\n",
"回合189/400奖励-31.0Epsilon0.01000000000000016\n",
"回合190/400奖励-17.0Epsilon0.010000000000000151\n",
"回合191/400奖励-22.0Epsilon0.01000000000000014\n",
"回合192/400奖励-18.0Epsilon0.010000000000000132\n",
"回合193/400奖励-34.0Epsilon0.010000000000000118\n",
"回合194/400奖励-32.0Epsilon0.010000000000000106\n",
"回合195/400奖励-14.0Epsilon0.0100000000000001\n",
"回合196/400奖励-23.0Epsilon0.010000000000000094\n",
"回合197/400奖励-23.0Epsilon0.010000000000000087\n",
"回合198/400奖励-28.0Epsilon0.01000000000000008\n",
"回合199/400奖励-24.0Epsilon0.010000000000000073\n",
"回合200/400奖励-21.0Epsilon0.010000000000000068\n",
"回合201/400奖励-15.0Epsilon0.010000000000000064\n",
"回合202/400奖励-16.0Epsilon0.010000000000000061\n",
"回合203/400奖励-22.0Epsilon0.010000000000000057\n",
"回合204/400奖励-28.0Epsilon0.010000000000000052\n",
"回合205/400奖励-25.0Epsilon0.010000000000000049\n",
"回合206/400奖励-16.0Epsilon0.010000000000000045\n",
"回合207/400奖励-13.0Epsilon0.010000000000000044\n",
"回合208/400奖励-31.0Epsilon0.01000000000000004\n",
"回合209/400奖励-25.0Epsilon0.010000000000000037\n",
"回合210/400奖励-21.0Epsilon0.010000000000000033\n",
"回合211/400奖励-26.0Epsilon0.010000000000000031\n",
"回合212/400奖励-13.0Epsilon0.01000000000000003\n",
"回合213/400奖励-15.0Epsilon0.010000000000000028\n",
"回合214/400奖励-23.0Epsilon0.010000000000000026\n",
"回合215/400奖励-23.0Epsilon0.010000000000000024\n",
"回合216/400奖励-13.0Epsilon0.010000000000000023\n",
"回合217/400奖励-21.0Epsilon0.010000000000000021\n",
"回合218/400奖励-28.0Epsilon0.01000000000000002\n",
"回合219/400奖励-24.0Epsilon0.010000000000000018\n",
"回合220/400奖励-20.0Epsilon0.010000000000000018\n",
"回合221/400奖励-13.0Epsilon0.010000000000000016\n",
"回合222/400奖励-15.0Epsilon0.010000000000000016\n",
"回合223/400奖励-27.0Epsilon0.010000000000000014\n",
"回合224/400奖励-18.0Epsilon0.010000000000000014\n",
"回合225/400奖励-20.0Epsilon0.010000000000000012\n",
"回合226/400奖励-27.0Epsilon0.010000000000000012\n",
"回合227/400奖励-18.0Epsilon0.01000000000000001\n",
"回合228/400奖励-15.0Epsilon0.01000000000000001\n",
"回合229/400奖励-19.0Epsilon0.010000000000000009\n",
"回合230/400奖励-20.0Epsilon0.010000000000000009\n",
"回合231/400奖励-13.0Epsilon0.010000000000000009\n",
"回合232/400奖励-28.0Epsilon0.010000000000000007\n",
"回合233/400奖励-38.0Epsilon0.010000000000000007\n",
"回合234/400奖励-17.0Epsilon0.010000000000000007\n",
"回合235/400奖励-22.0Epsilon0.010000000000000005\n",
"回合236/400奖励-13.0Epsilon0.010000000000000005\n",
"回合237/400奖励-20.0Epsilon0.010000000000000005\n",
"回合238/400奖励-18.0Epsilon0.010000000000000005\n",
"回合239/400奖励-14.0Epsilon0.010000000000000005\n",
"回合240/400奖励-13.0Epsilon0.010000000000000005\n",
"回合241/400奖励-28.0Epsilon0.010000000000000004\n",
"回合242/400奖励-13.0Epsilon0.010000000000000004\n",
"回合243/400奖励-23.0Epsilon0.010000000000000004\n",
"回合244/400奖励-17.0Epsilon0.010000000000000004\n",
"回合245/400奖励-14.0Epsilon0.010000000000000004\n",
"回合246/400奖励-22.0Epsilon0.010000000000000004\n",
"回合247/400奖励-15.0Epsilon0.010000000000000004\n",
"回合248/400奖励-19.0Epsilon0.010000000000000004\n",
"回合249/400奖励-17.0Epsilon0.010000000000000004\n",
"回合250/400奖励-27.0Epsilon0.010000000000000002\n",
"回合251/400奖励-21.0Epsilon0.010000000000000002\n",
"回合252/400奖励-23.0Epsilon0.010000000000000002\n",
"回合253/400奖励-15.0Epsilon0.010000000000000002\n",
"回合254/400奖励-15.0Epsilon0.010000000000000002\n",
"回合255/400奖励-13.0Epsilon0.010000000000000002\n",
"回合256/400奖励-15.0Epsilon0.010000000000000002\n",
"回合257/400奖励-13.0Epsilon0.010000000000000002\n",
"回合258/400奖励-28.0Epsilon0.010000000000000002\n",
"回合259/400奖励-13.0Epsilon0.010000000000000002\n",
"回合260/400奖励-13.0Epsilon0.010000000000000002\n",
"回合261/400奖励-23.0Epsilon0.010000000000000002\n",
"回合262/400奖励-13.0Epsilon0.010000000000000002\n",
"回合263/400奖励-24.0Epsilon0.010000000000000002\n",
"回合264/400奖励-17.0Epsilon0.010000000000000002\n",
"回合265/400奖励-19.0Epsilon0.010000000000000002\n",
"回合266/400奖励-13.0Epsilon0.010000000000000002\n",
"回合267/400奖励-25.0Epsilon0.010000000000000002\n",
"回合268/400奖励-15.0Epsilon0.01\n",
"回合269/400奖励-15.0Epsilon0.01\n",
"回合270/400奖励-21.0Epsilon0.01\n",
"回合271/400奖励-13.0Epsilon0.01\n",
"回合272/400奖励-13.0Epsilon0.01\n",
"回合273/400奖励-22.0Epsilon0.01\n",
"回合274/400奖励-15.0Epsilon0.01\n",
"回合275/400奖励-13.0Epsilon0.01\n",
"回合276/400奖励-19.0Epsilon0.01\n",
"回合277/400奖励-13.0Epsilon0.01\n",
"回合278/400奖励-18.0Epsilon0.01\n",
"回合279/400奖励-14.0Epsilon0.01\n",
"回合280/400奖励-126.0Epsilon0.01\n",
"回合281/400奖励-15.0Epsilon0.01\n",
"回合282/400奖励-19.0Epsilon0.01\n",
"回合283/400奖励-13.0Epsilon0.01\n",
"回合284/400奖励-25.0Epsilon0.01\n",
"回合285/400奖励-13.0Epsilon0.01\n",
"回合286/400奖励-119.0Epsilon0.01\n",
"回合287/400奖励-15.0Epsilon0.01\n",
"回合288/400奖励-15.0Epsilon0.01\n",
"回合289/400奖励-14.0Epsilon0.01\n",
"回合290/400奖励-13.0Epsilon0.01\n",
"回合291/400奖励-13.0Epsilon0.01\n",
"回合292/400奖励-15.0Epsilon0.01\n",
"回合293/400奖励-33.0Epsilon0.01\n",
"回合294/400奖励-19.0Epsilon0.01\n",
"回合295/400奖励-13.0Epsilon0.01\n",
"回合296/400奖励-15.0Epsilon0.01\n",
"回合297/400奖励-13.0Epsilon0.01\n",
"回合298/400奖励-132.0Epsilon0.01\n",
"回合299/400奖励-13.0Epsilon0.01\n",
"回合300/400奖励-13.0Epsilon0.01\n",
"回合301/400奖励-13.0Epsilon0.01\n",
"回合302/400奖励-14.0Epsilon0.01\n",
"回合303/400奖励-15.0Epsilon0.01\n",
"回合304/400奖励-13.0Epsilon0.01\n",
"回合305/400奖励-13.0Epsilon0.01\n",
"回合306/400奖励-13.0Epsilon0.01\n",
"回合307/400奖励-13.0Epsilon0.01\n",
"回合308/400奖励-13.0Epsilon0.01\n",
"回合309/400奖励-13.0Epsilon0.01\n",
"回合310/400奖励-13.0Epsilon0.01\n",
"回合311/400奖励-15.0Epsilon0.01\n",
"回合312/400奖励-13.0Epsilon0.01\n",
"回合313/400奖励-13.0Epsilon0.01\n",
"回合314/400奖励-13.0Epsilon0.01\n",
"回合315/400奖励-15.0Epsilon0.01\n",
"回合316/400奖励-14.0Epsilon0.01\n",
"回合317/400奖励-13.0Epsilon0.01\n",
"回合318/400奖励-13.0Epsilon0.01\n",
"回合319/400奖励-13.0Epsilon0.01\n",
"回合320/400奖励-21.0Epsilon0.01\n",
"回合321/400奖励-19.0Epsilon0.01\n",
"回合322/400奖励-13.0Epsilon0.01\n",
"回合323/400奖励-13.0Epsilon0.01\n",
"回合324/400奖励-13.0Epsilon0.01\n",
"回合325/400奖励-13.0Epsilon0.01\n",
"回合326/400奖励-14.0Epsilon0.01\n",
"回合327/400奖励-15.0Epsilon0.01\n",
"回合328/400奖励-13.0Epsilon0.01\n",
"回合329/400奖励-13.0Epsilon0.01\n",
"回合330/400奖励-13.0Epsilon0.01\n",
"回合331/400奖励-13.0Epsilon0.01\n",
"回合332/400奖励-13.0Epsilon0.01\n",
"回合333/400奖励-14.0Epsilon0.01\n",
"回合334/400奖励-13.0Epsilon0.01\n",
"回合335/400奖励-113.0Epsilon0.01\n",
"回合336/400奖励-13.0Epsilon0.01\n",
"回合337/400奖励-13.0Epsilon0.01\n",
"回合338/400奖励-13.0Epsilon0.01\n",
"回合339/400奖励-13.0Epsilon0.01\n",
"回合340/400奖励-13.0Epsilon0.01\n",
"回合341/400奖励-15.0Epsilon0.01\n",
"回合342/400奖励-23.0Epsilon0.01\n",
"回合343/400奖励-13.0Epsilon0.01\n",
"回合344/400奖励-13.0Epsilon0.01\n",
"回合345/400奖励-13.0Epsilon0.01\n",
"回合346/400奖励-13.0Epsilon0.01\n",
"回合347/400奖励-13.0Epsilon0.01\n",
"回合348/400奖励-13.0Epsilon0.01\n",
"回合349/400奖励-13.0Epsilon0.01\n",
"回合350/400奖励-13.0Epsilon0.01\n",
"回合351/400奖励-13.0Epsilon0.01\n",
"回合352/400奖励-13.0Epsilon0.01\n",
"回合353/400奖励-13.0Epsilon0.01\n",
"回合354/400奖励-13.0Epsilon0.01\n",
"回合355/400奖励-13.0Epsilon0.01\n",
"回合356/400奖励-13.0Epsilon0.01\n",
"回合357/400奖励-13.0Epsilon0.01\n",
"回合358/400奖励-13.0Epsilon0.01\n",
"回合359/400奖励-13.0Epsilon0.01\n",
"回合360/400奖励-13.0Epsilon0.01\n",
"回合361/400奖励-13.0Epsilon0.01\n",
"回合362/400奖励-13.0Epsilon0.01\n",
"回合363/400奖励-13.0Epsilon0.01\n",
"回合364/400奖励-13.0Epsilon0.01\n",
"回合365/400奖励-13.0Epsilon0.01\n",
"回合366/400奖励-13.0Epsilon0.01\n",
"回合367/400奖励-13.0Epsilon0.01\n",
"回合368/400奖励-13.0Epsilon0.01\n",
"回合369/400奖励-13.0Epsilon0.01\n",
"回合370/400奖励-13.0Epsilon0.01\n",
"回合371/400奖励-13.0Epsilon0.01\n",
"回合372/400奖励-14.0Epsilon0.01\n",
"回合373/400奖励-13.0Epsilon0.01\n",
"回合374/400奖励-15.0Epsilon0.01\n",
"回合375/400奖励-13.0Epsilon0.01\n",
"回合376/400奖励-13.0Epsilon0.01\n",
"回合377/400奖励-13.0Epsilon0.01\n",
"回合378/400奖励-13.0Epsilon0.01\n",
"回合379/400奖励-13.0Epsilon0.01\n",
"回合380/400奖励-117.0Epsilon0.01\n",
"回合381/400奖励-13.0Epsilon0.01\n",
"回合382/400奖励-13.0Epsilon0.01\n",
"回合383/400奖励-13.0Epsilon0.01\n",
"回合384/400奖励-13.0Epsilon0.01\n",
"回合385/400奖励-13.0Epsilon0.01\n",
"回合386/400奖励-13.0Epsilon0.01\n",
"回合387/400奖励-13.0Epsilon0.01\n",
"回合388/400奖励-13.0Epsilon0.01\n",
"回合389/400奖励-13.0Epsilon0.01\n",
"回合390/400奖励-13.0Epsilon0.01\n",
"回合391/400奖励-13.0Epsilon0.01\n",
"回合392/400奖励-13.0Epsilon0.01\n",
"回合393/400奖励-13.0Epsilon0.01\n",
"回合394/400奖励-13.0Epsilon0.01\n",
"回合395/400奖励-13.0Epsilon0.01\n",
"回合396/400奖励-13.0Epsilon0.01\n",
"回合397/400奖励-13.0Epsilon0.01\n",
"回合398/400奖励-15.0Epsilon0.01\n",
"回合399/400奖励-13.0Epsilon0.01\n",
"回合400/400奖励-13.0Epsilon0.01\n",
"完成训练!\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"开始测试!\n",
"环境CliffWalking-v0, 算法Q-learning, 设备cpu\n",
"回合数1/20, 奖励:-13.0\n",
"回合数2/20, 奖励:-13.0\n",
"回合数3/20, 奖励:-13.0\n",
"回合数4/20, 奖励:-13.0\n",
"回合数5/20, 奖励:-13.0\n",
"回合数6/20, 奖励:-13.0\n",
"回合数7/20, 奖励:-13.0\n",
"回合数8/20, 奖励:-13.0\n",
"回合数9/20, 奖励:-13.0\n",
"回合数10/20, 奖励:-13.0\n",
"回合数11/20, 奖励:-13.0\n",
"回合数12/20, 奖励:-13.0\n",
"回合数13/20, 奖励:-13.0\n",
"回合数14/20, 奖励:-13.0\n",
"回合数15/20, 奖励:-13.0\n",
"回合数16/20, 奖励:-13.0\n",
"回合数17/20, 奖励:-13.0\n",
"回合数18/20, 奖励:-13.0\n",
"回合数19/20, 奖励:-13.0\n",
"回合数20/20, 奖励:-13.0\n",
"完成测试!\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# 获取参数\n",
"cfg = get_args() \n",
"# 训练\n",
"env, agent = env_agent_config(cfg)\n",
"res_dic = train(cfg, env, agent)\n",
" \n",
"plot_rewards(res_dic['rewards'], cfg, tag=\"train\") \n",
"# 测试\n",
"res_dic = test(cfg, env, agent)\n",
"plot_rewards(res_dic['rewards'], cfg, tag=\"test\") # 画出结果"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.7.13 ('easyrl')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.13"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "8994a120d39b6e6a2ecc94b4007f5314b68aa69fc88a7f00edf21be39b41f49c"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}