Files
easy-rl/projects/notebooks/MonteCarlo.ipynb
2022-11-14 21:35:28 +08:00

481 lines
107 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1、定义算法"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"from collections import defaultdict\n",
"import numpy as np\n",
"class FisrtVisitMC:\n",
" ''' On-Policy First-Visit MC Control\n",
" '''\n",
" def __init__(self,cfg):\n",
" self.n_actions = cfg.n_actions\n",
" self.epsilon = cfg.epsilon\n",
" self.gamma = cfg.gamma \n",
" self.Q_table = defaultdict(lambda: np.zeros(cfg.n_actions))\n",
" self.returns_sum = defaultdict(float) # 保存return之和\n",
" self.returns_count = defaultdict(float)\n",
" \n",
" def sample_action(self,state):\n",
" state = str(state)\n",
" if np.random.uniform(0, 1) > self.epsilon:\n",
" action = np.argmax(self.Q_table[str(state)]) # 选择Q(s,a)最大对应的动作\n",
" else:\n",
" action = np.random.choice(self.n_actions) # 随机选择动作\n",
" return action\n",
" # if state in self.Q_table.keys():\n",
" # best_action = np.argmax(self.Q_table[state])\n",
" # action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions\n",
" # action_probs[best_action] += (1.0 - self.epsilon)\n",
" # action = np.random.choice(np.arange(len(action_probs)), p=action_probs)\n",
" # else:\n",
" # action = np.random.randint(0,self.n_actions)\n",
" # return action\n",
" def predict_action(self,state):\n",
" state = str(state)\n",
" state = str(state)\n",
" if np.random.uniform(0, 1) > self.epsilon:\n",
" action = np.argmax(self.Q_table[str(state)]) # 选择Q(s,a)最大对应的动作\n",
" else:\n",
" action = np.random.choice(self.n_actions) # 随机选择动作\n",
" return action\n",
" # if state in self.Q_table.keys():\n",
" # best_action = np.argmax(self.Q_table[state])\n",
" # action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions\n",
" # action_probs[best_action] += (1.0 - self.epsilon)\n",
" # action = np.argmax(self.Q_table[state])\n",
" # else:\n",
" # action = np.random.randint(0,self.n_actions)\n",
" # return action\n",
" def update(self,one_ep_transition):\n",
" # Find all (state, action) pairs we've visited in this one_ep_transition\n",
" # We convert each state to a tuple so that we can use it as a dict key\n",
" sa_in_episode = set([(str(x[0]), x[1]) for x in one_ep_transition])\n",
" for state, action in sa_in_episode:\n",
" sa_pair = (state, action)\n",
" # Find the first occurence of the (state, action) pair in the one_ep_transition\n",
"\n",
" first_occurence_idx = next(i for i,x in enumerate(one_ep_transition)\n",
" if str(x[0]) == state and x[1] == action)\n",
" # Sum up all rewards since the first occurance\n",
" G = sum([x[2]*(self.gamma**i) for i,x in enumerate(one_ep_transition[first_occurence_idx:])])\n",
" # Calculate average return for this state over all sampled episodes\n",
" self.returns_sum[sa_pair] += G\n",
" self.returns_count[sa_pair] += 1.0\n",
" self.Q_table[state][action] = self.returns_sum[sa_pair] / self.returns_count[sa_pair]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2、定义训练"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"def train(cfg,env,agent):\n",
" print('开始训练!')\n",
" print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')\n",
" rewards = [] # 记录奖励\n",
" for i_ep in range(cfg.train_eps):\n",
" ep_reward = 0 # 记录每个回合的奖励\n",
" one_ep_transition = []\n",
" state = env.reset(seed=cfg.seed) # 重置环境,即开始新的回合\n",
" for _ in range(cfg.max_steps):\n",
" action = agent.sample_action(state) # 根据算法采样一个动作\n",
" next_state, reward, terminated, info = env.step(action) # 与环境进行一次动作交互\n",
" one_ep_transition.append((state, action, reward)) # 保存transitions\n",
" agent.update(one_ep_transition) # 更新智能体\n",
" state = next_state # 更新状态\n",
" ep_reward += reward \n",
" if terminated:\n",
" break\n",
" rewards.append(ep_reward)\n",
" if (i_ep+1)%10==0:\n",
" print(f\"回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward:.1f}\")\n",
" print('完成训练!')\n",
" return {\"rewards\":rewards}\n",
"def test(cfg,env,agent):\n",
" print('开始测试!')\n",
" print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')\n",
" rewards = [] # 记录所有回合的奖励\n",
" for i_ep in range(cfg.test_eps):\n",
" ep_reward = 0 # 记录每个episode的reward\n",
" state = env.reset(seed=cfg.seed) # 重置环境, 重新开一局(即开始新的一个回合)\n",
" for _ in range(cfg.max_steps):\n",
" action = agent.predict_action(state) # 根据算法选择一个动作\n",
" next_state, reward, terminated, info = env.step(action) # 与环境进行一个交互\n",
" state = next_state # 更新状态\n",
" ep_reward += reward\n",
" if terminated:\n",
" break\n",
" rewards.append(ep_reward)\n",
" print(f\"回合数:{i_ep+1}/{cfg.test_eps}, 奖励:{ep_reward:.1f}\")\n",
" print('完成测试!')\n",
" return {\"rewards\":rewards}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3、定义环境"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"import gym\n",
"import turtle\n",
"import numpy as np\n",
"\n",
"# turtle tutorial : https://docs.python.org/3.3/library/turtle.html\n",
"\n",
"class CliffWalkingWapper(gym.Wrapper):\n",
" def __init__(self, env):\n",
" gym.Wrapper.__init__(self, env)\n",
" self.t = None\n",
" self.unit = 50\n",
" self.max_x = 12\n",
" self.max_y = 4\n",
"\n",
" def draw_x_line(self, y, x0, x1, color='gray'):\n",
" assert x1 > x0\n",
" self.t.color(color)\n",
" self.t.setheading(0)\n",
" self.t.up()\n",
" self.t.goto(x0, y)\n",
" self.t.down()\n",
" self.t.forward(x1 - x0)\n",
"\n",
" def draw_y_line(self, x, y0, y1, color='gray'):\n",
" assert y1 > y0\n",
" self.t.color(color)\n",
" self.t.setheading(90)\n",
" self.t.up()\n",
" self.t.goto(x, y0)\n",
" self.t.down()\n",
" self.t.forward(y1 - y0)\n",
"\n",
" def draw_box(self, x, y, fillcolor='', line_color='gray'):\n",
" self.t.up()\n",
" self.t.goto(x * self.unit, y * self.unit)\n",
" self.t.color(line_color)\n",
" self.t.fillcolor(fillcolor)\n",
" self.t.setheading(90)\n",
" self.t.down()\n",
" self.t.begin_fill()\n",
" for i in range(4):\n",
" self.t.forward(self.unit)\n",
" self.t.right(90)\n",
" self.t.end_fill()\n",
"\n",
" def move_player(self, x, y):\n",
" self.t.up()\n",
" self.t.setheading(90)\n",
" self.t.fillcolor('red')\n",
" self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit)\n",
"\n",
" def render(self):\n",
" if self.t == None:\n",
" self.t = turtle.Turtle()\n",
" self.wn = turtle.Screen()\n",
" self.wn.setup(self.unit * self.max_x + 100,\n",
" self.unit * self.max_y + 100)\n",
" self.wn.setworldcoordinates(0, 0, self.unit * self.max_x,\n",
" self.unit * self.max_y)\n",
" self.t.shape('circle')\n",
" self.t.width(2)\n",
" self.t.speed(0)\n",
" self.t.color('gray')\n",
" for _ in range(2):\n",
" self.t.forward(self.max_x * self.unit)\n",
" self.t.left(90)\n",
" self.t.forward(self.max_y * self.unit)\n",
" self.t.left(90)\n",
" for i in range(1, self.max_y):\n",
" self.draw_x_line(\n",
" y=i * self.unit, x0=0, x1=self.max_x * self.unit)\n",
" for i in range(1, self.max_x):\n",
" self.draw_y_line(\n",
" x=i * self.unit, y0=0, y1=self.max_y * self.unit)\n",
"\n",
" for i in range(1, self.max_x - 1):\n",
" self.draw_box(i, 0, 'black')\n",
" self.draw_box(self.max_x - 1, 0, 'yellow')\n",
" self.t.shape('turtle')\n",
"\n",
" x_pos = self.s % self.max_x\n",
" y_pos = self.max_y - 1 - int(self.s / self.max_x)\n",
" self.move_player(x_pos, y_pos)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"import gym\n",
"import os\n",
"def all_seed(env,seed = 1):\n",
" ''' omnipotent seed for RL, attention the position of seed function, you'd better put it just following the env create function\n",
" Args:\n",
" env (_type_): \n",
" seed (int, optional): _description_. Defaults to 1.\n",
" '''\n",
" import torch\n",
" import numpy as np\n",
" import random\n",
" # print(f\"seed = {seed}\")\n",
" env.seed(seed) # env config\n",
" np.random.seed(seed)\n",
" random.seed(seed)\n",
" torch.manual_seed(seed) # config for CPU\n",
" torch.cuda.manual_seed(seed) # config for GPU\n",
" os.environ['PYTHONHASHSEED'] = str(seed) # config for python scripts\n",
" # config for cudnn\n",
" torch.backends.cudnn.deterministic = True\n",
" torch.backends.cudnn.benchmark = False\n",
" torch.backends.cudnn.enabled = False\n",
" \n",
"def env_agent_config(cfg):\n",
" '''创建环境和智能体\n",
" ''' \n",
" env = gym.make(cfg.env_name,new_step_api=True) # 创建环境\n",
" env = CliffWalkingWapper(env)\n",
" if cfg.seed !=0: # set random seed\n",
" all_seed(env,seed=cfg.seed) \n",
" try: # 状态维度\n",
" n_states = env.observation_space.n # print(hasattr(env.observation_space, 'n'))\n",
" except AttributeError:\n",
" n_states = env.observation_space.shape[0]\n",
" n_actions = env.action_space.n # 动作维度\n",
" setattr(cfg, 'n_states', n_states) # 将状态维度添加到配置参数中\n",
" setattr(cfg, 'n_actions', n_actions) # 将动作维度添加到配置参数中\n",
" agent = FisrtVisitMC(cfg)\n",
" return env,agent"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4、设置参数"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"class Config:\n",
" '''配置参数\n",
" '''\n",
" def __init__(self):\n",
" self.env_name = 'CliffWalking-v0' # 环境名称\n",
" self.algo_name = \"FirstVisitMC\" # 算法名称\n",
" self.train_eps = 400 # 训练回合数\n",
" self.test_eps = 20 # 测试回合数\n",
" self.max_steps = 200 # 每个回合最大步数\n",
" self.epsilon = 0.1 # 贪婪度\n",
" self.gamma = 0.9 # 折扣因子\n",
" self.lr = 0.5 # 学习率\n",
" self.seed = 1 # 随机种子\n",
" # if torch.cuda.is_available(): # 是否使用GPUs\n",
" # self.device = torch.device('cuda')\n",
" # else:\n",
" # self.device = torch.device('cpu')\n",
" self.device = torch.device('cpu')\n",
"def smooth(data, weight=0.9): \n",
" '''用于平滑曲线\n",
" '''\n",
" last = data[0] # First value in the plot (first timestep)\n",
" smoothed = list()\n",
" for point in data:\n",
" smoothed_val = last * weight + (1 - weight) * point # 计算平滑值\n",
" smoothed.append(smoothed_val) \n",
" last = smoothed_val \n",
" return smoothed\n",
"\n",
"def plot_rewards(rewards,title=\"learning curve\"):\n",
" sns.set()\n",
" plt.figure() # 创建一个图形实例,方便同时多画几个图\n",
" plt.title(f\"{title}\")\n",
" plt.xlim(0, len(rewards), 10) # 设置x轴的范围\n",
" plt.xlabel('epsiodes')\n",
" plt.plot(rewards, label='rewards')\n",
" plt.plot(smooth(rewards), label='smoothed')\n",
" plt.legend()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5、我准备好了"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"开始训练!\n",
"环境:CliffWalking-v0, 算法:FirstVisitMC, 设备:cpu\n",
"回合10/400奖励-200.0\n",
"回合20/400奖励-200.0\n",
"回合30/400奖励-200.0\n",
"回合40/400奖励-200.0\n",
"回合50/400奖励-200.0\n",
"回合60/400奖励-200.0\n",
"回合70/400奖励-200.0\n",
"回合80/400奖励-200.0\n",
"回合90/400奖励-200.0\n",
"回合100/400奖励-200.0\n",
"回合110/400奖励-200.0\n",
"回合120/400奖励-200.0\n",
"回合130/400奖励-200.0\n",
"回合140/400奖励-200.0\n",
"回合150/400奖励-200.0\n",
"回合160/400奖励-200.0\n",
"回合170/400奖励-200.0\n",
"回合180/400奖励-200.0\n",
"回合190/400奖励-200.0\n",
"回合200/400奖励-200.0\n",
"回合210/400奖励-200.0\n",
"回合220/400奖励-200.0\n",
"回合230/400奖励-200.0\n",
"回合240/400奖励-200.0\n",
"回合250/400奖励-200.0\n",
"回合260/400奖励-200.0\n",
"回合270/400奖励-299.0\n",
"回合280/400奖励-200.0\n",
"回合290/400奖励-200.0\n",
"回合300/400奖励-200.0\n",
"回合310/400奖励-200.0\n",
"回合320/400奖励-200.0\n",
"回合330/400奖励-200.0\n",
"回合340/400奖励-200.0\n",
"回合350/400奖励-200.0\n",
"回合360/400奖励-200.0\n",
"回合370/400奖励-200.0\n",
"回合380/400奖励-200.0\n",
"回合390/400奖励-200.0\n",
"回合400/400奖励-200.0\n",
"完成训练!\n",
"开始测试!\n",
"环境CliffWalking-v0, 算法FirstVisitMC, 设备cpu\n",
"回合数1/20, 奖励:-200.0\n",
"回合数2/20, 奖励:-200.0\n",
"回合数3/20, 奖励:-200.0\n",
"回合数4/20, 奖励:-200.0\n",
"回合数5/20, 奖励:-200.0\n",
"回合数6/20, 奖励:-200.0\n",
"回合数7/20, 奖励:-200.0\n",
"回合数8/20, 奖励:-200.0\n",
"回合数9/20, 奖励:-200.0\n",
"回合数10/20, 奖励:-299.0\n",
"回合数11/20, 奖励:-200.0\n",
"回合数12/20, 奖励:-200.0\n",
"回合数13/20, 奖励:-200.0\n",
"回合数14/20, 奖励:-200.0\n",
"回合数15/20, 奖励:-200.0\n",
"回合数16/20, 奖励:-200.0\n",
"回合数17/20, 奖励:-200.0\n",
"回合数18/20, 奖励:-200.0\n",
"回合数19/20, 奖励:-200.0\n",
"回合数20/20, 奖励:-200.0\n",
"完成测试!\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# 获取参数\n",
"cfg = Config() \n",
"# 训练\n",
"env, agent = env_agent_config(cfg)\n",
"res_dic = train(cfg, env, agent)\n",
" \n",
"plot_rewards(res_dic['rewards'], title=f\"training curve on {cfg.device} of {cfg.algo_name} for {cfg.env_name}\") \n",
"# 测试\n",
"res_dic = test(cfg, env, agent)\n",
"plot_rewards(res_dic['rewards'], title=f\"testing curve on {cfg.device} of {cfg.algo_name} for {cfg.env_name}\") # 画出结果"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.7.12 ('easyrl')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.12"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "f5a9629e9f3b9957bf68a43815f911e93447d47b3d065b6a8a04975e44c504d9"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}