Files
easy-rl/projects/assets/pseudocodes/pseudocodes.tex
johnjim0816 ad65dd17cd hot update
2022-08-22 17:50:11 +08:00

187 lines
11 KiB
TeX
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
\documentclass[11pt]{ctexart}
\usepackage{ctex}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{amssymb}
\usepackage{amsmath}
\usepackage{hyperref}
% \usepackage[hidelinks]{hyperref} 去除超链接的红色框
\usepackage{setspace}
\usepackage{titlesec}
\usepackage{float} % 调用该包能够使用[H]
% \pagestyle{plain} % 去除页眉但是保留页脚编号都去掉plain换empty
\begin{document}
\tableofcontents % 目录注意要运行两下或者vscode保存两下才能显示
% \singlespacing
\clearpage
\section{模版备用}
\begin{algorithm}[H] % [H]固定位置
\floatname{algorithm}{{算法}}
\renewcommand{\thealgorithm}{} % 去掉算法标号
\caption{}
\begin{algorithmic}[1] % [1]显示步数
\STATE 测试
\end{algorithmic}
\end{algorithm}
\clearpage
\section{Q learning算法}
\begin{algorithm}[H] % [H]固定位置
\floatname{algorithm}{{Q-learning算法}\footnotemark[1]}
\renewcommand{\thealgorithm}{} % 去掉算法标号
\caption{}
\begin{algorithmic}[1] % [1]显示步数
\STATE 初始化Q表$Q(s,a)$为任意值,但其中$Q(s_{terminal},)=0$即终止状态对应的Q值为0
\FOR {回合数 = $1,M$}
\STATE 重置环境,获得初始状态$s_1$
\FOR {时步 = $1,t$}
\STATE 根据$\varepsilon-greedy$策略采样动作$a_t$
\STATE 环境根据$a_t$反馈奖励$r_t$和下一个状态$s_{t+1}$
\STATE {\bfseries 更新策略:}
\STATE $Q(s_t,a_t) \leftarrow Q(s_t,a_t)+\alpha[r_t+\gamma\max _{a}Q(s_{t+1},a)-Q(s_t,a_t)]$
\STATE 更新状态$s_{t+1} \leftarrow s_t$
\ENDFOR
\ENDFOR
\end{algorithmic}
\end{algorithm}
\footnotetext[1]{Reinforcement Learning: An Introduction}
\clearpage
\section{Sarsa算法}
\begin{algorithm}[H] % [H]固定位置
\floatname{algorithm}{{Sarsa算法}\footnotemark[1]}
\renewcommand{\thealgorithm}{} % 去掉算法标号
\caption{}
\begin{algorithmic}[1] % [1]显示步数
\STATE 初始化Q表$Q(s,a)$为任意值,但其中$Q(s_{terminal},)=0$即终止状态对应的Q值为0
\FOR {回合数 = $1,M$}
\STATE 重置环境,获得初始状态$s_1$
\STATE 根据$\varepsilon-greedy$策略采样初始动作$a_1$
\FOR {时步 = $1,t$}
\STATE 环境根据$a_t$反馈奖励$r_t$和下一个状态$s_{t+1}$
\STATE 根据$\varepsilon-greedy$策略$s_{t+1}$和采样动作$a_{t+1}$
\STATE {\bfseries 更新策略:}
\STATE $Q(s_t,a_t) \leftarrow Q(s_t,a_t)+\alpha[r_t+\gamma Q(s_{t+1},a_{t+1})-Q(s_t,a_t)]$
\STATE 更新状态$s_{t+1} \leftarrow s_t$
\STATE 更新动作$a_{t+1} \leftarrow a_t$
\ENDFOR
\ENDFOR
\end{algorithmic}
\end{algorithm}
\footnotetext[1]{Reinforcement Learning: An Introduction}
\clearpage
\section{Policy Gradient算法}
\begin{algorithm}[H] % [H]固定位置
\floatname{algorithm}{{REINFORCE算法Monte-Carlo Policy Gradient}\footnotemark[1]}
\renewcommand{\thealgorithm}{} % 去掉算法标号
\caption{}
\begin{algorithmic}[1] % [1]显示步数
\STATE 初始化策略参数$\boldsymbol{\theta} \in \mathbb{R}^{d^{\prime}}($ e.g., to $\mathbf{0})$
\FOR {回合数 = $1,M$}
\STATE 根据策略$\pi(\cdot \mid \cdot, \boldsymbol{\theta})$采样一个(或几个)回合的transition
\FOR {时步 = $1,t$}
\STATE 计算回报$G \leftarrow \sum_{k=t+1}^{T} \gamma^{k-t-1} R_{k}$
\STATE 更新策略$\boldsymbol{\theta} \leftarrow {\boldsymbol{\theta}+\alpha \gamma^{t}} G \nabla \ln \pi\left(A_{t} \mid S_{t}, \boldsymbol{\theta}\right)$
\ENDFOR
\ENDFOR
\end{algorithmic}
\end{algorithm}
\footnotetext[1]{Reinforcement Learning: An Introduction}
\clearpage
\section{DQN算法}
\begin{algorithm}[H] % [H]固定位置
\floatname{algorithm}{{DQN算法}}
\renewcommand{\thealgorithm}{} % 去掉算法标号
\caption{}
\renewcommand{\algorithmicrequire}{\textbf{输入:}}
\renewcommand{\algorithmicensure}{\textbf{输出:}}
\begin{algorithmic}[1]
% \REQUIRE $n \geq 0 \vee x \neq 0$ % 输入
% \ENSURE $y = x^n$ % 输出
\STATE 初始化策略网络参数$\theta$ % 初始化
\STATE 复制参数到目标网络$\hat{Q} \leftarrow Q$
\STATE 初始化经验回放$D$
\FOR {回合数 = $1,M$}
\STATE 重置环境,获得初始状态$s_t$
\FOR {时步 = $1,t$}
\STATE 根据$\varepsilon-greedy$策略采样动作$a_t$
\STATE 环境根据$a_t$反馈奖励$r_t$和下一个状态$s_{t+1}$
\STATE 存储transition即$(s_t,a_t,r_t,s_{t+1})$到经验回放$D$
\STATE 更新环境状态$s_{t+1} \leftarrow s_t$
\STATE {\bfseries 更新策略:}
\STATE$D$中采样一个batch的transition
\STATE 计算实际的$Q$值,即$y_{j}= \begin{cases}r_{j} & \text {对于终止状态} s_{j+1} \\ r_{j}+\gamma \max _{a^{\prime}} Q\left(s_{j+1}, a^{\prime} ; \theta\right) & \text {对于非终止状态} s_{j+1}\end{cases}$
\STATE 对损失 $\left(y_{j}-Q\left(s_{j}, a_{j} ; \theta\right)\right)^{2}$关于参数$\theta$做随机梯度下降
\ENDFOR
\STATE$C$个回合复制参数$\hat{Q}\leftarrow Q$(此处也可像原论文中放到小循环中改成每$C$步,但没有每$C$个回合稳定)
\ENDFOR
\end{algorithmic}
\end{algorithm}
\clearpage
\section{SoftQ算法}
\begin{algorithm}[H]
\floatname{algorithm}{{SoftQ算法}}
\renewcommand{\thealgorithm}{} % 去掉算法标号
\caption{}
\begin{algorithmic}[1]
\STATE 初始化参数$\theta$$\phi$% 初始化
\STATE 复制参数$\bar{\theta} \leftarrow \theta, \bar{\phi} \leftarrow \phi$
\STATE 初始化经验回放$D$
\FOR {回合数 = $1,M$}
\FOR {时步 = $1,t$}
\STATE 根据$\mathbf{a}_{t} \leftarrow f^{\phi}\left(\xi ; \mathbf{s}_{t}\right)$采样动作,其中$\xi \sim \mathcal{N}(\mathbf{0}, \boldsymbol{I})$
\STATE 环境根据$a_t$反馈奖励$s_t$和下一个状态$s_{t+1}$
\STATE 存储transition即$(s_t,a_t,r_t,s_{t+1})$到经验回放$D$
\STATE 更新环境状态$s_{t+1} \leftarrow s_t$
\STATE {\bfseries 更新soft Q函数参数}
\STATE 对于每个$s^{(i)}_{t+1}$采样$\left\{\mathbf{a}^{(i, j)}\right\}_{j=0}^{M} \sim q_{\mathbf{a}^{\prime}}$
\STATE 计算empirical soft values $V_{\mathrm{soft}}^{\theta}\left(\mathbf{s}_{t}\right)$\footnotemark[1]
\STATE 计算empirical gradient $J_{Q}(\theta)$\footnotemark[2]
\STATE 根据$J_{Q}(\theta)$使用ADAM更新参数$\theta$
\STATE {\bfseries 更新策略:}
\STATE 对于每个$s^{(i)}_{t}$采样$\left\{\xi^{(i, j)}\right\}_{j=0}^{M} \sim \mathcal{N}(\mathbf{0}, \boldsymbol{I})$
\STATE 计算$\mathbf{a}_{t}^{(i, j)}=f^{\phi}\left(\xi^{(i, j)}, \mathbf{s}_{t}^{(i)}\right)$
\STATE 使用经验估计计算$\Delta f^{\phi}\left(\cdot ; \mathbf{s}_{t}\right)$\footnotemark[3]
\STATE 计算经验估计$\frac{\partial J_{\pi}\left(\phi ; \mathbf{s}_{t}\right)}{\partial \phi} \propto \mathbb{E}_{\xi}\left[\Delta f^{\phi}\left(\xi ; \mathbf{s}_{t}\right) \frac{\partial f^{\phi}\left(\xi ; \mathbf{s}_{t}\right)}{\partial \phi}\right]$,即$\hat{\nabla}_{\phi} J_{\pi}$
\STATE 根据$\hat{\nabla}_{\phi} J_{\pi}$使用ADAM更新参数$\phi$
\STATE
\ENDFOR
\STATE$C$个回合复制参数$\bar{\theta} \leftarrow \theta, \bar{\phi} \leftarrow \phi$
\ENDFOR
\end{algorithmic}
\end{algorithm}
\footnotetext[1]{$V_{\mathrm{soft}}^{\theta}\left(\mathbf{s}_{t}\right)=\alpha \log \mathbb{E}_{q_{\mathbf{a}^{\prime}}}\left[\frac{\exp \left(\frac{1}{\alpha} Q_{\mathrm{soft}}^{\theta}\left(\mathbf{s}_{t}, \mathbf{a}^{\prime}\right)\right)}{q_{\mathbf{a}^{\prime}}\left(\mathbf{a}^{\prime}\right)}\right]$}
\footnotetext[2]{$J_{Q}(\theta)=\mathbb{E}_{\mathbf{s}_{t} \sim q_{\mathbf{s}_{t}}, \mathbf{a}_{t} \sim q_{\mathbf{a}_{t}}}\left[\frac{1}{2}\left(\hat{Q}_{\mathrm{soft}}^{\bar{\theta}}\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)-Q_{\mathrm{soft}}^{\theta}\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)\right)^{2}\right]$}
\footnotetext[3]{$\begin{aligned} \Delta f^{\phi}\left(\cdot ; \mathbf{s}_{t}\right)=& \mathbb{E}_{\mathbf{a}_{t} \sim \pi^{\phi}}\left[\left.\kappa\left(\mathbf{a}_{t}, f^{\phi}\left(\cdot ; \mathbf{s}_{t}\right)\right) \nabla_{\mathbf{a}^{\prime}} Q_{\mathrm{soft}}^{\theta}\left(\mathbf{s}_{t}, \mathbf{a}^{\prime}\right)\right|_{\mathbf{a}^{\prime}=\mathbf{a}_{t}}\right.\\ &\left.+\left.\alpha \nabla_{\mathbf{a}^{\prime}} \kappa\left(\mathbf{a}^{\prime}, f^{\phi}\left(\cdot ; \mathbf{s}_{t}\right)\right)\right|_{\mathbf{a}^{\prime}=\mathbf{a}_{t}}\right] \end{aligned}$}
\clearpage
\section{SAC算法}
\begin{algorithm}[H] % [H]固定位置
\floatname{algorithm}{{Soft Actor Critic算法}}
\renewcommand{\thealgorithm}{} % 去掉算法标号
\caption{}
\begin{algorithmic}[1]
\STATE 初始化两个Actor的网络参数$\theta_1,\theta_2$以及一个Critic网络参数$\phi$ % 初始化
\STATE 复制参数到目标网络$\bar{\theta_1} \leftarrow \theta_1,\bar{\theta_2} \leftarrow \theta_2,$
\STATE 初始化经验回放$D$
\FOR {回合数 = $1,M$}
\STATE 重置环境,获得初始状态$s_t$
\FOR {时步 = $1,t$}
\STATE 根据$\boldsymbol{a}_{t} \sim \pi_{\phi}\left(\boldsymbol{a}_{t} \mid \mathbf{s}_{t}\right)$采样动作$a_t$
\STATE 环境反馈奖励和下一个状态,$\mathbf{s}_{t+1} \sim p\left(\mathbf{s}_{t+1} \mid \mathbf{s}_{t}, \mathbf{a}_{t}\right)$
\STATE 存储transition到经验回放中$\mathcal{D} \leftarrow \mathcal{D} \cup\left\{\left(\mathbf{s}_{t}, \mathbf{a}_{t}, r\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right), \mathbf{s}_{t+1}\right)\right\}$
\STATE 更新环境状态$s_{t+1} \leftarrow s_t$
\STATE {\bfseries 更新策略:}
\STATE 更新$Q$函数,$\theta_{i} \leftarrow \theta_{i}-\lambda_{Q} \hat{\nabla}_{\theta_{i}} J_{Q}\left(\theta_{i}\right)$ for $i \in\{1,2\}$\footnotemark[1]\footnotemark[2]
\STATE 更新策略权重,$\phi \leftarrow \phi-\lambda_{\pi} \hat{\nabla}_{\phi} J_{\pi}(\phi)$ \footnotemark[3]
\STATE 调整temperature$\alpha \leftarrow \alpha-\lambda \hat{\nabla}_{\alpha} J(\alpha)$ \footnotemark[4]
\STATE 更新目标网络权重,$\bar{\theta}_{i} \leftarrow \tau \theta_{i}+(1-\tau) \bar{\theta}_{i}$ for $i \in\{1,2\}$
\ENDFOR
\ENDFOR
\end{algorithmic}
\end{algorithm}
\footnotetext[1]{$J_{Q}(\theta)=\mathbb{E}_{\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right) \sim \mathcal{D}}\left[\frac{1}{2}\left(Q_{\theta}\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)-\left(r\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)+\gamma \mathbb{E}_{\mathbf{s}_{t+1} \sim p}\left[V_{\bar{\theta}}\left(\mathbf{s}_{t+1}\right)\right]\right)\right)^{2}\right]$}
\footnotetext[2]{$\hat{\nabla}_{\theta} J_{Q}(\theta)=\nabla_{\theta} Q_{\theta}\left(\mathbf{a}_{t}, \mathbf{s}_{t}\right)\left(Q_{\theta}\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)-\left(r\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)+\gamma\left(Q_{\bar{\theta}}\left(\mathbf{s}_{t+1}, \mathbf{a}_{t+1}\right)-\alpha \log \left(\pi_{\phi}\left(\mathbf{a}_{t+1} \mid \mathbf{s}_{t+1}\right)\right)\right)\right)\right.$}
\footnotetext[3]{$\hat{\nabla}_{\phi} J_{\pi}(\phi)=\nabla_{\phi} \alpha \log \left(\pi_{\phi}\left(\mathbf{a}_{t} \mid \mathbf{s}_{t}\right)\right)+\left(\nabla_{\mathbf{a}_{t}} \alpha \log \left(\pi_{\phi}\left(\mathbf{a}_{t} \mid \mathbf{s}_{t}\right)\right)-\nabla_{\mathbf{a}_{t}} Q\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)\right) \nabla_{\phi} f_{\phi}\left(\epsilon_{t} ; \mathbf{s}_{t}\right)$,$\mathbf{a}_{t}=f_{\phi}\left(\epsilon_{t} ; \mathbf{s}_{t}\right)$}
\footnotetext[4]{$J(\alpha)=\mathbb{E}_{\mathbf{a}_{t} \sim \pi_{t}}\left[-\alpha \log \pi_{t}\left(\mathbf{a}_{t} \mid \mathbf{s}_{t}\right)-\alpha \overline{\mathcal{H}}\right]$}
\clearpage
\end{document}