\documentclass[11pt]{ctexart} \usepackage{ctex} \usepackage{algorithm} \usepackage{algorithmic} \usepackage{amssymb} \usepackage{amsmath} \usepackage{hyperref} % \usepackage[hidelinks]{hyperref} 去除超链接的红色框 \usepackage{setspace} \usepackage{titlesec} \usepackage{float} % 调用该包能够使用[H] % \pagestyle{plain} % 去除页眉,但是保留页脚编号,都去掉plain换empty % 更改脚注为圆圈 \usepackage{pifont} \makeatletter \newcommand*{\circnum}[1]{% \expandafter\@circnum\csname c@#1\endcsname } \newcommand*{\@circnum}[1]{% \ifnum#1<1 % \@ctrerr \else \ifnum#1>20 % \@ctrerr \else \ding{\the\numexpr 171+(#1)\relax}% \fi \fi } \makeatother \renewcommand*{\thefootnote}{\circnum{footnote}} \begin{document} \tableofcontents % 目录,注意要运行两下或者vscode保存两下才能显示 % \singlespacing \clearpage \section{模版备用} \begin{algorithm}[H] % [H]固定位置 \floatname{algorithm}{{算法}} \renewcommand{\thealgorithm}{} % 去掉算法标号 \caption{} \begin{algorithmic}[1] % [1]显示步数 \STATE 测试 \end{algorithmic} \end{algorithm} \clearpage \section{Q learning算法} \begin{algorithm}[H] % [H]固定位置 \floatname{algorithm}{{Q-learning算法}\footnotemark[1]} \renewcommand{\thealgorithm}{} % 去掉算法标号 \caption{} \begin{algorithmic}[1] % [1]显示步数 \STATE 初始化Q表$Q(s,a)$为任意值,但其中$Q(s_{terminal},)=0$,即终止状态对应的Q值为0 \FOR {回合数 = $1,M$} \STATE 重置环境,获得初始状态$s_1$ \FOR {时步 = $1,t$} \STATE 根据$\varepsilon-greedy$策略采样动作$a_t$ \STATE 环境根据$a_t$反馈奖励$r_t$和下一个状态$s_{t+1}$ \STATE {\bfseries 更新策略:} \STATE $Q(s_t,a_t) \leftarrow Q(s_t,a_t)+\alpha[r_t+\gamma\max _{a}Q(s_{t+1},a)-Q(s_t,a_t)]$ \STATE 更新状态$s_{t+1} \leftarrow s_t$ \ENDFOR \ENDFOR \end{algorithmic} \end{algorithm} \footnotetext[1]{Reinforcement Learning: An Introduction} \clearpage \section{Sarsa算法} \begin{algorithm}[H] % [H]固定位置 \floatname{algorithm}{{Sarsa算法}\footnotemark[1]} \renewcommand{\thealgorithm}{} % 去掉算法标号 \caption{} \begin{algorithmic}[1] % [1]显示步数 \STATE 初始化Q表$Q(s,a)$为任意值,但其中$Q(s_{terminal},)=0$,即终止状态对应的Q值为0 \FOR {回合数 = $1,M$} \STATE 重置环境,获得初始状态$s_1$ \STATE 根据$\varepsilon-greedy$策略采样初始动作$a_1$ \FOR {时步 = $1,t$} \STATE 环境根据$a_t$反馈奖励$r_t$和下一个状态$s_{t+1}$ \STATE 根据$\varepsilon-greedy$策略$s_{t+1}$和采样动作$a_{t+1}$ \STATE {\bfseries 更新策略:} \STATE $Q(s_t,a_t) \leftarrow Q(s_t,a_t)+\alpha[r_t+\gamma Q(s_{t+1},a_{t+1})-Q(s_t,a_t)]$ \STATE 更新状态$s_{t+1} \leftarrow s_t$ \STATE 更新动作$a_{t+1} \leftarrow a_t$ \ENDFOR \ENDFOR \end{algorithmic} \end{algorithm} \footnotetext[1]{Reinforcement Learning: An Introduction} \clearpage \section{DQN算法} \begin{algorithm}[H] % [H]固定位置 \floatname{algorithm}{{DQN算法}\footnotemark[1]} \renewcommand{\thealgorithm}{} % 去掉算法标号 \caption{} \renewcommand{\algorithmicrequire}{\textbf{输入:}} \renewcommand{\algorithmicensure}{\textbf{输出:}} \begin{algorithmic}[1] % \REQUIRE $n \geq 0 \vee x \neq 0$ % 输入 % \ENSURE $y = x^n$ % 输出 \STATE 初始化策略网络参数$\theta$ % 初始化 \STATE 复制参数到目标网络$\hat{Q} \leftarrow Q$ \STATE 初始化经验回放$D$ \FOR {回合数 = $1,M$} \STATE 重置环境,获得初始状态$s_t$ \FOR {时步 = $1,t$} \STATE 根据$\varepsilon-greedy$策略采样动作$a_t$ \STATE 环境根据$a_t$反馈奖励$r_t$和下一个状态$s_{t+1}$ \STATE 存储transition即$(s_t,a_t,r_t,s_{t+1})$到经验回放$D$中 \STATE 更新环境状态$s_{t+1} \leftarrow s_t$ \STATE {\bfseries 更新策略:} \STATE 从$D$中采样一个batch的transition \STATE 计算实际的$Q$值,即$y_{j}$\footnotemark[2] \STATE 对损失 $L(\theta)=\left(y_{i}-Q\left(s_{i}, a_{i} ; \theta\right)\right)^{2}$关于参数$\theta$做随机梯度下降\footnotemark[3] \ENDFOR \STATE 每$C$个回合复制参数$\hat{Q}\leftarrow Q$\footnotemark[4]] \ENDFOR \end{algorithmic} \end{algorithm} \footnotetext[1]{Playing Atari with Deep Reinforcement Learning} \footnotetext[2]{$y_{i}= \begin{cases}r_{i} & \text {对于终止状态} s_{i+1} \\ r_{i}+\gamma \max _{a^{\prime}} Q\left(s_{i+1}, a^{\prime} ; \theta\right) & \text {对于非终止状态} s_{i+1}\end{cases}$} \footnotetext[3]{$\theta_i \leftarrow \theta_i - \lambda \nabla_{\theta_{i}} L_{i}\left(\theta_{i}\right)$} \footnotetext[4]{此处也可像原论文中放到小循环中改成每$C$步,但没有每$C$个回合稳定} \clearpage \section{Policy Gradient算法} \begin{algorithm}[H] % [H]固定位置 \floatname{algorithm}{{REINFORCE算法:Monte-Carlo Policy Gradient}\footnotemark[1]} \renewcommand{\thealgorithm}{} % 去掉算法标号 \caption{} \begin{algorithmic}[1] % [1]显示步数 \STATE 初始化策略参数$\boldsymbol{\theta} \in \mathbb{R}^{d^{\prime}}($ e.g., to $\mathbf{0})$ \FOR {回合数 = $1,M$} \STATE 根据策略$\pi(\cdot \mid \cdot, \boldsymbol{\theta})$采样一个(或几个)回合的transition \FOR {时步 = $1,t$} \STATE 计算回报$G \leftarrow \sum_{k=t+1}^{T} \gamma^{k-t-1} R_{k}$ \STATE 更新策略$\boldsymbol{\theta} \leftarrow {\boldsymbol{\theta}+\alpha \gamma^{t}} G \nabla \ln \pi\left(A_{t} \mid S_{t}, \boldsymbol{\theta}\right)$ \ENDFOR \ENDFOR \end{algorithmic} \end{algorithm} \footnotetext[1]{Reinforcement Learning: An Introduction} \clearpage \section{Advantage Actor Critic算法} \begin{algorithm}[H] % [H]固定位置 \floatname{algorithm}{{Q Actor Critic算法}} \renewcommand{\thealgorithm}{} % 去掉算法标号 \caption{} \begin{algorithmic}[1] % [1]显示步数 \STATE 初始化Actor参数$\theta$和Critic参数$w$ \FOR {回合数 = $1,M$} \STATE 根据策略$\pi_{\theta}(a|s)$采样一个(或几个)回合的transition \STATE {\bfseries 更新Critic参数\footnotemark[1]} \FOR {时步 = $t+1,1$} \STATE 计算Advantage,即$ \delta_t = r_t + \gamma Q_w(s_{t+1},a_{t+1})-Q_w(s_t,a_t)$ \STATE $w \leftarrow w+\alpha_{w} \delta_{t} \nabla_{w} Q_w(s_t,a_t)$ \STATE $a_t \leftarrow a_{t+1}$,$s_t \leftarrow s_{t+1}$ \ENDFOR \STATE 更新Actor参数$\theta \leftarrow \theta+\alpha_{\theta} Q_{w}(s, a) \nabla_{\theta} \log \pi_{\theta}(a \mid s)$ \ENDFOR \end{algorithmic} \end{algorithm} \footnotetext[1]{这里结合TD error的特性按照从$t+1$到$1$计算法Advantage更方便} \clearpage \section{SoftQ算法} \begin{algorithm}[H] \floatname{algorithm}{{SoftQ算法}} \renewcommand{\thealgorithm}{} % 去掉算法标号 \caption{} \begin{algorithmic}[1] \STATE 初始化参数$\theta$和$\phi$% 初始化 \STATE 复制参数$\bar{\theta} \leftarrow \theta, \bar{\phi} \leftarrow \phi$ \STATE 初始化经验回放$D$ \FOR {回合数 = $1,M$} \FOR {时步 = $1,t$} \STATE 根据$\mathbf{a}_{t} \leftarrow f^{\phi}\left(\xi ; \mathbf{s}_{t}\right)$采样动作,其中$\xi \sim \mathcal{N}(\mathbf{0}, \boldsymbol{I})$ \STATE 环境根据$a_t$反馈奖励$s_t$和下一个状态$s_{t+1}$ \STATE 存储transition即$(s_t,a_t,r_t,s_{t+1})$到经验回放$D$中 \STATE 更新环境状态$s_{t+1} \leftarrow s_t$ \STATE {\bfseries 更新soft Q函数参数:} \STATE 对于每个$s^{(i)}_{t+1}$采样$\left\{\mathbf{a}^{(i, j)}\right\}_{j=0}^{M} \sim q_{\mathbf{a}^{\prime}}$ \STATE 计算empirical soft values $V_{\mathrm{soft}}^{\theta}\left(\mathbf{s}_{t}\right)$\footnotemark[1] \STATE 计算empirical gradient $J_{Q}(\theta)$\footnotemark[2] \STATE 根据$J_{Q}(\theta)$使用ADAM更新参数$\theta$ \STATE {\bfseries 更新策略:} \STATE 对于每个$s^{(i)}_{t}$采样$\left\{\xi^{(i, j)}\right\}_{j=0}^{M} \sim \mathcal{N}(\mathbf{0}, \boldsymbol{I})$ \STATE 计算$\mathbf{a}_{t}^{(i, j)}=f^{\phi}\left(\xi^{(i, j)}, \mathbf{s}_{t}^{(i)}\right)$ \STATE 使用经验估计计算$\Delta f^{\phi}\left(\cdot ; \mathbf{s}_{t}\right)$\footnotemark[3] \STATE 计算经验估计$\frac{\partial J_{\pi}\left(\phi ; \mathbf{s}_{t}\right)}{\partial \phi} \propto \mathbb{E}_{\xi}\left[\Delta f^{\phi}\left(\xi ; \mathbf{s}_{t}\right) \frac{\partial f^{\phi}\left(\xi ; \mathbf{s}_{t}\right)}{\partial \phi}\right]$,即$\hat{\nabla}_{\phi} J_{\pi}$ \STATE 根据$\hat{\nabla}_{\phi} J_{\pi}$使用ADAM更新参数$\phi$ \STATE \ENDFOR \STATE 每$C$个回合复制参数$\bar{\theta} \leftarrow \theta, \bar{\phi} \leftarrow \phi$ \ENDFOR \end{algorithmic} \end{algorithm} \footnotetext[1]{$V_{\mathrm{soft}}^{\theta}\left(\mathbf{s}_{t}\right)=\alpha \log \mathbb{E}_{q_{\mathbf{a}^{\prime}}}\left[\frac{\exp \left(\frac{1}{\alpha} Q_{\mathrm{soft}}^{\theta}\left(\mathbf{s}_{t}, \mathbf{a}^{\prime}\right)\right)}{q_{\mathbf{a}^{\prime}}\left(\mathbf{a}^{\prime}\right)}\right]$} \footnotetext[2]{$J_{Q}(\theta)=\mathbb{E}_{\mathbf{s}_{t} \sim q_{\mathbf{s}_{t}}, \mathbf{a}_{t} \sim q_{\mathbf{a}_{t}}}\left[\frac{1}{2}\left(\hat{Q}_{\mathrm{soft}}^{\bar{\theta}}\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)-Q_{\mathrm{soft}}^{\theta}\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)\right)^{2}\right]$} \footnotetext[3]{$\begin{aligned} \Delta f^{\phi}\left(\cdot ; \mathbf{s}_{t}\right)=& \mathbb{E}_{\mathbf{a}_{t} \sim \pi^{\phi}}\left[\left.\kappa\left(\mathbf{a}_{t}, f^{\phi}\left(\cdot ; \mathbf{s}_{t}\right)\right) \nabla_{\mathbf{a}^{\prime}} Q_{\mathrm{soft}}^{\theta}\left(\mathbf{s}_{t}, \mathbf{a}^{\prime}\right)\right|_{\mathbf{a}^{\prime}=\mathbf{a}_{t}}\right.\\ &\left.+\left.\alpha \nabla_{\mathbf{a}^{\prime}} \kappa\left(\mathbf{a}^{\prime}, f^{\phi}\left(\cdot ; \mathbf{s}_{t}\right)\right)\right|_{\mathbf{a}^{\prime}=\mathbf{a}_{t}}\right] \end{aligned}$} \clearpage \section{SAC-S算法} \begin{algorithm}[H] % [H]固定位置 \floatname{algorithm}{{SAC-S算法}\footnotemark[1]} \renewcommand{\thealgorithm}{} % 去掉算法标号 \caption{} \begin{algorithmic}[1] % [1]显示步数 \STATE 初始化参数$\psi, \bar{\psi}, \theta, \phi$ \FOR {回合数 = $1,M$} \FOR {时步 = $1,t$} \STATE 根据$\boldsymbol{a}_{t} \sim \pi_{\phi}\left(\boldsymbol{a}_{t} \mid \mathbf{s}_{t}\right)$采样动作$a_t$ \STATE 环境反馈奖励和下一个状态,$\mathbf{s}_{t+1} \sim p\left(\mathbf{s}_{t+1} \mid \mathbf{s}_{t}, \mathbf{a}_{t}\right)$ \STATE 存储transition到经验回放中,$\mathcal{D} \leftarrow \mathcal{D} \cup\left\{\left(\mathbf{s}_{t}, \mathbf{a}_{t}, r\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right), \mathbf{s}_{t+1}\right)\right\}$ \STATE 更新环境状态$s_{t+1} \leftarrow s_t$ \STATE {\bfseries 更新策略:} \STATE $\psi \leftarrow \psi-\lambda_{V} \hat{\nabla}_{\psi} J_{V}(\psi)$ \STATE $\theta_{i} \leftarrow \theta_{i}-\lambda_{Q} \hat{\nabla}_{\theta_{i}} J_{Q}\left(\theta_{i}\right)$ for $i \in\{1,2\}$ \STATE $\phi \leftarrow \phi-\lambda_{\pi} \hat{\nabla}_{\phi} J_{\pi}(\phi)$ \STATE $\bar{\psi} \leftarrow \tau \psi+(1-\tau) \bar{\psi}$ \ENDFOR \ENDFOR \end{algorithmic} \end{algorithm} \footnotetext[1]{Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor} \clearpage \section{SAC算法} \begin{algorithm}[H] % [H]固定位置 \floatname{algorithm}{{SAC算法}\footnotemark[1]} \renewcommand{\thealgorithm}{} % 去掉算法标号 \caption{} \begin{algorithmic}[1] \STATE 初始化网络参数$\theta_1,\theta_2$以及$\phi$ % 初始化 \STATE 复制参数到目标网络$\bar{\theta_1} \leftarrow \theta_1,\bar{\theta_2} \leftarrow \theta_2,$ \STATE 初始化经验回放$D$ \FOR {回合数 = $1,M$} \STATE 重置环境,获得初始状态$s_t$ \FOR {时步 = $1,t$} \STATE 根据$\boldsymbol{a}_{t} \sim \pi_{\phi}\left(\boldsymbol{a}_{t} \mid \mathbf{s}_{t}\right)$采样动作$a_t$ \STATE 环境反馈奖励和下一个状态,$\mathbf{s}_{t+1} \sim p\left(\mathbf{s}_{t+1} \mid \mathbf{s}_{t}, \mathbf{a}_{t}\right)$ \STATE 存储transition到经验回放中,$\mathcal{D} \leftarrow \mathcal{D} \cup\left\{\left(\mathbf{s}_{t}, \mathbf{a}_{t}, r\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right), \mathbf{s}_{t+1}\right)\right\}$ \STATE 更新环境状态$s_{t+1} \leftarrow s_t$ \STATE {\bfseries 更新策略:} \STATE 更新$Q$函数,$\theta_{i} \leftarrow \theta_{i}-\lambda_{Q} \hat{\nabla}_{\theta_{i}} J_{Q}\left(\theta_{i}\right)$ for $i \in\{1,2\}$\footnotemark[2]\footnotemark[3] \STATE 更新策略权重,$\phi \leftarrow \phi-\lambda_{\pi} \hat{\nabla}_{\phi} J_{\pi}(\phi)$ \footnotemark[4] \STATE 调整temperature,$\alpha \leftarrow \alpha-\lambda \hat{\nabla}_{\alpha} J(\alpha)$ \footnotemark[5] \STATE 更新目标网络权重,$\bar{\theta}_{i} \leftarrow \tau \theta_{i}+(1-\tau) \bar{\theta}_{i}$ for $i \in\{1,2\}$ \ENDFOR \ENDFOR \end{algorithmic} \end{algorithm} \footnotetext[2]{Soft Actor-Critic Algorithms and Applications} \footnotetext[2]{$J_{Q}(\theta)=\mathbb{E}_{\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right) \sim \mathcal{D}}\left[\frac{1}{2}\left(Q_{\theta}\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)-\left(r\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)+\gamma \mathbb{E}_{\mathbf{s}_{t+1} \sim p}\left[V_{\bar{\theta}}\left(\mathbf{s}_{t+1}\right)\right]\right)\right)^{2}\right]$} \footnotetext[3]{$\hat{\nabla}_{\theta} J_{Q}(\theta)=\nabla_{\theta} Q_{\theta}\left(\mathbf{a}_{t}, \mathbf{s}_{t}\right)\left(Q_{\theta}\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)-\left(r\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)+\gamma\left(Q_{\bar{\theta}}\left(\mathbf{s}_{t+1}, \mathbf{a}_{t+1}\right)-\alpha \log \left(\pi_{\phi}\left(\mathbf{a}_{t+1} \mid \mathbf{s}_{t+1}\right)\right)\right)\right)\right.$} \footnotetext[4]{$\hat{\nabla}_{\phi} J_{\pi}(\phi)=\nabla_{\phi} \alpha \log \left(\pi_{\phi}\left(\mathbf{a}_{t} \mid \mathbf{s}_{t}\right)\right)+\left(\nabla_{\mathbf{a}_{t}} \alpha \log \left(\pi_{\phi}\left(\mathbf{a}_{t} \mid \mathbf{s}_{t}\right)\right)-\nabla_{\mathbf{a}_{t}} Q\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)\right) \nabla_{\phi} f_{\phi}\left(\epsilon_{t} ; \mathbf{s}_{t}\right)$,$\mathbf{a}_{t}=f_{\phi}\left(\epsilon_{t} ; \mathbf{s}_{t}\right)$} \footnotetext[5]{$J(\alpha)=\mathbb{E}_{\mathbf{a}_{t} \sim \pi_{t}}\left[-\alpha \log \pi_{t}\left(\mathbf{a}_{t} \mid \mathbf{s}_{t}\right)-\alpha \overline{\mathcal{H}}\right]$} \clearpage \end{document}