hot update

This commit is contained in:
johnjim0816
2022-08-24 11:33:06 +08:00
parent ad65dd17cd
commit 62a7364c72
40 changed files with 2129 additions and 179 deletions

View File

@@ -28,6 +28,8 @@
\@writefile{loa}{\contentsline {algorithm}{\numberline {}{\ignorespaces }}{6}{algorithm.}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {6}SoftQ算法}{7}{section.6}\protected@file@percent }
\@writefile{loa}{\contentsline {algorithm}{\numberline {}{\ignorespaces }}{7}{algorithm.}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {7}SAC算法}{8}{section.7}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {7}SAC-S算法}{8}{section.7}\protected@file@percent }
\@writefile{loa}{\contentsline {algorithm}{\numberline {}{\ignorespaces }}{8}{algorithm.}\protected@file@percent }
\gdef \@abspage@last{8}
\@writefile{toc}{\contentsline {section}{\numberline {8}SAC算法}{9}{section.8}\protected@file@percent }
\@writefile{loa}{\contentsline {algorithm}{\numberline {}{\ignorespaces }}{9}{algorithm.}\protected@file@percent }
\gdef \@abspage@last{9}

View File

@@ -1,4 +1,4 @@
This is XeTeX, Version 3.141592653-2.6-0.999993 (TeX Live 2021) (preloaded format=xelatex 2021.8.22) 22 AUG 2022 16:54
This is XeTeX, Version 3.141592653-2.6-0.999993 (TeX Live 2021) (preloaded format=xelatex 2021.8.22) 23 AUG 2022 19:26
entering extended mode
restricted \write18 enabled.
file:line:error style messages enabled.
@@ -415,85 +415,85 @@ Package: titlesec 2019/10/16 v2.13 Sectioning titles
) (./pseudocodes.aux)
\openout1 = `pseudocodes.aux'.
LaTeX Font Info: Checking defaults for OML/cmm/m/it on input line 13.
LaTeX Font Info: ... okay on input line 13.
LaTeX Font Info: Checking defaults for OMS/cmsy/m/n on input line 13.
LaTeX Font Info: ... okay on input line 13.
LaTeX Font Info: Checking defaults for OT1/cmr/m/n on input line 13.
LaTeX Font Info: ... okay on input line 13.
LaTeX Font Info: Checking defaults for T1/cmr/m/n on input line 13.
LaTeX Font Info: ... okay on input line 13.
LaTeX Font Info: Checking defaults for TS1/cmr/m/n on input line 13.
LaTeX Font Info: ... okay on input line 13.
LaTeX Font Info: Checking defaults for TU/lmr/m/n on input line 13.
LaTeX Font Info: ... okay on input line 13.
LaTeX Font Info: Checking defaults for OMX/cmex/m/n on input line 13.
LaTeX Font Info: ... okay on input line 13.
LaTeX Font Info: Checking defaults for U/cmr/m/n on input line 13.
LaTeX Font Info: ... okay on input line 13.
LaTeX Font Info: Checking defaults for PD1/pdf/m/n on input line 13.
LaTeX Font Info: ... okay on input line 13.
LaTeX Font Info: Checking defaults for PU/pdf/m/n on input line 13.
LaTeX Font Info: ... okay on input line 13.
LaTeX Font Info: Checking defaults for OML/cmm/m/it on input line 14.
LaTeX Font Info: ... okay on input line 14.
LaTeX Font Info: Checking defaults for OMS/cmsy/m/n on input line 14.
LaTeX Font Info: ... okay on input line 14.
LaTeX Font Info: Checking defaults for OT1/cmr/m/n on input line 14.
LaTeX Font Info: ... okay on input line 14.
LaTeX Font Info: Checking defaults for T1/cmr/m/n on input line 14.
LaTeX Font Info: ... okay on input line 14.
LaTeX Font Info: Checking defaults for TS1/cmr/m/n on input line 14.
LaTeX Font Info: ... okay on input line 14.
LaTeX Font Info: Checking defaults for TU/lmr/m/n on input line 14.
LaTeX Font Info: ... okay on input line 14.
LaTeX Font Info: Checking defaults for OMX/cmex/m/n on input line 14.
LaTeX Font Info: ... okay on input line 14.
LaTeX Font Info: Checking defaults for U/cmr/m/n on input line 14.
LaTeX Font Info: ... okay on input line 14.
LaTeX Font Info: Checking defaults for PD1/pdf/m/n on input line 14.
LaTeX Font Info: ... okay on input line 14.
LaTeX Font Info: Checking defaults for PU/pdf/m/n on input line 14.
LaTeX Font Info: ... okay on input line 14.
ABD: EverySelectfont initializing macros
LaTeX Info: Redefining \selectfont on input line 13.
LaTeX Info: Redefining \selectfont on input line 14.
Package fontspec Info: Adjusting the maths setup (use [no-math] to avoid
(fontspec) this).
\symlegacymaths=\mathgroup6
LaTeX Font Info: Overwriting symbol font `legacymaths' in version `bold'
(Font) OT1/cmr/m/n --> OT1/cmr/bx/n on input line 13.
LaTeX Font Info: Redeclaring math accent \acute on input line 13.
LaTeX Font Info: Redeclaring math accent \grave on input line 13.
LaTeX Font Info: Redeclaring math accent \ddot on input line 13.
LaTeX Font Info: Redeclaring math accent \tilde on input line 13.
LaTeX Font Info: Redeclaring math accent \bar on input line 13.
LaTeX Font Info: Redeclaring math accent \breve on input line 13.
LaTeX Font Info: Redeclaring math accent \check on input line 13.
LaTeX Font Info: Redeclaring math accent \hat on input line 13.
LaTeX Font Info: Redeclaring math accent \dot on input line 13.
LaTeX Font Info: Redeclaring math accent \mathring on input line 13.
LaTeX Font Info: Redeclaring math symbol \Gamma on input line 13.
LaTeX Font Info: Redeclaring math symbol \Delta on input line 13.
LaTeX Font Info: Redeclaring math symbol \Theta on input line 13.
LaTeX Font Info: Redeclaring math symbol \Lambda on input line 13.
LaTeX Font Info: Redeclaring math symbol \Xi on input line 13.
LaTeX Font Info: Redeclaring math symbol \Pi on input line 13.
LaTeX Font Info: Redeclaring math symbol \Sigma on input line 13.
LaTeX Font Info: Redeclaring math symbol \Upsilon on input line 13.
LaTeX Font Info: Redeclaring math symbol \Phi on input line 13.
LaTeX Font Info: Redeclaring math symbol \Psi on input line 13.
LaTeX Font Info: Redeclaring math symbol \Omega on input line 13.
LaTeX Font Info: Redeclaring math symbol \mathdollar on input line 13.
LaTeX Font Info: Redeclaring symbol font `operators' on input line 13.
(Font) OT1/cmr/m/n --> OT1/cmr/bx/n on input line 14.
LaTeX Font Info: Redeclaring math accent \acute on input line 14.
LaTeX Font Info: Redeclaring math accent \grave on input line 14.
LaTeX Font Info: Redeclaring math accent \ddot on input line 14.
LaTeX Font Info: Redeclaring math accent \tilde on input line 14.
LaTeX Font Info: Redeclaring math accent \bar on input line 14.
LaTeX Font Info: Redeclaring math accent \breve on input line 14.
LaTeX Font Info: Redeclaring math accent \check on input line 14.
LaTeX Font Info: Redeclaring math accent \hat on input line 14.
LaTeX Font Info: Redeclaring math accent \dot on input line 14.
LaTeX Font Info: Redeclaring math accent \mathring on input line 14.
LaTeX Font Info: Redeclaring math symbol \Gamma on input line 14.
LaTeX Font Info: Redeclaring math symbol \Delta on input line 14.
LaTeX Font Info: Redeclaring math symbol \Theta on input line 14.
LaTeX Font Info: Redeclaring math symbol \Lambda on input line 14.
LaTeX Font Info: Redeclaring math symbol \Xi on input line 14.
LaTeX Font Info: Redeclaring math symbol \Pi on input line 14.
LaTeX Font Info: Redeclaring math symbol \Sigma on input line 14.
LaTeX Font Info: Redeclaring math symbol \Upsilon on input line 14.
LaTeX Font Info: Redeclaring math symbol \Phi on input line 14.
LaTeX Font Info: Redeclaring math symbol \Psi on input line 14.
LaTeX Font Info: Redeclaring math symbol \Omega on input line 14.
LaTeX Font Info: Redeclaring math symbol \mathdollar on input line 14.
LaTeX Font Info: Redeclaring symbol font `operators' on input line 14.
LaTeX Font Info: Encoding `OT1' has changed to `TU' for symbol font
(Font) `operators' in the math version `normal' on input line 13.
(Font) `operators' in the math version `normal' on input line 14.
LaTeX Font Info: Overwriting symbol font `operators' in version `normal'
(Font) OT1/cmr/m/n --> TU/lmr/m/n on input line 13.
(Font) OT1/cmr/m/n --> TU/lmr/m/n on input line 14.
LaTeX Font Info: Encoding `OT1' has changed to `TU' for symbol font
(Font) `operators' in the math version `bold' on input line 13.
(Font) `operators' in the math version `bold' on input line 14.
LaTeX Font Info: Overwriting symbol font `operators' in version `bold'
(Font) OT1/cmr/bx/n --> TU/lmr/m/n on input line 13.
(Font) OT1/cmr/bx/n --> TU/lmr/m/n on input line 14.
LaTeX Font Info: Overwriting symbol font `operators' in version `normal'
(Font) TU/lmr/m/n --> TU/lmr/m/n on input line 13.
(Font) TU/lmr/m/n --> TU/lmr/m/n on input line 14.
LaTeX Font Info: Overwriting math alphabet `\mathit' in version `normal'
(Font) OT1/cmr/m/it --> TU/lmr/m/it on input line 13.
(Font) OT1/cmr/m/it --> TU/lmr/m/it on input line 14.
LaTeX Font Info: Overwriting math alphabet `\mathbf' in version `normal'
(Font) OT1/cmr/bx/n --> TU/lmr/b/n on input line 13.
(Font) OT1/cmr/bx/n --> TU/lmr/b/n on input line 14.
LaTeX Font Info: Overwriting math alphabet `\mathsf' in version `normal'
(Font) OT1/cmss/m/n --> TU/lmss/m/n on input line 13.
(Font) OT1/cmss/m/n --> TU/lmss/m/n on input line 14.
LaTeX Font Info: Overwriting math alphabet `\mathtt' in version `normal'
(Font) OT1/cmtt/m/n --> TU/lmtt/m/n on input line 13.
(Font) OT1/cmtt/m/n --> TU/lmtt/m/n on input line 14.
LaTeX Font Info: Overwriting symbol font `operators' in version `bold'
(Font) TU/lmr/m/n --> TU/lmr/b/n on input line 13.
(Font) TU/lmr/m/n --> TU/lmr/b/n on input line 14.
LaTeX Font Info: Overwriting math alphabet `\mathit' in version `bold'
(Font) OT1/cmr/bx/it --> TU/lmr/b/it on input line 13.
(Font) OT1/cmr/bx/it --> TU/lmr/b/it on input line 14.
LaTeX Font Info: Overwriting math alphabet `\mathsf' in version `bold'
(Font) OT1/cmss/bx/n --> TU/lmss/b/n on input line 13.
(Font) OT1/cmss/bx/n --> TU/lmss/b/n on input line 14.
LaTeX Font Info: Overwriting math alphabet `\mathtt' in version `bold'
(Font) OT1/cmtt/m/n --> TU/lmtt/b/n on input line 13.
Package hyperref Info: Link coloring OFF on input line 13.
(Font) OT1/cmtt/m/n --> TU/lmtt/b/n on input line 14.
Package hyperref Info: Link coloring OFF on input line 14.
(/usr/local/texlive/2021/texmf-dist/tex/latex/hyperref/nameref.sty
Package: nameref 2021-04-02 v2.47 Cross-referencing by name of section
(/usr/local/texlive/2021/texmf-dist/tex/latex/refcount/refcount.sty
@@ -503,9 +503,9 @@ Package: gettitlestring 2019/12/15 v1.6 Cleanup title references (HO)
)
\c@section@level=\count313
)
LaTeX Info: Redefining \ref on input line 13.
LaTeX Info: Redefining \pageref on input line 13.
LaTeX Info: Redefining \nameref on input line 13.
LaTeX Info: Redefining \ref on input line 14.
LaTeX Info: Redefining \pageref on input line 14.
LaTeX Info: Redefining \nameref on input line 14.
(./pseudocodes.out) (./pseudocodes.out)
\@outlinefile=\write3
\openout3 = `pseudocodes.out'.
@@ -515,19 +515,19 @@ LaTeX Info: Redefining \nameref on input line 13.
\openout4 = `pseudocodes.toc'.
LaTeX Font Info: Font shape `TU/SongtiSCLight(0)/m/sl' in size <10.95> not available
(Font) Font shape `TU/SongtiSCLight(0)/m/it' tried instead on input line 16.
(Font) Font shape `TU/SongtiSCLight(0)/m/it' tried instead on input line 17.
[1
]
Package hyperref Info: bookmark level for unknown algorithm defaults to 0 on input line 21.
Package hyperref Info: bookmark level for unknown algorithm defaults to 0 on input line 22.
[2
]
LaTeX Font Info: Trying to load font information for U+msa on input line 31.
LaTeX Font Info: Trying to load font information for U+msa on input line 32.
(/usr/local/texlive/2021/texmf-dist/tex/latex/amsfonts/umsa.fd
File: umsa.fd 2013/01/14 v3.01 AMS symbols A
)
LaTeX Font Info: Trying to load font information for U+msb on input line 31.
LaTeX Font Info: Trying to load font information for U+msb on input line 32.
(/usr/local/texlive/2021/texmf-dist/tex/latex/amsfonts/umsb.fd
File: umsb.fd 2013/01/14 v3.01 AMS symbols B
) [3
@@ -536,38 +536,35 @@ File: umsb.fd 2013/01/14 v3.01 AMS symbols B
] [5
]
Underfull \hbox (badness 10000) in paragraph at lines 111--112
[] []\TU/SongtiSCLight(0)/m/n/10.95 计 算 实 际 的 $\OML/cmm/m/it/10.95 Q$ \TU/SongtiSCLight(0)/m/n/10.95 值,| 即 $\OML/cmm/m/it/10.95 y[] \OT1/cmr/m/n/10.95 =
[]
[6
] [6
] [7
] [8
]
Overfull \hbox (32.54117pt too wide) in paragraph at lines 183--183
Overfull \hbox (32.54117pt too wide) in paragraph at lines 212--212
[][]$[]\OML/cmm/m/it/9 J[]\OT1/cmr/m/n/9 (\OML/cmm/m/it/9 ^^R\OT1/cmr/m/n/9 ) = \OMS/cmsy/m/n/9 r[]\OML/cmm/m/it/9 Q[] [] []$|
[]
Overfull \hbox (15.41673pt too wide) in paragraph at lines 184--184
Overfull \hbox (15.41673pt too wide) in paragraph at lines 213--213
[][]$[]\OML/cmm/m/it/9 J[]\OT1/cmr/m/n/9 (\OML/cmm/m/it/9 ^^^\OT1/cmr/m/n/9 ) = \OMS/cmsy/m/n/9 r[]\OML/cmm/m/it/9 ^^K [] [] \OT1/cmr/m/n/9 + [] \OMS/cmsy/m/n/9 r[]\OML/cmm/m/it/9 f[] []$\TU/lmr/m/n/9 ,$[][] \OT1/cmr/m/n/9 =
[]
[8
[9
] (./pseudocodes.aux)
Package rerunfilecheck Info: File `pseudocodes.out' has not changed.
(rerunfilecheck) Checksum: 4575BA7458AA23D6E696EFFE39D05727;640.
(rerunfilecheck) Checksum: 35B5A79A86EF3BC70F1A0B3BCBEBAA13;724.
)
Here is how much of TeX's memory you used:
14813 strings out of 476919
312635 string characters out of 5821840
653471 words of memory out of 5000000
34563 multiletter control sequences out of 15000+600000
413601 words of font info for 90 fonts, out of 8000000 for 9000
14827 strings out of 476919
313456 string characters out of 5821840
653576 words of memory out of 5000000
34576 multiletter control sequences out of 15000+600000
413609 words of font info for 91 fonts, out of 8000000 for 9000
1348 hyphenation exceptions out of 8191
101i,13n,104p,676b,736s stack positions out of 5000i,500n,10000p,200000b,80000s
101i,13n,104p,676b,697s stack positions out of 5000i,500n,10000p,200000b,80000s
Output written on pseudocodes.pdf (8 pages).
Output written on pseudocodes.pdf (9 pages).

View File

@@ -4,4 +4,5 @@
\BOOKMARK [1][-]{section.4}{\376\377\000P\000o\000l\000i\000c\000y\000\040\000G\000r\000a\000d\000i\000e\000n\000t\173\227\154\325}{}% 4
\BOOKMARK [1][-]{section.5}{\376\377\000D\000Q\000N\173\227\154\325}{}% 5
\BOOKMARK [1][-]{section.6}{\376\377\000S\000o\000f\000t\000Q\173\227\154\325}{}% 6
\BOOKMARK [1][-]{section.7}{\376\377\000S\000A\000C\173\227\154\325}{}% 7
\BOOKMARK [1][-]{section.7}{\376\377\000S\000A\000C\000-\000S\173\227\154\325}{}% 7
\BOOKMARK [1][-]{section.8}{\376\377\000S\000A\000C\173\227\154\325}{}% 8

View File

@@ -10,6 +10,7 @@
\usepackage{titlesec}
\usepackage{float} % 调用该包能够使用[H]
% \pagestyle{plain} % 去除页眉但是保留页脚编号都去掉plain换empty
\begin{document}
\tableofcontents % 目录注意要运行两下或者vscode保存两下才能显示
% \singlespacing
@@ -88,7 +89,7 @@
\clearpage
\section{DQN算法}
\begin{algorithm}[H] % [H]固定位置
\floatname{algorithm}{{DQN算法}}
\floatname{algorithm}{{DQN算法}{\hypersetup{linkcolor=white}\footnotemark}}
\renewcommand{\thealgorithm}{} % 去掉算法标号
\caption{}
\renewcommand{\algorithmicrequire}{\textbf{输入:}}
@@ -108,13 +109,17 @@
\STATE 更新环境状态$s_{t+1} \leftarrow s_t$
\STATE {\bfseries 更新策略:}
\STATE$D$中采样一个batch的transition
\STATE 计算实际的$Q$值,即$y_{j}= \begin{cases}r_{j} & \text {对于终止状态} s_{j+1} \\ r_{j}+\gamma \max _{a^{\prime}} Q\left(s_{j+1}, a^{\prime} ; \theta\right) & \text {对于非终止状态} s_{j+1}\end{cases}$
\STATE 对损失 $\left(y_{j}-Q\left(s_{j}, a_{j} ; \theta\right)\right)^{2}$关于参数$\theta$做随机梯度下降
\STATE 计算实际的$Q$值,即$y_{j}${\hypersetup{linkcolor=white}\footnotemark}
\STATE 对损失 $L(\theta)=\left(y_{i}-Q\left(s_{i}, a_{i} ; \theta\right)\right)^{2}$关于参数$\theta$做随机梯度下降{\hypersetup{linkcolor=white}\footnotemark}
\ENDFOR
\STATE$C$个回合复制参数$\hat{Q}\leftarrow Q$(此处也可像原论文中放到小循环中改成每$C$步,但没有每$C$个回合稳定)
\STATE$C$个回合复制参数$\hat{Q}\leftarrow Q${\hypersetup{linkcolor=white}\footnotemark}
\ENDFOR
\end{algorithmic}
\end{algorithm}
\footnotetext[1]{Playing Atari with Deep Reinforcement Learning}
\footnotetext[2]{$y_{i}= \begin{cases}r_{i} & \text {对于终止状态} s_{i+1} \\ r_{i}+\gamma \max _{a^{\prime}} Q\left(s_{i+1}, a^{\prime} ; \theta\right) & \text {对于非终止状态} s_{i+1}\end{cases}$}
\footnotetext[3]{$\theta_i \leftarrow \theta_i - \lambda \nabla_{\theta_{i}} L_{i}\left(\theta_{i}\right)$}
\footnotetext[4]{此处也可像原论文中放到小循环中改成每$C$步,但没有每$C$个回合稳定}
\clearpage
\section{SoftQ算法}
@@ -153,13 +158,37 @@
\footnotetext[2]{$J_{Q}(\theta)=\mathbb{E}_{\mathbf{s}_{t} \sim q_{\mathbf{s}_{t}}, \mathbf{a}_{t} \sim q_{\mathbf{a}_{t}}}\left[\frac{1}{2}\left(\hat{Q}_{\mathrm{soft}}^{\bar{\theta}}\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)-Q_{\mathrm{soft}}^{\theta}\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)\right)^{2}\right]$}
\footnotetext[3]{$\begin{aligned} \Delta f^{\phi}\left(\cdot ; \mathbf{s}_{t}\right)=& \mathbb{E}_{\mathbf{a}_{t} \sim \pi^{\phi}}\left[\left.\kappa\left(\mathbf{a}_{t}, f^{\phi}\left(\cdot ; \mathbf{s}_{t}\right)\right) \nabla_{\mathbf{a}^{\prime}} Q_{\mathrm{soft}}^{\theta}\left(\mathbf{s}_{t}, \mathbf{a}^{\prime}\right)\right|_{\mathbf{a}^{\prime}=\mathbf{a}_{t}}\right.\\ &\left.+\left.\alpha \nabla_{\mathbf{a}^{\prime}} \kappa\left(\mathbf{a}^{\prime}, f^{\phi}\left(\cdot ; \mathbf{s}_{t}\right)\right)\right|_{\mathbf{a}^{\prime}=\mathbf{a}_{t}}\right] \end{aligned}$}
\clearpage
\section{SAC-S算法}
\begin{algorithm}[H] % [H]固定位置
\floatname{algorithm}{{SAC-S算法}\footnotemark[1]}
\renewcommand{\thealgorithm}{} % 去掉算法标号
\caption{}
\begin{algorithmic}[1] % [1]显示步数
\STATE 初始化参数$\psi, \bar{\psi}, \theta, \phi$
\FOR {回合数 = $1,M$}
\FOR {时步 = $1,t$}
\STATE 根据$\boldsymbol{a}_{t} \sim \pi_{\phi}\left(\boldsymbol{a}_{t} \mid \mathbf{s}_{t}\right)$采样动作$a_t$
\STATE 环境反馈奖励和下一个状态,$\mathbf{s}_{t+1} \sim p\left(\mathbf{s}_{t+1} \mid \mathbf{s}_{t}, \mathbf{a}_{t}\right)$
\STATE 存储transition到经验回放中$\mathcal{D} \leftarrow \mathcal{D} \cup\left\{\left(\mathbf{s}_{t}, \mathbf{a}_{t}, r\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right), \mathbf{s}_{t+1}\right)\right\}$
\STATE 更新环境状态$s_{t+1} \leftarrow s_t$
\STATE {\bfseries 更新策略:}
\STATE $\psi \leftarrow \psi-\lambda_{V} \hat{\nabla}_{\psi} J_{V}(\psi)$
\STATE $\theta_{i} \leftarrow \theta_{i}-\lambda_{Q} \hat{\nabla}_{\theta_{i}} J_{Q}\left(\theta_{i}\right)$ for $i \in\{1,2\}$
\STATE $\phi \leftarrow \phi-\lambda_{\pi} \hat{\nabla}_{\phi} J_{\pi}(\phi)$
\STATE $\bar{\psi} \leftarrow \tau \psi+(1-\tau) \bar{\psi}$
\ENDFOR
\ENDFOR
\end{algorithmic}
\end{algorithm}
\footnotetext[1]{Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor}
\clearpage
\section{SAC算法}
\begin{algorithm}[H] % [H]固定位置
\floatname{algorithm}{{Soft Actor Critic算法}}
\floatname{algorithm}{{SAC算法}\footnotemark[1]}
\renewcommand{\thealgorithm}{} % 去掉算法标号
\caption{}
\begin{algorithmic}[1]
\STATE 初始化两个Actor的网络参数$\theta_1,\theta_2$以及一个Critic网络参数$\phi$ % 初始化
\STATE 初始化网络参数$\theta_1,\theta_2$以及$\phi$ % 初始化
\STATE 复制参数到目标网络$\bar{\theta_1} \leftarrow \theta_1,\bar{\theta_2} \leftarrow \theta_2,$
\STATE 初始化经验回放$D$
\FOR {回合数 = $1,M$}
@@ -170,18 +199,18 @@
\STATE 存储transition到经验回放中$\mathcal{D} \leftarrow \mathcal{D} \cup\left\{\left(\mathbf{s}_{t}, \mathbf{a}_{t}, r\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right), \mathbf{s}_{t+1}\right)\right\}$
\STATE 更新环境状态$s_{t+1} \leftarrow s_t$
\STATE {\bfseries 更新策略:}
\STATE 更新$Q$函数,$\theta_{i} \leftarrow \theta_{i}-\lambda_{Q} \hat{\nabla}_{\theta_{i}} J_{Q}\left(\theta_{i}\right)$ for $i \in\{1,2\}$\footnotemark[1]\footnotemark[2]
\STATE 更新策略权重,$\phi \leftarrow \phi-\lambda_{\pi} \hat{\nabla}_{\phi} J_{\pi}(\phi)$ \footnotemark[3]
\STATE 调整temperature$\alpha \leftarrow \alpha-\lambda \hat{\nabla}_{\alpha} J(\alpha)$ \footnotemark[4]
\STATE 更新$Q$函数,$\theta_{i} \leftarrow \theta_{i}-\lambda_{Q} \hat{\nabla}_{\theta_{i}} J_{Q}\left(\theta_{i}\right)$ for $i \in\{1,2\}$\footnotemark[2]\footnotemark[3]
\STATE 更新策略权重,$\phi \leftarrow \phi-\lambda_{\pi} \hat{\nabla}_{\phi} J_{\pi}(\phi)$ \footnotemark[4]
\STATE 调整temperature$\alpha \leftarrow \alpha-\lambda \hat{\nabla}_{\alpha} J(\alpha)$ \footnotemark[5]
\STATE 更新目标网络权重,$\bar{\theta}_{i} \leftarrow \tau \theta_{i}+(1-\tau) \bar{\theta}_{i}$ for $i \in\{1,2\}$
\ENDFOR
\ENDFOR
\end{algorithmic}
\end{algorithmic}
\end{algorithm}
\footnotetext[1]{$J_{Q}(\theta)=\mathbb{E}_{\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right) \sim \mathcal{D}}\left[\frac{1}{2}\left(Q_{\theta}\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)-\left(r\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)+\gamma \mathbb{E}_{\mathbf{s}_{t+1} \sim p}\left[V_{\bar{\theta}}\left(\mathbf{s}_{t+1}\right)\right]\right)\right)^{2}\right]$}
\footnotetext[2]{$\hat{\nabla}_{\theta} J_{Q}(\theta)=\nabla_{\theta} Q_{\theta}\left(\mathbf{a}_{t}, \mathbf{s}_{t}\right)\left(Q_{\theta}\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)-\left(r\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)+\gamma\left(Q_{\bar{\theta}}\left(\mathbf{s}_{t+1}, \mathbf{a}_{t+1}\right)-\alpha \log \left(\pi_{\phi}\left(\mathbf{a}_{t+1} \mid \mathbf{s}_{t+1}\right)\right)\right)\right)\right.$}
\footnotetext[3]{$\hat{\nabla}_{\phi} J_{\pi}(\phi)=\nabla_{\phi} \alpha \log \left(\pi_{\phi}\left(\mathbf{a}_{t} \mid \mathbf{s}_{t}\right)\right)+\left(\nabla_{\mathbf{a}_{t}} \alpha \log \left(\pi_{\phi}\left(\mathbf{a}_{t} \mid \mathbf{s}_{t}\right)\right)-\nabla_{\mathbf{a}_{t}} Q\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)\right) \nabla_{\phi} f_{\phi}\left(\epsilon_{t} ; \mathbf{s}_{t}\right)$,$\mathbf{a}_{t}=f_{\phi}\left(\epsilon_{t} ; \mathbf{s}_{t}\right)$}
\footnotetext[4]{$J(\alpha)=\mathbb{E}_{\mathbf{a}_{t} \sim \pi_{t}}\left[-\alpha \log \pi_{t}\left(\mathbf{a}_{t} \mid \mathbf{s}_{t}\right)-\alpha \overline{\mathcal{H}}\right]$}
\footnotetext[2]{Soft Actor-Critic Algorithms and Applications}
\footnotetext[2]{$J_{Q}(\theta)=\mathbb{E}_{\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right) \sim \mathcal{D}}\left[\frac{1}{2}\left(Q_{\theta}\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)-\left(r\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)+\gamma \mathbb{E}_{\mathbf{s}_{t+1} \sim p}\left[V_{\bar{\theta}}\left(\mathbf{s}_{t+1}\right)\right]\right)\right)^{2}\right]$}
\footnotetext[3]{$\hat{\nabla}_{\theta} J_{Q}(\theta)=\nabla_{\theta} Q_{\theta}\left(\mathbf{a}_{t}, \mathbf{s}_{t}\right)\left(Q_{\theta}\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)-\left(r\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)+\gamma\left(Q_{\bar{\theta}}\left(\mathbf{s}_{t+1}, \mathbf{a}_{t+1}\right)-\alpha \log \left(\pi_{\phi}\left(\mathbf{a}_{t+1} \mid \mathbf{s}_{t+1}\right)\right)\right)\right)\right.$}
\footnotetext[4]{$\hat{\nabla}_{\phi} J_{\pi}(\phi)=\nabla_{\phi} \alpha \log \left(\pi_{\phi}\left(\mathbf{a}_{t} \mid \mathbf{s}_{t}\right)\right)+\left(\nabla_{\mathbf{a}_{t}} \alpha \log \left(\pi_{\phi}\left(\mathbf{a}_{t} \mid \mathbf{s}_{t}\right)\right)-\nabla_{\mathbf{a}_{t}} Q\left(\mathbf{s}_{t}, \mathbf{a}_{t}\right)\right) \nabla_{\phi} f_{\phi}\left(\epsilon_{t} ; \mathbf{s}_{t}\right)$,$\mathbf{a}_{t}=f_{\phi}\left(\epsilon_{t} ; \mathbf{s}_{t}\right)$}
\footnotetext[5]{$J(\alpha)=\mathbb{E}_{\mathbf{a}_{t} \sim \pi_{t}}\left[-\alpha \log \pi_{t}\left(\mathbf{a}_{t} \mid \mathbf{s}_{t}\right)-\alpha \overline{\mathcal{H}}\right]$}
\clearpage
\end{document}

View File

@@ -4,4 +4,5 @@
\contentsline {section}{\numberline {4}Policy Gradient算法}{5}{section.4}%
\contentsline {section}{\numberline {5}DQN算法}{6}{section.5}%
\contentsline {section}{\numberline {6}SoftQ算法}{7}{section.6}%
\contentsline {section}{\numberline {7}SAC算法}{8}{section.7}%
\contentsline {section}{\numberline {7}SAC-S算法}{8}{section.7}%
\contentsline {section}{\numberline {8}SAC算法}{9}{section.8}%