From d4183b75ce7e7f06ad41c6aefd71079459b7e251 Mon Sep 17 00:00:00 2001 From: Yiyuan Yang Date: Tue, 25 May 2021 13:56:12 +0800 Subject: [PATCH] Update chapter5.md --- docs/chapter5/chapter5.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/chapter5/chapter5.md b/docs/chapter5/chapter5.md index 4202e03..8a9bc91 100644 --- a/docs/chapter5/chapter5.md +++ b/docs/chapter5/chapter5.md @@ -157,7 +157,7 @@ PPO 有一个前身叫做`信任区域策略优化(Trust Region Policy Optimizat $$ \begin{aligned} J_{T R P O}^{\theta^{\prime}}(\theta)=E_{\left(s_{t}, a_{t}\right) \sim \pi_{\theta^{\prime}}}\left[\frac{p_{\theta}\left(a_{t} | s_{t}\right)}{p_{\theta^{\prime}}\left(a_{t} | s_{t}\right)} A^{\theta^{\prime}}\left(s_{t}, a_{t}\right)\right] \\ \\ -

\mathrm{KL}\left(\theta, \theta^{\prime}\right)<\delta

+ \mathrm{KL}\left(\theta, \theta^{\prime}\right)<\delta \end{aligned} $$