hot update A2C

This commit is contained in:
johnjim0816
2022-08-29 15:12:33 +08:00
parent 99a3c1afec
commit 0b0f7e857d
109 changed files with 8213 additions and 1658 deletions

7
projects/.gitignore vendored
View File

@@ -2,4 +2,9 @@
.ipynb_checkpoints .ipynb_checkpoints
__pycache__ __pycache__
.vscode .vscode
test.py test.py
pseudocodes.aux
pseudocodes.log
pseudocodes.synctex.gz
pseudocodes.out
pseudocodes.toc

View File

@@ -22,15 +22,15 @@
注:点击对应的名称会跳到[codes](./codes/)下对应的算法中,其他版本还请读者自行翻阅 注:点击对应的名称会跳到[codes](./codes/)下对应的算法中,其他版本还请读者自行翻阅
| 算法名称 | 参考文献 | 备注 | | 算法名称 | 参考文献 | 备注 |
| :-----------------------: | :----------------------------------------------------------: | :--: | | :-------------------------------------: | :----------------------------------------------------------: | :--: |
| | | | | [Policy Gradient](codes/PolicyGradient) | [Policy Gradient paper](https://proceedings.neurips.cc/paper/1999/file/464d828b85b0bed98e80ade0a5c43b0f-Paper.pdf) | |
| DQN-CNN | | 待更 | | DQN-CNN | | 待更 |
| [SoftQ](codes/SoftQ) | [Soft Q-learning paper](https://arxiv.org/abs/1702.08165) | | | [SoftQ](codes/SoftQ) | [Soft Q-learning paper](https://arxiv.org/abs/1702.08165) | |
| [SAC](codes/SAC) | [SAC paper](https://arxiv.org/pdf/1812.05905.pdf) | | | [SAC](codes/SAC) | [SAC paper](https://arxiv.org/pdf/1812.05905.pdf) | |
| [SAC-Discrete](codes/SAC) | [SAC-Discrete paper](https://arxiv.org/pdf/1910.07207.pdf) | | | [SAC-Discrete](codes/SAC) | [SAC-Discrete paper](https://arxiv.org/pdf/1910.07207.pdf) | |
| SAC-S | [SAC-S paper](https://arxiv.org/abs/1801.01290) | | | SAC-S | [SAC-S paper](https://arxiv.org/abs/1801.01290) | |
| DSAC | [DSAC paper](https://paperswithcode.com/paper/addressing-value-estimation-errors-in) | 待更 | | DSAC | [DSAC paper](https://paperswithcode.com/paper/addressing-value-estimation-errors-in) | 待更 |
## 3、算法环境 ## 3、算法环境

View File

@@ -1,35 +0,0 @@
\relax
\providecommand\hyper@newdestlabel[2]{}
\providecommand\HyperFirstAtBeginDocument{\AtBeginDocument}
\HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined
\global\let\oldcontentsline\contentsline
\gdef\contentsline#1#2#3#4{\oldcontentsline{#1}{#2}{#3}}
\global\let\oldnewlabel\newlabel
\gdef\newlabel#1#2{\newlabelxx{#1}#2}
\gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}}
\AtEndDocument{\ifx\hyper@anchor\@undefined
\let\contentsline\oldcontentsline
\let\newlabel\oldnewlabel
\fi}
\fi}
\global\let\hyper@last\relax
\gdef\HyperFirstAtBeginDocument#1{#1}
\providecommand*\HyPL@Entry[1]{}
\HyPL@Entry{0<</S/D>>}
\@writefile{toc}{\contentsline {section}{\numberline {1}模版备用}{2}{section.1}\protected@file@percent }
\@writefile{loa}{\contentsline {algorithm}{\numberline {}{\ignorespaces }}{2}{algorithm.}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {2}Q learning算法}{3}{section.2}\protected@file@percent }
\@writefile{loa}{\contentsline {algorithm}{\numberline {}{\ignorespaces }}{3}{algorithm.}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {3}Sarsa算法}{4}{section.3}\protected@file@percent }
\@writefile{loa}{\contentsline {algorithm}{\numberline {}{\ignorespaces }}{4}{algorithm.}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {4}Policy Gradient算法}{5}{section.4}\protected@file@percent }
\@writefile{loa}{\contentsline {algorithm}{\numberline {}{\ignorespaces }}{5}{algorithm.}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {5}DQN算法}{6}{section.5}\protected@file@percent }
\@writefile{loa}{\contentsline {algorithm}{\numberline {}{\ignorespaces }}{6}{algorithm.}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {6}SoftQ算法}{7}{section.6}\protected@file@percent }
\@writefile{loa}{\contentsline {algorithm}{\numberline {}{\ignorespaces }}{7}{algorithm.}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {7}SAC-S算法}{8}{section.7}\protected@file@percent }
\@writefile{loa}{\contentsline {algorithm}{\numberline {}{\ignorespaces }}{8}{algorithm.}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {8}SAC算法}{9}{section.8}\protected@file@percent }
\@writefile{loa}{\contentsline {algorithm}{\numberline {}{\ignorespaces }}{9}{algorithm.}\protected@file@percent }
\gdef \@abspage@last{9}

View File

@@ -1,570 +0,0 @@
This is XeTeX, Version 3.141592653-2.6-0.999993 (TeX Live 2021) (preloaded format=xelatex 2021.8.22) 23 AUG 2022 19:26
entering extended mode
restricted \write18 enabled.
file:line:error style messages enabled.
%&-line parsing enabled.
**/Users/jj/Desktop/rl-tutorials/assets/pseudocodes/pseudocodes
(/Users/jj/Desktop/rl-tutorials/assets/pseudocodes/pseudocodes.tex
LaTeX2e <2020-10-01> patch level 4
L3 programming layer <2021-02-18> (/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/ctexart.cls (/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/config/ctexbackend.cfg
File: ctexbackend.cfg 2021/03/14 v2.5.6 Backend configuration file (CTEX)
) (/usr/local/texlive/2021/texmf-dist/tex/latex/l3kernel/expl3.sty
Package: expl3 2021-02-18 L3 programming layer (loader)
(/usr/local/texlive/2021/texmf-dist/tex/latex/l3backend/l3backend-xetex.def
File: l3backend-xetex.def 2021-03-18 L3 backend support: XeTeX
(|extractbb --version)
\c__kernel_sys_dvipdfmx_version_int=\count175
\l__color_backend_stack_int=\count176
\g__color_backend_stack_int=\count177
\g__graphics_track_int=\count178
\l__pdf_internal_box=\box47
\g__pdf_backend_object_int=\count179
\g__pdf_backend_annotation_int=\count180
\g__pdf_backend_link_int=\count181
))
Document Class: ctexart 2021/03/14 v2.5.6 Chinese adapter for class article (CTEX)
(/usr/local/texlive/2021/texmf-dist/tex/latex/l3packages/xparse/xparse.sty (/usr/local/texlive/2021/texmf-dist/tex/latex/l3packages/xparse/xparse-2020-10-01.sty (/usr/local/texlive/2021/texmf-dist/tex/latex/l3packages/xparse/xparse-generic.tex))) (/usr/local/texlive/2021/texmf-dist/tex/latex/l3packages/l3keys2e/l3keys2e.sty
Package: l3keys2e 2021-03-12 LaTeX2e option processing using LaTeX3 keys
) (/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/ctexhook.sty
Package: ctexhook 2021/03/14 v2.5.6 Document and package hooks (CTEX)
) (/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/ctexpatch.sty
Package: ctexpatch 2021/03/14 v2.5.6 Patching commands (CTEX)
) (/usr/local/texlive/2021/texmf-dist/tex/latex/base/fix-cm.sty
Package: fix-cm 2015/01/14 v1.1t fixes to LaTeX
(/usr/local/texlive/2021/texmf-dist/tex/latex/base/ts1enc.def
File: ts1enc.def 2001/06/05 v3.0e (jk/car/fm) Standard LaTeX file
LaTeX Font Info: Redeclaring font encoding TS1 on input line 47.
)) (/usr/local/texlive/2021/texmf-dist/tex/latex/everysel/everysel.sty
Package: everysel 2021/01/20 v2.1 EverySelectfont Package (MS)
(/usr/local/texlive/2021/texmf-dist/tex/latex/everysel/everysel-2011-10-28.sty))
\l__ctex_tmp_int=\count182
\l__ctex_tmp_box=\box48
\l__ctex_tmp_dim=\dimen138
\g__ctex_section_depth_int=\count183
\g__ctex_font_size_int=\count184
(/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/config/ctexopts.cfg
File: ctexopts.cfg 2021/03/14 v2.5.6 Option configuration file (CTEX)
) (/usr/local/texlive/2021/texmf-dist/tex/latex/base/article.cls
Document Class: article 2020/04/10 v1.4m Standard LaTeX document class
(/usr/local/texlive/2021/texmf-dist/tex/latex/base/size11.clo
File: size11.clo 2020/04/10 v1.4m Standard LaTeX file (size option)
)
\c@part=\count185
\c@section=\count186
\c@subsection=\count187
\c@subsubsection=\count188
\c@paragraph=\count189
\c@subparagraph=\count190
\c@figure=\count191
\c@table=\count192
\abovecaptionskip=\skip47
\belowcaptionskip=\skip48
\bibindent=\dimen139
) (/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/engine/ctex-engine-xetex.def
File: ctex-engine-xetex.def 2021/03/14 v2.5.6 XeLaTeX adapter (CTEX)
(/usr/local/texlive/2021/texmf-dist/tex/xelatex/xecjk/xeCJK.sty
Package: xeCJK 2020/10/19 v3.8.6 Typesetting CJK scripts with XeLaTeX
(/usr/local/texlive/2021/texmf-dist/tex/latex/l3packages/xtemplate/xtemplate.sty
Package: xtemplate 2021-03-12 L3 Experimental prototype document functions
\l__xtemplate_tmp_dim=\dimen140
\l__xtemplate_tmp_int=\count193
\l__xtemplate_tmp_muskip=\muskip16
\l__xtemplate_tmp_skip=\skip49
)
\l__xeCJK_tmp_int=\count194
\l__xeCJK_tmp_box=\box49
\l__xeCJK_tmp_dim=\dimen141
\l__xeCJK_tmp_skip=\skip50
\g__xeCJK_space_factor_int=\count195
\l__xeCJK_begin_int=\count196
\l__xeCJK_end_int=\count197
\c__xeCJK_CJK_class_int=\XeTeXcharclass1
\c__xeCJK_FullLeft_class_int=\XeTeXcharclass2
\c__xeCJK_FullRight_class_int=\XeTeXcharclass3
\c__xeCJK_HalfLeft_class_int=\XeTeXcharclass4
\c__xeCJK_HalfRight_class_int=\XeTeXcharclass5
\c__xeCJK_NormalSpace_class_int=\XeTeXcharclass6
\c__xeCJK_CM_class_int=\XeTeXcharclass7
\c__xeCJK_HangulJamo_class_int=\XeTeXcharclass8
\l__xeCJK_last_skip=\skip51
\g__xeCJK_node_int=\count198
\c__xeCJK_CJK_node_dim=\dimen142
\c__xeCJK_CJK-space_node_dim=\dimen143
\c__xeCJK_default_node_dim=\dimen144
\c__xeCJK_default-space_node_dim=\dimen145
\c__xeCJK_CJK-widow_node_dim=\dimen146
\c__xeCJK_normalspace_node_dim=\dimen147
\l__xeCJK_ccglue_skip=\skip52
\l__xeCJK_ecglue_skip=\skip53
\l__xeCJK_punct_kern_skip=\skip54
\l__xeCJK_last_penalty_int=\count199
\l__xeCJK_last_bound_dim=\dimen148
\l__xeCJK_last_kern_dim=\dimen149
\l__xeCJK_widow_penalty_int=\count266
Package xtemplate Info: Declaring object type 'xeCJK/punctuation' taking 0
(xtemplate) argument(s) on line 2341.
\l__xeCJK_fixed_punct_width_dim=\dimen150
\l__xeCJK_mixed_punct_width_dim=\dimen151
\l__xeCJK_middle_punct_width_dim=\dimen152
\l__xeCJK_fixed_margin_width_dim=\dimen153
\l__xeCJK_mixed_margin_width_dim=\dimen154
\l__xeCJK_middle_margin_width_dim=\dimen155
\l__xeCJK_bound_punct_width_dim=\dimen156
\l__xeCJK_bound_margin_width_dim=\dimen157
\l__xeCJK_margin_minimum_dim=\dimen158
\l__xeCJK_kerning_total_width_dim=\dimen159
\l__xeCJK_same_align_margin_dim=\dimen160
\l__xeCJK_different_align_margin_dim=\dimen161
\l__xeCJK_kerning_margin_width_dim=\dimen162
\l__xeCJK_kerning_margin_minimum_dim=\dimen163
\l__xeCJK_bound_dim=\dimen164
\l__xeCJK_reverse_bound_dim=\dimen165
\l__xeCJK_margin_dim=\dimen166
\l__xeCJK_minimum_bound_dim=\dimen167
\l__xeCJK_kerning_margin_dim=\dimen168
\g__xeCJK_family_int=\count267
\l__xeCJK_fam_int=\count268
\g__xeCJK_fam_allocation_int=\count269
\l__xeCJK_verb_case_int=\count270
\l__xeCJK_verb_exspace_skip=\skip55
(/usr/local/texlive/2021/texmf-dist/tex/latex/fontspec/fontspec.sty
Package: fontspec 2020/02/21 v2.7i Font selection for XeLaTeX and LuaLaTeX
(/usr/local/texlive/2021/texmf-dist/tex/latex/fontspec/fontspec-xetex.sty
Package: fontspec-xetex 2020/02/21 v2.7i Font selection for XeLaTeX and LuaLaTeX
\l__fontspec_script_int=\count271
\l__fontspec_language_int=\count272
\l__fontspec_strnum_int=\count273
\l__fontspec_tmp_int=\count274
\l__fontspec_tmpa_int=\count275
\l__fontspec_tmpb_int=\count276
\l__fontspec_tmpc_int=\count277
\l__fontspec_em_int=\count278
\l__fontspec_emdef_int=\count279
\l__fontspec_strong_int=\count280
\l__fontspec_strongdef_int=\count281
\l__fontspec_tmpa_dim=\dimen169
\l__fontspec_tmpb_dim=\dimen170
\l__fontspec_tmpc_dim=\dimen171
(/usr/local/texlive/2021/texmf-dist/tex/latex/base/fontenc.sty
Package: fontenc 2020/08/10 v2.0s Standard LaTeX package
) (/usr/local/texlive/2021/texmf-dist/tex/latex/fontspec/fontspec.cfg))) (/usr/local/texlive/2021/texmf-dist/tex/xelatex/xecjk/xeCJK.cfg
File: xeCJK.cfg 2020/10/19 v3.8.6 Configuration file for xeCJK package
))
\ccwd=\dimen172
\l__ctex_ccglue_skip=\skip56
)
\l__ctex_ziju_dim=\dimen173
(/usr/local/texlive/2021/texmf-dist/tex/latex/zhnumber/zhnumber.sty
Package: zhnumber 2020/05/01 v2.8 Typesetting numbers with Chinese glyphs
\l__zhnum_scale_int=\count282
(/usr/local/texlive/2021/texmf-dist/tex/latex/zhnumber/zhnumber-utf8.cfg
File: zhnumber-utf8.cfg 2020/05/01 v2.8 Chinese numerals with UTF8 encoding
))
\l__ctex_heading_skip=\skip57
(/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/scheme/ctex-scheme-chinese-article.def
File: ctex-scheme-chinese-article.def 2021/03/14 v2.5.6 Chinese scheme for article (CTEX)
(/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/config/ctex-name-utf8.cfg
File: ctex-name-utf8.cfg 2021/03/14 v2.5.6 Caption with encoding UTF-8 (CTEX)
)) (/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/fontset/ctex-fontset-mac.def
File: ctex-fontset-mac.def 2021/03/14 v2.5.6 macOS fonts definition (CTEX)
(/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/fontset/ctex-fontset-macnew.def
File: ctex-fontset-macnew.def 2021/03/14 v2.5.6 macOS fonts definition for El Capitan or later version (CTEX)
Package fontspec Warning: Font "Songti SC Light" does not contain requested
(fontspec) Script "CJK".
Package fontspec Info: Font family 'SongtiSCLight(0)' created for font 'Songti
(fontspec) SC Light' with options
(fontspec) [Script={CJK},BoldItalicFont={Kaiti SC
(fontspec) Bold},BoldFont={Songti SC Bold},ItalicFont={Kaiti SC}].
(fontspec)
(fontspec) This font family consists of the following NFSS
(fontspec) series/shapes:
(fontspec)
(fontspec) - 'normal' (m/n) with NFSS spec.: <->"Songti SC
(fontspec) Light/OT:language=dflt;"
(fontspec) - 'small caps' (m/sc) with NFSS spec.:
(fontspec) - 'bold' (b/n) with NFSS spec.: <->"Songti SC
(fontspec) Bold/OT:language=dflt;"
(fontspec) - 'bold small caps' (b/sc) with NFSS spec.:
(fontspec) - 'italic' (m/it) with NFSS spec.: <->"Kaiti
(fontspec) SC/OT:language=dflt;"
(fontspec) - 'italic small caps' (m/scit) with NFSS spec.:
(fontspec) - 'bold italic' (b/it) with NFSS spec.: <->"Kaiti SC
(fontspec) Bold/OT:language=dflt;"
(fontspec) - 'bold italic small caps' (b/scit) with NFSS spec.:
))) (/usr/local/texlive/2021/texmf-dist/tex/latex/ctex/config/ctex.cfg
File: ctex.cfg 2021/03/14 v2.5.6 Configuration file (CTEX)
) (/usr/local/texlive/2021/texmf-dist/tex/latex/algorithms/algorithm.sty
Invalid UTF-8 byte or sequence at line 11 replaced by U+FFFD.
Package: algorithm 2009/08/24 v0.1 Document Style `algorithm' - floating environment
(/usr/local/texlive/2021/texmf-dist/tex/latex/float/float.sty
Package: float 2001/11/08 v1.3d Float enhancements (AL)
\c@float@type=\count283
\float@exts=\toks15
\float@box=\box50
\@float@everytoks=\toks16
\@floatcapt=\box51
) (/usr/local/texlive/2021/texmf-dist/tex/latex/base/ifthen.sty
Package: ifthen 2014/09/29 v1.1c Standard LaTeX ifthen package (DPC)
)
\@float@every@algorithm=\toks17
\c@algorithm=\count284
) (/usr/local/texlive/2021/texmf-dist/tex/latex/algorithms/algorithmic.sty
Invalid UTF-8 byte or sequence at line 11 replaced by U+FFFD.
Package: algorithmic 2009/08/24 v0.1 Document Style `algorithmic'
(/usr/local/texlive/2021/texmf-dist/tex/latex/graphics/keyval.sty
Package: keyval 2014/10/28 v1.15 key=value parser (DPC)
\KV@toks@=\toks18
)
\c@ALC@unique=\count285
\c@ALC@line=\count286
\c@ALC@rem=\count287
\c@ALC@depth=\count288
\ALC@tlm=\skip58
\algorithmicindent=\skip59
) (/usr/local/texlive/2021/texmf-dist/tex/latex/amsfonts/amssymb.sty
Package: amssymb 2013/01/14 v3.01 AMS font symbols
(/usr/local/texlive/2021/texmf-dist/tex/latex/amsfonts/amsfonts.sty
Package: amsfonts 2013/01/14 v3.01 Basic AMSFonts support
\@emptytoks=\toks19
\symAMSa=\mathgroup4
\symAMSb=\mathgroup5
LaTeX Font Info: Redeclaring math symbol \hbar on input line 98.
LaTeX Font Info: Overwriting math alphabet `\mathfrak' in version `bold'
(Font) U/euf/m/n --> U/euf/b/n on input line 106.
)) (/usr/local/texlive/2021/texmf-dist/tex/latex/amsmath/amsmath.sty
Package: amsmath 2020/09/23 v2.17i AMS math features
\@mathmargin=\skip60
For additional information on amsmath, use the `?' option.
(/usr/local/texlive/2021/texmf-dist/tex/latex/amsmath/amstext.sty
Package: amstext 2000/06/29 v2.01 AMS text
(/usr/local/texlive/2021/texmf-dist/tex/latex/amsmath/amsgen.sty
File: amsgen.sty 1999/11/30 v2.0 generic functions
\@emptytoks=\toks20
\ex@=\dimen174
)) (/usr/local/texlive/2021/texmf-dist/tex/latex/amsmath/amsbsy.sty
Package: amsbsy 1999/11/29 v1.2d Bold Symbols
\pmbraise@=\dimen175
) (/usr/local/texlive/2021/texmf-dist/tex/latex/amsmath/amsopn.sty
Package: amsopn 2016/03/08 v2.02 operator names
)
\inf@bad=\count289
LaTeX Info: Redefining \frac on input line 234.
\uproot@=\count290
\leftroot@=\count291
LaTeX Info: Redefining \overline on input line 399.
\classnum@=\count292
\DOTSCASE@=\count293
LaTeX Info: Redefining \ldots on input line 496.
LaTeX Info: Redefining \dots on input line 499.
LaTeX Info: Redefining \cdots on input line 620.
\Mathstrutbox@=\box52
\strutbox@=\box53
\big@size=\dimen176
LaTeX Font Info: Redeclaring font encoding OML on input line 743.
LaTeX Font Info: Redeclaring font encoding OMS on input line 744.
\macc@depth=\count294
\c@MaxMatrixCols=\count295
\dotsspace@=\muskip17
\c@parentequation=\count296
\dspbrk@lvl=\count297
\tag@help=\toks21
\row@=\count298
\column@=\count299
\maxfields@=\count300
\andhelp@=\toks22
\eqnshift@=\dimen177
\alignsep@=\dimen178
\tagshift@=\dimen179
\tagwidth@=\dimen180
\totwidth@=\dimen181
\lineht@=\dimen182
\@envbody=\toks23
\multlinegap=\skip61
\multlinetaggap=\skip62
\mathdisplay@stack=\toks24
LaTeX Info: Redefining \[ on input line 2923.
LaTeX Info: Redefining \] on input line 2924.
) (/usr/local/texlive/2021/texmf-dist/tex/latex/hyperref/hyperref.sty
Package: hyperref 2021-02-27 v7.00k Hypertext links for LaTeX
(/usr/local/texlive/2021/texmf-dist/tex/generic/ltxcmds/ltxcmds.sty
Package: ltxcmds 2020-05-10 v1.25 LaTeX kernel commands for general use (HO)
) (/usr/local/texlive/2021/texmf-dist/tex/generic/iftex/iftex.sty
Package: iftex 2020/03/06 v1.0d TeX engine tests
) (/usr/local/texlive/2021/texmf-dist/tex/generic/pdftexcmds/pdftexcmds.sty
Package: pdftexcmds 2020-06-27 v0.33 Utility functions of pdfTeX for LuaTeX (HO)
(/usr/local/texlive/2021/texmf-dist/tex/generic/infwarerr/infwarerr.sty
Package: infwarerr 2019/12/03 v1.5 Providing info/warning/error messages (HO)
)
Package pdftexcmds Info: \pdf@primitive is available.
Package pdftexcmds Info: \pdf@ifprimitive is available.
Package pdftexcmds Info: \pdfdraftmode not found.
) (/usr/local/texlive/2021/texmf-dist/tex/generic/kvsetkeys/kvsetkeys.sty
Package: kvsetkeys 2019/12/15 v1.18 Key value parser (HO)
) (/usr/local/texlive/2021/texmf-dist/tex/generic/kvdefinekeys/kvdefinekeys.sty
Package: kvdefinekeys 2019-12-19 v1.6 Define keys (HO)
) (/usr/local/texlive/2021/texmf-dist/tex/generic/pdfescape/pdfescape.sty
Package: pdfescape 2019/12/09 v1.15 Implements pdfTeX's escape features (HO)
) (/usr/local/texlive/2021/texmf-dist/tex/latex/hycolor/hycolor.sty
Package: hycolor 2020-01-27 v1.10 Color options for hyperref/bookmark (HO)
) (/usr/local/texlive/2021/texmf-dist/tex/latex/letltxmacro/letltxmacro.sty
Package: letltxmacro 2019/12/03 v1.6 Let assignment for LaTeX macros (HO)
) (/usr/local/texlive/2021/texmf-dist/tex/latex/auxhook/auxhook.sty
Package: auxhook 2019-12-17 v1.6 Hooks for auxiliary files (HO)
) (/usr/local/texlive/2021/texmf-dist/tex/latex/kvoptions/kvoptions.sty
Package: kvoptions 2020-10-07 v3.14 Key value format for package options (HO)
)
\@linkdim=\dimen183
\Hy@linkcounter=\count301
\Hy@pagecounter=\count302
(/usr/local/texlive/2021/texmf-dist/tex/latex/hyperref/pd1enc.def
File: pd1enc.def 2021-02-27 v7.00k Hyperref: PDFDocEncoding definition (HO)
) (/usr/local/texlive/2021/texmf-dist/tex/latex/hyperref/hyperref-langpatches.def
File: hyperref-langpatches.def 2021-02-27 v7.00k Hyperref: patches for babel languages
) (/usr/local/texlive/2021/texmf-dist/tex/generic/intcalc/intcalc.sty
Package: intcalc 2019/12/15 v1.3 Expandable calculations with integers (HO)
) (/usr/local/texlive/2021/texmf-dist/tex/generic/etexcmds/etexcmds.sty
Package: etexcmds 2019/12/15 v1.7 Avoid name clashes with e-TeX commands (HO)
)
\Hy@SavedSpaceFactor=\count303
(/usr/local/texlive/2021/texmf-dist/tex/latex/hyperref/puenc.def
File: puenc.def 2021-02-27 v7.00k Hyperref: PDF Unicode definition (HO)
)
Package hyperref Info: Option `unicode' set `true' on input line 4073.
Package hyperref Info: Hyper figures OFF on input line 4192.
Package hyperref Info: Link nesting OFF on input line 4197.
Package hyperref Info: Hyper index ON on input line 4200.
Package hyperref Info: Plain pages OFF on input line 4207.
Package hyperref Info: Backreferencing OFF on input line 4212.
Package hyperref Info: Implicit mode ON; LaTeX internals redefined.
Package hyperref Info: Bookmarks ON on input line 4445.
\c@Hy@tempcnt=\count304
(/usr/local/texlive/2021/texmf-dist/tex/latex/url/url.sty
\Urlmuskip=\muskip18
Package: url 2013/09/16 ver 3.4 Verb mode for urls, etc.
)
LaTeX Info: Redefining \url on input line 4804.
\XeTeXLinkMargin=\dimen184
(/usr/local/texlive/2021/texmf-dist/tex/generic/bitset/bitset.sty
Package: bitset 2019/12/09 v1.3 Handle bit-vector datatype (HO)
(/usr/local/texlive/2021/texmf-dist/tex/generic/bigintcalc/bigintcalc.sty
Package: bigintcalc 2019/12/15 v1.5 Expandable calculations on big integers (HO)
))
\Fld@menulength=\count305
\Field@Width=\dimen185
\Fld@charsize=\dimen186
Package hyperref Info: Hyper figures OFF on input line 6075.
Package hyperref Info: Link nesting OFF on input line 6080.
Package hyperref Info: Hyper index ON on input line 6083.
Package hyperref Info: backreferencing OFF on input line 6090.
Package hyperref Info: Link coloring OFF on input line 6095.
Package hyperref Info: Link coloring with OCG OFF on input line 6100.
Package hyperref Info: PDF/A mode OFF on input line 6105.
LaTeX Info: Redefining \ref on input line 6145.
LaTeX Info: Redefining \pageref on input line 6149.
(/usr/local/texlive/2021/texmf-dist/tex/latex/base/atbegshi-ltx.sty
Package: atbegshi-ltx 2020/08/17 v1.0a Emulation of the original atbegshi package
with kernel methods
)
\Hy@abspage=\count306
\c@Item=\count307
\c@Hfootnote=\count308
)
Package hyperref Info: Driver (autodetected): hxetex.
(/usr/local/texlive/2021/texmf-dist/tex/latex/hyperref/hxetex.def
File: hxetex.def 2021-02-27 v7.00k Hyperref driver for XeTeX
(/usr/local/texlive/2021/texmf-dist/tex/generic/stringenc/stringenc.sty
Package: stringenc 2019/11/29 v1.12 Convert strings between diff. encodings (HO)
)
\pdfm@box=\box54
\c@Hy@AnnotLevel=\count309
\HyField@AnnotCount=\count310
\Fld@listcount=\count311
\c@bookmark@seq@number=\count312
(/usr/local/texlive/2021/texmf-dist/tex/latex/rerunfilecheck/rerunfilecheck.sty
Package: rerunfilecheck 2019/12/05 v1.9 Rerun checks for auxiliary files (HO)
(/usr/local/texlive/2021/texmf-dist/tex/latex/base/atveryend-ltx.sty
Package: atveryend-ltx 2020/08/19 v1.0a Emulation of the original atvery package
with kernel methods
) (/usr/local/texlive/2021/texmf-dist/tex/generic/uniquecounter/uniquecounter.sty
Package: uniquecounter 2019/12/15 v1.4 Provide unlimited unique counter (HO)
)
Package uniquecounter Info: New unique counter `rerunfilecheck' on input line 286.
)
\Hy@SectionHShift=\skip63
) (/usr/local/texlive/2021/texmf-dist/tex/latex/setspace/setspace.sty
Package: setspace 2011/12/19 v6.7a set line spacing
) (/usr/local/texlive/2021/texmf-dist/tex/latex/titlesec/titlesec.sty
Package: titlesec 2019/10/16 v2.13 Sectioning titles
\ttl@box=\box55
\beforetitleunit=\skip64
\aftertitleunit=\skip65
\ttl@plus=\dimen187
\ttl@minus=\dimen188
\ttl@toksa=\toks25
\titlewidth=\dimen189
\titlewidthlast=\dimen190
\titlewidthfirst=\dimen191
) (./pseudocodes.aux)
\openout1 = `pseudocodes.aux'.
LaTeX Font Info: Checking defaults for OML/cmm/m/it on input line 14.
LaTeX Font Info: ... okay on input line 14.
LaTeX Font Info: Checking defaults for OMS/cmsy/m/n on input line 14.
LaTeX Font Info: ... okay on input line 14.
LaTeX Font Info: Checking defaults for OT1/cmr/m/n on input line 14.
LaTeX Font Info: ... okay on input line 14.
LaTeX Font Info: Checking defaults for T1/cmr/m/n on input line 14.
LaTeX Font Info: ... okay on input line 14.
LaTeX Font Info: Checking defaults for TS1/cmr/m/n on input line 14.
LaTeX Font Info: ... okay on input line 14.
LaTeX Font Info: Checking defaults for TU/lmr/m/n on input line 14.
LaTeX Font Info: ... okay on input line 14.
LaTeX Font Info: Checking defaults for OMX/cmex/m/n on input line 14.
LaTeX Font Info: ... okay on input line 14.
LaTeX Font Info: Checking defaults for U/cmr/m/n on input line 14.
LaTeX Font Info: ... okay on input line 14.
LaTeX Font Info: Checking defaults for PD1/pdf/m/n on input line 14.
LaTeX Font Info: ... okay on input line 14.
LaTeX Font Info: Checking defaults for PU/pdf/m/n on input line 14.
LaTeX Font Info: ... okay on input line 14.
ABD: EverySelectfont initializing macros
LaTeX Info: Redefining \selectfont on input line 14.
Package fontspec Info: Adjusting the maths setup (use [no-math] to avoid
(fontspec) this).
\symlegacymaths=\mathgroup6
LaTeX Font Info: Overwriting symbol font `legacymaths' in version `bold'
(Font) OT1/cmr/m/n --> OT1/cmr/bx/n on input line 14.
LaTeX Font Info: Redeclaring math accent \acute on input line 14.
LaTeX Font Info: Redeclaring math accent \grave on input line 14.
LaTeX Font Info: Redeclaring math accent \ddot on input line 14.
LaTeX Font Info: Redeclaring math accent \tilde on input line 14.
LaTeX Font Info: Redeclaring math accent \bar on input line 14.
LaTeX Font Info: Redeclaring math accent \breve on input line 14.
LaTeX Font Info: Redeclaring math accent \check on input line 14.
LaTeX Font Info: Redeclaring math accent \hat on input line 14.
LaTeX Font Info: Redeclaring math accent \dot on input line 14.
LaTeX Font Info: Redeclaring math accent \mathring on input line 14.
LaTeX Font Info: Redeclaring math symbol \Gamma on input line 14.
LaTeX Font Info: Redeclaring math symbol \Delta on input line 14.
LaTeX Font Info: Redeclaring math symbol \Theta on input line 14.
LaTeX Font Info: Redeclaring math symbol \Lambda on input line 14.
LaTeX Font Info: Redeclaring math symbol \Xi on input line 14.
LaTeX Font Info: Redeclaring math symbol \Pi on input line 14.
LaTeX Font Info: Redeclaring math symbol \Sigma on input line 14.
LaTeX Font Info: Redeclaring math symbol \Upsilon on input line 14.
LaTeX Font Info: Redeclaring math symbol \Phi on input line 14.
LaTeX Font Info: Redeclaring math symbol \Psi on input line 14.
LaTeX Font Info: Redeclaring math symbol \Omega on input line 14.
LaTeX Font Info: Redeclaring math symbol \mathdollar on input line 14.
LaTeX Font Info: Redeclaring symbol font `operators' on input line 14.
LaTeX Font Info: Encoding `OT1' has changed to `TU' for symbol font
(Font) `operators' in the math version `normal' on input line 14.
LaTeX Font Info: Overwriting symbol font `operators' in version `normal'
(Font) OT1/cmr/m/n --> TU/lmr/m/n on input line 14.
LaTeX Font Info: Encoding `OT1' has changed to `TU' for symbol font
(Font) `operators' in the math version `bold' on input line 14.
LaTeX Font Info: Overwriting symbol font `operators' in version `bold'
(Font) OT1/cmr/bx/n --> TU/lmr/m/n on input line 14.
LaTeX Font Info: Overwriting symbol font `operators' in version `normal'
(Font) TU/lmr/m/n --> TU/lmr/m/n on input line 14.
LaTeX Font Info: Overwriting math alphabet `\mathit' in version `normal'
(Font) OT1/cmr/m/it --> TU/lmr/m/it on input line 14.
LaTeX Font Info: Overwriting math alphabet `\mathbf' in version `normal'
(Font) OT1/cmr/bx/n --> TU/lmr/b/n on input line 14.
LaTeX Font Info: Overwriting math alphabet `\mathsf' in version `normal'
(Font) OT1/cmss/m/n --> TU/lmss/m/n on input line 14.
LaTeX Font Info: Overwriting math alphabet `\mathtt' in version `normal'
(Font) OT1/cmtt/m/n --> TU/lmtt/m/n on input line 14.
LaTeX Font Info: Overwriting symbol font `operators' in version `bold'
(Font) TU/lmr/m/n --> TU/lmr/b/n on input line 14.
LaTeX Font Info: Overwriting math alphabet `\mathit' in version `bold'
(Font) OT1/cmr/bx/it --> TU/lmr/b/it on input line 14.
LaTeX Font Info: Overwriting math alphabet `\mathsf' in version `bold'
(Font) OT1/cmss/bx/n --> TU/lmss/b/n on input line 14.
LaTeX Font Info: Overwriting math alphabet `\mathtt' in version `bold'
(Font) OT1/cmtt/m/n --> TU/lmtt/b/n on input line 14.
Package hyperref Info: Link coloring OFF on input line 14.
(/usr/local/texlive/2021/texmf-dist/tex/latex/hyperref/nameref.sty
Package: nameref 2021-04-02 v2.47 Cross-referencing by name of section
(/usr/local/texlive/2021/texmf-dist/tex/latex/refcount/refcount.sty
Package: refcount 2019/12/15 v3.6 Data extraction from label references (HO)
) (/usr/local/texlive/2021/texmf-dist/tex/generic/gettitlestring/gettitlestring.sty
Package: gettitlestring 2019/12/15 v1.6 Cleanup title references (HO)
)
\c@section@level=\count313
)
LaTeX Info: Redefining \ref on input line 14.
LaTeX Info: Redefining \pageref on input line 14.
LaTeX Info: Redefining \nameref on input line 14.
(./pseudocodes.out) (./pseudocodes.out)
\@outlinefile=\write3
\openout3 = `pseudocodes.out'.
(./pseudocodes.toc)
\tf@toc=\write4
\openout4 = `pseudocodes.toc'.
LaTeX Font Info: Font shape `TU/SongtiSCLight(0)/m/sl' in size <10.95> not available
(Font) Font shape `TU/SongtiSCLight(0)/m/it' tried instead on input line 17.
[1
]
Package hyperref Info: bookmark level for unknown algorithm defaults to 0 on input line 22.
[2
]
LaTeX Font Info: Trying to load font information for U+msa on input line 32.
(/usr/local/texlive/2021/texmf-dist/tex/latex/amsfonts/umsa.fd
File: umsa.fd 2013/01/14 v3.01 AMS symbols A
)
LaTeX Font Info: Trying to load font information for U+msb on input line 32.
(/usr/local/texlive/2021/texmf-dist/tex/latex/amsfonts/umsb.fd
File: umsb.fd 2013/01/14 v3.01 AMS symbols B
) [3
] [4
] [5
] [6
] [7
] [8
]
Overfull \hbox (32.54117pt too wide) in paragraph at lines 212--212
[][]$[]\OML/cmm/m/it/9 J[]\OT1/cmr/m/n/9 (\OML/cmm/m/it/9 ^^R\OT1/cmr/m/n/9 ) = \OMS/cmsy/m/n/9 r[]\OML/cmm/m/it/9 Q[] [] []$|
[]
Overfull \hbox (15.41673pt too wide) in paragraph at lines 213--213
[][]$[]\OML/cmm/m/it/9 J[]\OT1/cmr/m/n/9 (\OML/cmm/m/it/9 ^^^\OT1/cmr/m/n/9 ) = \OMS/cmsy/m/n/9 r[]\OML/cmm/m/it/9 ^^K [] [] \OT1/cmr/m/n/9 + [] \OMS/cmsy/m/n/9 r[]\OML/cmm/m/it/9 f[] []$\TU/lmr/m/n/9 ,$[][] \OT1/cmr/m/n/9 =
[]
[9
] (./pseudocodes.aux)
Package rerunfilecheck Info: File `pseudocodes.out' has not changed.
(rerunfilecheck) Checksum: 35B5A79A86EF3BC70F1A0B3BCBEBAA13;724.
)
Here is how much of TeX's memory you used:
14827 strings out of 476919
313456 string characters out of 5821840
653576 words of memory out of 5000000
34576 multiletter control sequences out of 15000+600000
413609 words of font info for 91 fonts, out of 8000000 for 9000
1348 hyphenation exceptions out of 8191
101i,13n,104p,676b,697s stack positions out of 5000i,500n,10000p,200000b,80000s
Output written on pseudocodes.pdf (9 pages).

View File

@@ -1,8 +0,0 @@
\BOOKMARK [1][-]{section.1}{\376\377\152\041\162\110\131\007\165\050}{}% 1
\BOOKMARK [1][-]{section.2}{\376\377\000Q\000\040\000l\000e\000a\000r\000n\000i\000n\000g\173\227\154\325}{}% 2
\BOOKMARK [1][-]{section.3}{\376\377\000S\000a\000r\000s\000a\173\227\154\325}{}% 3
\BOOKMARK [1][-]{section.4}{\376\377\000P\000o\000l\000i\000c\000y\000\040\000G\000r\000a\000d\000i\000e\000n\000t\173\227\154\325}{}% 4
\BOOKMARK [1][-]{section.5}{\376\377\000D\000Q\000N\173\227\154\325}{}% 5
\BOOKMARK [1][-]{section.6}{\376\377\000S\000o\000f\000t\000Q\173\227\154\325}{}% 6
\BOOKMARK [1][-]{section.7}{\376\377\000S\000A\000C\000-\000S\173\227\154\325}{}% 7
\BOOKMARK [1][-]{section.8}{\376\377\000S\000A\000C\173\227\154\325}{}% 8

View File

@@ -11,6 +11,27 @@
\usepackage{float} % 调用该包能够使用[H] \usepackage{float} % 调用该包能够使用[H]
% \pagestyle{plain} % 去除页眉但是保留页脚编号都去掉plain换empty % \pagestyle{plain} % 去除页眉但是保留页脚编号都去掉plain换empty
% 更改脚注为圆圈
\usepackage{pifont}
\makeatletter
\newcommand*{\circnum}[1]{%
\expandafter\@circnum\csname c@#1\endcsname
}
\newcommand*{\@circnum}[1]{%
\ifnum#1<1 %
\@ctrerr
\else
\ifnum#1>20 %
\@ctrerr
\else
\ding{\the\numexpr 171+(#1)\relax}%
\fi
\fi
}
\makeatother
\renewcommand*{\thefootnote}{\circnum{footnote}}
\begin{document} \begin{document}
\tableofcontents % 目录注意要运行两下或者vscode保存两下才能显示 \tableofcontents % 目录注意要运行两下或者vscode保存两下才能显示
% \singlespacing % \singlespacing
@@ -69,27 +90,10 @@
\end{algorithm} \end{algorithm}
\footnotetext[1]{Reinforcement Learning: An Introduction} \footnotetext[1]{Reinforcement Learning: An Introduction}
\clearpage \clearpage
\section{Policy Gradient算法}
\begin{algorithm}[H] % [H]固定位置
\floatname{algorithm}{{REINFORCE算法Monte-Carlo Policy Gradient}\footnotemark[1]}
\renewcommand{\thealgorithm}{} % 去掉算法标号
\caption{}
\begin{algorithmic}[1] % [1]显示步数
\STATE 初始化策略参数$\boldsymbol{\theta} \in \mathbb{R}^{d^{\prime}}($ e.g., to $\mathbf{0})$
\FOR {回合数 = $1,M$}
\STATE 根据策略$\pi(\cdot \mid \cdot, \boldsymbol{\theta})$采样一个(或几个)回合的transition
\FOR {时步 = $1,t$}
\STATE 计算回报$G \leftarrow \sum_{k=t+1}^{T} \gamma^{k-t-1} R_{k}$
\STATE 更新策略$\boldsymbol{\theta} \leftarrow {\boldsymbol{\theta}+\alpha \gamma^{t}} G \nabla \ln \pi\left(A_{t} \mid S_{t}, \boldsymbol{\theta}\right)$
\ENDFOR
\ENDFOR
\end{algorithmic}
\end{algorithm}
\footnotetext[1]{Reinforcement Learning: An Introduction}
\clearpage
\section{DQN算法} \section{DQN算法}
\begin{algorithm}[H] % [H]固定位置 \begin{algorithm}[H] % [H]固定位置
\floatname{algorithm}{{DQN算法}{\hypersetup{linkcolor=white}\footnotemark}} \floatname{algorithm}{{DQN算法}\footnotemark[1]}
\renewcommand{\thealgorithm}{} % 去掉算法标号 \renewcommand{\thealgorithm}{} % 去掉算法标号
\caption{} \caption{}
\renewcommand{\algorithmicrequire}{\textbf{输入:}} \renewcommand{\algorithmicrequire}{\textbf{输入:}}
@@ -109,10 +113,10 @@
\STATE 更新环境状态$s_{t+1} \leftarrow s_t$ \STATE 更新环境状态$s_{t+1} \leftarrow s_t$
\STATE {\bfseries 更新策略:} \STATE {\bfseries 更新策略:}
\STATE$D$中采样一个batch的transition \STATE$D$中采样一个batch的transition
\STATE 计算实际的$Q$值,即$y_{j}${\hypersetup{linkcolor=white}\footnotemark} \STATE 计算实际的$Q$值,即$y_{j}$\footnotemark[2]
\STATE 对损失 $L(\theta)=\left(y_{i}-Q\left(s_{i}, a_{i} ; \theta\right)\right)^{2}$关于参数$\theta$做随机梯度下降{\hypersetup{linkcolor=white}\footnotemark} \STATE 对损失 $L(\theta)=\left(y_{i}-Q\left(s_{i}, a_{i} ; \theta\right)\right)^{2}$关于参数$\theta$做随机梯度下降\footnotemark[3]
\ENDFOR \ENDFOR
\STATE$C$个回合复制参数$\hat{Q}\leftarrow Q${\hypersetup{linkcolor=white}\footnotemark} \STATE$C$个回合复制参数$\hat{Q}\leftarrow Q$\footnotemark[4]]
\ENDFOR \ENDFOR
\end{algorithmic} \end{algorithmic}
\end{algorithm} \end{algorithm}
@@ -121,7 +125,46 @@
\footnotetext[3]{$\theta_i \leftarrow \theta_i - \lambda \nabla_{\theta_{i}} L_{i}\left(\theta_{i}\right)$} \footnotetext[3]{$\theta_i \leftarrow \theta_i - \lambda \nabla_{\theta_{i}} L_{i}\left(\theta_{i}\right)$}
\footnotetext[4]{此处也可像原论文中放到小循环中改成每$C$步,但没有每$C$个回合稳定} \footnotetext[4]{此处也可像原论文中放到小循环中改成每$C$步,但没有每$C$个回合稳定}
\clearpage \clearpage
\section{Policy Gradient算法}
\begin{algorithm}[H] % [H]固定位置
\floatname{algorithm}{{REINFORCE算法Monte-Carlo Policy Gradient}\footnotemark[1]}
\renewcommand{\thealgorithm}{} % 去掉算法标号
\caption{}
\begin{algorithmic}[1] % [1]显示步数
\STATE 初始化策略参数$\boldsymbol{\theta} \in \mathbb{R}^{d^{\prime}}($ e.g., to $\mathbf{0})$
\FOR {回合数 = $1,M$}
\STATE 根据策略$\pi(\cdot \mid \cdot, \boldsymbol{\theta})$采样一个(或几个)回合的transition
\FOR {时步 = $1,t$}
\STATE 计算回报$G \leftarrow \sum_{k=t+1}^{T} \gamma^{k-t-1} R_{k}$
\STATE 更新策略$\boldsymbol{\theta} \leftarrow {\boldsymbol{\theta}+\alpha \gamma^{t}} G \nabla \ln \pi\left(A_{t} \mid S_{t}, \boldsymbol{\theta}\right)$
\ENDFOR
\ENDFOR
\end{algorithmic}
\end{algorithm}
\footnotetext[1]{Reinforcement Learning: An Introduction}
\clearpage
\section{Advantage Actor Critic算法}
\begin{algorithm}[H] % [H]固定位置
\floatname{algorithm}{{Q Actor Critic算法}}
\renewcommand{\thealgorithm}{} % 去掉算法标号
\caption{}
\begin{algorithmic}[1] % [1]显示步数
\STATE 初始化Actor参数$\theta$和Critic参数$w$
\FOR {回合数 = $1,M$}
\STATE 根据策略$\pi_{\theta}(a|s)$采样一个(或几个)回合的transition
\STATE {\bfseries 更新Critic参数\footnotemark[1]}
\FOR {时步 = $t+1,1$}
\STATE 计算Advantage$ \delta_t = r_t + \gamma Q_w(s_{t+1},a_{t+1})-Q_w(s_t,a_t)$
\STATE $w \leftarrow w+\alpha_{w} \delta_{t} \nabla_{w} Q_w(s_t,a_t)$
\STATE $a_t \leftarrow a_{t+1}$,$s_t \leftarrow s_{t+1}$
\ENDFOR
\STATE 更新Actor参数$\theta \leftarrow \theta+\alpha_{\theta} Q_{w}(s, a) \nabla_{\theta} \log \pi_{\theta}(a \mid s)$
\ENDFOR
\end{algorithmic}
\end{algorithm}
\footnotetext[1]{这里结合TD error的特性按照从$t+1$$1$计算法Advantage更方便}
\clearpage
\section{SoftQ算法} \section{SoftQ算法}
\begin{algorithm}[H] \begin{algorithm}[H]
\floatname{algorithm}{{SoftQ算法}} \floatname{algorithm}{{SoftQ算法}}

View File

@@ -1,8 +0,0 @@
\contentsline {section}{\numberline {1}模版备用}{2}{section.1}%
\contentsline {section}{\numberline {2}Q learning算法}{3}{section.2}%
\contentsline {section}{\numberline {3}Sarsa算法}{4}{section.3}%
\contentsline {section}{\numberline {4}Policy Gradient算法}{5}{section.4}%
\contentsline {section}{\numberline {5}DQN算法}{6}{section.5}%
\contentsline {section}{\numberline {6}SoftQ算法}{7}{section.6}%
\contentsline {section}{\numberline {7}SAC-S算法}{8}{section.7}%
\contentsline {section}{\numberline {8}SAC算法}{9}{section.8}%

View File

@@ -1,56 +1,60 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: JiangJi
Email: johnjim0816@gmail.com
Date: 2021-05-03 22:16:08
LastEditor: JiangJi
LastEditTime: 2022-07-20 23:54:40
Discription:
Environment:
'''
import torch import torch
import torch.optim as optim import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
class ActorCritic(nn.Module):
''' A2C网络模型包含一个Actor和Critic
'''
def __init__(self, input_dim, output_dim, hidden_dim):
super(ActorCritic, self).__init__()
self.critic = nn.Sequential(
nn.Linear(input_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, 1)
)
self.actor = nn.Sequential(
nn.Linear(input_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, output_dim),
nn.Softmax(dim=1),
)
def forward(self, x):
value = self.critic(x)
probs = self.actor(x)
dist = Categorical(probs)
return dist, value
class A2C: class A2C:
''' A2C算法 def __init__(self,models,memories,cfg):
''' self.n_actions = cfg['n_actions']
def __init__(self,n_states,n_actions,cfg) -> None: self.gamma = cfg['gamma']
self.gamma = cfg.gamma self.device = torch.device(cfg['device'])
self.device = torch.device(cfg.device) self.memory = memories['ACMemory']
self.model = ActorCritic(n_states, n_actions, cfg.hidden_size).to(self.device) self.actor = models['Actor'].to(self.device)
self.optimizer = optim.Adam(self.model.parameters()) self.critic = models['Critic'].to(self.device)
self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=cfg['actor_lr'])
self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=cfg['critic_lr'])
def sample_action(self,state):
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
dist = self.actor(state)
value = self.critic(state) # note that 'dist' need require_grad=True
value = value.detach().numpy().squeeze(0)[0]
action = np.random.choice(self.n_actions, p=dist.detach().numpy().squeeze(0)) # shape(p=(n_actions,1)
return action,value,dist
def predict_action(self,state):
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
dist = self.actor(state)
value = self.critic(state) # note that 'dist' need require_grad=True
value = value.detach().numpy().squeeze(0)[0]
action = np.random.choice(self.n_actions, p=dist.detach().numpy().squeeze(0)) # shape(p=(n_actions,1)
return action,value,dist
def update(self,next_state,entropy):
value_pool,log_prob_pool,reward_pool = self.memory.sample()
next_state = torch.tensor(next_state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
next_value = self.critic(next_state)
returns = np.zeros_like(reward_pool)
for t in reversed(range(len(reward_pool))):
next_value = reward_pool[t] + self.gamma * next_value # G(s_{t},a{t}) = r_{t+1} + gamma * V(s_{t+1})
returns[t] = next_value
returns = torch.tensor(returns, device=self.device)
value_pool = torch.tensor(value_pool, device=self.device)
advantages = returns - value_pool
log_prob_pool = torch.stack(log_prob_pool)
actor_loss = (-log_prob_pool * advantages).mean()
critic_loss = 0.5 * advantages.pow(2).mean()
tot_loss = actor_loss + critic_loss + 0.001 * entropy
self.actor_optim.zero_grad()
self.critic_optim.zero_grad()
tot_loss.backward()
self.actor_optim.step()
self.critic_optim.step()
self.memory.clear()
def save_model(self, path):
from pathlib import Path
# create path
Path(path).mkdir(parents=True, exist_ok=True)
torch.save(self.actor.state_dict(), f"{path}/actor_checkpoint.pt")
torch.save(self.critic.state_dict(), f"{path}/critic_checkpoint.pt")
def compute_returns(self,next_value, rewards, masks): def load_model(self, path):
R = next_value self.actor.load_state_dict(torch.load(f"{path}/actor_checkpoint.pt"))
returns = [] self.critic.load_state_dict(torch.load(f"{path}/critic_checkpoint.pt"))
for step in reversed(range(len(rewards))):
R = rewards[step] + self.gamma * R * masks[step]
returns.insert(0, R)
return returns

View File

@@ -0,0 +1,55 @@
import torch
import numpy as np
class A2C_2:
def __init__(self,models,memories,cfg):
self.n_actions = cfg['n_actions']
self.gamma = cfg['gamma']
self.device = torch.device(cfg['device'])
self.memory = memories['ACMemory']
self.ac_net = models['ActorCritic'].to(self.device)
self.ac_optimizer = torch.optim.Adam(self.ac_net.parameters(), lr=cfg['lr'])
def sample_action(self,state):
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
value, dist = self.ac_net(state) # note that 'dist' need require_grad=True
value = value.detach().numpy().squeeze(0)[0]
action = np.random.choice(self.n_actions, p=dist.detach().numpy().squeeze(0)) # shape(p=(n_actions,1)
return action,value,dist
def predict_action(self,state):
''' predict can be all wrapped with no_grad(), then donot need detach(), or you can just copy contents of 'sample_action'
'''
with torch.no_grad():
state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
value, dist = self.ac_net(state)
value = value.numpy().squeeze(0)[0] # shape(value) = (1,)
action = np.random.choice(self.n_actions, p=dist.numpy().squeeze(0)) # shape(p=(n_actions,1)
return action,value,dist
def update(self,next_state,entropy):
value_pool,log_prob_pool,reward_pool = self.memory.sample()
next_state = torch.tensor(next_state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
next_value,_ = self.ac_net(next_state)
returns = np.zeros_like(reward_pool)
for t in reversed(range(len(reward_pool))):
next_value = reward_pool[t] + self.gamma * next_value # G(s_{t},a{t}) = r_{t+1} + gamma * V(s_{t+1})
returns[t] = next_value
returns = torch.tensor(returns, device=self.device)
value_pool = torch.tensor(value_pool, device=self.device)
advantages = returns - value_pool
log_prob_pool = torch.stack(log_prob_pool)
actor_loss = (-log_prob_pool * advantages).mean()
critic_loss = 0.5 * advantages.pow(2).mean()
ac_loss = actor_loss + critic_loss + 0.001 * entropy
self.ac_optimizer.zero_grad()
ac_loss.backward()
self.ac_optimizer.step()
self.memory.clear()
def save_model(self, path):
from pathlib import Path
# create path
Path(path).mkdir(parents=True, exist_ok=True)
torch.save(self.ac_net.state_dict(), f"{path}/a2c_checkpoint.pt")
def load_model(self, path):
self.ac_net.load_state_dict(torch.load(f"{path}/a2c_checkpoint.pt"))

121
projects/codes/A2C/main.py Normal file
View File

@@ -0,0 +1,121 @@
import sys,os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" # avoid "OMP: Error #15: Initializing libiomp5md.dll, but found libiomp5md.dll already initialized."
curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
parent_path = os.path.dirname(curr_path) # parent path
sys.path.append(parent_path) # add path to system path
import datetime
import argparse
import gym
import torch
import numpy as np
from common.utils import all_seed
from common.launcher import Launcher
from common.memories import PGReplay
from common.models import ActorSoftmax,Critic
from envs.register import register_env
from a2c import A2C
class Main(Launcher):
def get_args(self):
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
parser = argparse.ArgumentParser(description="hyperparameters")
parser.add_argument('--algo_name',default='A2C',type=str,help="name of algorithm")
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
parser.add_argument('--train_eps',default=1600,type=int,help="episodes of training")
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
parser.add_argument('--ep_max_steps',default = 100000,type=int,help="steps per episode, much larger value can simulate infinite steps")
parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor")
parser.add_argument('--actor_lr',default=3e-4,type=float,help="learning rate of actor")
parser.add_argument('--critic_lr',default=1e-3,type=float,help="learning rate of critic")
parser.add_argument('--actor_hidden_dim',default=256,type=int,help="hidden of actor net")
parser.add_argument('--critic_hidden_dim',default=256,type=int,help="hidden of critic net")
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
parser.add_argument('--seed',default=10,type=int,help="seed")
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
args = parser.parse_args()
default_args = {'result_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/results/",
'model_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/models/",
}
args = {**vars(args),**default_args} # type(dict)
return args
def env_agent_config(self,cfg):
''' create env and agent
'''
register_env(cfg['env_name'])
env = gym.make(cfg['env_name'])
if cfg['seed'] !=0: # set random seed
all_seed(env,seed=cfg["seed"])
try: # state dimension
n_states = env.observation_space.n # print(hasattr(env.observation_space, 'n'))
except AttributeError:
n_states = env.observation_space.shape[0] # print(hasattr(env.observation_space, 'shape'))
n_actions = env.action_space.n # action dimension
print(f"n_states: {n_states}, n_actions: {n_actions}")
cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters
models = {'Actor':ActorSoftmax(cfg['n_states'],cfg['n_actions'], hidden_dim = cfg['actor_hidden_dim']),'Critic':Critic(cfg['n_states'],1,hidden_dim=cfg['critic_hidden_dim'])}
memories = {'ACMemory':PGReplay()}
agent = A2C(models,memories,cfg)
return env,agent
def train(self,cfg,env,agent):
print("Start training!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = [] # record rewards for all episodes
steps = [] # record steps for all episodes
for i_ep in range(cfg['train_eps']):
ep_reward = 0 # reward per episode
ep_step = 0 # step per episode
ep_entropy = 0
state = env.reset() # reset and obtain initial state
for _ in range(cfg['ep_max_steps']):
action, value, dist = agent.sample_action(state) # sample action
next_state, reward, done, _ = env.step(action) # update env and return transitions
log_prob = torch.log(dist.squeeze(0)[action])
entropy = -np.sum(np.mean(dist.detach().numpy()) * np.log(dist.detach().numpy()))
agent.memory.push((value,log_prob,reward)) # save transitions
state = next_state # update state
ep_reward += reward
ep_entropy += entropy
ep_step += 1
if done:
break
agent.update(next_state,ep_entropy) # update agent
rewards.append(ep_reward)
steps.append(ep_step)
if (i_ep+1)%10==0:
print(f'Episode: {i_ep+1}/{cfg["train_eps"]}, Reward: {ep_reward:.2f}, Steps:{ep_step}')
print("Finish training!")
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
def test(self,cfg,env,agent):
print("Start testing!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = [] # record rewards for all episodes
steps = [] # record steps for all episodes
for i_ep in range(cfg['test_eps']):
ep_reward = 0 # reward per episode
ep_step = 0
state = env.reset() # reset and obtain initial state
for _ in range(cfg['ep_max_steps']):
action,_,_ = agent.predict_action(state) # predict action
next_state, reward, done, _ = env.step(action)
state = next_state
ep_reward += reward
ep_step += 1
if done:
break
rewards.append(ep_reward)
steps.append(ep_step)
print(f"Episode: {i_ep+1}/{cfg['test_eps']}, Steps:{ep_step}, Reward: {ep_reward:.2f}")
print("Finish testing!")
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
if __name__ == "__main__":
main = Main()
main.run()

120
projects/codes/A2C/main2.py Normal file
View File

@@ -0,0 +1,120 @@
import sys,os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" # avoid "OMP: Error #15: Initializing libiomp5md.dll, but found libiomp5md.dll already initialized."
curr_path = os.path.dirname(os.path.abspath(__file__)) # current path
parent_path = os.path.dirname(curr_path) # parent path
sys.path.append(parent_path) # add path to system path
import datetime
import argparse
import gym
import torch
import numpy as np
from common.utils import all_seed
from common.launcher import Launcher
from common.memories import PGReplay
from common.models import ActorCriticSoftmax
from envs.register import register_env
from a2c_2 import A2C_2
class Main(Launcher):
def get_args(self):
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
parser = argparse.ArgumentParser(description="hyperparameters")
parser.add_argument('--algo_name',default='A2C',type=str,help="name of algorithm")
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
parser.add_argument('--train_eps',default=2000,type=int,help="episodes of training")
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
parser.add_argument('--ep_max_steps',default = 100000,type=int,help="steps per episode, much larger value can simulate infinite steps")
parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor")
parser.add_argument('--lr',default=3e-4,type=float,help="learning rate")
parser.add_argument('--actor_hidden_dim',default=256,type=int)
parser.add_argument('--critic_hidden_dim',default=256,type=int)
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
parser.add_argument('--seed',default=10,type=int,help="seed")
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
args = parser.parse_args()
default_args = {'result_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/results/",
'model_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/models/",
}
args = {**vars(args),**default_args} # type(dict)
return args
def env_agent_config(self,cfg):
''' create env and agent
'''
register_env(cfg['env_name'])
env = gym.make(cfg['env_name'])
if cfg['seed'] !=0: # set random seed
all_seed(env,seed=cfg["seed"])
try: # state dimension
n_states = env.observation_space.n # print(hasattr(env.observation_space, 'n'))
except AttributeError:
n_states = env.observation_space.shape[0] # print(hasattr(env.observation_space, 'shape'))
n_actions = env.action_space.n # action dimension
print(f"n_states: {n_states}, n_actions: {n_actions}")
cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters
models = {'ActorCritic':ActorCriticSoftmax(cfg['n_states'],cfg['n_actions'], actor_hidden_dim = cfg['actor_hidden_dim'],critic_hidden_dim=cfg['critic_hidden_dim'])}
memories = {'ACMemory':PGReplay()}
agent = A2C_2(models,memories,cfg)
return env,agent
def train(self,cfg,env,agent):
print("Start training!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = [] # record rewards for all episodes
steps = [] # record steps for all episodes
for i_ep in range(cfg['train_eps']):
ep_reward = 0 # reward per episode
ep_step = 0 # step per episode
ep_entropy = 0
state = env.reset() # reset and obtain initial state
for _ in range(cfg['ep_max_steps']):
action, value, dist = agent.sample_action(state) # sample action
next_state, reward, done, _ = env.step(action) # update env and return transitions
log_prob = torch.log(dist.squeeze(0)[action])
entropy = -np.sum(np.mean(dist.detach().numpy()) * np.log(dist.detach().numpy()))
agent.memory.push((value,log_prob,reward)) # save transitions
state = next_state # update state
ep_reward += reward
ep_entropy += entropy
ep_step += 1
if done:
break
agent.update(next_state,ep_entropy) # update agent
rewards.append(ep_reward)
steps.append(ep_step)
if (i_ep+1)%10==0:
print(f'Episode: {i_ep+1}/{cfg["train_eps"]}, Reward: {ep_reward:.2f}, Steps:{ep_step}')
print("Finish training!")
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
def test(self,cfg,env,agent):
print("Start testing!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = [] # record rewards for all episodes
steps = [] # record steps for all episodes
for i_ep in range(cfg['test_eps']):
ep_reward = 0 # reward per episode
ep_step = 0
state = env.reset() # reset and obtain initial state
for _ in range(cfg['ep_max_steps']):
action,_,_ = agent.predict_action(state) # predict action
next_state, reward, done, _ = env.step(action)
state = next_state
ep_reward += reward
ep_step += 1
if done:
break
rewards.append(ep_reward)
steps.append(ep_step)
print(f"Episode: {i_ep+1}/{cfg['test_eps']}, Steps:{ep_step}, Reward: {ep_reward:.2f}")
print("Finish testing!")
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
if __name__ == "__main__":
main = Main()
main.run()

View File

@@ -0,0 +1,19 @@
{
"algo_name": "A2C",
"env_name": "CartPole-v0",
"train_eps": 2000,
"test_eps": 20,
"ep_max_steps": 100000,
"gamma": 0.99,
"lr": 0.0003,
"actor_hidden_dim": 256,
"critic_hidden_dim": 256,
"device": "cpu",
"seed": 10,
"show_fig": false,
"save_fig": true,
"result_path": "/Users/jj/Desktop/rl-tutorials/codes/A2C/outputs/CartPole-v0/20220829-135818/results/",
"model_path": "/Users/jj/Desktop/rl-tutorials/codes/A2C/outputs/CartPole-v0/20220829-135818/models/",
"n_states": 4,
"n_actions": 2
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 44 KiB

View File

@@ -0,0 +1,21 @@
episodes,rewards,steps
0,200.0,200
1,200.0,200
2,93.0,93
3,155.0,155
4,116.0,116
5,200.0,200
6,190.0,190
7,176.0,176
8,200.0,200
9,200.0,200
10,200.0,200
11,179.0,179
12,200.0,200
13,185.0,185
14,191.0,191
15,200.0,200
16,200.0,200
17,124.0,124
18,200.0,200
19,172.0,172
1 episodes rewards steps
2 0 200.0 200
3 1 200.0 200
4 2 93.0 93
5 3 155.0 155
6 4 116.0 116
7 5 200.0 200
8 6 190.0 190
9 7 176.0 176
10 8 200.0 200
11 9 200.0 200
12 10 200.0 200
13 11 179.0 179
14 12 200.0 200
15 13 185.0 185
16 14 191.0 191
17 15 200.0 200
18 16 200.0 200
19 17 124.0 124
20 18 200.0 200
21 19 172.0 172

Binary file not shown.

After

Width:  |  Height:  |  Size: 63 KiB

View File

@@ -0,0 +1 @@
{"algo_name": "A2C", "env_name": "CartPole-v0", "train_eps": 1600, "test_eps": 20, "ep_max_steps": 100000, "gamma": 0.99, "actor_lr": 0.0003, "critic_lr": 0.001, "actor_hidden_dim": 256, "critic_hidden_dim": 256, "device": "cpu", "seed": 10, "show_fig": false, "save_fig": true, "result_path": "/Users/jj/Desktop/rl-tutorials/codes/A2C/outputs/CartPole-v0/20220829-143327/results/", "model_path": "/Users/jj/Desktop/rl-tutorials/codes/A2C/outputs/CartPole-v0/20220829-143327/models/", "n_states": 4, "n_actions": 2}

Binary file not shown.

After

Width:  |  Height:  |  Size: 41 KiB

View File

@@ -0,0 +1,21 @@
episodes,rewards,steps
0,177.0,177
1,180.0,180
2,200.0,200
3,200.0,200
4,167.0,167
5,124.0,124
6,128.0,128
7,200.0,200
8,200.0,200
9,200.0,200
10,186.0,186
11,187.0,187
12,200.0,200
13,176.0,176
14,200.0,200
15,200.0,200
16,200.0,200
17,200.0,200
18,185.0,185
19,180.0,180
1 episodes rewards steps
2 0 177.0 177
3 1 180.0 180
4 2 200.0 200
5 3 200.0 200
6 4 167.0 167
7 5 124.0 124
8 6 128.0 128
9 7 200.0 200
10 8 200.0 200
11 9 200.0 200
12 10 186.0 186
13 11 187.0 187
14 12 200.0 200
15 13 176.0 176
16 14 200.0 200
17 15 200.0 200
18 16 200.0 200
19 17 200.0 200
20 18 185.0 185
21 19 180.0 180

Binary file not shown.

After

Width:  |  Height:  |  Size: 66 KiB

56
projects/codes/A3C/a3c.py Normal file
View File

@@ -0,0 +1,56 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: JiangJi
Email: johnjim0816@gmail.com
Date: 2021-05-03 22:16:08
LastEditor: JiangJi
LastEditTime: 2022-07-20 23:54:40
Discription:
Environment:
'''
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
class ActorCritic(nn.Module):
''' A2C网络模型包含一个Actor和Critic
'''
def __init__(self, input_dim, output_dim, hidden_dim):
super(ActorCritic, self).__init__()
self.critic = nn.Sequential(
nn.Linear(input_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, 1)
)
self.actor = nn.Sequential(
nn.Linear(input_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, output_dim),
nn.Softmax(dim=1),
)
def forward(self, x):
value = self.critic(x)
probs = self.actor(x)
dist = Categorical(probs)
return dist, value
class A2C:
''' A2C算法
'''
def __init__(self,n_states,n_actions,cfg) -> None:
self.gamma = cfg.gamma
self.device = torch.device(cfg.device)
self.model = ActorCritic(n_states, n_actions, cfg.hidden_size).to(self.device)
self.optimizer = optim.Adam(self.model.parameters())
def compute_returns(self,next_value, rewards, masks):
R = next_value
returns = []
for step in reversed(range(len(rewards))):
R = rewards[step] + self.gamma * R * masks[step]
returns.insert(0, R)
return returns

View File

@@ -10,7 +10,7 @@ import torch.optim as optim
import datetime import datetime
import argparse import argparse
from common.multiprocessing_env import SubprocVecEnv from common.multiprocessing_env import SubprocVecEnv
from a2c import ActorCritic from a3c import ActorCritic
from common.utils import save_results, make_dir from common.utils import save_results, make_dir
from common.utils import plot_rewards, save_args from common.utils import plot_rewards, save_args

View File

@@ -24,6 +24,7 @@ def get_args():
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment") parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
parser.add_argument('--train_eps',default=200,type=int,help="episodes of training") parser.add_argument('--train_eps',default=200,type=int,help="episodes of training")
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing") parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
parser.add_argument('--ep_max_steps',default = 100000,type=int,help="steps per episode, much larger value can simulate infinite steps")
parser.add_argument('--gamma',default=0.95,type=float,help="discounted factor") parser.add_argument('--gamma',default=0.95,type=float,help="discounted factor")
parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon") parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon")
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon") parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon")
@@ -72,7 +73,7 @@ def train(cfg, env, agent):
ep_reward = 0 # reward per episode ep_reward = 0 # reward per episode
ep_step = 0 ep_step = 0
state = env.reset() # reset and obtain initial state state = env.reset() # reset and obtain initial state
while True: for _ in range(cfg['ep_max_steps']):
ep_step += 1 ep_step += 1
action = agent.sample_action(state) # sample action action = agent.sample_action(state) # sample action
next_state, reward, done, _ = env.step(action) # update env and return transitions next_state, reward, done, _ = env.step(action) # update env and return transitions
@@ -91,7 +92,7 @@ def train(cfg, env, agent):
print(f'Episode: {i_ep+1}/{cfg["train_eps"]}, Reward: {ep_reward:.2f}: Epislon: {agent.epsilon:.3f}') print(f'Episode: {i_ep+1}/{cfg["train_eps"]}, Reward: {ep_reward:.2f}: Epislon: {agent.epsilon:.3f}')
print("Finish training!") print("Finish training!")
env.close() env.close()
res_dic = {'episodes':range(len(rewards)),'rewards':rewards} res_dic = {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
return res_dic return res_dic
def test(cfg, env, agent): def test(cfg, env, agent):
@@ -103,7 +104,7 @@ def test(cfg, env, agent):
ep_reward = 0 # reward per episode ep_reward = 0 # reward per episode
ep_step = 0 ep_step = 0
state = env.reset() # reset and obtain initial state state = env.reset() # reset and obtain initial state
while True: for _ in range(cfg['ep_max_steps']):
ep_step+=1 ep_step+=1
action = agent.predict_action(state) # predict action action = agent.predict_action(state) # predict action
next_state, reward, done, _ = env.step(action) next_state, reward, done, _ = env.step(action)
@@ -116,7 +117,7 @@ def test(cfg, env, agent):
print(f"Episode: {i_ep+1}/{cfg['test_eps']}Reward: {ep_reward:.2f}") print(f"Episode: {i_ep+1}/{cfg['test_eps']}Reward: {ep_reward:.2f}")
print("Finish testing!") print("Finish testing!")
env.close() env.close()
return {'episodes':range(len(rewards)),'rewards':rewards} return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
if __name__ == "__main__": if __name__ == "__main__":

View File

@@ -0,0 +1 @@
{"algo_name": "DQN", "env_name": "CartPole-v1", "train_eps": 2000, "test_eps": 20, "ep_max_steps": 100000, "gamma": 0.99, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 6000, "lr": 1e-05, "memory_capacity": 200000, "batch_size": 64, "target_update": 4, "hidden_dim": 256, "device": "cuda", "seed": 10, "show_fig": false, "save_fig": true, "result_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v1/20220828-214702/results", "model_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\DQN/outputs/CartPole-v1/20220828-214702/models", "n_states": 4, "n_actions": 2}

Binary file not shown.

After

Width:  |  Height:  |  Size: 50 KiB

View File

@@ -0,0 +1,21 @@
episodes,rewards,steps
0,371.0,371
1,446.0,446
2,300.0,300
3,500.0,500
4,313.0,313
5,500.0,500
6,341.0,341
7,489.0,489
8,304.0,304
9,358.0,358
10,278.0,278
11,500.0,500
12,500.0,500
13,500.0,500
14,500.0,500
15,476.0,476
16,308.0,308
17,394.0,394
18,500.0,500
19,500.0,500
1 episodes rewards steps
2 0 371.0 371
3 1 446.0 446
4 2 300.0 300
5 3 500.0 500
6 4 313.0 313
7 5 500.0 500
8 6 341.0 341
9 7 489.0 489
10 8 304.0 304
11 9 358.0 358
12 10 278.0 278
13 11 500.0 500
14 12 500.0 500
15 13 500.0 500
16 14 500.0 500
17 15 476.0 476
18 16 308.0 308
19 17 394.0 394
20 18 500.0 500
21 19 500.0 500

Binary file not shown.

After

Width:  |  Height:  |  Size: 50 KiB

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com Email: johnjim0816@gmail.com
Date: 2020-11-22 23:21:53 Date: 2020-11-22 23:21:53
LastEditor: John LastEditor: John
LastEditTime: 2022-08-25 20:59:23 LastEditTime: 2022-08-27 00:04:08
Discription: Discription:
Environment: Environment:
''' '''
@@ -34,7 +34,7 @@ class PGNet(MLP):
def forward(self, x): def forward(self, x):
x = F.relu(self.fc1(x)) x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x)) x = F.relu(self.fc2(x))
x = F.sigmoid(self.fc3(x)) x = torch.sigmoid(self.fc3(x))
return x return x
class Main(Launcher): class Main(Launcher):
@@ -47,8 +47,9 @@ class Main(Launcher):
parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment") parser.add_argument('--env_name',default='CartPole-v0',type=str,help="name of environment")
parser.add_argument('--train_eps',default=200,type=int,help="episodes of training") parser.add_argument('--train_eps',default=200,type=int,help="episodes of training")
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing") parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing")
parser.add_argument('--ep_max_steps',default = 100000,type=int,help="steps per episode, much larger value can simulate infinite steps")
parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor") parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor")
parser.add_argument('--lr',default=0.005,type=float,help="learning rate") parser.add_argument('--lr',default=0.01,type=float,help="learning rate")
parser.add_argument('--update_fre',default=8,type=int) parser.add_argument('--update_fre',default=8,type=int)
parser.add_argument('--hidden_dim',default=36,type=int) parser.add_argument('--hidden_dim',default=36,type=int)
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda") parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
@@ -81,7 +82,7 @@ class Main(Launcher):
for i_ep in range(cfg['train_eps']): for i_ep in range(cfg['train_eps']):
state = env.reset() state = env.reset()
ep_reward = 0 ep_reward = 0
for _ in count(): for _ in range(cfg['ep_max_steps']):
action = agent.sample_action(state) # sample action action = agent.sample_action(state) # sample action
next_state, reward, done, _ = env.step(action) next_state, reward, done, _ = env.step(action)
ep_reward += reward ep_reward += reward
@@ -90,8 +91,9 @@ class Main(Launcher):
agent.memory.push((state,float(action),reward)) agent.memory.push((state,float(action),reward))
state = next_state state = next_state
if done: if done:
print(f"Episode{i_ep+1}/{cfg['train_eps']}, Reward:{ep_reward:.2f}")
break break
if (i_ep+1) % 10 == 0:
print(f"Episode{i_ep+1}/{cfg['train_eps']}, Reward:{ep_reward:.2f}")
if (i_ep+1) % cfg['update_fre'] == 0: if (i_ep+1) % cfg['update_fre'] == 0:
agent.update() agent.update()
rewards.append(ep_reward) rewards.append(ep_reward)
@@ -107,7 +109,7 @@ class Main(Launcher):
for i_ep in range(cfg['test_eps']): for i_ep in range(cfg['test_eps']):
state = env.reset() state = env.reset()
ep_reward = 0 ep_reward = 0
for _ in count(): for _ in range(cfg['ep_max_steps']):
action = agent.predict_action(state) action = agent.predict_action(state)
next_state, reward, done, _ = env.step(action) next_state, reward, done, _ = env.step(action)
ep_reward += reward ep_reward += reward
@@ -115,9 +117,9 @@ class Main(Launcher):
reward = 0 reward = 0
state = next_state state = next_state
if done: if done:
print(f"Episode: {i_ep+1}/{cfg['test_eps']}Reward: {ep_reward:.2f}")
break break
rewards.append(ep_reward) print(f"Episode: {i_ep+1}/{cfg['test_eps']}Reward: {ep_reward:.2f}")
rewards.append(ep_reward)
print("Finish testing!") print("Finish testing!")
env.close() env.close()
return {'episodes':range(len(rewards)),'rewards':rewards} return {'episodes':range(len(rewards)),'rewards':rewards}

View File

@@ -1 +0,0 @@
{"algo_name": "PolicyGradient", "env_name": "CartPole-v0", "train_eps": 200, "test_eps": 20, "gamma": 0.99, "lr": 0.005, "update_fre": 8, "hidden_dim": 36, "device": "cpu", "seed": 1, "save_fig": true, "show_fig": false, "result_path": "/Users/jj/Desktop/rl-tutorials/codes/PolicyGradient/outputs/CartPole-v0/20220825-205930/results/", "model_path": "/Users/jj/Desktop/rl-tutorials/codes/PolicyGradient/outputs/CartPole-v0/20220825-205930/models/", "n_states": 4, "n_actions": 2}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 35 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 66 KiB

View File

@@ -1,201 +0,0 @@
episodes,rewards
0,26.0
1,53.0
2,10.0
3,37.0
4,22.0
5,21.0
6,12.0
7,34.0
8,38.0
9,40.0
10,23.0
11,14.0
12,16.0
13,25.0
14,15.0
15,23.0
16,11.0
17,28.0
18,21.0
19,62.0
20,33.0
21,27.0
22,15.0
23,17.0
24,26.0
25,35.0
26,26.0
27,14.0
28,42.0
29,45.0
30,34.0
31,39.0
32,31.0
33,17.0
34,42.0
35,41.0
36,31.0
37,39.0
38,28.0
39,12.0
40,36.0
41,33.0
42,47.0
43,40.0
44,63.0
45,36.0
46,64.0
47,79.0
48,49.0
49,40.0
50,65.0
51,47.0
52,51.0
53,30.0
54,26.0
55,41.0
56,86.0
57,61.0
58,38.0
59,200.0
60,49.0
61,70.0
62,61.0
63,101.0
64,200.0
65,152.0
66,108.0
67,46.0
68,72.0
69,87.0
70,27.0
71,126.0
72,46.0
73,25.0
74,14.0
75,42.0
76,38.0
77,55.0
78,42.0
79,51.0
80,67.0
81,83.0
82,178.0
83,115.0
84,140.0
85,97.0
86,85.0
87,61.0
88,153.0
89,200.0
90,200.0
91,200.0
92,200.0
93,64.0
94,200.0
95,200.0
96,157.0
97,128.0
98,160.0
99,35.0
100,140.0
101,113.0
102,200.0
103,154.0
104,200.0
105,200.0
106,200.0
107,198.0
108,137.0
109,200.0
110,200.0
111,102.0
112,200.0
113,200.0
114,200.0
115,200.0
116,148.0
117,200.0
118,200.0
119,200.0
120,200.0
121,200.0
122,194.0
123,200.0
124,200.0
125,200.0
126,183.0
127,200.0
128,200.0
129,200.0
130,200.0
131,200.0
132,200.0
133,200.0
134,200.0
135,200.0
136,93.0
137,96.0
138,84.0
139,103.0
140,79.0
141,104.0
142,82.0
143,105.0
144,200.0
145,200.0
146,171.0
147,200.0
148,200.0
149,200.0
150,200.0
151,197.0
152,133.0
153,142.0
154,147.0
155,156.0
156,131.0
157,181.0
158,163.0
159,146.0
160,200.0
161,176.0
162,200.0
163,173.0
164,177.0
165,200.0
166,200.0
167,200.0
168,200.0
169,200.0
170,200.0
171,200.0
172,200.0
173,200.0
174,200.0
175,200.0
176,200.0
177,200.0
178,200.0
179,200.0
180,200.0
181,200.0
182,200.0
183,200.0
184,200.0
185,200.0
186,200.0
187,200.0
188,200.0
189,200.0
190,200.0
191,200.0
192,200.0
193,200.0
194,200.0
195,200.0
196,190.0
197,200.0
198,189.0
199,200.0
1 episodes rewards
2 0 26.0
3 1 53.0
4 2 10.0
5 3 37.0
6 4 22.0
7 5 21.0
8 6 12.0
9 7 34.0
10 8 38.0
11 9 40.0
12 10 23.0
13 11 14.0
14 12 16.0
15 13 25.0
16 14 15.0
17 15 23.0
18 16 11.0
19 17 28.0
20 18 21.0
21 19 62.0
22 20 33.0
23 21 27.0
24 22 15.0
25 23 17.0
26 24 26.0
27 25 35.0
28 26 26.0
29 27 14.0
30 28 42.0
31 29 45.0
32 30 34.0
33 31 39.0
34 32 31.0
35 33 17.0
36 34 42.0
37 35 41.0
38 36 31.0
39 37 39.0
40 38 28.0
41 39 12.0
42 40 36.0
43 41 33.0
44 42 47.0
45 43 40.0
46 44 63.0
47 45 36.0
48 46 64.0
49 47 79.0
50 48 49.0
51 49 40.0
52 50 65.0
53 51 47.0
54 52 51.0
55 53 30.0
56 54 26.0
57 55 41.0
58 56 86.0
59 57 61.0
60 58 38.0
61 59 200.0
62 60 49.0
63 61 70.0
64 62 61.0
65 63 101.0
66 64 200.0
67 65 152.0
68 66 108.0
69 67 46.0
70 68 72.0
71 69 87.0
72 70 27.0
73 71 126.0
74 72 46.0
75 73 25.0
76 74 14.0
77 75 42.0
78 76 38.0
79 77 55.0
80 78 42.0
81 79 51.0
82 80 67.0
83 81 83.0
84 82 178.0
85 83 115.0
86 84 140.0
87 85 97.0
88 86 85.0
89 87 61.0
90 88 153.0
91 89 200.0
92 90 200.0
93 91 200.0
94 92 200.0
95 93 64.0
96 94 200.0
97 95 200.0
98 96 157.0
99 97 128.0
100 98 160.0
101 99 35.0
102 100 140.0
103 101 113.0
104 102 200.0
105 103 154.0
106 104 200.0
107 105 200.0
108 106 200.0
109 107 198.0
110 108 137.0
111 109 200.0
112 110 200.0
113 111 102.0
114 112 200.0
115 113 200.0
116 114 200.0
117 115 200.0
118 116 148.0
119 117 200.0
120 118 200.0
121 119 200.0
122 120 200.0
123 121 200.0
124 122 194.0
125 123 200.0
126 124 200.0
127 125 200.0
128 126 183.0
129 127 200.0
130 128 200.0
131 129 200.0
132 130 200.0
133 131 200.0
134 132 200.0
135 133 200.0
136 134 200.0
137 135 200.0
138 136 93.0
139 137 96.0
140 138 84.0
141 139 103.0
142 140 79.0
143 141 104.0
144 142 82.0
145 143 105.0
146 144 200.0
147 145 200.0
148 146 171.0
149 147 200.0
150 148 200.0
151 149 200.0
152 150 200.0
153 151 197.0
154 152 133.0
155 153 142.0
156 154 147.0
157 155 156.0
158 156 131.0
159 157 181.0
160 158 163.0
161 159 146.0
162 160 200.0
163 161 176.0
164 162 200.0
165 163 173.0
166 164 177.0
167 165 200.0
168 166 200.0
169 167 200.0
170 168 200.0
171 169 200.0
172 170 200.0
173 171 200.0
174 172 200.0
175 173 200.0
176 174 200.0
177 175 200.0
178 176 200.0
179 177 200.0
180 178 200.0
181 179 200.0
182 180 200.0
183 181 200.0
184 182 200.0
185 183 200.0
186 184 200.0
187 185 200.0
188 186 200.0
189 187 200.0
190 188 200.0
191 189 200.0
192 190 200.0
193 191 200.0
194 192 200.0
195 193 200.0
196 194 200.0
197 195 200.0
198 196 190.0
199 197 200.0
200 198 189.0
201 199 200.0

View File

@@ -0,0 +1 @@
{"algo_name": "PolicyGradient", "env_name": "CartPole-v0", "train_eps": 200, "test_eps": 20, "ep_max_steps": 100000, "gamma": 0.99, "lr": 0.01, "update_fre": 8, "hidden_dim": 36, "device": "cpu", "seed": 1, "save_fig": true, "show_fig": false, "result_path": "c:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\PolicyGradient/outputs/CartPole-v0/20220827-000433/results/", "model_path": "c:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\PolicyGradient/outputs/CartPole-v0/20220827-000433/models/", "n_states": 4, "n_actions": 2}

Binary file not shown.

After

Width:  |  Height:  |  Size: 28 KiB

View File

@@ -1,7 +1,7 @@
episodes,rewards episodes,rewards
0,200.0 0,200.0
1,200.0 1,200.0
2,165.0 2,200.0
3,200.0 3,200.0
4,200.0 4,200.0
5,200.0 5,200.0
@@ -10,12 +10,12 @@ episodes,rewards
8,200.0 8,200.0
9,200.0 9,200.0
10,200.0 10,200.0
11,168.0 11,200.0
12,200.0 12,200.0
13,200.0 13,200.0
14,200.0 14,200.0
15,115.0 15,200.0
16,198.0 16,200.0
17,200.0 17,200.0
18,200.0 18,200.0
19,200.0 19,200.0
1 episodes rewards
2 0 200.0
3 1 200.0
4 2 165.0 200.0
5 3 200.0
6 4 200.0
7 5 200.0
10 8 200.0
11 9 200.0
12 10 200.0
13 11 168.0 200.0
14 12 200.0
15 13 200.0
16 14 200.0
17 15 115.0 200.0
18 16 198.0 200.0
19 17 200.0
20 18 200.0
21 19 200.0

Binary file not shown.

After

Width:  |  Height:  |  Size: 60 KiB

View File

@@ -0,0 +1,201 @@
episodes,rewards
0,26.0
1,53.0
2,10.0
3,37.0
4,22.0
5,21.0
6,12.0
7,34.0
8,93.0
9,36.0
10,29.0
11,18.0
12,14.0
13,62.0
14,20.0
15,40.0
16,10.0
17,10.0
18,10.0
19,11.0
20,10.0
21,14.0
22,12.0
23,8.0
24,19.0
25,33.0
26,22.0
27,32.0
28,16.0
29,24.0
30,24.0
31,24.0
32,75.0
33,33.0
34,33.0
35,72.0
36,110.0
37,48.0
38,60.0
39,43.0
40,61.0
41,34.0
42,50.0
43,61.0
44,53.0
45,58.0
46,36.0
47,44.0
48,42.0
49,64.0
50,67.0
51,52.0
52,39.0
53,42.0
54,40.0
55,33.0
56,200.0
57,199.0
58,149.0
59,185.0
60,134.0
61,174.0
62,162.0
63,200.0
64,93.0
65,72.0
66,69.0
67,51.0
68,62.0
69,98.0
70,73.0
71,73.0
72,200.0
73,200.0
74,200.0
75,200.0
76,200.0
77,200.0
78,200.0
79,133.0
80,200.0
81,200.0
82,200.0
83,200.0
84,200.0
85,200.0
86,200.0
87,200.0
88,114.0
89,151.0
90,129.0
91,156.0
92,112.0
93,172.0
94,171.0
95,141.0
96,200.0
97,200.0
98,200.0
99,200.0
100,200.0
101,200.0
102,200.0
103,200.0
104,188.0
105,199.0
106,138.0
107,200.0
108,200.0
109,181.0
110,145.0
111,200.0
112,135.0
113,119.0
114,112.0
115,122.0
116,118.0
117,119.0
118,131.0
119,119.0
120,109.0
121,96.0
122,105.0
123,29.0
124,110.0
125,113.0
126,18.0
127,90.0
128,145.0
129,152.0
130,151.0
131,109.0
132,141.0
133,109.0
134,136.0
135,143.0
136,200.0
137,200.0
138,200.0
139,200.0
140,200.0
141,200.0
142,200.0
143,200.0
144,192.0
145,173.0
146,180.0
147,182.0
148,186.0
149,175.0
150,176.0
151,191.0
152,200.0
153,200.0
154,200.0
155,200.0
156,200.0
157,200.0
158,200.0
159,200.0
160,200.0
161,200.0
162,200.0
163,200.0
164,200.0
165,200.0
166,200.0
167,200.0
168,200.0
169,200.0
170,200.0
171,200.0
172,200.0
173,200.0
174,200.0
175,200.0
176,200.0
177,200.0
178,200.0
179,200.0
180,200.0
181,200.0
182,200.0
183,200.0
184,200.0
185,200.0
186,200.0
187,200.0
188,200.0
189,200.0
190,200.0
191,200.0
192,200.0
193,200.0
194,200.0
195,200.0
196,200.0
197,200.0
198,200.0
199,200.0
1 episodes rewards
2 0 26.0
3 1 53.0
4 2 10.0
5 3 37.0
6 4 22.0
7 5 21.0
8 6 12.0
9 7 34.0
10 8 93.0
11 9 36.0
12 10 29.0
13 11 18.0
14 12 14.0
15 13 62.0
16 14 20.0
17 15 40.0
18 16 10.0
19 17 10.0
20 18 10.0
21 19 11.0
22 20 10.0
23 21 14.0
24 22 12.0
25 23 8.0
26 24 19.0
27 25 33.0
28 26 22.0
29 27 32.0
30 28 16.0
31 29 24.0
32 30 24.0
33 31 24.0
34 32 75.0
35 33 33.0
36 34 33.0
37 35 72.0
38 36 110.0
39 37 48.0
40 38 60.0
41 39 43.0
42 40 61.0
43 41 34.0
44 42 50.0
45 43 61.0
46 44 53.0
47 45 58.0
48 46 36.0
49 47 44.0
50 48 42.0
51 49 64.0
52 50 67.0
53 51 52.0
54 52 39.0
55 53 42.0
56 54 40.0
57 55 33.0
58 56 200.0
59 57 199.0
60 58 149.0
61 59 185.0
62 60 134.0
63 61 174.0
64 62 162.0
65 63 200.0
66 64 93.0
67 65 72.0
68 66 69.0
69 67 51.0
70 68 62.0
71 69 98.0
72 70 73.0
73 71 73.0
74 72 200.0
75 73 200.0
76 74 200.0
77 75 200.0
78 76 200.0
79 77 200.0
80 78 200.0
81 79 133.0
82 80 200.0
83 81 200.0
84 82 200.0
85 83 200.0
86 84 200.0
87 85 200.0
88 86 200.0
89 87 200.0
90 88 114.0
91 89 151.0
92 90 129.0
93 91 156.0
94 92 112.0
95 93 172.0
96 94 171.0
97 95 141.0
98 96 200.0
99 97 200.0
100 98 200.0
101 99 200.0
102 100 200.0
103 101 200.0
104 102 200.0
105 103 200.0
106 104 188.0
107 105 199.0
108 106 138.0
109 107 200.0
110 108 200.0
111 109 181.0
112 110 145.0
113 111 200.0
114 112 135.0
115 113 119.0
116 114 112.0
117 115 122.0
118 116 118.0
119 117 119.0
120 118 131.0
121 119 119.0
122 120 109.0
123 121 96.0
124 122 105.0
125 123 29.0
126 124 110.0
127 125 113.0
128 126 18.0
129 127 90.0
130 128 145.0
131 129 152.0
132 130 151.0
133 131 109.0
134 132 141.0
135 133 109.0
136 134 136.0
137 135 143.0
138 136 200.0
139 137 200.0
140 138 200.0
141 139 200.0
142 140 200.0
143 141 200.0
144 142 200.0
145 143 200.0
146 144 192.0
147 145 173.0
148 146 180.0
149 147 182.0
150 148 186.0
151 149 175.0
152 150 176.0
153 151 191.0
154 152 200.0
155 153 200.0
156 154 200.0
157 155 200.0
158 156 200.0
159 157 200.0
160 158 200.0
161 159 200.0
162 160 200.0
163 161 200.0
164 162 200.0
165 163 200.0
166 164 200.0
167 165 200.0
168 166 200.0
169 167 200.0
170 168 200.0
171 169 200.0
172 170 200.0
173 171 200.0
174 172 200.0
175 173 200.0
176 174 200.0
177 175 200.0
178 176 200.0
179 177 200.0
180 178 200.0
181 179 200.0
182 180 200.0
183 181 200.0
184 182 200.0
185 183 200.0
186 184 200.0
187 185 200.0
188 186 200.0
189 187 200.0
190 188 200.0
191 189 200.0
192 190 200.0
193 191 200.0
194 192 200.0
195 193 200.0
196 194 200.0
197 195 200.0
198 196 200.0
199 197 200.0
200 198 200.0
201 199 200.0

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com Email: johnjim0816@gmail.com
Date: 2020-11-22 23:27:44 Date: 2020-11-22 23:27:44
LastEditor: John LastEditor: John
LastEditTime: 2022-08-25 20:58:59 LastEditTime: 2022-08-27 13:45:26
Discription: Discription:
Environment: Environment:
''' '''
@@ -31,8 +31,11 @@ class PolicyGradient:
state = torch.from_numpy(state).float() state = torch.from_numpy(state).float()
state = Variable(state) state = Variable(state)
probs = self.policy_net(state) probs = self.policy_net(state)
print("probs")
print(probs)
m = Bernoulli(probs) # 伯努利分布 m = Bernoulli(probs) # 伯努利分布
action = m.sample() action = m.sample()
action = action.data.numpy().astype(int)[0] # 转为标量 action = action.data.numpy().astype(int)[0] # 转为标量
return action return action
def predict_action(self,state): def predict_action(self,state):

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com Email: johnjim0816@gmail.com
Date: 2020-09-11 23:03:00 Date: 2020-09-11 23:03:00
LastEditor: John LastEditor: John
LastEditTime: 2022-08-25 14:59:15 LastEditTime: 2022-08-26 22:46:21
Discription: Discription:
Environment: Environment:
''' '''
@@ -57,7 +57,10 @@ class Main(Launcher):
env = CliffWalkingWapper(env) env = CliffWalkingWapper(env)
if cfg['seed'] !=0: # set random seed if cfg['seed'] !=0: # set random seed
all_seed(env,seed=cfg["seed"]) all_seed(env,seed=cfg["seed"])
n_states = env.observation_space.n # state dimension try: # state dimension
n_states = env.observation_space.n # print(hasattr(env.observation_space, 'n'))
except AttributeError:
n_states = env.observation_space.shape[0] # print(hasattr(env.observation_space, 'shape'))
n_actions = env.action_space.n # action dimension n_actions = env.action_space.n # action dimension
print(f"n_states: {n_states}, n_actions: {n_actions}") print(f"n_states: {n_states}, n_actions: {n_actions}")
cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters

View File

@@ -1,21 +0,0 @@
episodes,rewards
0,-13
1,-13
2,-13
3,-13
4,-13
5,-13
6,-13
7,-13
8,-13
9,-13
10,-13
11,-13
12,-13
13,-13
14,-13
15,-13
16,-13
17,-13
18,-13
19,-13
1 episodes rewards
2 0 -13
3 1 -13
4 2 -13
5 3 -13
6 4 -13
7 5 -13
8 6 -13
9 7 -13
10 8 -13
11 9 -13
12 10 -13
13 11 -13
14 12 -13
15 13 -13
16 14 -13
17 15 -13
18 16 -13
19 17 -13
20 18 -13
21 19 -13

View File

@@ -1,401 +0,0 @@
episodes,rewards
0,-2131
1,-1086
2,-586
3,-220
4,-154
5,-122
6,-150
7,-159
8,-164
9,-88
10,-195
11,-114
12,-60
13,-179
14,-101
15,-304
16,-96
17,-119
18,-113
19,-98
20,-106
21,-105
22,-77
23,-51
24,-105
25,-136
26,-100
27,-29
28,-79
29,-114
30,-82
31,-70
32,-75
33,-51
34,-94
35,-52
36,-93
37,-71
38,-73
39,-48
40,-52
41,-96
42,-46
43,-65
44,-57
45,-41
46,-104
47,-51
48,-181
49,-229
50,-39
51,-69
52,-53
53,-59
54,-26
55,-75
56,-31
57,-60
58,-63
59,-40
60,-35
61,-79
62,-42
63,-22
64,-73
65,-71
66,-18
67,-55
68,-29
69,-43
70,-70
71,-49
72,-42
73,-29
74,-81
75,-36
76,-38
77,-36
78,-52
79,-28
80,-42
81,-52
82,-66
83,-31
84,-27
85,-49
86,-28
87,-54
88,-34
89,-35
90,-50
91,-36
92,-36
93,-46
94,-34
95,-135
96,-39
97,-36
98,-26
99,-56
100,-40
101,-40
102,-26
103,-28
104,-31
105,-35
106,-26
107,-57
108,-44
109,-41
110,-31
111,-26
112,-25
113,-41
114,-32
115,-44
116,-30
117,-32
118,-30
119,-25
120,-23
121,-47
122,-24
123,-45
124,-39
125,-21
126,-43
127,-143
128,-26
129,-20
130,-32
131,-16
132,-24
133,-42
134,-25
135,-36
136,-19
137,-29
138,-43
139,-17
140,-150
141,-32
142,-34
143,-19
144,-26
145,-30
146,-31
147,-49
148,-33
149,-21
150,-17
151,-48
152,-34
153,-20
154,-20
155,-26
156,-21
157,-13
158,-40
159,-22
160,-26
161,-30
162,-29
163,-25
164,-26
165,-27
166,-21
167,-29
168,-24
169,-17
170,-22
171,-35
172,-35
173,-18
174,-135
175,-15
176,-23
177,-28
178,-25
179,-24
180,-29
181,-31
182,-24
183,-129
184,-45
185,-24
186,-17
187,-20
188,-21
189,-23
190,-15
191,-32
192,-22
193,-19
194,-17
195,-45
196,-15
197,-14
198,-14
199,-37
200,-23
201,-17
202,-19
203,-21
204,-23
205,-27
206,-14
207,-18
208,-23
209,-34
210,-23
211,-13
212,-25
213,-17
214,-13
215,-21
216,-29
217,-18
218,-24
219,-15
220,-27
221,-25
222,-21
223,-19
224,-17
225,-18
226,-13
227,-22
228,-14
229,-13
230,-29
231,-23
232,-15
233,-15
234,-14
235,-28
236,-25
237,-17
238,-23
239,-29
240,-15
241,-14
242,-15
243,-23
244,-15
245,-16
246,-19
247,-13
248,-16
249,-17
250,-25
251,-30
252,-13
253,-14
254,-15
255,-22
256,-14
257,-17
258,-126
259,-15
260,-21
261,-16
262,-23
263,-14
264,-13
265,-13
266,-19
267,-13
268,-19
269,-17
270,-17
271,-13
272,-19
273,-13
274,-13
275,-16
276,-22
277,-14
278,-15
279,-19
280,-34
281,-13
282,-15
283,-32
284,-13
285,-13
286,-13
287,-14
288,-16
289,-13
290,-13
291,-17
292,-13
293,-13
294,-22
295,-14
296,-15
297,-13
298,-13
299,-13
300,-16
301,-13
302,-14
303,-13
304,-13
305,-13
306,-24
307,-13
308,-13
309,-15
310,-13
311,-13
312,-13
313,-15
314,-13
315,-19
316,-15
317,-17
318,-13
319,-13
320,-13
321,-13
322,-13
323,-15
324,-13
325,-13
326,-13
327,-123
328,-13
329,-13
330,-13
331,-13
332,-13
333,-13
334,-13
335,-13
336,-16
337,-13
338,-23
339,-13
340,-13
341,-13
342,-13
343,-13
344,-13
345,-13
346,-13
347,-13
348,-13
349,-13
350,-134
351,-13
352,-13
353,-13
354,-13
355,-13
356,-13
357,-13
358,-13
359,-13
360,-15
361,-13
362,-13
363,-13
364,-13
365,-13
366,-13
367,-13
368,-13
369,-14
370,-13
371,-13
372,-13
373,-13
374,-13
375,-13
376,-13
377,-124
378,-13
379,-13
380,-13
381,-13
382,-13
383,-13
384,-13
385,-13
386,-13
387,-13
388,-13
389,-121
390,-13
391,-13
392,-13
393,-13
394,-13
395,-13
396,-13
397,-13
398,-17
399,-13
1 episodes rewards
2 0 -2131
3 1 -1086
4 2 -586
5 3 -220
6 4 -154
7 5 -122
8 6 -150
9 7 -159
10 8 -164
11 9 -88
12 10 -195
13 11 -114
14 12 -60
15 13 -179
16 14 -101
17 15 -304
18 16 -96
19 17 -119
20 18 -113
21 19 -98
22 20 -106
23 21 -105
24 22 -77
25 23 -51
26 24 -105
27 25 -136
28 26 -100
29 27 -29
30 28 -79
31 29 -114
32 30 -82
33 31 -70
34 32 -75
35 33 -51
36 34 -94
37 35 -52
38 36 -93
39 37 -71
40 38 -73
41 39 -48
42 40 -52
43 41 -96
44 42 -46
45 43 -65
46 44 -57
47 45 -41
48 46 -104
49 47 -51
50 48 -181
51 49 -229
52 50 -39
53 51 -69
54 52 -53
55 53 -59
56 54 -26
57 55 -75
58 56 -31
59 57 -60
60 58 -63
61 59 -40
62 60 -35
63 61 -79
64 62 -42
65 63 -22
66 64 -73
67 65 -71
68 66 -18
69 67 -55
70 68 -29
71 69 -43
72 70 -70
73 71 -49
74 72 -42
75 73 -29
76 74 -81
77 75 -36
78 76 -38
79 77 -36
80 78 -52
81 79 -28
82 80 -42
83 81 -52
84 82 -66
85 83 -31
86 84 -27
87 85 -49
88 86 -28
89 87 -54
90 88 -34
91 89 -35
92 90 -50
93 91 -36
94 92 -36
95 93 -46
96 94 -34
97 95 -135
98 96 -39
99 97 -36
100 98 -26
101 99 -56
102 100 -40
103 101 -40
104 102 -26
105 103 -28
106 104 -31
107 105 -35
108 106 -26
109 107 -57
110 108 -44
111 109 -41
112 110 -31
113 111 -26
114 112 -25
115 113 -41
116 114 -32
117 115 -44
118 116 -30
119 117 -32
120 118 -30
121 119 -25
122 120 -23
123 121 -47
124 122 -24
125 123 -45
126 124 -39
127 125 -21
128 126 -43
129 127 -143
130 128 -26
131 129 -20
132 130 -32
133 131 -16
134 132 -24
135 133 -42
136 134 -25
137 135 -36
138 136 -19
139 137 -29
140 138 -43
141 139 -17
142 140 -150
143 141 -32
144 142 -34
145 143 -19
146 144 -26
147 145 -30
148 146 -31
149 147 -49
150 148 -33
151 149 -21
152 150 -17
153 151 -48
154 152 -34
155 153 -20
156 154 -20
157 155 -26
158 156 -21
159 157 -13
160 158 -40
161 159 -22
162 160 -26
163 161 -30
164 162 -29
165 163 -25
166 164 -26
167 165 -27
168 166 -21
169 167 -29
170 168 -24
171 169 -17
172 170 -22
173 171 -35
174 172 -35
175 173 -18
176 174 -135
177 175 -15
178 176 -23
179 177 -28
180 178 -25
181 179 -24
182 180 -29
183 181 -31
184 182 -24
185 183 -129
186 184 -45
187 185 -24
188 186 -17
189 187 -20
190 188 -21
191 189 -23
192 190 -15
193 191 -32
194 192 -22
195 193 -19
196 194 -17
197 195 -45
198 196 -15
199 197 -14
200 198 -14
201 199 -37
202 200 -23
203 201 -17
204 202 -19
205 203 -21
206 204 -23
207 205 -27
208 206 -14
209 207 -18
210 208 -23
211 209 -34
212 210 -23
213 211 -13
214 212 -25
215 213 -17
216 214 -13
217 215 -21
218 216 -29
219 217 -18
220 218 -24
221 219 -15
222 220 -27
223 221 -25
224 222 -21
225 223 -19
226 224 -17
227 225 -18
228 226 -13
229 227 -22
230 228 -14
231 229 -13
232 230 -29
233 231 -23
234 232 -15
235 233 -15
236 234 -14
237 235 -28
238 236 -25
239 237 -17
240 238 -23
241 239 -29
242 240 -15
243 241 -14
244 242 -15
245 243 -23
246 244 -15
247 245 -16
248 246 -19
249 247 -13
250 248 -16
251 249 -17
252 250 -25
253 251 -30
254 252 -13
255 253 -14
256 254 -15
257 255 -22
258 256 -14
259 257 -17
260 258 -126
261 259 -15
262 260 -21
263 261 -16
264 262 -23
265 263 -14
266 264 -13
267 265 -13
268 266 -19
269 267 -13
270 268 -19
271 269 -17
272 270 -17
273 271 -13
274 272 -19
275 273 -13
276 274 -13
277 275 -16
278 276 -22
279 277 -14
280 278 -15
281 279 -19
282 280 -34
283 281 -13
284 282 -15
285 283 -32
286 284 -13
287 285 -13
288 286 -13
289 287 -14
290 288 -16
291 289 -13
292 290 -13
293 291 -17
294 292 -13
295 293 -13
296 294 -22
297 295 -14
298 296 -15
299 297 -13
300 298 -13
301 299 -13
302 300 -16
303 301 -13
304 302 -14
305 303 -13
306 304 -13
307 305 -13
308 306 -24
309 307 -13
310 308 -13
311 309 -15
312 310 -13
313 311 -13
314 312 -13
315 313 -15
316 314 -13
317 315 -19
318 316 -15
319 317 -17
320 318 -13
321 319 -13
322 320 -13
323 321 -13
324 322 -13
325 323 -15
326 324 -13
327 325 -13
328 326 -13
329 327 -123
330 328 -13
331 329 -13
332 330 -13
333 331 -13
334 332 -13
335 333 -13
336 334 -13
337 335 -13
338 336 -16
339 337 -13
340 338 -23
341 339 -13
342 340 -13
343 341 -13
344 342 -13
345 343 -13
346 344 -13
347 345 -13
348 346 -13
349 347 -13
350 348 -13
351 349 -13
352 350 -134
353 351 -13
354 352 -13
355 353 -13
356 354 -13
357 355 -13
358 356 -13
359 357 -13
360 358 -13
361 359 -13
362 360 -15
363 361 -13
364 362 -13
365 363 -13
366 364 -13
367 365 -13
368 366 -13
369 367 -13
370 368 -13
371 369 -14
372 370 -13
373 371 -13
374 372 -13
375 373 -13
376 374 -13
377 375 -13
378 376 -13
379 377 -124
380 378 -13
381 379 -13
382 380 -13
383 381 -13
384 382 -13
385 383 -13
386 384 -13
387 385 -13
388 386 -13
389 387 -13
390 388 -13
391 389 -121
392 390 -13
393 391 -13
394 392 -13
395 393 -13
396 394 -13
397 395 -13
398 396 -13
399 397 -13
400 398 -17
401 399 -13

View File

@@ -0,0 +1 @@
{"algo_name": "Q-learning", "env_name": "CliffWalking-v0", "train_eps": 400, "test_eps": 20, "gamma": 0.9, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 300, "lr": 0.1, "device": "cpu", "seed": 10, "show_fig": false, "save_fig": true, "result_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\QLearning/outputs/CliffWalking-v0/20220826-224730/results/", "model_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\QLearning/outputs/CliffWalking-v0/20220826-224730/models/", "n_states": 48, "n_actions": 4}

View File

@@ -0,0 +1,21 @@
episodes,rewards,steps
0,-13,13
1,-13,13
2,-13,13
3,-13,13
4,-13,13
5,-13,13
6,-13,13
7,-13,13
8,-13,13
9,-13,13
10,-13,13
11,-13,13
12,-13,13
13,-13,13
14,-13,13
15,-13,13
16,-13,13
17,-13,13
18,-13,13
19,-13,13
1 episodes rewards steps
2 0 -13 13
3 1 -13 13
4 2 -13 13
5 3 -13 13
6 4 -13 13
7 5 -13 13
8 6 -13 13
9 7 -13 13
10 8 -13 13
11 9 -13 13
12 10 -13 13
13 11 -13 13
14 12 -13 13
15 13 -13 13
16 14 -13 13
17 15 -13 13
18 16 -13 13
19 17 -13 13
20 18 -13 13
21 19 -13 13

View File

@@ -0,0 +1,401 @@
episodes,rewards,steps
0,-2131,448
1,-1086,492
2,-586,388
3,-220,220
4,-154,154
5,-122,122
6,-150,150
7,-159,159
8,-164,164
9,-88,88
10,-195,195
11,-114,114
12,-60,60
13,-179,179
14,-101,101
15,-304,205
16,-96,96
17,-119,119
18,-113,113
19,-98,98
20,-106,106
21,-105,105
22,-77,77
23,-51,51
24,-105,105
25,-136,136
26,-100,100
27,-29,29
28,-79,79
29,-114,114
30,-82,82
31,-70,70
32,-75,75
33,-51,51
34,-94,94
35,-52,52
36,-93,93
37,-71,71
38,-73,73
39,-48,48
40,-52,52
41,-96,96
42,-46,46
43,-65,65
44,-57,57
45,-41,41
46,-104,104
47,-51,51
48,-181,82
49,-229,130
50,-39,39
51,-69,69
52,-53,53
53,-59,59
54,-26,26
55,-75,75
56,-31,31
57,-60,60
58,-63,63
59,-40,40
60,-35,35
61,-79,79
62,-42,42
63,-22,22
64,-73,73
65,-71,71
66,-18,18
67,-55,55
68,-29,29
69,-43,43
70,-70,70
71,-49,49
72,-42,42
73,-29,29
74,-81,81
75,-36,36
76,-38,38
77,-36,36
78,-52,52
79,-28,28
80,-42,42
81,-52,52
82,-66,66
83,-31,31
84,-27,27
85,-49,49
86,-28,28
87,-54,54
88,-34,34
89,-35,35
90,-50,50
91,-36,36
92,-36,36
93,-46,46
94,-34,34
95,-135,36
96,-39,39
97,-36,36
98,-26,26
99,-56,56
100,-40,40
101,-40,40
102,-26,26
103,-28,28
104,-31,31
105,-35,35
106,-26,26
107,-57,57
108,-44,44
109,-41,41
110,-31,31
111,-26,26
112,-25,25
113,-41,41
114,-32,32
115,-44,44
116,-30,30
117,-32,32
118,-30,30
119,-25,25
120,-23,23
121,-47,47
122,-24,24
123,-45,45
124,-39,39
125,-21,21
126,-43,43
127,-143,44
128,-26,26
129,-20,20
130,-32,32
131,-16,16
132,-24,24
133,-42,42
134,-25,25
135,-36,36
136,-19,19
137,-29,29
138,-43,43
139,-17,17
140,-150,51
141,-32,32
142,-34,34
143,-19,19
144,-26,26
145,-30,30
146,-31,31
147,-49,49
148,-33,33
149,-21,21
150,-17,17
151,-48,48
152,-34,34
153,-20,20
154,-20,20
155,-26,26
156,-21,21
157,-13,13
158,-40,40
159,-22,22
160,-26,26
161,-30,30
162,-29,29
163,-25,25
164,-26,26
165,-27,27
166,-21,21
167,-29,29
168,-24,24
169,-17,17
170,-22,22
171,-35,35
172,-35,35
173,-18,18
174,-135,36
175,-15,15
176,-23,23
177,-28,28
178,-25,25
179,-24,24
180,-29,29
181,-31,31
182,-24,24
183,-129,30
184,-45,45
185,-24,24
186,-17,17
187,-20,20
188,-21,21
189,-23,23
190,-15,15
191,-32,32
192,-22,22
193,-19,19
194,-17,17
195,-45,45
196,-15,15
197,-14,14
198,-14,14
199,-37,37
200,-23,23
201,-17,17
202,-19,19
203,-21,21
204,-23,23
205,-27,27
206,-14,14
207,-18,18
208,-23,23
209,-34,34
210,-23,23
211,-13,13
212,-25,25
213,-17,17
214,-13,13
215,-21,21
216,-29,29
217,-18,18
218,-24,24
219,-15,15
220,-27,27
221,-25,25
222,-21,21
223,-19,19
224,-17,17
225,-18,18
226,-13,13
227,-22,22
228,-14,14
229,-13,13
230,-29,29
231,-23,23
232,-15,15
233,-15,15
234,-14,14
235,-28,28
236,-25,25
237,-17,17
238,-23,23
239,-29,29
240,-15,15
241,-14,14
242,-15,15
243,-23,23
244,-15,15
245,-16,16
246,-19,19
247,-13,13
248,-16,16
249,-17,17
250,-25,25
251,-30,30
252,-13,13
253,-14,14
254,-15,15
255,-22,22
256,-14,14
257,-17,17
258,-126,27
259,-15,15
260,-21,21
261,-16,16
262,-23,23
263,-14,14
264,-13,13
265,-13,13
266,-19,19
267,-13,13
268,-19,19
269,-17,17
270,-17,17
271,-13,13
272,-19,19
273,-13,13
274,-13,13
275,-16,16
276,-22,22
277,-14,14
278,-15,15
279,-19,19
280,-34,34
281,-13,13
282,-15,15
283,-32,32
284,-13,13
285,-13,13
286,-13,13
287,-14,14
288,-16,16
289,-13,13
290,-13,13
291,-17,17
292,-13,13
293,-13,13
294,-22,22
295,-14,14
296,-15,15
297,-13,13
298,-13,13
299,-13,13
300,-16,16
301,-13,13
302,-14,14
303,-13,13
304,-13,13
305,-13,13
306,-24,24
307,-13,13
308,-13,13
309,-15,15
310,-13,13
311,-13,13
312,-13,13
313,-15,15
314,-13,13
315,-19,19
316,-15,15
317,-17,17
318,-13,13
319,-13,13
320,-13,13
321,-13,13
322,-13,13
323,-15,15
324,-13,13
325,-13,13
326,-13,13
327,-123,24
328,-13,13
329,-13,13
330,-13,13
331,-13,13
332,-13,13
333,-13,13
334,-13,13
335,-13,13
336,-16,16
337,-13,13
338,-23,23
339,-13,13
340,-13,13
341,-13,13
342,-13,13
343,-13,13
344,-13,13
345,-13,13
346,-13,13
347,-13,13
348,-13,13
349,-13,13
350,-134,35
351,-13,13
352,-13,13
353,-13,13
354,-13,13
355,-13,13
356,-13,13
357,-13,13
358,-13,13
359,-13,13
360,-15,15
361,-13,13
362,-13,13
363,-13,13
364,-13,13
365,-13,13
366,-13,13
367,-13,13
368,-13,13
369,-14,14
370,-13,13
371,-13,13
372,-13,13
373,-13,13
374,-13,13
375,-13,13
376,-13,13
377,-124,25
378,-13,13
379,-13,13
380,-13,13
381,-13,13
382,-13,13
383,-13,13
384,-13,13
385,-13,13
386,-13,13
387,-13,13
388,-13,13
389,-121,22
390,-13,13
391,-13,13
392,-13,13
393,-13,13
394,-13,13
395,-13,13
396,-13,13
397,-13,13
398,-17,17
399,-13,13
1 episodes rewards steps
2 0 -2131 448
3 1 -1086 492
4 2 -586 388
5 3 -220 220
6 4 -154 154
7 5 -122 122
8 6 -150 150
9 7 -159 159
10 8 -164 164
11 9 -88 88
12 10 -195 195
13 11 -114 114
14 12 -60 60
15 13 -179 179
16 14 -101 101
17 15 -304 205
18 16 -96 96
19 17 -119 119
20 18 -113 113
21 19 -98 98
22 20 -106 106
23 21 -105 105
24 22 -77 77
25 23 -51 51
26 24 -105 105
27 25 -136 136
28 26 -100 100
29 27 -29 29
30 28 -79 79
31 29 -114 114
32 30 -82 82
33 31 -70 70
34 32 -75 75
35 33 -51 51
36 34 -94 94
37 35 -52 52
38 36 -93 93
39 37 -71 71
40 38 -73 73
41 39 -48 48
42 40 -52 52
43 41 -96 96
44 42 -46 46
45 43 -65 65
46 44 -57 57
47 45 -41 41
48 46 -104 104
49 47 -51 51
50 48 -181 82
51 49 -229 130
52 50 -39 39
53 51 -69 69
54 52 -53 53
55 53 -59 59
56 54 -26 26
57 55 -75 75
58 56 -31 31
59 57 -60 60
60 58 -63 63
61 59 -40 40
62 60 -35 35
63 61 -79 79
64 62 -42 42
65 63 -22 22
66 64 -73 73
67 65 -71 71
68 66 -18 18
69 67 -55 55
70 68 -29 29
71 69 -43 43
72 70 -70 70
73 71 -49 49
74 72 -42 42
75 73 -29 29
76 74 -81 81
77 75 -36 36
78 76 -38 38
79 77 -36 36
80 78 -52 52
81 79 -28 28
82 80 -42 42
83 81 -52 52
84 82 -66 66
85 83 -31 31
86 84 -27 27
87 85 -49 49
88 86 -28 28
89 87 -54 54
90 88 -34 34
91 89 -35 35
92 90 -50 50
93 91 -36 36
94 92 -36 36
95 93 -46 46
96 94 -34 34
97 95 -135 36
98 96 -39 39
99 97 -36 36
100 98 -26 26
101 99 -56 56
102 100 -40 40
103 101 -40 40
104 102 -26 26
105 103 -28 28
106 104 -31 31
107 105 -35 35
108 106 -26 26
109 107 -57 57
110 108 -44 44
111 109 -41 41
112 110 -31 31
113 111 -26 26
114 112 -25 25
115 113 -41 41
116 114 -32 32
117 115 -44 44
118 116 -30 30
119 117 -32 32
120 118 -30 30
121 119 -25 25
122 120 -23 23
123 121 -47 47
124 122 -24 24
125 123 -45 45
126 124 -39 39
127 125 -21 21
128 126 -43 43
129 127 -143 44
130 128 -26 26
131 129 -20 20
132 130 -32 32
133 131 -16 16
134 132 -24 24
135 133 -42 42
136 134 -25 25
137 135 -36 36
138 136 -19 19
139 137 -29 29
140 138 -43 43
141 139 -17 17
142 140 -150 51
143 141 -32 32
144 142 -34 34
145 143 -19 19
146 144 -26 26
147 145 -30 30
148 146 -31 31
149 147 -49 49
150 148 -33 33
151 149 -21 21
152 150 -17 17
153 151 -48 48
154 152 -34 34
155 153 -20 20
156 154 -20 20
157 155 -26 26
158 156 -21 21
159 157 -13 13
160 158 -40 40
161 159 -22 22
162 160 -26 26
163 161 -30 30
164 162 -29 29
165 163 -25 25
166 164 -26 26
167 165 -27 27
168 166 -21 21
169 167 -29 29
170 168 -24 24
171 169 -17 17
172 170 -22 22
173 171 -35 35
174 172 -35 35
175 173 -18 18
176 174 -135 36
177 175 -15 15
178 176 -23 23
179 177 -28 28
180 178 -25 25
181 179 -24 24
182 180 -29 29
183 181 -31 31
184 182 -24 24
185 183 -129 30
186 184 -45 45
187 185 -24 24
188 186 -17 17
189 187 -20 20
190 188 -21 21
191 189 -23 23
192 190 -15 15
193 191 -32 32
194 192 -22 22
195 193 -19 19
196 194 -17 17
197 195 -45 45
198 196 -15 15
199 197 -14 14
200 198 -14 14
201 199 -37 37
202 200 -23 23
203 201 -17 17
204 202 -19 19
205 203 -21 21
206 204 -23 23
207 205 -27 27
208 206 -14 14
209 207 -18 18
210 208 -23 23
211 209 -34 34
212 210 -23 23
213 211 -13 13
214 212 -25 25
215 213 -17 17
216 214 -13 13
217 215 -21 21
218 216 -29 29
219 217 -18 18
220 218 -24 24
221 219 -15 15
222 220 -27 27
223 221 -25 25
224 222 -21 21
225 223 -19 19
226 224 -17 17
227 225 -18 18
228 226 -13 13
229 227 -22 22
230 228 -14 14
231 229 -13 13
232 230 -29 29
233 231 -23 23
234 232 -15 15
235 233 -15 15
236 234 -14 14
237 235 -28 28
238 236 -25 25
239 237 -17 17
240 238 -23 23
241 239 -29 29
242 240 -15 15
243 241 -14 14
244 242 -15 15
245 243 -23 23
246 244 -15 15
247 245 -16 16
248 246 -19 19
249 247 -13 13
250 248 -16 16
251 249 -17 17
252 250 -25 25
253 251 -30 30
254 252 -13 13
255 253 -14 14
256 254 -15 15
257 255 -22 22
258 256 -14 14
259 257 -17 17
260 258 -126 27
261 259 -15 15
262 260 -21 21
263 261 -16 16
264 262 -23 23
265 263 -14 14
266 264 -13 13
267 265 -13 13
268 266 -19 19
269 267 -13 13
270 268 -19 19
271 269 -17 17
272 270 -17 17
273 271 -13 13
274 272 -19 19
275 273 -13 13
276 274 -13 13
277 275 -16 16
278 276 -22 22
279 277 -14 14
280 278 -15 15
281 279 -19 19
282 280 -34 34
283 281 -13 13
284 282 -15 15
285 283 -32 32
286 284 -13 13
287 285 -13 13
288 286 -13 13
289 287 -14 14
290 288 -16 16
291 289 -13 13
292 290 -13 13
293 291 -17 17
294 292 -13 13
295 293 -13 13
296 294 -22 22
297 295 -14 14
298 296 -15 15
299 297 -13 13
300 298 -13 13
301 299 -13 13
302 300 -16 16
303 301 -13 13
304 302 -14 14
305 303 -13 13
306 304 -13 13
307 305 -13 13
308 306 -24 24
309 307 -13 13
310 308 -13 13
311 309 -15 15
312 310 -13 13
313 311 -13 13
314 312 -13 13
315 313 -15 15
316 314 -13 13
317 315 -19 19
318 316 -15 15
319 317 -17 17
320 318 -13 13
321 319 -13 13
322 320 -13 13
323 321 -13 13
324 322 -13 13
325 323 -15 15
326 324 -13 13
327 325 -13 13
328 326 -13 13
329 327 -123 24
330 328 -13 13
331 329 -13 13
332 330 -13 13
333 331 -13 13
334 332 -13 13
335 333 -13 13
336 334 -13 13
337 335 -13 13
338 336 -16 16
339 337 -13 13
340 338 -23 23
341 339 -13 13
342 340 -13 13
343 341 -13 13
344 342 -13 13
345 343 -13 13
346 344 -13 13
347 345 -13 13
348 346 -13 13
349 347 -13 13
350 348 -13 13
351 349 -13 13
352 350 -134 35
353 351 -13 13
354 352 -13 13
355 353 -13 13
356 354 -13 13
357 355 -13 13
358 356 -13 13
359 357 -13 13
360 358 -13 13
361 359 -13 13
362 360 -15 15
363 361 -13 13
364 362 -13 13
365 363 -13 13
366 364 -13 13
367 365 -13 13
368 366 -13 13
369 367 -13 13
370 368 -13 13
371 369 -14 14
372 370 -13 13
373 371 -13 13
374 372 -13 13
375 373 -13 13
376 374 -13 13
377 375 -13 13
378 376 -13 13
379 377 -124 25
380 378 -13 13
381 379 -13 13
382 380 -13 13
383 381 -13 13
384 382 -13 13
385 383 -13 13
386 384 -13 13
387 385 -13 13
388 386 -13 13
389 387 -13 13
390 388 -13 13
391 389 -121 22
392 390 -13 13
393 391 -13 13
394 392 -13 13
395 393 -13 13
396 394 -13 13
397 395 -13 13
398 396 -13 13
399 397 -13 13
400 398 -17 17
401 399 -13 13

View File

@@ -0,0 +1 @@
{"algo_name": "Q-learning", "env_name": "Racetrack-v0", "train_eps": 400, "test_eps": 20, "gamma": 0.9, "epsilon_start": 0.95, "epsilon_end": 0.01, "epsilon_decay": 300, "lr": 0.1, "device": "cpu", "seed": 10, "show_fig": false, "save_fig": true, "result_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\QLearning/outputs/Racetrack-v0/20220826-224626/results/", "model_path": "C:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\QLearning/outputs/Racetrack-v0/20220826-224626/models/", "n_states": 4, "n_actions": 9}

Binary file not shown.

After

Width:  |  Height:  |  Size: 39 KiB

View File

@@ -0,0 +1,21 @@
episodes,rewards,steps
0,-1000,1000
1,2,8
2,4,6
3,3,7
4,2,8
5,3,7
6,4,6
7,-1000,1000
8,3,7
9,-11,11
10,-19,19
11,-18,18
12,1,9
13,1,9
14,4,6
15,-16,16
16,-17,17
17,4,6
18,-16,16
19,4,6
1 episodes rewards steps
2 0 -1000 1000
3 1 2 8
4 2 4 6
5 3 3 7
6 4 2 8
7 5 3 7
8 6 4 6
9 7 -1000 1000
10 8 3 7
11 9 -11 11
12 10 -19 19
13 11 -18 18
14 12 1 9
15 13 1 9
16 14 4 6
17 15 -16 16
18 16 -17 17
19 17 4 6
20 18 -16 16
21 19 4 6

Binary file not shown.

After

Width:  |  Height:  |  Size: 40 KiB

View File

@@ -0,0 +1,401 @@
episodes,rewards,steps
0,-3580,1000
1,-2960,1000
2,-2670,1000
3,-2720,1000
4,-2670,1000
5,-2570,1000
6,-2407,977
7,-2012,852
8,-2500,1000
9,-2530,1000
10,-2550,1000
11,-437,187
12,-80,40
13,-2450,1000
14,-338,148
15,-1175,525
16,-755,325
17,-411,181
18,-1068,448
19,-785,325
20,-149,79
21,-628,268
22,-423,183
23,-282,122
24,-2198,938
25,-13,13
26,-253,113
27,-48,28
28,-72,42
29,-123,63
30,-305,145
31,-72,32
32,-142,72
33,-13,13
34,4,6
35,-1285,545
36,-174,94
37,-436,196
38,-759,339
39,-11,11
40,-17,17
41,-283,123
42,-181,81
43,-44,24
44,-55,35
45,-135,65
46,-577,277
47,-234,114
48,-54,34
49,4,6
50,-29,19
51,-100,50
52,-32,22
53,-23,23
54,4,6
55,-17,17
56,-18,18
57,-48,28
58,-34,24
59,-45,25
60,-29,19
61,1,9
62,-77,37
63,3,7
64,-25,15
65,-3,13
66,-78,48
67,-69,39
68,-105,45
69,-48,28
70,3,7
71,4,6
72,-100,50
73,-130,60
74,-20,20
75,4,6
76,4,6
77,4,6
78,4,6
79,-47,27
80,4,6
81,4,6
82,-174,94
83,-12,12
84,-26,16
85,3,7
86,3,7
87,-42,32
88,-48,28
89,-97,57
90,-11,11
91,-16,16
92,-15,15
93,4,6
94,-147,67
95,-52,32
96,-97,47
97,3,7
98,-17,17
99,3,7
100,4,6
101,3,7
102,3,7
103,3,7
104,1,9
105,4,6
106,4,6
107,3,7
108,4,6
109,-68,38
110,3,7
111,4,6
112,-14,14
113,4,6
114,-57,37
115,3,7
116,4,6
117,-12,12
118,3,7
119,3,7
120,-64,34
121,-13,13
122,3,7
123,-13,13
124,4,6
125,3,7
126,-32,22
127,-41,31
128,3,7
129,3,7
130,3,7
131,4,6
132,4,6
133,3,7
134,-12,12
135,-31,21
136,4,6
137,3,7
138,-51,31
139,-48,28
140,4,6
141,-85,45
142,-14,14
143,4,6
144,3,7
145,-6,16
146,4,6
147,4,6
148,-15,15
149,4,6
150,-24,24
151,3,7
152,-14,14
153,-18,18
154,3,7
155,4,6
156,-85,45
157,-51,31
158,3,7
159,2,8
160,3,7
161,-79,39
162,-14,14
163,-13,13
164,4,6
165,3,7
166,4,6
167,3,7
168,-74,34
169,-15,15
170,4,6
171,-14,14
172,4,6
173,-31,21
174,-8,18
175,4,6
176,4,6
177,4,6
178,4,6
179,-29,19
180,4,6
181,3,7
182,4,6
183,-82,42
184,3,7
185,4,6
186,4,6
187,-11,11
188,-23,23
189,-33,23
190,3,7
191,-12,12
192,-44,24
193,-62,42
194,-16,16
195,4,6
196,-12,12
197,3,7
198,-13,13
199,3,7
200,3,7
201,4,6
202,4,6
203,4,6
204,-28,18
205,-16,16
206,3,7
207,4,6
208,-12,12
209,-13,13
210,-66,36
211,-14,14
212,4,6
213,4,6
214,-15,15
215,-60,30
216,4,6
217,3,7
218,4,6
219,-33,23
220,-12,12
221,-14,14
222,4,6
223,3,7
224,-97,47
225,4,6
226,2,8
227,4,6
228,4,6
229,3,7
230,-11,11
231,4,6
232,3,7
233,3,7
234,4,6
235,3,7
236,3,7
237,-32,22
238,-13,13
239,3,7
240,-22,22
241,4,6
242,2,8
243,-31,21
244,4,6
245,-4,14
246,-30,20
247,4,6
248,3,7
249,-26,16
250,4,6
251,-12,12
252,2,8
253,1,9
254,4,6
255,2,8
256,2,8
257,-12,12
258,3,7
259,-48,28
260,4,6
261,4,6
262,-51,31
263,-12,12
264,4,6
265,2,8
266,2,8
267,2,8
268,3,7
269,4,6
270,4,6
271,-17,17
272,4,6
273,-13,13
274,-16,16
275,-97,57
276,3,7
277,-1,11
278,-32,22
279,3,7
280,4,6
281,3,7
282,3,7
283,3,7
284,3,7
285,2,8
286,3,7
287,-15,15
288,2,8
289,-18,18
290,4,6
291,-36,26
292,4,6
293,4,6
294,4,6
295,4,6
296,-77,47
297,-14,14
298,3,7
299,3,7
300,3,7
301,4,6
302,3,7
303,4,6
304,-12,12
305,-45,35
306,-63,43
307,2,8
308,4,6
309,4,6
310,-13,13
311,4,6
312,-13,13
313,4,6
314,3,7
315,-30,20
316,-13,13
317,3,7
318,4,6
319,4,6
320,-12,12
321,-13,13
322,3,7
323,3,7
324,3,7
325,3,7
326,-36,26
327,4,6
328,3,7
329,3,7
330,3,7
331,3,7
332,-14,14
333,-16,16
334,3,7
335,3,7
336,-14,14
337,1,9
338,2,8
339,3,7
340,4,6
341,-36,26
342,-14,14
343,-78,48
344,2,8
345,-37,27
346,3,7
347,3,7
348,-37,27
349,-16,16
350,4,6
351,-15,15
352,4,6
353,2,8
354,-44,24
355,-13,13
356,-14,14
357,-17,17
358,-13,13
359,3,7
360,2,8
361,4,6
362,3,7
363,-5,15
364,-14,14
365,2,8
366,-12,12
367,3,7
368,4,6
369,2,8
370,2,8
371,1,9
372,-16,16
373,1,9
374,4,6
375,-16,16
376,3,7
377,2,8
378,-13,13
379,-44,34
380,-16,16
381,-30,20
382,4,6
383,4,6
384,2,8
385,-15,15
386,4,6
387,3,7
388,2,8
389,4,6
390,2,8
391,3,7
392,3,7
393,-14,14
394,-15,15
395,3,7
396,-13,13
397,3,7
398,4,6
399,3,7
1 episodes rewards steps
2 0 -3580 1000
3 1 -2960 1000
4 2 -2670 1000
5 3 -2720 1000
6 4 -2670 1000
7 5 -2570 1000
8 6 -2407 977
9 7 -2012 852
10 8 -2500 1000
11 9 -2530 1000
12 10 -2550 1000
13 11 -437 187
14 12 -80 40
15 13 -2450 1000
16 14 -338 148
17 15 -1175 525
18 16 -755 325
19 17 -411 181
20 18 -1068 448
21 19 -785 325
22 20 -149 79
23 21 -628 268
24 22 -423 183
25 23 -282 122
26 24 -2198 938
27 25 -13 13
28 26 -253 113
29 27 -48 28
30 28 -72 42
31 29 -123 63
32 30 -305 145
33 31 -72 32
34 32 -142 72
35 33 -13 13
36 34 4 6
37 35 -1285 545
38 36 -174 94
39 37 -436 196
40 38 -759 339
41 39 -11 11
42 40 -17 17
43 41 -283 123
44 42 -181 81
45 43 -44 24
46 44 -55 35
47 45 -135 65
48 46 -577 277
49 47 -234 114
50 48 -54 34
51 49 4 6
52 50 -29 19
53 51 -100 50
54 52 -32 22
55 53 -23 23
56 54 4 6
57 55 -17 17
58 56 -18 18
59 57 -48 28
60 58 -34 24
61 59 -45 25
62 60 -29 19
63 61 1 9
64 62 -77 37
65 63 3 7
66 64 -25 15
67 65 -3 13
68 66 -78 48
69 67 -69 39
70 68 -105 45
71 69 -48 28
72 70 3 7
73 71 4 6
74 72 -100 50
75 73 -130 60
76 74 -20 20
77 75 4 6
78 76 4 6
79 77 4 6
80 78 4 6
81 79 -47 27
82 80 4 6
83 81 4 6
84 82 -174 94
85 83 -12 12
86 84 -26 16
87 85 3 7
88 86 3 7
89 87 -42 32
90 88 -48 28
91 89 -97 57
92 90 -11 11
93 91 -16 16
94 92 -15 15
95 93 4 6
96 94 -147 67
97 95 -52 32
98 96 -97 47
99 97 3 7
100 98 -17 17
101 99 3 7
102 100 4 6
103 101 3 7
104 102 3 7
105 103 3 7
106 104 1 9
107 105 4 6
108 106 4 6
109 107 3 7
110 108 4 6
111 109 -68 38
112 110 3 7
113 111 4 6
114 112 -14 14
115 113 4 6
116 114 -57 37
117 115 3 7
118 116 4 6
119 117 -12 12
120 118 3 7
121 119 3 7
122 120 -64 34
123 121 -13 13
124 122 3 7
125 123 -13 13
126 124 4 6
127 125 3 7
128 126 -32 22
129 127 -41 31
130 128 3 7
131 129 3 7
132 130 3 7
133 131 4 6
134 132 4 6
135 133 3 7
136 134 -12 12
137 135 -31 21
138 136 4 6
139 137 3 7
140 138 -51 31
141 139 -48 28
142 140 4 6
143 141 -85 45
144 142 -14 14
145 143 4 6
146 144 3 7
147 145 -6 16
148 146 4 6
149 147 4 6
150 148 -15 15
151 149 4 6
152 150 -24 24
153 151 3 7
154 152 -14 14
155 153 -18 18
156 154 3 7
157 155 4 6
158 156 -85 45
159 157 -51 31
160 158 3 7
161 159 2 8
162 160 3 7
163 161 -79 39
164 162 -14 14
165 163 -13 13
166 164 4 6
167 165 3 7
168 166 4 6
169 167 3 7
170 168 -74 34
171 169 -15 15
172 170 4 6
173 171 -14 14
174 172 4 6
175 173 -31 21
176 174 -8 18
177 175 4 6
178 176 4 6
179 177 4 6
180 178 4 6
181 179 -29 19
182 180 4 6
183 181 3 7
184 182 4 6
185 183 -82 42
186 184 3 7
187 185 4 6
188 186 4 6
189 187 -11 11
190 188 -23 23
191 189 -33 23
192 190 3 7
193 191 -12 12
194 192 -44 24
195 193 -62 42
196 194 -16 16
197 195 4 6
198 196 -12 12
199 197 3 7
200 198 -13 13
201 199 3 7
202 200 3 7
203 201 4 6
204 202 4 6
205 203 4 6
206 204 -28 18
207 205 -16 16
208 206 3 7
209 207 4 6
210 208 -12 12
211 209 -13 13
212 210 -66 36
213 211 -14 14
214 212 4 6
215 213 4 6
216 214 -15 15
217 215 -60 30
218 216 4 6
219 217 3 7
220 218 4 6
221 219 -33 23
222 220 -12 12
223 221 -14 14
224 222 4 6
225 223 3 7
226 224 -97 47
227 225 4 6
228 226 2 8
229 227 4 6
230 228 4 6
231 229 3 7
232 230 -11 11
233 231 4 6
234 232 3 7
235 233 3 7
236 234 4 6
237 235 3 7
238 236 3 7
239 237 -32 22
240 238 -13 13
241 239 3 7
242 240 -22 22
243 241 4 6
244 242 2 8
245 243 -31 21
246 244 4 6
247 245 -4 14
248 246 -30 20
249 247 4 6
250 248 3 7
251 249 -26 16
252 250 4 6
253 251 -12 12
254 252 2 8
255 253 1 9
256 254 4 6
257 255 2 8
258 256 2 8
259 257 -12 12
260 258 3 7
261 259 -48 28
262 260 4 6
263 261 4 6
264 262 -51 31
265 263 -12 12
266 264 4 6
267 265 2 8
268 266 2 8
269 267 2 8
270 268 3 7
271 269 4 6
272 270 4 6
273 271 -17 17
274 272 4 6
275 273 -13 13
276 274 -16 16
277 275 -97 57
278 276 3 7
279 277 -1 11
280 278 -32 22
281 279 3 7
282 280 4 6
283 281 3 7
284 282 3 7
285 283 3 7
286 284 3 7
287 285 2 8
288 286 3 7
289 287 -15 15
290 288 2 8
291 289 -18 18
292 290 4 6
293 291 -36 26
294 292 4 6
295 293 4 6
296 294 4 6
297 295 4 6
298 296 -77 47
299 297 -14 14
300 298 3 7
301 299 3 7
302 300 3 7
303 301 4 6
304 302 3 7
305 303 4 6
306 304 -12 12
307 305 -45 35
308 306 -63 43
309 307 2 8
310 308 4 6
311 309 4 6
312 310 -13 13
313 311 4 6
314 312 -13 13
315 313 4 6
316 314 3 7
317 315 -30 20
318 316 -13 13
319 317 3 7
320 318 4 6
321 319 4 6
322 320 -12 12
323 321 -13 13
324 322 3 7
325 323 3 7
326 324 3 7
327 325 3 7
328 326 -36 26
329 327 4 6
330 328 3 7
331 329 3 7
332 330 3 7
333 331 3 7
334 332 -14 14
335 333 -16 16
336 334 3 7
337 335 3 7
338 336 -14 14
339 337 1 9
340 338 2 8
341 339 3 7
342 340 4 6
343 341 -36 26
344 342 -14 14
345 343 -78 48
346 344 2 8
347 345 -37 27
348 346 3 7
349 347 3 7
350 348 -37 27
351 349 -16 16
352 350 4 6
353 351 -15 15
354 352 4 6
355 353 2 8
356 354 -44 24
357 355 -13 13
358 356 -14 14
359 357 -17 17
360 358 -13 13
361 359 3 7
362 360 2 8
363 361 4 6
364 362 3 7
365 363 -5 15
366 364 -14 14
367 365 2 8
368 366 -12 12
369 367 3 7
370 368 4 6
371 369 2 8
372 370 2 8
373 371 1 9
374 372 -16 16
375 373 1 9
376 374 4 6
377 375 -16 16
378 376 3 7
379 377 2 8
380 378 -13 13
381 379 -44 34
382 380 -16 16
383 381 -30 20
384 382 4 6
385 383 4 6
386 384 2 8
387 385 -15 15
388 386 4 6
389 387 3 7
390 388 2 8
391 389 4 6
392 390 2 8
393 391 3 7
394 392 3 7
395 393 -14 14
396 394 -15 15
397 395 3 7
398 396 -13 13
399 397 3 7
400 398 4 6
401 399 3 7

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com Email: johnjim0816@gmail.com
Date: 2021-03-11 17:59:16 Date: 2021-03-11 17:59:16
LastEditor: John LastEditor: John
LastEditTime: 2022-08-25 14:26:36 LastEditTime: 2022-08-26 23:03:39
Discription: Discription:
Environment: Environment:
''' '''
@@ -20,117 +20,105 @@ import argparse
from envs.register import register_env from envs.register import register_env
from envs.wrappers import CliffWalkingWapper from envs.wrappers import CliffWalkingWapper
from Sarsa.sarsa import Sarsa from Sarsa.sarsa import Sarsa
from common.utils import save_results,make_dir,plot_rewards,save_args,all_seed from common.utils import all_seed
from common.launcher import Launcher
def get_args(): class Main(Launcher):
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time def get_args(self):
parser = argparse.ArgumentParser(description="hyperparameters") curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time
parser.add_argument('--algo_name',default='Sarsa',type=str,help="name of algorithm") parser = argparse.ArgumentParser(description="hyperparameters")
parser.add_argument('--env_name',default='Racetrack-v0',type=str,help="name of environment") parser.add_argument('--algo_name',default = 'Sarsa',type=str,help="name of algorithm")
parser.add_argument('--train_eps',default=300,type=int,help="episodes of training") parser.add_argument('--env_name',default = 'Racetrack-v0',type=str,help="name of environment")
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing") parser.add_argument('--train_eps',default = 300,type=int,help="episodes of training")
parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor") parser.add_argument('--test_eps',default = 20,type=int,help="episodes of testing")
parser.add_argument('--epsilon_start',default=0.90,type=float,help="initial value of epsilon") parser.add_argument('--ep_max_steps',default = 100000,type=int,help="steps per episode, much larger value can simulate infinite steps")
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon") parser.add_argument('--gamma',default=0.99,type=float,help="discounted factor")
parser.add_argument('--epsilon_decay',default=200,type=int,help="decay rate of epsilon") parser.add_argument('--epsilon_start',default=0.90,type=float,help="initial value of epsilon")
parser.add_argument('--lr',default=0.2,type=float,help="learning rate") parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon")
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda") parser.add_argument('--epsilon_decay',default=200,type=int,help="decay rate of epsilon")
parser.add_argument('--seed',default=10,type=int,help="seed") parser.add_argument('--lr',default=0.2,type=float,help="learning rate")
parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not") parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not") parser.add_argument('--seed',default=10,type=int,help="seed")
args = parser.parse_args() parser.add_argument('--show_fig',default=False,type=bool,help="if show figure or not")
default_args = {'result_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/results/", parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
'model_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/models/", args = parser.parse_args()
} default_args = {'result_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/results/",
args = {**vars(args),**default_args} # type(dict) 'model_path':f"{curr_path}/outputs/{args.env_name}/{curr_time}/models/",
return args }
args = {**vars(args),**default_args} # type(dict)
return args
def env_agent_config(cfg): def env_agent_config(self,cfg):
register_env(cfg['env_name']) register_env(cfg['env_name'])
env = gym.make(cfg['env_name']) env = gym.make(cfg['env_name'])
if cfg['seed'] !=0: # set random seed if cfg['seed'] !=0: # set random seed
all_seed(env,seed= cfg['seed']) all_seed(env,seed= cfg['seed'])
if cfg['env_name'] == 'CliffWalking-v0': if cfg['env_name'] == 'CliffWalking-v0':
env = CliffWalkingWapper(env) env = CliffWalkingWapper(env)
try: # state dimension try: # state dimension
n_states = env.observation_space.n # print(hasattr(env.observation_space, 'n')) n_states = env.observation_space.n # print(hasattr(env.observation_space, 'n'))
except AttributeError: except AttributeError:
n_states = env.observation_space.shape[0] # print(hasattr(env.observation_space, 'shape')) n_states = env.observation_space.shape[0] # print(hasattr(env.observation_space, 'shape'))
n_actions = env.action_space.n # action dimension n_actions = env.action_space.n # action dimension
print(f"n_states: {n_states}, n_actions: {n_actions}") print(f"n_states: {n_states}, n_actions: {n_actions}")
cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters cfg.update({"n_states":n_states,"n_actions":n_actions}) # update to cfg paramters
agent = Sarsa(cfg) agent = Sarsa(cfg)
return env,agent return env,agent
def train(cfg,env,agent): def train(self,cfg,env,agent):
print("Start training!") print("Start training!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}") print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = [] # record rewards for all episodes rewards = [] # record rewards for all episodes
steps = [] # record steps for all episodes steps = [] # record steps for all episodes
for i_ep in range(cfg['train_eps']): for i_ep in range(cfg['train_eps']):
ep_reward = 0 # reward per episode ep_reward = 0 # reward per episode
ep_step = 0 # step per episode ep_step = 0 # step per episode
state = env.reset() # reset and obtain initial state state = env.reset() # reset and obtain initial state
action = agent.sample_action(state) action = agent.sample_action(state)
while True: # while True:
# for _ in range(cfg.ep_max_steps): for _ in range(cfg['ep_max_steps']):
next_state, reward, done, _ = env.step(action) # update env and return transitions next_state, reward, done, _ = env.step(action) # update env and return transitions
next_action = agent.sample_action(next_state) next_action = agent.sample_action(next_state)
agent.update(state, action, reward, next_state, next_action,done) # update agent agent.update(state, action, reward, next_state, next_action,done) # update agent
state = next_state # update state state = next_state # update state
action = next_action action = next_action
ep_reward += reward ep_reward += reward
ep_step += 1 ep_step += 1
if done: if done:
break break
rewards.append(ep_reward) rewards.append(ep_reward)
steps.append(ep_step) steps.append(ep_step)
if (i_ep+1)%10==0: if (i_ep+1)%10==0:
print(f'Episode: {i_ep+1}/{cfg["train_eps"]}, Reward: {ep_reward:.2f}, Steps:{ep_step}, Epislon: {agent.epsilon:.3f}') print(f'Episode: {i_ep+1}/{cfg["train_eps"]}, Reward: {ep_reward:.2f}, Steps: {ep_step}, Epislon: {agent.epsilon:.3f}')
print("Finish training!") print("Finish training!")
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps} return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
def test(cfg,env,agent): def test(self,cfg,env,agent):
print("Start testing!") print("Start testing!")
print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}") print(f"Env: {cfg['env_name']}, Algorithm: {cfg['algo_name']}, Device: {cfg['device']}")
rewards = [] # record rewards for all episodes rewards = [] # record rewards for all episodes
steps = [] # record steps for all episodes steps = [] # record steps for all episodes
for i_ep in range(cfg['test_eps']): for i_ep in range(cfg['test_eps']):
ep_reward = 0 # reward per episode ep_reward = 0 # reward per episode
ep_step = 0 ep_step = 0
while True: state = env.reset() # reset and obtain initial state
# for _ in range(cfg.ep_max_steps): for _ in range(cfg['ep_max_steps']):
action = agent.predict_action(state) action = agent.predict_action(state)
next_state, reward, done = env.step(action) next_state, reward, done, _ = env.step(action)
state = next_state state = next_state
ep_reward+=reward ep_reward+=reward
ep_step+=1 ep_step+=1
if done: if done:
break break
rewards.append(ep_reward) rewards.append(ep_reward)
steps.append(ep_step) steps.append(ep_step)
print(f"Episode: {i_ep+1}/{cfg['test_eps']}, Steps:{ep_step}, Reward: {ep_reward:.2f}") print(f"Episode: {i_ep+1}/{cfg['test_eps']}, Steps: {ep_step}, Reward: {ep_reward:.2f}")
print("Finish testing!") print("Finish testing!")
return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps} return {'episodes':range(len(rewards)),'rewards':rewards,'steps':steps}
if __name__ == "__main__": if __name__ == "__main__":
cfg = get_args() main = Main()
# 训练 main.run()
env, agent = env_agent_config(cfg)
res_dic = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path)
save_args(cfg) # save parameters
agent.save(path=cfg.model_path) # save model
save_results(res_dic, tag='train',
path=cfg.result_path)
plot_rewards(res_dic['rewards'], cfg, tag="train")
# 测试
env, agent = env_agent_config(cfg)
agent.load(path=cfg.model_path) # 导入模型
res_dic = test(cfg, env, agent)
save_results(res_dic, tag='test',
path=cfg.result_path) # 保存结果
plot_rewards(res_dic['rewards'], cfg, tag="test") # 画出结果

View File

@@ -1 +0,0 @@
{"algo_name": "Sarsa", "env_name": "CliffWalking-v0", "train_eps": 300, "test_eps": 20, "ep_max_steps": 200, "gamma": 0.99, "epsilon_start": 0.9, "epsilon_end": 0.01, "epsilon_decay": 200, "lr": 0.2, "device": "cpu", "result_path": "/Users/jj/Desktop/rl-tutorials/codes/Sarsa/outputs/CliffWalking-v0/20220803-142740/results/", "model_path": "/Users/jj/Desktop/rl-tutorials/codes/Sarsa/outputs/CliffWalking-v0/20220803-142740/models/", "save_fig": true}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 34 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 54 KiB

View File

@@ -1,15 +0,0 @@
{
"algo_name": "Sarsa",
"env_name": "CliffWalking-v0",
"train_eps": 400,
"test_eps": 20,
"gamma": 0.9,
"epsilon_start": 0.95,
"epsilon_end": 0.01,
"epsilon_decay": 300,
"lr": 0.1,
"device": "cpu",
"result_path": "c:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\Sarsa/outputs/CliffWalking-v0/20220804-223029/results/",
"model_path": "c:\\Users\\24438\\Desktop\\rl-tutorials\\codes\\Sarsa/outputs/CliffWalking-v0/20220804-223029/models/",
"save_fig": true
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 34 KiB

View File

@@ -1,5 +1,5 @@
{ {
"algo_name": "Q-learning", "algo_name": "Sarsa",
"env_name": "CliffWalking-v0", "env_name": "CliffWalking-v0",
"train_eps": 400, "train_eps": 400,
"test_eps": 20, "test_eps": 20,
@@ -12,8 +12,8 @@
"seed": 10, "seed": 10,
"show_fig": false, "show_fig": false,
"save_fig": true, "save_fig": true,
"result_path": "/Users/jj/Desktop/rl-tutorials/codes/QLearning/outputs/CliffWalking-v0/20220824-103255/results/", "result_path": "/Users/jj/Desktop/rl-tutorials/codes/Sarsa/outputs/CliffWalking-v0/20220825-213316/results/",
"model_path": "/Users/jj/Desktop/rl-tutorials/codes/QLearning/outputs/CliffWalking-v0/20220824-103255/models/", "model_path": "/Users/jj/Desktop/rl-tutorials/codes/Sarsa/outputs/CliffWalking-v0/20220825-213316/models/",
"n_states": 48, "n_states": 48,
"n_actions": 4 "n_actions": 4
} }

View File

@@ -0,0 +1,21 @@
episodes,rewards,steps
0,-15,15
1,-15,15
2,-15,15
3,-15,15
4,-15,15
5,-15,15
6,-15,15
7,-15,15
8,-15,15
9,-15,15
10,-15,15
11,-15,15
12,-15,15
13,-15,15
14,-15,15
15,-15,15
16,-15,15
17,-15,15
18,-15,15
19,-15,15
1 episodes rewards steps
2 0 -15 15
3 1 -15 15
4 2 -15 15
5 3 -15 15
6 4 -15 15
7 5 -15 15
8 6 -15 15
9 7 -15 15
10 8 -15 15
11 9 -15 15
12 10 -15 15
13 11 -15 15
14 12 -15 15
15 13 -15 15
16 14 -15 15
17 15 -15 15
18 16 -15 15
19 17 -15 15
20 18 -15 15
21 19 -15 15

Binary file not shown.

After

Width:  |  Height:  |  Size: 33 KiB

View File

@@ -0,0 +1,401 @@
episodes,rewards,steps
0,-649,154
1,-2822,842
2,-176,176
3,-139,139
4,-221,221
5,-51,51
6,-219,219
7,-247,148
8,-90,90
9,-145,145
10,-104,104
11,-162,162
12,-49,49
13,-129,129
14,-140,140
15,-19,19
16,-131,131
17,-115,115
18,-43,43
19,-133,133
20,-73,73
21,-89,89
22,-131,131
23,-61,61
24,-113,113
25,-119,119
26,-119,119
27,-71,71
28,-132,132
29,-47,47
30,-79,79
31,-57,57
32,-125,125
33,-77,77
34,-87,87
35,-49,49
36,-57,57
37,-81,81
38,-81,81
39,-97,97
40,-61,61
41,-85,85
42,-217,118
43,-39,39
44,-117,117
45,-41,41
46,-71,71
47,-105,105
48,-73,73
49,-68,68
50,-95,95
51,-41,41
52,-41,41
53,-67,67
54,-71,71
55,-65,65
56,-41,41
57,-61,61
58,-81,81
59,-21,21
60,-76,76
61,-80,80
62,-23,23
63,-53,53
64,-67,67
65,-33,33
66,-41,41
67,-59,59
68,-33,33
69,-64,64
70,-188,89
71,-47,47
72,-57,57
73,-45,45
74,-33,33
75,-79,79
76,-45,45
77,-23,23
78,-47,47
79,-57,57
80,-47,47
81,-45,45
82,-53,53
83,-29,29
84,-33,33
85,-69,69
86,-61,61
87,-35,35
88,-59,59
89,-43,43
90,-17,17
91,-39,39
92,-59,59
93,-29,29
94,-31,31
95,-55,55
96,-35,35
97,-45,45
98,-29,29
99,-59,59
100,-25,25
101,-29,29
102,-33,33
103,-39,39
104,-19,19
105,-47,47
106,-57,57
107,-19,19
108,-47,47
109,-25,25
110,-23,23
111,-53,53
112,-39,39
113,-34,34
114,-27,27
115,-27,27
116,-63,63
117,-33,33
118,-17,17
119,-21,21
120,-19,19
121,-49,49
122,-25,25
123,-39,39
124,-25,25
125,-167,68
126,-35,35
127,-29,29
128,-31,31
129,-44,44
130,-33,33
131,-23,23
132,-37,37
133,-134,35
134,-31,31
135,-19,19
136,-29,29
137,-37,37
138,-25,25
139,-39,39
140,-47,47
141,-29,29
142,-27,27
143,-21,21
144,-41,41
145,-29,29
146,-25,25
147,-25,25
148,-21,21
149,-29,29
150,-39,39
151,-35,35
152,-35,35
153,-32,32
154,-31,31
155,-19,19
156,-21,21
157,-35,35
158,-33,33
159,-37,37
160,-25,25
161,-41,41
162,-25,25
163,-23,23
164,-27,27
165,-25,25
166,-39,39
167,-28,28
168,-24,24
169,-23,23
170,-41,41
171,-17,17
172,-35,35
173,-23,23
174,-29,29
175,-17,17
176,-39,39
177,-33,33
178,-29,29
179,-24,24
180,-23,23
181,-19,19
182,-15,15
183,-23,23
184,-39,39
185,-25,25
186,-35,35
187,-33,33
188,-19,19
189,-35,35
190,-21,21
191,-131,32
192,-15,15
193,-23,23
194,-21,21
195,-17,17
196,-23,23
197,-31,31
198,-21,21
199,-31,31
200,-35,35
201,-27,27
202,-19,19
203,-21,21
204,-23,23
205,-23,23
206,-21,21
207,-31,31
208,-25,25
209,-23,23
210,-17,17
211,-19,19
212,-25,25
213,-23,23
214,-19,19
215,-19,19
216,-25,25
217,-25,25
218,-25,25
219,-25,25
220,-23,23
221,-19,19
222,-19,19
223,-149,50
224,-41,41
225,-19,19
226,-29,29
227,-37,37
228,-17,17
229,-17,17
230,-19,19
231,-27,27
232,-19,19
233,-33,33
234,-23,23
235,-23,23
236,-34,34
237,-15,15
238,-33,33
239,-29,29
240,-17,17
241,-23,23
242,-17,17
243,-19,19
244,-21,21
245,-23,23
246,-17,17
247,-15,15
248,-39,39
249,-21,21
250,-23,23
251,-29,29
252,-15,15
253,-17,17
254,-29,29
255,-15,15
256,-21,21
257,-19,19
258,-19,19
259,-21,21
260,-17,17
261,-21,21
262,-27,27
263,-27,27
264,-21,21
265,-19,19
266,-17,17
267,-23,23
268,-19,19
269,-17,17
270,-19,19
271,-19,19
272,-17,17
273,-23,23
274,-17,17
275,-22,22
276,-31,31
277,-19,19
278,-17,17
279,-33,33
280,-19,19
281,-17,17
282,-31,31
283,-15,15
284,-15,15
285,-15,15
286,-29,29
287,-19,19
288,-17,17
289,-26,26
290,-17,17
291,-19,19
292,-15,15
293,-21,21
294,-21,21
295,-15,15
296,-19,19
297,-15,15
298,-17,17
299,-19,19
300,-17,17
301,-21,21
302,-17,17
303,-27,27
304,-17,17
305,-19,19
306,-15,15
307,-19,19
308,-33,33
309,-17,17
310,-20,20
311,-19,19
312,-17,17
313,-15,15
314,-23,23
315,-15,15
316,-15,15
317,-17,17
318,-25,25
319,-15,15
320,-17,17
321,-19,19
322,-17,17
323,-15,15
324,-23,23
325,-19,19
326,-17,17
327,-23,23
328,-15,15
329,-19,19
330,-15,15
331,-17,17
332,-19,19
333,-15,15
334,-17,17
335,-17,17
336,-19,19
337,-15,15
338,-19,19
339,-19,19
340,-17,17
341,-15,15
342,-21,21
343,-19,19
344,-17,17
345,-17,17
346,-15,15
347,-21,21
348,-20,20
349,-15,15
350,-15,15
351,-15,15
352,-19,19
353,-17,17
354,-15,15
355,-27,27
356,-15,15
357,-15,15
358,-23,23
359,-125,26
360,-132,33
361,-17,17
362,-15,15
363,-17,17
364,-23,23
365,-17,17
366,-15,15
367,-15,15
368,-17,17
369,-15,15
370,-17,17
371,-15,15
372,-15,15
373,-15,15
374,-15,15
375,-15,15
376,-15,15
377,-15,15
378,-15,15
379,-15,15
380,-17,17
381,-15,15
382,-15,15
383,-19,19
384,-15,15
385,-17,17
386,-27,27
387,-15,15
388,-21,21
389,-125,26
390,-15,15
391,-15,15
392,-15,15
393,-27,27
394,-15,15
395,-15,15
396,-17,17
397,-15,15
398,-15,15
399,-15,15
1 episodes rewards steps
2 0 -649 154
3 1 -2822 842
4 2 -176 176
5 3 -139 139
6 4 -221 221
7 5 -51 51
8 6 -219 219
9 7 -247 148
10 8 -90 90
11 9 -145 145
12 10 -104 104
13 11 -162 162
14 12 -49 49
15 13 -129 129
16 14 -140 140
17 15 -19 19
18 16 -131 131
19 17 -115 115
20 18 -43 43
21 19 -133 133
22 20 -73 73
23 21 -89 89
24 22 -131 131
25 23 -61 61
26 24 -113 113
27 25 -119 119
28 26 -119 119
29 27 -71 71
30 28 -132 132
31 29 -47 47
32 30 -79 79
33 31 -57 57
34 32 -125 125
35 33 -77 77
36 34 -87 87
37 35 -49 49
38 36 -57 57
39 37 -81 81
40 38 -81 81
41 39 -97 97
42 40 -61 61
43 41 -85 85
44 42 -217 118
45 43 -39 39
46 44 -117 117
47 45 -41 41
48 46 -71 71
49 47 -105 105
50 48 -73 73
51 49 -68 68
52 50 -95 95
53 51 -41 41
54 52 -41 41
55 53 -67 67
56 54 -71 71
57 55 -65 65
58 56 -41 41
59 57 -61 61
60 58 -81 81
61 59 -21 21
62 60 -76 76
63 61 -80 80
64 62 -23 23
65 63 -53 53
66 64 -67 67
67 65 -33 33
68 66 -41 41
69 67 -59 59
70 68 -33 33
71 69 -64 64
72 70 -188 89
73 71 -47 47
74 72 -57 57
75 73 -45 45
76 74 -33 33
77 75 -79 79
78 76 -45 45
79 77 -23 23
80 78 -47 47
81 79 -57 57
82 80 -47 47
83 81 -45 45
84 82 -53 53
85 83 -29 29
86 84 -33 33
87 85 -69 69
88 86 -61 61
89 87 -35 35
90 88 -59 59
91 89 -43 43
92 90 -17 17
93 91 -39 39
94 92 -59 59
95 93 -29 29
96 94 -31 31
97 95 -55 55
98 96 -35 35
99 97 -45 45
100 98 -29 29
101 99 -59 59
102 100 -25 25
103 101 -29 29
104 102 -33 33
105 103 -39 39
106 104 -19 19
107 105 -47 47
108 106 -57 57
109 107 -19 19
110 108 -47 47
111 109 -25 25
112 110 -23 23
113 111 -53 53
114 112 -39 39
115 113 -34 34
116 114 -27 27
117 115 -27 27
118 116 -63 63
119 117 -33 33
120 118 -17 17
121 119 -21 21
122 120 -19 19
123 121 -49 49
124 122 -25 25
125 123 -39 39
126 124 -25 25
127 125 -167 68
128 126 -35 35
129 127 -29 29
130 128 -31 31
131 129 -44 44
132 130 -33 33
133 131 -23 23
134 132 -37 37
135 133 -134 35
136 134 -31 31
137 135 -19 19
138 136 -29 29
139 137 -37 37
140 138 -25 25
141 139 -39 39
142 140 -47 47
143 141 -29 29
144 142 -27 27
145 143 -21 21
146 144 -41 41
147 145 -29 29
148 146 -25 25
149 147 -25 25
150 148 -21 21
151 149 -29 29
152 150 -39 39
153 151 -35 35
154 152 -35 35
155 153 -32 32
156 154 -31 31
157 155 -19 19
158 156 -21 21
159 157 -35 35
160 158 -33 33
161 159 -37 37
162 160 -25 25
163 161 -41 41
164 162 -25 25
165 163 -23 23
166 164 -27 27
167 165 -25 25
168 166 -39 39
169 167 -28 28
170 168 -24 24
171 169 -23 23
172 170 -41 41
173 171 -17 17
174 172 -35 35
175 173 -23 23
176 174 -29 29
177 175 -17 17
178 176 -39 39
179 177 -33 33
180 178 -29 29
181 179 -24 24
182 180 -23 23
183 181 -19 19
184 182 -15 15
185 183 -23 23
186 184 -39 39
187 185 -25 25
188 186 -35 35
189 187 -33 33
190 188 -19 19
191 189 -35 35
192 190 -21 21
193 191 -131 32
194 192 -15 15
195 193 -23 23
196 194 -21 21
197 195 -17 17
198 196 -23 23
199 197 -31 31
200 198 -21 21
201 199 -31 31
202 200 -35 35
203 201 -27 27
204 202 -19 19
205 203 -21 21
206 204 -23 23
207 205 -23 23
208 206 -21 21
209 207 -31 31
210 208 -25 25
211 209 -23 23
212 210 -17 17
213 211 -19 19
214 212 -25 25
215 213 -23 23
216 214 -19 19
217 215 -19 19
218 216 -25 25
219 217 -25 25
220 218 -25 25
221 219 -25 25
222 220 -23 23
223 221 -19 19
224 222 -19 19
225 223 -149 50
226 224 -41 41
227 225 -19 19
228 226 -29 29
229 227 -37 37
230 228 -17 17
231 229 -17 17
232 230 -19 19
233 231 -27 27
234 232 -19 19
235 233 -33 33
236 234 -23 23
237 235 -23 23
238 236 -34 34
239 237 -15 15
240 238 -33 33
241 239 -29 29
242 240 -17 17
243 241 -23 23
244 242 -17 17
245 243 -19 19
246 244 -21 21
247 245 -23 23
248 246 -17 17
249 247 -15 15
250 248 -39 39
251 249 -21 21
252 250 -23 23
253 251 -29 29
254 252 -15 15
255 253 -17 17
256 254 -29 29
257 255 -15 15
258 256 -21 21
259 257 -19 19
260 258 -19 19
261 259 -21 21
262 260 -17 17
263 261 -21 21
264 262 -27 27
265 263 -27 27
266 264 -21 21
267 265 -19 19
268 266 -17 17
269 267 -23 23
270 268 -19 19
271 269 -17 17
272 270 -19 19
273 271 -19 19
274 272 -17 17
275 273 -23 23
276 274 -17 17
277 275 -22 22
278 276 -31 31
279 277 -19 19
280 278 -17 17
281 279 -33 33
282 280 -19 19
283 281 -17 17
284 282 -31 31
285 283 -15 15
286 284 -15 15
287 285 -15 15
288 286 -29 29
289 287 -19 19
290 288 -17 17
291 289 -26 26
292 290 -17 17
293 291 -19 19
294 292 -15 15
295 293 -21 21
296 294 -21 21
297 295 -15 15
298 296 -19 19
299 297 -15 15
300 298 -17 17
301 299 -19 19
302 300 -17 17
303 301 -21 21
304 302 -17 17
305 303 -27 27
306 304 -17 17
307 305 -19 19
308 306 -15 15
309 307 -19 19
310 308 -33 33
311 309 -17 17
312 310 -20 20
313 311 -19 19
314 312 -17 17
315 313 -15 15
316 314 -23 23
317 315 -15 15
318 316 -15 15
319 317 -17 17
320 318 -25 25
321 319 -15 15
322 320 -17 17
323 321 -19 19
324 322 -17 17
325 323 -15 15
326 324 -23 23
327 325 -19 19
328 326 -17 17
329 327 -23 23
330 328 -15 15
331 329 -19 19
332 330 -15 15
333 331 -17 17
334 332 -19 19
335 333 -15 15
336 334 -17 17
337 335 -17 17
338 336 -19 19
339 337 -15 15
340 338 -19 19
341 339 -19 19
342 340 -17 17
343 341 -15 15
344 342 -21 21
345 343 -19 19
346 344 -17 17
347 345 -17 17
348 346 -15 15
349 347 -21 21
350 348 -20 20
351 349 -15 15
352 350 -15 15
353 351 -15 15
354 352 -19 19
355 353 -17 17
356 354 -15 15
357 355 -27 27
358 356 -15 15
359 357 -15 15
360 358 -23 23
361 359 -125 26
362 360 -132 33
363 361 -17 17
364 362 -15 15
365 363 -17 17
366 364 -23 23
367 365 -17 17
368 366 -15 15
369 367 -15 15
370 368 -17 17
371 369 -15 15
372 370 -17 17
373 371 -15 15
374 372 -15 15
375 373 -15 15
376 374 -15 15
377 375 -15 15
378 376 -15 15
379 377 -15 15
380 378 -15 15
381 379 -15 15
382 380 -17 17
383 381 -15 15
384 382 -15 15
385 383 -19 19
386 384 -15 15
387 385 -17 17
388 386 -27 27
389 387 -15 15
390 388 -21 21
391 389 -125 26
392 390 -15 15
393 391 -15 15
394 392 -15 15
395 393 -27 27
396 394 -15 15
397 395 -15 15
398 396 -17 17
399 397 -15 15
400 398 -15 15
401 399 -15 15

View File

@@ -0,0 +1 @@
{"algo_name": "Sarsa", "env_name": "Racetrack-v0", "train_eps": 300, "test_eps": 20, "gamma": 0.99, "epsilon_start": 0.9, "epsilon_end": 0.01, "epsilon_decay": 200, "lr": 0.2, "device": "cpu", "seed": 10, "show_fig": false, "save_fig": true, "result_path": "/Users/jj/Desktop/rl-tutorials/codes/Sarsa/outputs/Racetrack-v0/20220825-212738/results/", "model_path": "/Users/jj/Desktop/rl-tutorials/codes/Sarsa/outputs/Racetrack-v0/20220825-212738/models/", "n_states": 4, "n_actions": 9}

Binary file not shown.

After

Width:  |  Height:  |  Size: 39 KiB

View File

@@ -0,0 +1,21 @@
episodes,rewards,steps
0,4,6
1,4,6
2,-1010,1000
3,-14,14
4,4,6
5,4,6
6,4,6
7,-1060,1000
8,2,8
9,-12,12
10,3,7
11,-15,15
12,3,7
13,4,6
14,-14,14
15,3,7
16,-18,18
17,4,6
18,4,6
19,-1020,1000
1 episodes rewards steps
2 0 4 6
3 1 4 6
4 2 -1010 1000
5 3 -14 14
6 4 4 6
7 5 4 6
8 6 4 6
9 7 -1060 1000
10 8 2 8
11 9 -12 12
12 10 3 7
13 11 -15 15
14 12 3 7
15 13 4 6
16 14 -14 14
17 15 3 7
18 16 -18 18
19 17 4 6
20 18 4 6
21 19 -1020 1000

Binary file not shown.

After

Width:  |  Height:  |  Size: 41 KiB

View File

@@ -0,0 +1,301 @@
episodes,rewards,steps
0,-3460,1000
1,-2800,1000
2,-2910,1000
3,-2620,1000
4,-2620,1000
5,-2590,1000
6,-2390,1000
7,-2510,1000
8,-2470,1000
9,-611,251
10,-891,371
11,-265,125
12,-2281,911
13,-1203,523
14,-616,266
15,-213,113
16,-633,273
17,-1112,482
18,-350,160
19,-852,342
20,-87,47
21,-11,11
22,-27,17
23,-117,57
24,-15,15
25,4,6
26,-27,17
27,-94,44
28,-184,84
29,-44,24
30,-150,80
31,-14,14
32,-219,89
33,-50,30
34,-111,61
35,-10,10
36,-28,18
37,-34,24
38,-12,12
39,-19,19
40,-136,66
41,-171,71
42,-51,31
43,4,6
44,-117,57
45,4,6
46,4,6
47,-127,67
48,-78,48
49,-311,131
50,-25,15
51,4,6
52,-49,29
53,-25,15
54,-78,48
55,-238,108
56,4,6
57,-17,17
58,-29,19
59,-218,98
60,4,6
61,-129,59
62,-344,144
63,-25,15
64,-15,15
65,-77,37
66,2,8
67,0,10
68,4,6
69,4,6
70,-242,102
71,3,7
72,4,6
73,-53,33
74,-14,14
75,4,6
76,4,6
77,-30,20
78,-12,12
79,2,8
80,-12,12
81,-150,70
82,-48,28
83,-102,52
84,4,6
85,-97,47
86,-10,10
87,-125,55
88,-28,18
89,-26,16
90,-107,57
91,4,6
92,-16,16
93,-84,44
94,-13,13
95,-43,23
96,-14,14
97,-12,12
98,-13,13
99,-2,12
100,-14,14
101,-47,27
102,4,6
103,4,6
104,-91,51
105,-65,35
106,4,6
107,-12,12
108,-14,14
109,-13,13
110,4,6
111,-41,31
112,-13,13
113,4,6
114,-4,14
115,-74,34
116,4,6
117,-60,30
118,4,6
119,-15,15
120,3,7
121,4,6
122,4,6
123,-19,19
124,4,6
125,-49,29
126,-13,13
127,-30,20
128,2,8
129,-21,21
130,-45,25
131,-32,22
132,-67,37
133,-46,26
134,0,10
135,-12,12
136,-9,9
137,-10,10
138,-14,14
139,4,6
140,-11,11
141,-12,12
142,2,8
143,-35,25
144,4,6
145,-73,43
146,4,6
147,-20,20
148,4,6
149,2,8
150,-29,19
151,-20,20
152,4,6
153,-28,18
154,4,6
155,4,6
156,4,6
157,4,6
158,-34,24
159,4,6
160,4,6
161,4,6
162,-25,15
163,4,6
164,3,7
165,-48,28
166,4,6
167,-58,38
168,-20,20
169,-9,9
170,3,7
171,4,6
172,3,7
173,-33,23
174,-50,30
175,-16,16
176,-32,22
177,-65,35
178,4,6
179,-13,13
180,-11,11
181,3,7
182,4,6
183,-16,16
184,-12,12
185,4,6
186,-48,28
187,-13,13
188,2,8
189,3,7
190,-27,17
191,3,7
192,4,6
193,4,6
194,4,6
195,4,6
196,4,6
197,-13,13
198,-14,14
199,4,6
200,4,6
201,-13,13
202,-33,23
203,4,6
204,-32,22
205,4,6
206,-48,28
207,4,6
208,4,6
209,3,7
210,4,6
211,-34,24
212,3,7
213,4,6
214,4,6
215,4,6
216,3,7
217,-12,12
218,3,7
219,-8,8
220,3,7
221,4,6
222,-46,26
223,-33,23
224,4,6
225,1,9
226,3,7
227,2,8
228,-34,24
229,4,6
230,4,6
231,4,6
232,4,6
233,-55,35
234,-37,27
235,4,6
236,-14,14
237,-65,35
238,4,6
239,-13,13
240,4,6
241,4,6
242,-13,13
243,-30,20
244,3,7
245,-13,13
246,4,6
247,4,6
248,-13,13
249,-32,22
250,4,6
251,-55,35
252,-12,12
253,3,7
254,3,7
255,3,7
256,4,6
257,2,8
258,-12,12
259,3,7
260,-10,10
261,-12,12
262,4,6
263,3,7
264,3,7
265,-16,16
266,3,7
267,-47,27
268,-13,13
269,4,6
270,3,7
271,-13,13
272,4,6
273,4,6
274,-17,17
275,4,6
276,3,7
277,3,7
278,4,6
279,-41,31
280,3,7
281,-47,27
282,-32,22
283,4,6
284,3,7
285,-17,17
286,3,7
287,3,7
288,3,7
289,-12,12
290,4,6
291,3,7
292,3,7
293,-24,14
294,3,7
295,4,6
296,3,7
297,3,7
298,3,7
299,-13,13
1 episodes rewards steps
2 0 -3460 1000
3 1 -2800 1000
4 2 -2910 1000
5 3 -2620 1000
6 4 -2620 1000
7 5 -2590 1000
8 6 -2390 1000
9 7 -2510 1000
10 8 -2470 1000
11 9 -611 251
12 10 -891 371
13 11 -265 125
14 12 -2281 911
15 13 -1203 523
16 14 -616 266
17 15 -213 113
18 16 -633 273
19 17 -1112 482
20 18 -350 160
21 19 -852 342
22 20 -87 47
23 21 -11 11
24 22 -27 17
25 23 -117 57
26 24 -15 15
27 25 4 6
28 26 -27 17
29 27 -94 44
30 28 -184 84
31 29 -44 24
32 30 -150 80
33 31 -14 14
34 32 -219 89
35 33 -50 30
36 34 -111 61
37 35 -10 10
38 36 -28 18
39 37 -34 24
40 38 -12 12
41 39 -19 19
42 40 -136 66
43 41 -171 71
44 42 -51 31
45 43 4 6
46 44 -117 57
47 45 4 6
48 46 4 6
49 47 -127 67
50 48 -78 48
51 49 -311 131
52 50 -25 15
53 51 4 6
54 52 -49 29
55 53 -25 15
56 54 -78 48
57 55 -238 108
58 56 4 6
59 57 -17 17
60 58 -29 19
61 59 -218 98
62 60 4 6
63 61 -129 59
64 62 -344 144
65 63 -25 15
66 64 -15 15
67 65 -77 37
68 66 2 8
69 67 0 10
70 68 4 6
71 69 4 6
72 70 -242 102
73 71 3 7
74 72 4 6
75 73 -53 33
76 74 -14 14
77 75 4 6
78 76 4 6
79 77 -30 20
80 78 -12 12
81 79 2 8
82 80 -12 12
83 81 -150 70
84 82 -48 28
85 83 -102 52
86 84 4 6
87 85 -97 47
88 86 -10 10
89 87 -125 55
90 88 -28 18
91 89 -26 16
92 90 -107 57
93 91 4 6
94 92 -16 16
95 93 -84 44
96 94 -13 13
97 95 -43 23
98 96 -14 14
99 97 -12 12
100 98 -13 13
101 99 -2 12
102 100 -14 14
103 101 -47 27
104 102 4 6
105 103 4 6
106 104 -91 51
107 105 -65 35
108 106 4 6
109 107 -12 12
110 108 -14 14
111 109 -13 13
112 110 4 6
113 111 -41 31
114 112 -13 13
115 113 4 6
116 114 -4 14
117 115 -74 34
118 116 4 6
119 117 -60 30
120 118 4 6
121 119 -15 15
122 120 3 7
123 121 4 6
124 122 4 6
125 123 -19 19
126 124 4 6
127 125 -49 29
128 126 -13 13
129 127 -30 20
130 128 2 8
131 129 -21 21
132 130 -45 25
133 131 -32 22
134 132 -67 37
135 133 -46 26
136 134 0 10
137 135 -12 12
138 136 -9 9
139 137 -10 10
140 138 -14 14
141 139 4 6
142 140 -11 11
143 141 -12 12
144 142 2 8
145 143 -35 25
146 144 4 6
147 145 -73 43
148 146 4 6
149 147 -20 20
150 148 4 6
151 149 2 8
152 150 -29 19
153 151 -20 20
154 152 4 6
155 153 -28 18
156 154 4 6
157 155 4 6
158 156 4 6
159 157 4 6
160 158 -34 24
161 159 4 6
162 160 4 6
163 161 4 6
164 162 -25 15
165 163 4 6
166 164 3 7
167 165 -48 28
168 166 4 6
169 167 -58 38
170 168 -20 20
171 169 -9 9
172 170 3 7
173 171 4 6
174 172 3 7
175 173 -33 23
176 174 -50 30
177 175 -16 16
178 176 -32 22
179 177 -65 35
180 178 4 6
181 179 -13 13
182 180 -11 11
183 181 3 7
184 182 4 6
185 183 -16 16
186 184 -12 12
187 185 4 6
188 186 -48 28
189 187 -13 13
190 188 2 8
191 189 3 7
192 190 -27 17
193 191 3 7
194 192 4 6
195 193 4 6
196 194 4 6
197 195 4 6
198 196 4 6
199 197 -13 13
200 198 -14 14
201 199 4 6
202 200 4 6
203 201 -13 13
204 202 -33 23
205 203 4 6
206 204 -32 22
207 205 4 6
208 206 -48 28
209 207 4 6
210 208 4 6
211 209 3 7
212 210 4 6
213 211 -34 24
214 212 3 7
215 213 4 6
216 214 4 6
217 215 4 6
218 216 3 7
219 217 -12 12
220 218 3 7
221 219 -8 8
222 220 3 7
223 221 4 6
224 222 -46 26
225 223 -33 23
226 224 4 6
227 225 1 9
228 226 3 7
229 227 2 8
230 228 -34 24
231 229 4 6
232 230 4 6
233 231 4 6
234 232 4 6
235 233 -55 35
236 234 -37 27
237 235 4 6
238 236 -14 14
239 237 -65 35
240 238 4 6
241 239 -13 13
242 240 4 6
243 241 4 6
244 242 -13 13
245 243 -30 20
246 244 3 7
247 245 -13 13
248 246 4 6
249 247 4 6
250 248 -13 13
251 249 -32 22
252 250 4 6
253 251 -55 35
254 252 -12 12
255 253 3 7
256 254 3 7
257 255 3 7
258 256 4 6
259 257 2 8
260 258 -12 12
261 259 3 7
262 260 -10 10
263 261 -12 12
264 262 4 6
265 263 3 7
266 264 3 7
267 265 -16 16
268 266 3 7
269 267 -47 27
270 268 -13 13
271 269 4 6
272 270 3 7
273 271 -13 13
274 272 4 6
275 273 4 6
276 274 -17 17
277 275 4 6
278 276 3 7
279 277 3 7
280 278 4 6
281 279 -41 31
282 280 3 7
283 281 -47 27
284 282 -32 22
285 283 4 6
286 284 3 7
287 285 -17 17
288 286 3 7
289 287 3 7
290 288 3 7
291 289 -12 12
292 290 4 6
293 291 3 7
294 292 3 7
295 293 -24 14
296 294 3 7
297 295 4 6
298 296 3 7
299 297 3 7
300 298 3 7
301 299 -13 13

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com Email: johnjim0816@gmail.com
Date: 2021-03-12 16:58:16 Date: 2021-03-12 16:58:16
LastEditor: John LastEditor: John
LastEditTime: 2022-08-25 00:23:22 LastEditTime: 2022-08-25 21:26:08
Discription: Discription:
Environment: Environment:
''' '''
@@ -30,7 +30,7 @@ class Sarsa(object):
self.sample_count += 1 self.sample_count += 1
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \ self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
math.exp(-1. * self.sample_count / self.epsilon_decay) # The probability to select a random action, is is log decayed math.exp(-1. * self.sample_count / self.epsilon_decay) # The probability to select a random action, is is log decayed
best_action = np.argmax(self.Q_table[state]) best_action = np.argmax(self.Q_table[str(state)]) # array cannot be hashtable, thus convert to str
action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions
action_probs[best_action] += (1.0 - self.epsilon) action_probs[best_action] += (1.0 - self.epsilon)
action = np.random.choice(np.arange(len(action_probs)), p=action_probs) action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
@@ -38,27 +38,27 @@ class Sarsa(object):
def predict_action(self,state): def predict_action(self,state):
''' predict action while testing ''' predict action while testing
''' '''
action = np.argmax(self.Q_table[state]) action = np.argmax(self.Q_table[str(state)])
return action return action
def update(self, state, action, reward, next_state, next_action,done): def update(self, state, action, reward, next_state, next_action,done):
Q_predict = self.Q_table[state][action] Q_predict = self.Q_table[str(state)][action]
if done: if done:
Q_target = reward # terminal state Q_target = reward # terminal state
else: else:
Q_target = reward + self.gamma * self.Q_table[next_state][next_action] # the only difference from Q learning Q_target = reward + self.gamma * self.Q_table[str(next_state)][next_action] # the only difference from Q learning
self.Q_table[state][action] += self.lr * (Q_target - Q_predict) self.Q_table[str(state)][action] += self.lr * (Q_target - Q_predict)
def save_model(self,path): def save_model(self,path):
import dill import dill
from pathlib import Path from pathlib import Path
# create path # create path
Path(path).mkdir(parents=True, exist_ok=True) Path(path).mkdir(parents=True, exist_ok=True)
torch.save( torch.save(
obj=self.Q_table_table, obj=self.Q_table,
f=path+"checkpoint.pkl", f=path+"checkpoint.pkl",
pickle_module=dill pickle_module=dill
) )
print("Model saved!") print("Model saved!")
def load_model(self, path): def load_model(self, path):
import dill import dill
self.Q_table_table =torch.load(f=path+'checkpoint.pkl',pickle_module=dill) self.Q_table=torch.load(f=path+'checkpoint.pkl',pickle_module=dill)
print("Mode loaded!") print("Mode loaded!")

View File

@@ -1,131 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
'''
Author: John
Email: johnjim0816@gmail.com
Date: 2020-09-11 23:03:00
LastEditor: John
LastEditTime: 2022-08-04 22:44:00
Discription:
Environment:
'''
import sys
import os
curr_path = os.path.dirname(os.path.abspath(__file__)) # 当前文件所在绝对路径
parent_path = os.path.dirname(curr_path) # 父路径
sys.path.append(parent_path) # 添加路径到系统路径
import gym
import torch
import datetime
import argparse
from envs.gridworld_env import CliffWalkingWapper
from Sarsa.sarsa import Sarsa
from common.utils import plot_rewards,save_args
from common.utils import save_results,make_dir
def get_args():
"""
"""
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
parser = argparse.ArgumentParser(description="hyperparameters")
parser.add_argument('--algo_name',default='Sarsa',type=str,help="name of algorithm")
parser.add_argument('--env_name',default='CliffWalking-v0',type=str,help="name of environment")
parser.add_argument('--train_eps',default=400,type=int,help="episodes of training") # 训练的回合数
parser.add_argument('--test_eps',default=20,type=int,help="episodes of testing") # 测试的回合数
parser.add_argument('--gamma',default=0.90,type=float,help="discounted factor") # 折扣因子
parser.add_argument('--epsilon_start',default=0.95,type=float,help="initial value of epsilon") # e-greedy策略中初始epsilon
parser.add_argument('--epsilon_end',default=0.01,type=float,help="final value of epsilon") # e-greedy策略中的终止epsilon
parser.add_argument('--epsilon_decay',default=300,type=int,help="decay rate of epsilon") # e-greedy策略中epsilon的衰减率
parser.add_argument('--lr',default=0.1,type=float,help="learning rate")
parser.add_argument('--device',default='cpu',type=str,help="cpu or cuda")
parser.add_argument('--result_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/results/' )
parser.add_argument('--model_path',default=curr_path + "/outputs/" + parser.parse_args().env_name + \
'/' + curr_time + '/models/' ) # path to save models
parser.add_argument('--save_fig',default=True,type=bool,help="if save figure or not")
args = parser.parse_args([])
return args
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # 获取当前时间
def train(cfg,env,agent):
print('开始训练!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
rewards = [] # 记录奖励
for i_ep in range(cfg.train_eps):
ep_reward = 0 # 记录每个回合的奖励
state = env.reset() # 重置环境,即开始新的回合
action = agent.sample(state)
while True:
action = agent.sample(state) # 根据算法采样一个动作
next_state, reward, done, _ = env.step(action) # 与环境进行一次动作交互
next_action = agent.sample(next_state)
agent.update(state, action, reward, next_state, next_action,done) # 算法更新
state = next_state # 更新状态
action = next_action
ep_reward += reward
if done:
break
rewards.append(ep_reward)
print(f"回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward:.1f}Epsilon{agent.epsilon}")
print('完成训练!')
return {"rewards":rewards}
def test(cfg,env,agent):
print('开始测试!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励
for i_ep in range(cfg.test_eps):
ep_reward = 0 # 记录每个episode的reward
state = env.reset() # 重置环境, 重新开一局(即开始新的一个回合)
while True:
action = agent.predict(state) # 根据算法选择一个动作
next_state, reward, done, _ = env.step(action) # 与环境进行一个交互
state = next_state # 更新状态
ep_reward += reward
if done:
break
rewards.append(ep_reward)
print(f"回合数:{i_ep+1}/{cfg.test_eps}, 奖励:{ep_reward:.1f}")
print('完成测试!')
return {"rewards":rewards}
def env_agent_config(cfg,seed=1):
'''创建环境和智能体
Args:
cfg ([type]): [description]
seed (int, optional): 随机种子. Defaults to 1.
Returns:
env [type]: 环境
agent : 智能体
'''
env = gym.make(cfg.env_name)
env = CliffWalkingWapper(env)
env.seed(seed) # 设置随机种子
n_states = env.observation_space.n # 状态维度
n_actions = env.action_space.n # 动作维度
print(f"状态数:{n_states},动作数:{n_actions}")
agent = Sarsa(n_actions,cfg)
return env,agent
if __name__ == "__main__":
cfg = get_args()
# 训练
env, agent = env_agent_config(cfg)
res_dic = train(cfg, env, agent)
make_dir(cfg.result_path, cfg.model_path)
save_args(cfg) # save parameters
agent.save(path=cfg.model_path) # save model
save_results(res_dic, tag='train',
path=cfg.result_path)
plot_rewards(res_dic['rewards'], cfg, tag="train")
# 测试
env, agent = env_agent_config(cfg)
agent.load(path=cfg.model_path) # 导入模型
res_dic = test(cfg, env, agent)
save_results(res_dic, tag='test',
path=cfg.result_path) # 保存结果
plot_rewards(res_dic['rewards'], cfg, tag="test") # 画出结果

View File

@@ -24,7 +24,7 @@ class Launcher:
save_results(res_dic, tag = 'train', path = cfg['result_path']) # save results save_results(res_dic, tag = 'train', path = cfg['result_path']) # save results
plot_rewards(res_dic['rewards'], cfg, path = cfg['result_path'],tag = "train") # plot results plot_rewards(res_dic['rewards'], cfg, path = cfg['result_path'],tag = "train") # plot results
# testing # testing
env, agent = self.env_agent_config(cfg) # create new env for testing, sometimes can ignore this step # env, agent = self.env_agent_config(cfg) # create new env for testing, sometimes can ignore this step
agent.load_model(path = cfg['model_path']) # load model agent.load_model(path = cfg['model_path']) # load model
res_dic = self.test(cfg, env, agent) res_dic = self.test(cfg, env, agent)
save_results(res_dic, tag='test', save_results(res_dic, tag='test',

View File

@@ -5,7 +5,7 @@
@Email: johnjim0816@gmail.com @Email: johnjim0816@gmail.com
@Date: 2020-06-10 15:27:16 @Date: 2020-06-10 15:27:16
@LastEditor: John @LastEditor: John
LastEditTime: 2022-08-22 17:23:21 LastEditTime: 2022-08-28 23:44:06
@Discription: @Discription:
@Environment: python 3.7.7 @Environment: python 3.7.7
''' '''
@@ -39,12 +39,12 @@ class ReplayBufferQue:
def __init__(self, capacity: int) -> None: def __init__(self, capacity: int) -> None:
self.capacity = capacity self.capacity = capacity
self.buffer = deque(maxlen=self.capacity) self.buffer = deque(maxlen=self.capacity)
def push(self,trainsitions): def push(self,transitions):
'''_summary_ '''_summary_
Args: Args:
trainsitions (tuple): _description_ trainsitions (tuple): _description_
''' '''
self.buffer.append(trainsitions) self.buffer.append(transitions)
def sample(self, batch_size: int, sequential: bool = False): def sample(self, batch_size: int, sequential: bool = False):
if batch_size > len(self.buffer): if batch_size > len(self.buffer):
batch_size = len(self.buffer) batch_size = len(self.buffer)

View File

@@ -5,7 +5,7 @@ Author: John
Email: johnjim0816@gmail.com Email: johnjim0816@gmail.com
Date: 2021-03-12 21:14:12 Date: 2021-03-12 21:14:12
LastEditor: John LastEditor: John
LastEditTime: 2021-09-15 13:21:03 LastEditTime: 2022-08-29 14:24:44
Discription: Discription:
Environment: Environment:
''' '''
@@ -31,40 +31,45 @@ class MLP(nn.Module):
x = F.relu(self.fc2(x)) x = F.relu(self.fc2(x))
return self.fc3(x) return self.fc3(x)
class ActorSoftmax(nn.Module):
def __init__(self, input_dim, output_dim, hidden_dim=256):
super(ActorSoftmax, self).__init__()
self.fc1 = nn.Linear(input_dim, hidden_dim)
self.fc2 = nn.Linear(hidden_dim, output_dim)
def forward(self,state):
dist = F.relu(self.fc1(state))
dist = F.softmax(self.fc2(dist),dim=1)
return dist
class Critic(nn.Module): class Critic(nn.Module):
def __init__(self, n_obs, n_actions, hidden_size, init_w=3e-3): def __init__(self,input_dim,output_dim,hidden_dim=256):
super(Critic, self).__init__() super(Critic,self).__init__()
assert output_dim == 1 # critic must output a single value
self.linear1 = nn.Linear(n_obs + n_actions, hidden_size) self.fc1 = nn.Linear(input_dim, hidden_dim)
self.linear2 = nn.Linear(hidden_size, hidden_size) self.fc2 = nn.Linear(hidden_dim, output_dim)
self.linear3 = nn.Linear(hidden_size, 1) def forward(self,state):
# 随机初始化为较小的值 value = F.relu(self.fc1(state))
self.linear3.weight.data.uniform_(-init_w, init_w) value = self.fc2(value)
self.linear3.bias.data.uniform_(-init_w, init_w) return value
def forward(self, state, action):
# 按维数1拼接
x = torch.cat([state, action], 1)
x = F.relu(self.linear1(x))
x = F.relu(self.linear2(x))
x = self.linear3(x)
return x
class Actor(nn.Module): class ActorCriticSoftmax(nn.Module):
def __init__(self, n_obs, n_actions, hidden_size, init_w=3e-3): def __init__(self, input_dim, output_dim, actor_hidden_dim=256,critic_hidden_dim=256):
super(Actor, self).__init__() super(ActorCriticSoftmax, self).__init__()
self.linear1 = nn.Linear(n_obs, hidden_size)
self.linear2 = nn.Linear(hidden_size, hidden_size) self.critic_fc1 = nn.Linear(input_dim, critic_hidden_dim)
self.linear3 = nn.Linear(hidden_size, n_actions) self.critic_fc2 = nn.Linear(critic_hidden_dim, 1)
self.actor_fc1 = nn.Linear(input_dim, actor_hidden_dim)
self.actor_fc2 = nn.Linear(actor_hidden_dim, output_dim)
def forward(self, state):
# state = Variable(torch.from_numpy(state).float().unsqueeze(0))
value = F.relu(self.critic_fc1(state))
value = self.critic_fc2(value)
self.linear3.weight.data.uniform_(-init_w, init_w) policy_dist = F.relu(self.actor_fc1(state))
self.linear3.bias.data.uniform_(-init_w, init_w) policy_dist = F.softmax(self.actor_fc2(policy_dist), dim=1)
def forward(self, x): return value, policy_dist
x = F.relu(self.linear1(x))
x = F.relu(self.linear2(x))
x = torch.tanh(self.linear3(x))
return x
class ActorCritic(nn.Module): class ActorCritic(nn.Module):
def __init__(self, n_states, n_actions, hidden_dim=256): def __init__(self, n_states, n_actions, hidden_dim=256):

View File

@@ -5,7 +5,7 @@ def register_env(env_name):
if env_name == 'Racetrack-v0': if env_name == 'Racetrack-v0':
register( register(
id='Racetrack-v0', id='Racetrack-v0',
entry_point='racetrack:RacetrackEnv', entry_point='envs.racetrack:RacetrackEnv',
max_episode_steps=1000, max_episode_steps=1000,
kwargs={} kwargs={}
) )

View File

@@ -0,0 +1,15 @@
# run A2C on CartPole-v0
# source conda, if you are already in proper conda environment, then comment the codes util "conda activate easyrl"
if [ -f "$HOME/anaconda3/etc/profile.d/conda.sh" ]; then
echo "source file at ~/anaconda3/etc/profile.d/conda.sh"
source ~/anaconda3/etc/profile.d/conda.sh
elif [ -f "$HOME/opt/anaconda3/etc/profile.d/conda.sh" ]; then
echo "source file at ~/opt/anaconda3/etc/profile.d/conda.sh"
source ~/opt/anaconda3/etc/profile.d/conda.sh
else
echo 'please manually config the conda source path'
fi
conda activate easyrl # easyrl here can be changed to another name of conda env that you have created
codes_dir=$(dirname $(dirname $(readlink -f "$0"))) # "codes" path
python $codes_dir/A2C/main.py

Some files were not shown because too many files have changed in this diff Show More