This commit is contained in:
johnjim0816
2021-06-02 10:36:20 +08:00
parent e1e0d5a878
commit 889b89a859

View File

@@ -28,45 +28,32 @@
```python ```python
'''初始化环境''' '''初始化环境'''
env = gym.make("CliffWalking-v0") # 0 up, 1 right, 2 down, 3 left env = gym.make("CliffWalking-v0") # 0 up, 1 right, 2 down, 3 left
env = CliffWalkingWapper(env) env = CliffWalkingWapper(env)
agent = QLearning( agent = QLearning(
obs_dim=env.observation_space.n, state_dim=env.observation_space.n,
action_dim=env.action_space.n, action_dim=env.action_space.n,
learning_rate=cfg.policy_lr, learning_rate=cfg.policy_lr,
gamma=cfg.gamma, gamma=cfg.gamma,
epsilon_start=cfg.epsilon_start,epsilon_end=cfg.epsilon_end,epsilon_decay=cfg.epsilon_decay) rewards = []
render = False # 是否打开GUI画面 ma_rewards = [] # moving average reward
rewards = [] # 记录所有episode的reward for i_ep in range(cfg.train_eps): # train_eps: 训练的最大episodes数
MA_rewards = [] # 记录滑动平均的reward ep_reward = 0 # 记录每个episode的reward
steps = []# 记录所有episode的steps state = env.reset() # 重置环境, 重新开一局(即开始新的一个episode
for i_episode in range(1,cfg.max_episodes+1): while True:
ep_reward = 0 # 记录每个episode的reward action = agent.choose_action(state) # 根据算法选择一个动作
ep_steps = 0 # 记录每个episode走了多少step next_state, reward, done, _ = env.step(action) # 与环境进行一次动作交互
obs = env.reset() # 重置环境, 重新开一局即开始新的一个episode agent.update(state, action, reward, next_state, done) # Q-learning算法更新
while True: state = next_state # 存储上一个观察值
action = agent.sample(obs) # 根据算法选择一个动作 ep_reward += reward
next_obs, reward, done, _ = env.step(action) # 与环境进行一个交互 if done:
# 训练 Q-learning算法 break
agent.learn(obs, action, reward, next_obs, done) # 不需要下一步的action rewards.append(ep_reward)
obs = next_obs # 存储上一个观察值 if ma_rewards:
ep_reward += reward ma_rewards.append(ma_rewards[-1]*0.9+ep_reward*0.1)
ep_steps += 1 # 计算step数 else:
if render: ma_rewards.append(ep_reward)
env.render() #渲染新的一帧图形 print("Episode:{}/{}: reward:{:.1f}".format(i_ep+1, cfg.train_eps,ep_reward))
if done:
break
steps.append(ep_steps)
rewards.append(ep_reward)
# 计算滑动平均的reward
if i_episode == 1:
MA_rewards.append(ep_reward)
else:
MA_rewards.append(
0.9*MA_rewards[-1]+0.1*ep_reward)
print('Episode %s: steps = %s , reward = %.1f, explore = %.2f' % (i_episode, ep_steps,
ep_reward,agent.epsilon))
agent.save() # 训练结束,保存模型
``` ```
## 任务要求 ## 任务要求