add some codes

This commit is contained in:
qiwang067
2020-07-20 23:56:20 +08:00
parent aae36f5bb8
commit f4ac39625a
41 changed files with 1799 additions and 7 deletions

View File

@@ -31,6 +31,7 @@
## 主要贡献者
- [@qiwang067](https://github.com/qiwang067)
- [@JohnJim0816](https://github.com/JohnJim0816)
## 关注我们

75
codes/Q-learning/agent.py Normal file
View File

@@ -0,0 +1,75 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# -*- coding: utf-8 -*-
import numpy as np
class QLearningAgent(object):
def __init__(self,
obs_n,
act_n,
learning_rate=0.01,
gamma=0.9,
e_greed=0.1):
self.act_n = act_n # 动作维度,有几个动作可选
self.lr = learning_rate # 学习率
self.gamma = gamma # reward的衰减率
self.epsilon = e_greed # 按一定概率随机选动作
self.Q = np.zeros((obs_n, act_n))
# 根据输入观察值,采样输出的动作值,带探索
def sample(self, obs):
if np.random.uniform(0, 1) < (1.0 - self.epsilon): #根据table的Q值选动作
action = self.predict(obs)
else:
action = np.random.choice(self.act_n) #有一定概率随机探索选取一个动作
return action
# 根据输入观察值,预测输出的动作值
def predict(self, obs):
Q_list = self.Q[obs, :]
maxQ = np.max(Q_list)
action_list = np.where(Q_list == maxQ)[0] # maxQ可能对应多个action
action = np.random.choice(action_list)
return action
# 学习方法也就是更新Q-table的方法
def learn(self, obs, action, reward, next_obs, done):
""" off-policy
obs: 交互前的obs, s_t
action: 本次交互选择的action, a_t
reward: 本次动作获得的奖励r
next_obs: 本次交互后的obs, s_t+1
done: episode是否结束
"""
predict_Q = self.Q[obs, action]
if done:
target_Q = reward # 没有下一个状态了
else:
target_Q = reward + self.gamma * np.max(
self.Q[next_obs, :]) # Q-learning
self.Q[obs, action] += self.lr * (target_Q - predict_Q) # 修正q
# 把 Q表格 的数据保存到文件中
def save(self):
npy_file = './q_table.npy'
np.save(npy_file, self.Q)
print(npy_file + ' saved.')
# 从文件中读取数据到 Q表格
def restore(self, npy_file='./q_table.npy'):
self.Q = np.load(npy_file)
print(npy_file + ' loaded.')

View File

@@ -0,0 +1,195 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# -*- coding: utf-8 -*-
import gym
import turtle
import numpy as np
# turtle tutorial : https://docs.python.org/3.3/library/turtle.html
def GridWorld(gridmap=None, is_slippery=False):
if gridmap is None:
gridmap = ['SFFF', 'FHFH', 'FFFH', 'HFFG']
env = gym.make("FrozenLake-v0", desc=gridmap, is_slippery=False)
env = FrozenLakeWapper(env)
return env
class FrozenLakeWapper(gym.Wrapper):
def __init__(self, env):
gym.Wrapper.__init__(self, env)
self.max_y = env.desc.shape[0]
self.max_x = env.desc.shape[1]
self.t = None
self.unit = 50
def draw_box(self, x, y, fillcolor='', line_color='gray'):
self.t.up()
self.t.goto(x * self.unit, y * self.unit)
self.t.color(line_color)
self.t.fillcolor(fillcolor)
self.t.setheading(90)
self.t.down()
self.t.begin_fill()
for _ in range(4):
self.t.forward(self.unit)
self.t.right(90)
self.t.end_fill()
def move_player(self, x, y):
self.t.up()
self.t.setheading(90)
self.t.fillcolor('red')
self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit)
def render(self):
if self.t == None:
self.t = turtle.Turtle()
self.wn = turtle.Screen()
self.wn.setup(self.unit * self.max_x + 100,
self.unit * self.max_y + 100)
self.wn.setworldcoordinates(0, 0, self.unit * self.max_x,
self.unit * self.max_y)
self.t.shape('circle')
self.t.width(2)
self.t.speed(0)
self.t.color('gray')
for i in range(self.desc.shape[0]):
for j in range(self.desc.shape[1]):
x = j
y = self.max_y - 1 - i
if self.desc[i][j] == b'S': # Start
self.draw_box(x, y, 'white')
elif self.desc[i][j] == b'F': # Frozen ice
self.draw_box(x, y, 'white')
elif self.desc[i][j] == b'G': # Goal
self.draw_box(x, y, 'yellow')
elif self.desc[i][j] == b'H': # Hole
self.draw_box(x, y, 'black')
else:
self.draw_box(x, y, 'white')
self.t.shape('turtle')
x_pos = self.s % self.max_x
y_pos = self.max_y - 1 - int(self.s / self.max_x)
self.move_player(x_pos, y_pos)
class CliffWalkingWapper(gym.Wrapper):
def __init__(self, env):
gym.Wrapper.__init__(self, env)
self.t = None
self.unit = 50
self.max_x = 12
self.max_y = 4
def draw_x_line(self, y, x0, x1, color='gray'):
assert x1 > x0
self.t.color(color)
self.t.setheading(0)
self.t.up()
self.t.goto(x0, y)
self.t.down()
self.t.forward(x1 - x0)
def draw_y_line(self, x, y0, y1, color='gray'):
assert y1 > y0
self.t.color(color)
self.t.setheading(90)
self.t.up()
self.t.goto(x, y0)
self.t.down()
self.t.forward(y1 - y0)
def draw_box(self, x, y, fillcolor='', line_color='gray'):
self.t.up()
self.t.goto(x * self.unit, y * self.unit)
self.t.color(line_color)
self.t.fillcolor(fillcolor)
self.t.setheading(90)
self.t.down()
self.t.begin_fill()
for i in range(4):
self.t.forward(self.unit)
self.t.right(90)
self.t.end_fill()
def move_player(self, x, y):
self.t.up()
self.t.setheading(90)
self.t.fillcolor('red')
self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit)
def render(self):
if self.t == None:
self.t = turtle.Turtle()
self.wn = turtle.Screen()
self.wn.setup(self.unit * self.max_x + 100,
self.unit * self.max_y + 100)
self.wn.setworldcoordinates(0, 0, self.unit * self.max_x,
self.unit * self.max_y)
self.t.shape('circle')
self.t.width(2)
self.t.speed(0)
self.t.color('gray')
for _ in range(2):
self.t.forward(self.max_x * self.unit)
self.t.left(90)
self.t.forward(self.max_y * self.unit)
self.t.left(90)
for i in range(1, self.max_y):
self.draw_x_line(
y=i * self.unit, x0=0, x1=self.max_x * self.unit)
for i in range(1, self.max_x):
self.draw_y_line(
x=i * self.unit, y0=0, y1=self.max_y * self.unit)
for i in range(1, self.max_x - 1):
self.draw_box(i, 0, 'black')
self.draw_box(self.max_x - 1, 0, 'yellow')
self.t.shape('turtle')
x_pos = self.s % self.max_x
y_pos = self.max_y - 1 - int(self.s / self.max_x)
self.move_player(x_pos, y_pos)
if __name__ == '__main__':
# 环境1FrozenLake, 可以配置冰面是否是滑的
# 0 left, 1 down, 2 right, 3 up
env = gym.make("FrozenLake-v0", is_slippery=False)
env = FrozenLakeWapper(env)
# 环境2CliffWalking, 悬崖环境
# env = gym.make("CliffWalking-v0") # 0 up, 1 right, 2 down, 3 left
# env = CliffWalkingWapper(env)
# 环境3自定义格子世界可以配置地图, S为出发点Start, F为平地Floor, H为洞Hole, G为出口目标Goal
# gridmap = [
# 'SFFF',
# 'FHFF',
# 'FFFF',
# 'HFGF' ]
# env = GridWorld(gridmap)
env.reset()
for step in range(10):
action = np.random.randint(0, 4)
obs, reward, done, info = env.step(action)
print('step {}: action {}, obs {}, reward {}, done {}, info {}'.format(\
step, action, obs, reward, done, info))
# env.render() # 渲染一帧图像

90
codes/Q-learning/train.py Normal file
View File

@@ -0,0 +1,90 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# -*- coding: utf-8 -*-
import gym
from gridworld import CliffWalkingWapper, FrozenLakeWapper
from agent import QLearningAgent
import time
def run_episode(env, agent, render=False):
total_steps = 0 # 记录每个episode走了多少step
total_reward = 0
obs = env.reset() # 重置环境, 重新开一局即开始新的一个episode
while True:
action = agent.sample(obs) # 根据算法选择一个动作
next_obs, reward, done, _ = env.step(action) # 与环境进行一个交互
# 训练 Q-learning算法
agent.learn(obs, action, reward, next_obs, done) # 不需要下一步的action
obs = next_obs # 存储上一个观察值
total_reward += reward
total_steps += 1 # 计算step数
if render:
env.render() #渲染新的一帧图形
if done:
break
return total_reward, total_steps
def test_episode(env, agent):
total_reward = 0
obs = env.reset()
while True:
action = agent.predict(obs) # greedy
next_obs, reward, done, _ = env.step(action)
total_reward += reward
obs = next_obs
time.sleep(0.5)
env.render()
if done:
print('test reward = %.1f' % (total_reward))
break
def main():
# env = gym.make("FrozenLake-v0", is_slippery=False) # 0 left, 1 down, 2 right, 3 up
# env = FrozenLakeWapper(env)
env = gym.make("CliffWalking-v0") # 0 up, 1 right, 2 down, 3 left
env = CliffWalkingWapper(env)
agent = QLearningAgent(
obs_n=env.observation_space.n,
act_n=env.action_space.n,
learning_rate=0.1,
gamma=0.9,
e_greed=0.1)
is_render = False
for episode in range(500):
ep_reward, ep_steps = run_episode(env, agent, is_render)
print('Episode %s: steps = %s , reward = %.1f' % (episode, ep_steps,
ep_reward))
# 每隔20个episode渲染一下看看效果
if episode % 20 == 0:
is_render = True
else:
is_render = False
# 训练结束,查看算法效果
test_episode(env, agent)
if __name__ == "__main__":
main()

74
codes/Sarsa/agent.py Normal file
View File

@@ -0,0 +1,74 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# -*- coding: utf-8 -*-
import numpy as np
# 根据Q表格选动作
class SarsaAgent(object):
def __init__(self,
obs_n,
act_n,
learning_rate=0.01,
gamma=0.9,
e_greed=0.1):
self.act_n = act_n # 动作维度,有几个动作可选
self.lr = learning_rate # 学习率
self.gamma = gamma # reward的衰减率
self.epsilon = e_greed # 按一定概率随机选动作
self.Q = np.zeros((obs_n, act_n)) # 初始化Q表格
# 根据输入观察值,采样输出的动作值,带探索(epsilon-greedy训练时用这个方法)
def sample(self, obs):
if np.random.uniform(0, 1) < (1.0 - self.epsilon): #根据table的Q值选动作
action = self.predict(obs)
else:
action = np.random.choice(self.act_n) #有一定概率随机探索选取一个动作
return action
# 根据输入观察值,预测输出的动作值(已有里面挑最大,贪心的算法,只有利用,没有探索)
def predict(self, obs):
Q_list = self.Q[obs, :]
maxQ = np.max(Q_list) # 找到最大Q对应的下标
action_list = np.where(Q_list == maxQ)[0] # maxQ可能对应多个action
action = np.random.choice(action_list) # 从这些action中随机挑一个action可以打印出来看看
return action
# 学习方法也就是更新Q-table的方法
def learn(self, obs, action, reward, next_obs, next_action, done):
""" on-policy
obs: 交互前的obs, s_t
action: 本次交互选择的action, a_t
reward: 本次动作获得的奖励r
next_obs: 本次交互后的obs, s_t+1
next_action: 根据当前Q表格, 针对next_obs会选择的动作, a_t+1
done: episode是否结束
"""
predict_Q = self.Q[obs, action]
if done: # done为ture的话代表这是episode最后一个状态
target_Q = reward # 没有下一个状态了
else:
target_Q = reward + self.gamma * self.Q[next_obs,
next_action] # Sarsa
self.Q[obs, action] += self.lr * (target_Q - predict_Q) # 修正q
def save(self):
npy_file = './q_table.npy'
np.save(npy_file, self.Q)
print(npy_file + ' saved.')
def restore(self, npy_file='./q_table.npy'):
self.Q = np.load(npy_file)
print(npy_file + ' loaded.')

195
codes/Sarsa/gridworld.py Normal file
View File

@@ -0,0 +1,195 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# -*- coding: utf-8 -*-
import gym
import turtle
import numpy as np
# turtle tutorial : https://docs.python.org/3.3/library/turtle.html
def GridWorld(gridmap=None, is_slippery=False):
if gridmap is None:
gridmap = ['SFFF', 'FHFH', 'FFFH', 'HFFG']
env = gym.make("FrozenLake-v0", desc=gridmap, is_slippery=False)
env = FrozenLakeWapper(env)
return env
class FrozenLakeWapper(gym.Wrapper):
def __init__(self, env):
gym.Wrapper.__init__(self, env)
self.max_y = env.desc.shape[0]
self.max_x = env.desc.shape[1]
self.t = None
self.unit = 50
def draw_box(self, x, y, fillcolor='', line_color='gray'):
self.t.up()
self.t.goto(x * self.unit, y * self.unit)
self.t.color(line_color)
self.t.fillcolor(fillcolor)
self.t.setheading(90)
self.t.down()
self.t.begin_fill()
for _ in range(4):
self.t.forward(self.unit)
self.t.right(90)
self.t.end_fill()
def move_player(self, x, y):
self.t.up()
self.t.setheading(90)
self.t.fillcolor('red')
self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit)
def render(self):
if self.t == None:
self.t = turtle.Turtle()
self.wn = turtle.Screen()
self.wn.setup(self.unit * self.max_x + 100,
self.unit * self.max_y + 100)
self.wn.setworldcoordinates(0, 0, self.unit * self.max_x,
self.unit * self.max_y)
self.t.shape('circle')
self.t.width(2)
self.t.speed(0)
self.t.color('gray')
for i in range(self.desc.shape[0]):
for j in range(self.desc.shape[1]):
x = j
y = self.max_y - 1 - i
if self.desc[i][j] == b'S': # Start
self.draw_box(x, y, 'white')
elif self.desc[i][j] == b'F': # Frozen ice
self.draw_box(x, y, 'white')
elif self.desc[i][j] == b'G': # Goal
self.draw_box(x, y, 'yellow')
elif self.desc[i][j] == b'H': # Hole
self.draw_box(x, y, 'black')
else:
self.draw_box(x, y, 'white')
self.t.shape('turtle')
x_pos = self.s % self.max_x
y_pos = self.max_y - 1 - int(self.s / self.max_x)
self.move_player(x_pos, y_pos)
class CliffWalkingWapper(gym.Wrapper):
def __init__(self, env):
gym.Wrapper.__init__(self, env)
self.t = None
self.unit = 50
self.max_x = 12
self.max_y = 4
def draw_x_line(self, y, x0, x1, color='gray'):
assert x1 > x0
self.t.color(color)
self.t.setheading(0)
self.t.up()
self.t.goto(x0, y)
self.t.down()
self.t.forward(x1 - x0)
def draw_y_line(self, x, y0, y1, color='gray'):
assert y1 > y0
self.t.color(color)
self.t.setheading(90)
self.t.up()
self.t.goto(x, y0)
self.t.down()
self.t.forward(y1 - y0)
def draw_box(self, x, y, fillcolor='', line_color='gray'):
self.t.up()
self.t.goto(x * self.unit, y * self.unit)
self.t.color(line_color)
self.t.fillcolor(fillcolor)
self.t.setheading(90)
self.t.down()
self.t.begin_fill()
for i in range(4):
self.t.forward(self.unit)
self.t.right(90)
self.t.end_fill()
def move_player(self, x, y):
self.t.up()
self.t.setheading(90)
self.t.fillcolor('red')
self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit)
def render(self):
if self.t == None:
self.t = turtle.Turtle()
self.wn = turtle.Screen()
self.wn.setup(self.unit * self.max_x + 100,
self.unit * self.max_y + 100)
self.wn.setworldcoordinates(0, 0, self.unit * self.max_x,
self.unit * self.max_y)
self.t.shape('circle')
self.t.width(2)
self.t.speed(0)
self.t.color('gray')
for _ in range(2):
self.t.forward(self.max_x * self.unit)
self.t.left(90)
self.t.forward(self.max_y * self.unit)
self.t.left(90)
for i in range(1, self.max_y):
self.draw_x_line(
y=i * self.unit, x0=0, x1=self.max_x * self.unit)
for i in range(1, self.max_x):
self.draw_y_line(
x=i * self.unit, y0=0, y1=self.max_y * self.unit)
for i in range(1, self.max_x - 1):
self.draw_box(i, 0, 'black')
self.draw_box(self.max_x - 1, 0, 'yellow')
self.t.shape('turtle')
x_pos = self.s % self.max_x
y_pos = self.max_y - 1 - int(self.s / self.max_x)
self.move_player(x_pos, y_pos)
if __name__ == '__main__':
# 环境1FrozenLake, 可以配置冰面是否是滑的
# 0 left, 1 down, 2 right, 3 up
env = gym.make("FrozenLake-v0", is_slippery=False)
env = FrozenLakeWapper(env)
# 环境2CliffWalking, 悬崖环境
# env = gym.make("CliffWalking-v0") # 0 up, 1 right, 2 down, 3 left
# env = CliffWalkingWapper(env)
# 环境3自定义格子世界可以配置地图, S为出发点Start, F为平地Floor, H为洞Hole, G为出口目标Goal
# gridmap = [
# 'SFFF',
# 'FHFF',
# 'FFFF',
# 'HFGF' ]
# env = GridWorld(gridmap)
env.reset()
for step in range(10):
action = np.random.randint(0, 4)
obs, reward, done, info = env.step(action)
print('step {}: action {}, obs {}, reward {}, done {}, info {}'.format(\
step, action, obs, reward, done, info))
# env.render() # 渲染一帧图像

92
codes/Sarsa/train.py Normal file
View File

@@ -0,0 +1,92 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# -*- coding: utf-8 -*-
import gym
from gridworld import CliffWalkingWapper, FrozenLakeWapper
from agent import SarsaAgent
import time
def run_episode(env, agent, render=False):
total_steps = 0 # 记录每个episode走了多少step
total_reward = 0
obs = env.reset() # 重置环境, 重新开一局即开始新的一个episode
action = agent.sample(obs) # 根据算法选择一个动作
while True:
next_obs, reward, done, _ = env.step(action) # 与环境进行一个交互
next_action = agent.sample(next_obs) # 根据算法选择一个动作
# 训练 Sarsa 算法
agent.learn(obs, action, reward, next_obs, next_action, done)
action = next_action
obs = next_obs # 存储上一个观察值
total_reward += reward
total_steps += 1 # 计算step数
if render:
env.render() #渲染新的一帧图形
if done:
break
return total_reward, total_steps
def test_episode(env, agent):
total_reward = 0
obs = env.reset()
while True:
action = agent.predict(obs) # greedy只取最优的动作
next_obs, reward, done, _ = env.step(action)
total_reward += reward
obs = next_obs
time.sleep(0.5) # 每个step延迟0.5秒来看看效果
env.render()
if done:
print('test reward = %.1f' % (total_reward))
break
def main():
# env = gym.make("FrozenLake-v0", is_slippery=False) # 0 left, 1 down, 2 right, 3 up
# env = FrozenLakeWapper(env)
env = gym.make("CliffWalking-v0") # 0 up, 1 right, 2 down, 3 left
env = CliffWalkingWapper(env) # 这行不加也可以,这个是为了显示效果更好一点
agent = SarsaAgent(
obs_n=env.observation_space.n,
act_n=env.action_space.n,
learning_rate=0.1,
gamma=0.9,
e_greed=0.1)
is_render = False
for episode in range(500):
ep_reward, ep_steps = run_episode(env, agent, is_render)
print('Episode %s: steps = %s , reward = %.1f' % (episode, ep_steps,
ep_reward))
# 每隔20个episode渲染一下看看效果每个episode都渲染的话时间会比较长
if episode % 20 == 0:
is_render = True
else:
is_render = False
# 训练结束,查看算法效果
test_episode(env, agent)
if __name__ == "__main__":
main()

87
codes/ddpg/ddpg.py Normal file
View File

@@ -0,0 +1,87 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-09 20:25:52
@LastEditor: John
@LastEditTime: 2020-06-14 11:43:17
@Discription:
@Environment: python 3.7.7
'''
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from model import Actor, Critic
from memory import ReplayBuffer
class DDPG:
def __init__(self, n_states, n_actions, hidden_dim=30, device="cpu", critic_lr=1e-3,
actor_lr=1e-4, gamma=0.99, soft_tau=1e-2, memory_capacity=100000, batch_size=128):
self.device = device
self.critic = Critic(n_states, n_actions, hidden_dim).to(device)
self.actor = Actor(n_states, n_actions, hidden_dim).to(device)
self.target_critic = Critic(n_states, n_actions, hidden_dim).to(device)
self.target_actor = Actor(n_states, n_actions, hidden_dim).to(device)
for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()):
target_param.data.copy_(param.data)
for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()):
target_param.data.copy_(param.data)
self.critic_optimizer = optim.Adam(
self.critic.parameters(), lr=critic_lr)
self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)
self.critic_criterion = nn.MSELoss()
self.memory = ReplayBuffer(memory_capacity)
self.batch_size = batch_size
self.soft_tau = soft_tau
self.gamma = gamma
def select_action(self, state):
return self.actor.select_action(state)
def update(self):
if len(self.memory) < self.batch_size:
return
state, action, reward, next_state, done = self.memory.sample(
self.batch_size)
# 将所有变量转为张量
state = torch.FloatTensor(state).to(self.device)
next_state = torch.FloatTensor(next_state).to(self.device)
action = torch.FloatTensor(action).to(self.device)
reward = torch.FloatTensor(reward).unsqueeze(1).to(self.device)
done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(self.device)
policy_loss = self.critic(state, self.actor(state))
policy_loss = -policy_loss.mean()
next_action = self.target_actor(next_state)
target_value = self.target_critic(next_state, next_action.detach())
expected_value = reward + (1.0 - done) * self.gamma * target_value
expected_value = torch.clamp(expected_value, -np.inf, np.inf)
value = self.critic(state, action)
value_loss = self.critic_criterion(value, expected_value.detach())
self.actor_optimizer.zero_grad()
policy_loss.backward()
self.actor_optimizer.step()
self.critic_optimizer.zero_grad()
value_loss.backward()
self.critic_optimizer.step()
for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()):
target_param.data.copy_(
target_param.data * (1.0 - self.soft_tau) +
param.data * self.soft_tau
)
for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()):
target_param.data.copy_(
target_param.data * (1.0 - self.soft_tau) +
param.data * self.soft_tau
)

31
codes/ddpg/env.py Normal file
View File

@@ -0,0 +1,31 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-10 15:28:30
@LastEditor: John
@LastEditTime: 2020-06-12 22:49:18
@Discription:
@Environment: python 3.7.7
'''
import gym
import numpy as np
class NormalizedActions(gym.ActionWrapper):
def action(self, action):
low_bound = self.action_space.low
upper_bound = self.action_space.high
action = low_bound + (action + 1.0) * 0.5 * (upper_bound - low_bound)
action = np.clip(action, low_bound, upper_bound)
return action
def reverse_action(self, action):
low_bound = self.action_space.low
upper_bound = self.action_space.high
action = 2 * (action - low_bound) / (upper_bound - low_bound) - 1
action = np.clip(action, low_bound, upper_bound)
return action

89
codes/ddpg/main.py Normal file
View File

@@ -0,0 +1,89 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-11 20:58:21
@LastEditor: John
@LastEditTime: 2020-07-20 23:01:02
@Discription:
@Environment: python 3.7.7
'''
import torch
import gym
from ddpg import DDPG
from env import NormalizedActions
from noise import OUNoise
from plot import plot
import argparse
def get_args():
'''模型建立好之后只需要在这里调参
'''
parser = argparse.ArgumentParser()
parser.add_argument("--gamma", default=0.99, type=float) # q-learning中的gamma
parser.add_argument("--critic_lr", default=1e-3, type=float) # critic学习率
parser.add_argument("--actor_lr", default=1e-4, type=float)
parser.add_argument("--memory_capacity", default=10000, type=int,help="capacity of Replay Memory")
parser.add_argument("--batch_size", default=128, type=int,help="batch size of memory sampling")
parser.add_argument("--max_episodes", default=200, type=int)
parser.add_argument("--max_steps", default=200, type=int)
parser.add_argument("--target_update", default=4, type=int,help="when(every default 10 eisodes) to update target net ")
config = parser.parse_args()
return config
if __name__ == "__main__":
cfg = get_args()
env = NormalizedActions(gym.make("Pendulum-v0"))
# 增加action噪声
ou_noise = OUNoise(env.action_space)
n_states = env.observation_space.shape[0]
n_actions = env.action_space.shape[0]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
agent=DDPG(n_states,n_actions,device="cpu", critic_lr=1e-3,
actor_lr=1e-4, gamma=0.99, soft_tau=1e-2, memory_capacity=100000, batch_size=128)
rewards = []
moving_average_rewards = []
for i_episode in range(1,cfg.max_episodes+1):
state=env.reset()
ou_noise.reset()
ep_reward = 0
for i_step in range(1,cfg.max_steps+1):
action = agent.select_action(state)
action = ou_noise.get_action(action, i_step) # 即paper中的random process
next_state, reward, done, _ = env.step(action)
ep_reward += reward
agent.memory.push(state, action, reward, next_state, done)
agent.update()
state = next_state
if done:
break
print('Episode:', i_episode, ' Reward: %i' % int(ep_reward),)
rewards.append(ep_reward)
#
if i_episode == 1:
moving_average_rewards.append(ep_reward)
else:
moving_average_rewards.append(
0.9*moving_average_rewards[-1]+0.1*ep_reward)
print('Complete')
import os
import numpy as np
output_path = os.path.dirname(__file__)+"/result/"
if not os.path.exists(output_path):
os.mkdir(output_path)
np.save(output_path+"rewards.npy", rewards)
np.save(output_path+"moving_average_rewards.npy", moving_average_rewards)
plot(rewards)
plot(moving_average_rewards,ylabel="moving_average_rewards")

34
codes/ddpg/memory.py Normal file
View File

@@ -0,0 +1,34 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-10 15:27:16
@LastEditor: John
@LastEditTime: 2020-06-13 00:29:45
@Discription:
@Environment: python 3.7.7
'''
import random
import numpy as np
class ReplayBuffer:
def __init__(self, capacity):
self.capacity = capacity
self.buffer = []
self.position = 0
def push(self, state, action, reward, next_state, done):
if len(self.buffer) < self.capacity:
self.buffer.append(None)
self.buffer[self.position] = (state, action, reward, next_state, done)
self.position = (self.position + 1) % self.capacity
def sample(self, batch_size):
batch = random.sample(self.buffer, batch_size)
state_batch, action_batch, reward_batch, next_state_batch, done_batch = map(np.stack, zip(*batch))
return state_batch, action_batch, reward_batch, next_state_batch, done_batch
def __len__(self):
return len(self.buffer)

56
codes/ddpg/model.py Normal file
View File

@@ -0,0 +1,56 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-10 15:03:59
@LastEditor: John
@LastEditTime: 2020-06-14 11:42:45
@Discription:
@Environment: python 3.7.7
'''
import torch
import torch.nn as nn
import torch.nn.functional as F
class Critic(nn.Module):
def __init__(self, n_obs, n_actions, hidden_size, init_w=3e-3):
super(Critic, self).__init__()
self.linear1 = nn.Linear(n_obs + n_actions, hidden_size)
self.linear2 = nn.Linear(hidden_size, hidden_size)
self.linear3 = nn.Linear(hidden_size, 1)
self.linear3.weight.data.uniform_(-init_w, init_w)
self.linear3.bias.data.uniform_(-init_w, init_w)
def forward(self, state, action):
x = torch.cat([state, action], 1)
x = F.relu(self.linear1(x))
x = F.relu(self.linear2(x))
x = self.linear3(x)
return x
class Actor(nn.Module):
def __init__(self, n_obs, n_actions, hidden_size, init_w=3e-3):
super(Actor, self).__init__()
self.linear1 = nn.Linear(n_obs, hidden_size)
self.linear2 = nn.Linear(hidden_size, hidden_size)
self.linear3 = nn.Linear(hidden_size, n_actions)
self.linear3.weight.data.uniform_(-init_w, init_w)
self.linear3.bias.data.uniform_(-init_w, init_w)
def forward(self, x):
x = F.relu(self.linear1(x))
x = F.relu(self.linear2(x))
x = F.tanh(self.linear3(x))
return x
def select_action(self, state):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
state = torch.FloatTensor(state).unsqueeze(0).to(device)
# print(state)
action = self.forward(state)
return action.detach().cpu().numpy()[0, 0]

39
codes/ddpg/noise.py Normal file
View File

@@ -0,0 +1,39 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-11 20:58:59
@LastEditor: John
@LastEditTime: 2020-06-11 20:59:20
@Discription:
@Environment: python 3.7.7
'''
import numpy as np
class OUNoise(object):
def __init__(self, action_space, mu=0.0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000):
self.mu = mu
self.theta = theta
self.sigma = max_sigma
self.max_sigma = max_sigma
self.min_sigma = min_sigma
self.decay_period = decay_period
self.n_actions = action_space.shape[0]
self.low = action_space.low
self.high = action_space.high
self.reset()
def reset(self):
self.obs = np.ones(self.n_actions) * self.mu
def evolve_obs(self):
x = self.obs
dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.n_actions)
self.obs = x + dx
return self.obs
def get_action(self, action, t=0):
ou_obs = self.evolve_obs()
self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period)
return np.clip(action + ou_obs, self.low, self.high)

47
codes/ddpg/plot.py Normal file
View File

@@ -0,0 +1,47 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-11 16:30:09
@LastEditor: John
@LastEditTime: 2020-06-12 11:34:52
@Discription:
@Environment: python 3.7.7
'''
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns; sns.set()
import numpy as np
import os
# def plot(item,ylabel='rewards'):
# plt.figure()
# plt.plot(np.arange(len(item)), item)
# plt.title(ylabel+' of DDPG')
# plt.ylabel(ylabel)
# plt.xlabel('episodes')
# plt.savefig(os.path.dirname(__file__)+"/result/"+ylabel+".png")
# plt.show()
def plot(item,ylabel='rewards'):
df = pd.DataFrame(dict(time=np.arange(500),
value=np.random.randn(500).cumsum()))
g = sns.relplot(x="time", y="value", kind="line", data=df)
g.fig.autofmt_xdate()
# time = range(len(item))
# sns.set(style="darkgrid", font_scale=1.5)
# sns.lineplot(time=time, data=item, color="r", condition="behavior_cloning")
# # sns.tsplot(time=time, data=x2, color="b", condition="dagger")
# plt.ylabel("Reward")
# plt.xlabel("Iteration Number")
# plt.title("Imitation Learning")
plt.show()
if __name__ == "__main__":
output_path = os.path.dirname(__file__)+"/result/"
rewards=np.load(output_path+"rewards.npy", )
moving_average_rewards=np.load(output_path+"moving_average_rewards.npy",)
plot(rewards)
plot(moving_average_rewards,ylabel='moving_average_rewards')

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 47 KiB

98
codes/dqn/dqn.py Normal file
View File

@@ -0,0 +1,98 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:50:49
@LastEditor: John
@LastEditTime: 2020-06-14 13:56:45
@Discription:
@Environment: python 3.7.7
'''
'''off-policy
'''
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
import math
import numpy as np
from memory import ReplayBuffer
from model import FCN
class DQN:
def __init__(self, n_states, n_actions, gamma=0.99, epsilon_start=0.9, epsilon_end=0.05, epsilon_decay=200, memory_capacity=10000, policy_lr=0.01,batch_size=128, device="cpu"):
self.actions_count = 0
self.n_actions = n_actions
self.device = device
self.gamma = gamma
self.epsilon = 0
self.epsilon_start = epsilon_start
self.epsilon_end = epsilon_end
self.epsilon_decay = epsilon_decay
self.batch_size = batch_size
self.policy_net = FCN(n_states,n_actions).to(self.device)
self.target_net = FCN(n_states,n_actions).to(self.device)
self.target_net.load_state_dict(self.policy_net.state_dict())
self.target_net.eval() # 不启用 BatchNormalization 和 Dropout
self.optimizer = optim.Adam(self.policy_net.parameters(),lr=policy_lr)
self.loss = 0
self.memory = ReplayBuffer(memory_capacity)
def select_action(self,state):
'''选择工作
Args:
state [array]: 状态
Returns:
[array]: 动作
'''
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
math.exp(-1. * self.actions_count / self.epsilon_decay)
self.actions_count += 1
if random.random() > self.epsilon:
with torch.no_grad():
state = torch.tensor([state],device=self.device,dtype=torch.float32) # 先转为张量便于丢给神经网络,state元素数据原本为float64注意state=torch.tensor(state).unsqueeze(0)跟state=torch.tensor([state])等价
q_value = self.policy_net(state) # tensor([[-0.0798, -0.0079]], grad_fn=<AddmmBackward>)
action = q_value.max(1)[1].item()
else:
action = random.randrange(self.n_actions)
return action
def update(self):
if len(self.memory) < self.batch_size:
return
state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(self.batch_size)
state_batch = torch.tensor(state_batch,device=self.device,dtype=torch.float) # 例如tensor([[-4.5543e-02, -2.3910e-01, 1.8344e-02, 2.3158e-01],...,[-1.8615e-02, -2.3921e-01, -1.1791e-02, 2.3400e-01]])
action_batch = torch.tensor(action_batch,device=self.device).unsqueeze(1) # 例如tensor([[1],...,[0]])
reward_batch = torch.tensor(reward_batch,device=self.device,dtype=torch.float) # tensor([1., 1.,...,1])
next_state_batch = torch.tensor(next_state_batch,device=self.device,dtype=torch.float)
done_batch = torch.tensor(np.float32(done_batch),device=self.device).unsqueeze(1) # 将bool转为float然后转为张量
# Compute Q(s_t, a) - the model computes Q(s_t), then we select the
# columns of actions taken. These are the actions which would've been taken
# for each batch state according to policy_net
q_values = self.policy_net(state_batch).gather(1, action_batch) # 等价于self.forward
# Compute V(s_{t+1}) for all next states.
# Expected values of actions for non_final_next_states are computed based
# on the "older" target_net; selecting their best reward with max(1)[0].
# This is merged based on the mask, such that we'll have either the expected
# state value or 0 in case the state was final.
next_state_values = self.target_net(
next_state_batch).max(1)[0].detach() # tensor([ 0.0060, -0.0171,...,])
# Compute the expected Q values
expected_q_values = reward_batch + self.gamma * next_state_values * (1-done_batch[0])
# Compute Huber loss
# self.loss = nn.MSELoss(q_values, expected_q_values.unsqueeze(1))
self.loss = nn.MSELoss()(q_values,expected_q_values.unsqueeze(1))
# Optimize the model
self.optimizer.zero_grad() # zero_grad clears old gradients from the last step (otherwise youd just accumulate the gradients from all loss.backward() calls).
self.loss.backward() # loss.backward() computes the derivative of the loss w.r.t. the parameters (or anything requiring gradients) using backpropagation.
for param in self.policy_net.parameters(): # clip防止梯度爆炸
param.grad.data.clamp_(-1, 1)
self.optimizer.step() # causes the optimizer to take a step based on the gradients of the parameters.

98
codes/dqn/main.py Normal file
View File

@@ -0,0 +1,98 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:48:57
@LastEditor: John
@LastEditTime: 2020-07-20 23:02:16
@Discription:
@Environment: python 3.7.7
'''
'''未完成
'''
import gym
import torch
from dqn import DQN
from plot import plot
import argparse
def get_args():
'''模型建立好之后只需要在这里调参
'''
parser = argparse.ArgumentParser()
parser.add_argument("--gamma", default=0.99,
type=float) # q-learning中的gamma
parser.add_argument("--epsilon_start", default=0.95,
type=float) # 基于贪心选择action对应的参数epsilon
parser.add_argument("--epsilon_end", default=0.01, type=float)
parser.add_argument("--epsilon_decay", default=200, type=float)
parser.add_argument("--policy_lr", default=0.01, type=float)
parser.add_argument("--memory_capacity", default=1000,
type=int, help="capacity of Replay Memory")
parser.add_argument("--batch_size", default=32, type=int,
help="batch size of memory sampling")
parser.add_argument("--max_episodes", default=200, type=int)
parser.add_argument("--max_steps", default=200, type=int)
parser.add_argument("--target_update", default=2, type=int,
help="when(every default 10 eisodes) to update target net ")
config = parser.parse_args()
return config
if __name__ == "__main__":
cfg = get_args()
# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
env = gym.make('CartPole-v0').unwrapped
env.seed(1)
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n
agent = DQN(n_states=n_states, n_actions=n_actions, device=device, gamma=cfg.gamma, epsilon_start=cfg.epsilon_start,
epsilon_end=cfg.epsilon_end, epsilon_decay=cfg.epsilon_decay,policy_lr=cfg.policy_lr, memory_capacity=cfg.memory_capacity, batch_size=cfg.batch_size)
rewards = []
moving_average_rewards = []
for i_episode in range(1, cfg.max_episodes+1):
# Initialize the environment and state
state = env.reset()
ep_reward = 0
for t in range(1, cfg.max_steps+1):
# Select and perform an action
action = agent.select_action(state)
next_state, reward, done, _ = env.step(action)
ep_reward += reward
# Store the transition in memory
agent.memory.push(state,action,reward,next_state,done)
# Move to the next state
state = next_state
# Perform one step of the optimization (on the target network)
agent.update()
if done:
break
# Update the target network, copying all weights and biases in DQN
if i_episode % cfg.target_update == 0:
agent.target_net.load_state_dict(agent.policy_net.state_dict())
print('Episode:', i_episode, ' Reward: %i' %
int(ep_reward), 'Explore: %.2f' % agent.epsilon)
rewards.append(ep_reward)
# 计算滑动窗口的reward
if i_episode == 1:
moving_average_rewards.append(ep_reward)
else:
moving_average_rewards.append(
0.9*moving_average_rewards[-1]+0.1*ep_reward)
import os
import numpy as np
output_path = os.path.dirname(__file__)+"/result/"
if not os.path.exists(output_path):
os.mkdir(output_path)
np.save(output_path+"rewards.npy", rewards)
np.save(output_path+"moving_average_rewards.npy", moving_average_rewards)
print('Complete')
plot(rewards)
plot(moving_average_rewards, ylabel="moving_average_rewards")

35
codes/dqn/memory.py Normal file
View File

@@ -0,0 +1,35 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-10 15:27:16
@LastEditor: John
@LastEditTime: 2020-06-14 11:36:24
@Discription:
@Environment: python 3.7.7
'''
import random
import numpy as np
class ReplayBuffer:
def __init__(self, capacity):
self.capacity = capacity
self.buffer = []
self.position = 0
def push(self, state, action, reward, next_state, done):
if len(self.buffer) < self.capacity:
self.buffer.append(None)
self.buffer[self.position] = (state, action, reward, next_state, done)
self.position = (self.position + 1) % self.capacity
def sample(self, batch_size):
batch = random.sample(self.buffer, batch_size)
state, action, reward, next_state, done = zip(*batch)
return state, action, reward, next_state, done
def __len__(self):
return len(self.buffer)

30
codes/dqn/model.py Normal file
View File

@@ -0,0 +1,30 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-12 00:47:02
@LastEditor: John
@LastEditTime: 2020-06-14 11:23:04
@Discription:
@Environment: python 3.7.7
'''
import torch.nn as nn
import torch.nn.functional as F
class FCN(nn.Module):
def __init__(self, n_states=4, n_actions=18):
"""
Initialize a deep Q-learning network for testing algorithm
n_states: number of features of input.
n_actions: number of action-value to output, one-to-one correspondence to action in game.
"""
super(FCN, self).__init__()
self.fc1 = nn.Linear(n_states, 128)
self.fc2 = nn.Linear(128, 128)
self.fc3 = nn.Linear(128, n_actions)
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
return self.fc3(x)

30
codes/dqn/plot.py Normal file
View File

@@ -0,0 +1,30 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-11 16:30:09
@LastEditor: John
@LastEditTime: 2020-06-14 11:38:42
@Discription:
@Environment: python 3.7.7
'''
import matplotlib.pyplot as plt
import numpy as np
import os
def plot(item,ylabel='rewards'):
plt.figure()
plt.plot(np.arange(len(item)), item)
plt.title(ylabel+' of DQN')
plt.ylabel('rewards')
plt.xlabel('episodes')
plt.savefig(os.path.dirname(__file__)+"/result/"+ylabel+".png")
plt.show()
if __name__ == "__main__":
output_path = os.path.dirname(__file__)+"/result/"
rewards=np.load(output_path+"rewards.npy", )
moving_average_rewards=np.load(output_path+"moving_average_rewards.npy",)
plot(rewards)
plot(moving_average_rewards,ylabel='moving_average_rewards')

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 31 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 49 KiB

107
codes/dqn_cnn/dqn.py Normal file
View File

@@ -0,0 +1,107 @@
import random
import math
import torch
import torch.optim as optim
import torch.nn.functional as F
from memory import ReplayBuffer
from model import CNN
class DQN:
def __init__(self, screen_height=0, screen_width=0, n_actions=0, gamma=0.999, epsilon_start=0.9, epsilon_end=0.05, epsilon_decay=200, memory_capacity=10000, batch_size=128, device="cpu"):
self.actions_count = 0
self.n_actions = n_actions
self.device = device
self.gamma = gamma
self.epsilon = 0
self.epsilon_start = epsilon_start
self.epsilon_end = epsilon_end
self.epsilon_decay = epsilon_decay
self.batch_size = batch_size
self.policy_net = CNN(screen_height, screen_width,
n_actions).to(self.device)
self.target_net = CNN(screen_height, screen_width,
n_actions).to(self.device)
self.target_net.load_state_dict(self.policy_net.state_dict())
self.target_net.eval() # 不启用 BatchNormalization 和 Dropout
self.optimizer = optim.RMSprop(self.policy_net.parameters())
self.loss = 0
self.memory = ReplayBuffer(memory_capacity)
def select_action(self, state):
'''choose_action [summary]
Args:
state [torch tensor]: [description]
Returns:
actions [torch tensor]: [description]
'''
sample = random.random()
self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
math.exp(-1. * self.actions_count / self.epsilon_decay)
self.actions_count += 1
if sample > self.epsilon:
with torch.no_grad():
# t.max(1) will return largest column value of each row.
# second column on max result is index of where max element was
# found, so we pick action with the larger expected reward.
q_value = self.policy_net(state) # q_value比如tensor([[-0.2522, 0.3887]])
action = q_value.max(1)[1].view(1, 1) # q_value最大对应的下标注意该action是个张量如tensor([1])
return action
else:
return torch.tensor([[random.randrange(self.n_actions)]], device=self.device, dtype=torch.long)
def update(self):
if len(self.memory) < self.batch_size:
return
transitions = self.memory.sample(self.batch_size)
# Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
# detailed explanation). This converts batch-array of Transitions
# to Transition of batch-arrays.
batch = self.memory.Transition(*zip(*transitions))
# Compute a mask of non-final states and concatenate the batch elements
# (a final state would've been the one after which simulation ended)
non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
batch.next_state)), device=self.device, dtype=torch.bool)
non_final_next_states = torch.cat([s for s in batch.next_state
if s is not None])
state_batch = torch.cat(batch.state)
action_batch = torch.cat(batch.action)
reward_batch = torch.cat(batch.reward) # tensor([1., 1.,...,])
# Compute Q(s_t, a) - the model computes Q(s_t), then we select the
# columns of actions taken. These are the actions which would've been taken
# for each batch state according to policy_net
state_action_values = self.policy_net(
state_batch).gather(1, action_batch) #tensor([[ 1.1217],...,[ 0.8314]])
# Compute V(s_{t+1}) for all next states.
# Expected values of actions for non_final_next_states are computed based
# on the "older" target_net; selecting their best reward with max(1)[0].
# This is merged based on the mask, such that we'll have either the expected
# state value or 0 in case the state was final.
next_state_values = torch.zeros(self.batch_size, device=self.device)
next_state_values[non_final_mask] = self.target_net(
non_final_next_states).max(1)[0].detach()
# Compute the expected Q values
expected_state_action_values = (next_state_values * self.gamma) + reward_batch # tensor([0.9685, 0.9683,...,])
# Compute Huber loss
self.loss = F.smooth_l1_loss(
state_action_values, expected_state_action_values.unsqueeze(1)) # .unsqueeze增加一个维度
# Optimize the model
self.optimizer.zero_grad() # zero_grad clears old gradients from the last step (otherwise youd just accumulate the gradients from all loss.backward() calls).
self.loss.backward() # loss.backward() computes the derivative of the loss w.r.t. the parameters (or anything requiring gradients) using backpropagation.
for param in self.policy_net.parameters(): # clip防止梯度爆炸
param.grad.data.clamp_(-1, 1)
self.optimizer.step() # causes the optimizer to take a step based on the gradients of the parameters.
if __name__ == "__main__":
dqn = DQN()

115
codes/dqn_cnn/main.py Normal file
View File

@@ -0,0 +1,115 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-11 10:01:09
@LastEditor: John
@LastEditTime: 2020-06-13 00:24:31
@Discription:
@Environment: python 3.7.7
'''
'''
应该是没有收敛但是pytorch官方教程的结果也差不多
'''
import gym
import torch
from screen_state import get_screen
from dqn import DQN
from plot import plot
import argparse
def get_args():
'''模型建立好之后只需要在这里调参
'''
parser = argparse.ArgumentParser()
parser.add_argument("--gamma", default=0.999, type=float) # q-learning中的gamma
parser.add_argument("--epsilon_start", default=0.9, type=float) # 基于贪心选择action对应的参数epsilon
parser.add_argument("--epsilon_end", default=0.05, type=float)
parser.add_argument("--epsilon_decay", default=200, type=float)
parser.add_argument("--memory_capacity", default=10000, type=int,help="capacity of Replay Memory")
parser.add_argument("--batch_size", default=128, type=int,help="batch size of memory sampling")
parser.add_argument("--max_episodes", default=100, type=int)
parser.add_argument("--max_steps", default=200, type=int)
parser.add_argument("--target_update", default=4, type=int,help="when(every default 10 eisodes) to update target net ")
config = parser.parse_args()
return config
if __name__ == "__main__":
cfg = get_args()
# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Get screen size so that we can initialize layers correctly based on shape
# returned from AI gym. Typical dimensions at this point are close to 3x40x90
# which is the result of a clamped and down-scaled render buffer in get_screen(env,device)
env = gym.make('CartPole-v0').unwrapped
env.reset()
init_screen = get_screen(env, device)
_, _, screen_height, screen_width = init_screen.shape
# Get number of actions from gym action space
n_actions = env.action_space.n
agent = DQN(screen_height=screen_height, screen_width=screen_width,
n_actions=n_actions, device=device, gamma=cfg.gamma, epsilon_start=cfg.epsilon_start, epsilon_end=cfg.epsilon_end, epsilon_decay=cfg.epsilon_decay, memory_capacity=cfg.memory_capacity,batch_size=cfg.batch_size)
rewards = []
moving_average_rewards = []
for i_episode in range(1,cfg.max_episodes+1):
# Initialize the environment and state
env.reset()
last_screen = get_screen(env, device)
current_screen = get_screen(env, device)
state = current_screen - last_screen
ep_reward = 0
for t in range(1,cfg.max_steps+1):
# Select and perform an action
action = agent.select_action(state)
_, reward, done, _ = env.step(action.item())
ep_reward += reward
reward = torch.tensor([reward], device=device)
# Observe new state
last_screen = current_screen
current_screen = get_screen(env, device)
if done: break
next_state = current_screen - last_screen
# Store the transition in memory
agent.memory.push(state, action, next_state, reward)
# Move to the next state
state = next_state
# Perform one step of the optimization (on the target network)
agent.update()
# Update the target network, copying all weights and biases in DQN
if i_episode % cfg.target_update == 0:
agent.target_net.load_state_dict(agent.policy_net.state_dict())
print('Episode:', i_episode, ' Reward: %i' %int(ep_reward), 'Explore: %.2f' % agent.epsilon)
rewards.append(ep_reward)
if i_episode == 1:
moving_average_rewards.append(ep_reward)
else:
moving_average_rewards.append(
0.9*moving_average_rewards[-1]+0.1*ep_reward)
import os
import numpy as np
output_path = os.path.dirname(__file__)+"/result/"
if not os.path.exists(output_path):
os.mkdir(output_path)
np.save(output_path+"rewards.npy", rewards)
np.save(output_path+"moving_average_rewards.npy", moving_average_rewards)
print('Complete')
plot(rewards)
plot(moving_average_rewards,ylabel="moving_average_rewards")

37
codes/dqn_cnn/memory.py Normal file
View File

@@ -0,0 +1,37 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-11 09:42:44
@LastEditor: John
@LastEditTime: 2020-06-11 15:50:33
@Discription:
@Environment: python 3.7.7
'''
from collections import namedtuple
import random
class ReplayBuffer(object):
def __init__(self, capacity):
self.capacity = capacity
self.buffer = []
self.position = 0
self.Transition = namedtuple('Transition',
('state', 'action', 'next_state', 'reward'))
def push(self, *args):
"""Saves a transition."""
if len(self.buffer) < self.capacity:
self.buffer.append(None)
self.buffer[self.position] = self.Transition(*args)
self.position = (self.position + 1) % self.capacity
def sample(self, batch_size):
return random.sample(self.buffer, batch_size)
def __len__(self):
return len(self.buffer)

41
codes/dqn_cnn/model.py Normal file
View File

@@ -0,0 +1,41 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-11 12:18:12
@LastEditor: John
@LastEditTime: 2020-06-11 17:23:45
@Discription:
@Environment: python 3.7.7
'''
import torch.nn as nn
import torch.nn.functional as F
class CNN(nn.Module):
def __init__(self, h, w, n_outputs):
super(CNN, self).__init__()
self.conv1 = nn.Conv2d(3, 16, kernel_size=5, stride=2)
self.bn1 = nn.BatchNorm2d(16)
self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2)
self.bn2 = nn.BatchNorm2d(32)
self.conv3 = nn.Conv2d(32, 32, kernel_size=5, stride=2)
self.bn3 = nn.BatchNorm2d(32)
# Number of Linear input connections depends on output of conv2d layers
# and therefore the input image size, so compute it.
def conv2d_size_out(size, kernel_size = 5, stride = 2):
return (size - (kernel_size - 1) - 1) // stride + 1
convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
linear_input_size = convw * convh * 32
self.head = nn.Linear(linear_input_size, n_outputs)
# Called with either one element to determine next action, or a batch
# during optimization. Returns tensor([[left0exp,right0exp]...]).
def forward(self, x):
x = F.relu(self.bn1(self.conv1(x)))
x = F.relu(self.bn2(self.conv2(x)))
x = F.relu(self.bn3(self.conv3(x)))
return self.head(x.view(x.size(0), -1))

24
codes/dqn_cnn/plot.py Normal file
View File

@@ -0,0 +1,24 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-11 16:30:09
@LastEditor: John
@LastEditTime: 2020-06-11 22:27:24
@Discription:
@Environment: python 3.7.7
'''
import matplotlib.pyplot as plt
import numpy as np
import os
def plot(item,ylabel='rewards'):
plt.figure()
plt.plot(np.arange(len(item)), item)
plt.title(ylabel+' of CnnDQN')
plt.ylabel('rewards')
plt.xlabel('episodes')
plt.savefig(os.path.dirname(__file__)+"/result/"+ylabel+".png")
plt.show()

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 34 KiB

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 44 KiB

View File

@@ -0,0 +1,66 @@
#!/usr/bin/env python
# coding=utf-8
'''
@Author: John
@Email: johnjim0816@gmail.com
@Date: 2020-06-11 10:02:35
@LastEditor: John
@LastEditTime: 2020-06-11 16:57:34
@Discription:
@Environment: python 3.7.7
'''
import numpy as np
import torch
import torchvision.transforms as T
from PIL import Image
resize = T.Compose([T.ToPILImage(),
T.Resize(40, interpolation=Image.CUBIC),
T.ToTensor()])
def get_cart_location(env,screen_width):
world_width = env.x_threshold * 2
scale = screen_width / world_width
return int(env.state[0] * scale + screen_width / 2.0) # MIDDLE OF CART
def get_screen(env,device):
# Returned screen requested by gym is 400x600x3, but is sometimes larger
# such as 800x1200x3. Transpose it into torch order (CHW).
screen = env.render(mode='rgb_array').transpose((2, 0, 1))
# Cart is in the lower half, so strip off the top and bottom of the screen
_, screen_height, screen_width = screen.shape
screen = screen[:, int(screen_height*0.4):int(screen_height * 0.8)]
view_width = int(screen_width * 0.6)
cart_location = get_cart_location(env,screen_width)
if cart_location < view_width // 2:
slice_range = slice(view_width)
elif cart_location > (screen_width - view_width // 2):
slice_range = slice(-view_width, None)
else:
slice_range = slice(cart_location - view_width // 2,
cart_location + view_width // 2)
# Strip off the edges, so that we have a square image centered on a cart
screen = screen[:, :, slice_range]
# Convert to float, rescale, convert to torch tensor
# (this doesn't require a copy)
screen = np.ascontiguousarray(screen, dtype=np.float32) / 255
screen = torch.from_numpy(screen)
# Resize, and add a batch dimension (BCHW)
return resize(screen).unsqueeze(0).to(device)
if __name__ == "__main__":
import gym
env = gym.make('CartPole-v0').unwrapped
# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
env.reset()
import matplotlib.pyplot as plt
plt.figure()
plt.imshow(get_screen(env,device).cpu().squeeze(0).permute(1, 2, 0).numpy(),
interpolation='none')
plt.title('Example extracted screen')
plt.show()

View File

@@ -29,6 +29,7 @@
## 主要贡献者
- [@qiwang067](https://github.com/qiwang067)
- [@JohnJim0816](https://github.com/JohnJim0816)
## 关注我们

View File

@@ -10,7 +10,7 @@
强化学习的三个重要的要素:状态、动作和奖励。强化学习智能体跟环境是一步一步交互的,就是我先观察一下状态,然后再输入动作。再观察一下状态,再输出动作,拿到这些 reward 。它是一个跟时间相关的一个序列决策的问题。
举个例子,在 $t-1$ 时刻,我看到了熊对我招手,那我下意识的可能输出的动作就是赶紧跑路。熊看到了有人跑了,可能就觉得发现猎物,开始发动攻击。而在 $t$ 时刻的话,我如果选择装死的动作,可能熊咬了咬我那个摔了几下就发现就觉得挺无趣的,可能会走开。这个时候,我再跑路的话可能就跑路成功了,就是这样子的一个序列决策的过程。
举个例子,在 $t-1$ 时刻,我看到了熊对我招手,那我下意识的可能输出的动作就是赶紧跑路。熊看到了有人跑了,可能就觉得发现猎物,开始发动攻击。而在 $t$ 时刻的话,我如果选择装死的动作,可能熊咬了咬我那个摔了几下就发现就觉得挺无趣的,可能会走开。这个时候,我再跑路的话可能就跑路成功了,就是这样子的一个序列决策的过程。
当然在输出每一个动作之前,其实你都是可以选择不同的动作。比如说在 $t$ 时刻,我选择跑路的时候,熊已经追上来了,如果说 $t$ 时刻,我没有选择装死,而我是选择跑路的话,这个时候熊已经追上了,那这个时候,其实我有两种情况转移到不同的状态去,就我有一定的概率可以逃跑成功,也有很大的概率我会逃跑失败。那我们就用状态转移概率 $p\left[s_{t+1}, r_{t} \mid s_{t}, a_{t}\right]$ 来表述说在 $s_t$ 的状态选择了 $a_t$ 的动作的时候,转移到 $s_{t+1}$ ,而且拿到 $r_t$ 的概率是多少。
@@ -103,7 +103,7 @@ $$
![](img/2.14.png)
这种强化方式其实在数学上面一行公式就表达出来了。这种更新的方式叫做`时序差分(Temporal Difference)`。这个公式它想要表达就是可以拿下一步的 Q 值 $Q(S_{t+_1},A_{t+1})$ 来更新我这一步的 Q 值 $Q(S_t,A_t)$ 。
这种强化方式其实在数学上面一行公式就表达出来了。这种更新的方式叫做`时序差分(Temporal Difference)`。这个公式就是可以拿下一步的 Q 值 $Q(S_{t+_1},A_{t+1})$ 来更新我这一步的 Q 值 $Q(S_t,A_t)$ 。
为了理解这个公式,如图所示,我们先把 $R_{t+1}+\gamma Q\left(S_{t+1}, A_{t+1}\right.)$ 当作是一个目标值,就是 $Q(S_t,A_t)$ 想要去逼近的一个目标值。我们想要计算的就是这个 $Q(S_t,A_t)$ 。因为最开始 Q 值都是随机初始化或者是初始化为零。它需要不断的去逼近它理想中真实的Q 值,我们就叫 target 。Target 就是未来收益的总和大概有多少,而且是带衰减的那个。
@@ -176,11 +176,14 @@ $$
![](img/2.19.png)
我们再仔细地对比一下两个更新公式,它们俩的更新公式都是一样的区别只在 target 计算的这一部分。Sarsa 是 $R_{t+1}+\gamma Q(S_{t+1}, A_{t+1})$ Q-learning 是$R_{t+1}+\gamma \underset{a}{\max} Q\left(S_{t+1}, a\right)$ 。
Sarsa 和 Q-learning 的更新公式都是一样的区别只在 target 计算的这一部分
Sarsa 实际上都是用自己的策略产生了 S,A,R,S',A' 这一条轨迹。然后拿着 $Q(S_{t+1},A_{t+1})$ 去更新原本的Q值 $Q(S_t,A_t)$。 但是 Q-learning 并不需要知道,我实际上选择哪一个 action ,它默认下一个动作就是 Q 最大的那个动作。所以它知道实际上 behavior policy 可能会有 10% 的概率去选择别的动作,但是 Q-learning 并不担心受到探索的影响,它默认了就按照最优的策略来去优化我的目标策略,所以它可以更大胆地去寻找最优的路径,它其实会表现的比 Sarsa 大胆非常多。
* Sarsa 是 $R_{t+1}+\gamma Q(S_{t+1}, A_{t+1})$
* Q-learning 是$R_{t+1}+\gamma \underset{a}{\max} Q\left(S_{t+1}, a\right)$ 。
然后Q-learning 的这个逐步的一个拆解的话跟Sarsa 唯一一点不一样就是我并不需要提前知道我 $A_2$ ,我就能更新 $Q(S_1,A_1)$ 。在训练一个 episode 这个流程图当中,Q-leanring 在 learn 之前它也不需要去拿到 next action A',它只需要前面四个 $(S,A,R,S')$也就可以了。这一点就是跟 Sarsa 有一个很明显的区别。这边我们给出[ Q-learning 的 Python实现](https://github.com/datawhalechina/leedeeprl-notes/tree/master/docs/code/Q-learning)
Sarsa 实际上都是用自己的策略产生了 S,A,R,S',A' 这一条轨迹。然后拿着 $Q(S_{t+1},A_{t+1})$ 去更新原本的 Q 值 $Q(S_t,A_t)$。 但是 Q-learning 并不需要知道,我实际上选择哪一个 action ,它默认下一个动作就是 Q 最大的那个动作。Q-learning 知道实际上 behavior policy 可能会有 10% 的概率去选择别的动作,但是 Q-learning 并不担心受到探索的影响,它默认了就按照最优的策略来去优化我的目标策略,所以它可以更大胆地去寻找最优的路径,它其实会表现的比 Sarsa 大胆非常多
然后 Q-learning 的这个逐步的一个拆解的话,跟 Sarsa 唯一一点不一样就是我并不需要提前知道我 $A_2$ ,我就能更新 $Q(S_1,A_1)$ 。在训练一个 episode 这个流程图当中Q-learning 在 learn 之前它也不需要去拿到 next action A',它只需要前面四个 $(S,A,R,S')$也就可以了,这一点就是跟 Sarsa 有一个很明显的区别。这边我们给出[ Q-learning 的 Python实现](https://github.com/datawhalechina/leedeeprl-notes/tree/master/docs/code/Q-learning)。
### Q-function Bellman Equation
@@ -195,7 +198,7 @@ $$
>Bellman Equation 就是当前状态与未来状态的迭代关系表示当前状态的值函数可以通过下个状态的值函数来计算。Bellman Equation 因其提出者、动态规划创始人 Richard Bellman 而得名 ,也 叫作“动态规划方程”。
从另一方面考虑,在计算 $t$ 时刻的动作价值 $Q^{\pi}(s_t,a_t)$ 时,需要知道在 $t$、$t+1$、$t+2 \cdots \cdots$ 时刻的奖励这样就不仅需要知道某一状态的所有可能出现的后续状态以及对应的奖励值还要进行全宽度的回溯来更新状态的价值。这种方法无法在状态转移函数未知或者大规模问题中使用。因此Q- learning 采用了浅层的时序差分采样学习,在计算累积奖励时,基于当前策略 $\pi$ 预测接下来发生的 $n$ 步动作($n$ 可以取 1 到 $+\infty$)并计算其奖励值。
从另一方面考虑,在计算 $t$ 时刻的动作价值 $Q^{\pi}(s_t,a_t)$ 时,需要知道在 $t$、$t+1$、$t+2 \cdots \cdots$ 时刻的奖励这样就不仅需要知道某一状态的所有可能出现的后续状态以及对应的奖励值还要进行全宽度的回溯来更新状态的价值。这种方法无法在状态转移函数未知或者大规模问题中使用。因此Q-learning 采用了浅层的时序差分采样学习,在计算累积奖励时,基于当前策略 $\pi$ 预测接下来发生的 $n$ 步动作($n$ 可以取 1 到 $+\infty$)并计算其奖励值。
具体来说,假设在状态 $s_t$ 下选择了动作 $a_t$,并得到了奖励 $r_t$ ,此时状态转移到 $s_{t+1}$,如果在此状态下根据同样的策略选择了动作 $a_{t+1}$ ,则 $Q^{\pi}(s_t,a_t)$ 可以表示为
$$

View File

@@ -235,6 +235,8 @@ $$
Advantage function 的意义就是,假设我们在某一个 state $s_t$ 执行某一个 action $a_t$,相较于其他可能的 action它有多好。它在意的不是一个绝对的好而是相对的好`相对优势(relative advantage)`。因为会减掉一个 b减掉一个 baseline 所以这个东西是相对的好,不是绝对的好。 $A^{\theta}\left(s_{t}, a_{t}\right)$ 通常可以是由一个 network estimate 出来的,这个 network 叫做 critic。
## References
* [Intro to Reinforcement Learning (强化学习纲要)](https://github.com/zhoubolei/introRL)

View File

@@ -168,7 +168,7 @@ KL divergence 到底指的是什么?这边我是直接把 KL divergence 当做
![](img/4.9.png)
在PPO 的paper 里面还有一个 `adaptive KL divergence`,这边会遇到一个问题就是 $\beta$ 要设多少它就跟那个regularization 一样。regularization 前面也要乘一个weight所以这个 KL divergence 前面也要乘一个 weight但 $\beta$ 要设多少呢?所以有个动态调整 $\beta$ 的方法。在这个方法里面呢,你先设一个 KL divergence你可以接受的最大值。然后假设你发现说你 optimize 完这个式子以后KL divergence 的项太大,那就代表说后面这个 penalize 的 term 没有发挥作用,那就把 $\beta$ 调大。那另外你定一个 KL divergence 的最小值。如果发现 optimize 完上面这个式子以后KL divergence 比最小值还要小,那代表后面这一项的效果太强了,你怕他只弄后面这一项,那$\theta$ 跟$\theta^k$ 都一样,这不是你要的,所以你这个时候你叫要减少 $\beta$。所以 $\beta$ 是可以动态调整的。这个叫做 adaptive KL penalty。
在PPO 的paper 里面还有一个 `adaptive KL divergence`,这边会遇到一个问题就是 $\beta$ 要设多少它就跟那个regularization 一样。regularization 前面也要乘一个weight所以这个 KL divergence 前面也要乘一个 weight但 $\beta$ 要设多少呢?所以有个动态调整 $\beta$ 的方法。在这个方法里面呢,你先设一个 KL divergence你可以接受的最大值。然后假设你发现说你 optimize 完这个式子以后KL divergence 的项太大,那就代表说后面这个 penalize 的 term 没有发挥作用,那就把 $\beta$ 调大。那另外你定一个 KL divergence 的最小值。如果发现 optimize 完上面这个式子以后KL divergence 比最小值还要小,那代表后面这一项的效果太强了,你怕他只弄后面这一项,那 $\theta$ 跟 $\theta^k$ 都一样,这不是你要的,所以你这个时候你叫要减少 $\beta$。所以 $\beta$ 是可以动态调整的。这个叫做 adaptive KL penalty。
![](img/4.10.png)