update codes
This commit is contained in:
@@ -90,15 +90,15 @@ class OUNoise(object):
|
||||
self.max_sigma = max_sigma
|
||||
self.min_sigma = min_sigma
|
||||
self.decay_period = decay_period
|
||||
self.action_dim = action_space.shape[0]
|
||||
self.n_actions = action_space.shape[0]
|
||||
self.low = action_space.low
|
||||
self.high = action_space.high
|
||||
self.reset()
|
||||
def reset(self):
|
||||
self.obs = np.ones(self.action_dim) * self.mu
|
||||
self.obs = np.ones(self.n_actions) * self.mu
|
||||
def evolve_obs(self):
|
||||
x = self.obs
|
||||
dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
|
||||
dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.n_actions)
|
||||
self.obs = x + dx
|
||||
return self.obs
|
||||
def get_action(self, action, t=0):
|
||||
|
||||
@@ -14,10 +14,10 @@ CartPole-v0是一个经典的入门环境,如下图,它通过向左(动作=0
|
||||
import gym
|
||||
env = gym.make('CartPole-v0') # 建立环境
|
||||
env.seed(1) # 随机种子
|
||||
state_dim = env.observation_space.shape[0] # 状态数
|
||||
action_dim = env.action_space.n # 动作数
|
||||
n_states = env.observation_space.shape[0] # 状态数
|
||||
n_actions = env.action_space.n # 动作数
|
||||
state = env.reset() # 初始化环境
|
||||
print(f"状态数:{state_dim},动作数:{action_dim}")
|
||||
print(f"状态数:{n_states},动作数:{n_actions}")
|
||||
print(f"初始状态:{state}")
|
||||
```
|
||||
|
||||
@@ -157,7 +157,7 @@ def choose_action(self, state):
|
||||
q_values = self.policy_net(state)
|
||||
action = q_values.max(1)[1].item() # 选择Q值最大的动作
|
||||
else:
|
||||
action = random.randrange(self.action_dim)
|
||||
action = random.randrange(self.n_actions)
|
||||
```
|
||||
|
||||
可以看到跟Q学习算法其实是一样的,都是用的$\epsilon-greedy$策略,只是使用神经网络的话我们需要通过Torch或者Tensorflow工具来处理相应的数据。
|
||||
|
||||
@@ -30,9 +30,9 @@ env = CliffWalkingWapper(env) # 装饰环境
|
||||
这里我们在程序中使用了一个装饰器重新定义环境,但不影响对环境的理解,感兴趣的同学具体看相关代码。可以由于gym环境封装得比较好,所以我们想要使用这个环境只需要使用gym.make命令输入函数名即可,然后我们可以查看环境的状态和动作数目:
|
||||
|
||||
```python
|
||||
state_dim = env.observation_space.n # 状态数
|
||||
action_dim = env.action_space.n # 动作数
|
||||
print(f"状态数:{state_dim},动作数:{action_dim}")
|
||||
n_states = env.observation_space.n # 状态数
|
||||
n_actions = env.action_space.n # 动作数
|
||||
print(f"状态数:{n_states},动作数:{n_actions}")
|
||||
```
|
||||
|
||||
打印出来的结果如下:
|
||||
@@ -72,9 +72,9 @@ print(state)
|
||||
env = gym.make('CliffWalking-v0') # 定义环境
|
||||
env = CliffWalkingWapper(env) # 装饰环境
|
||||
env.seed(1) # 设置随机种子
|
||||
state_dim = env.observation_space.n # 状态数
|
||||
action_dim = env.action_space.n # 动作数
|
||||
agent = QLearning(state_dim,action_dim,cfg) # cfg存储算法相关参数
|
||||
n_states = env.observation_space.n # 状态数
|
||||
n_actions = env.action_space.n # 动作数
|
||||
agent = QLearning(n_states,n_actions,cfg) # cfg存储算法相关参数
|
||||
for i_ep in range(cfg.train_eps): # cfg.train_eps表示最大训练的回合数
|
||||
ep_reward = 0 # 记录每个回合的奖励
|
||||
state = env.reset() # 重置环境
|
||||
@@ -126,7 +126,7 @@ def choose_action(self, state):
|
||||
if np.random.uniform(0, 1) > self.epsilon:
|
||||
action = np.argmax(self.Q_table[str(state)]) # 选择Q(s,a)最大对应的动作
|
||||
else:
|
||||
action = np.random.choice(self.action_dim) # 随机选择动作
|
||||
action = np.random.choice(self.n_actions) # 随机选择动作
|
||||
return action
|
||||
```
|
||||
|
||||
|
||||
Reference in New Issue
Block a user