在我開始之前,我知道有很多問題都有相同的錯誤,但沒有一個為我解決了這個問題。
我有一個 PPO 實作,用于從健身房(gym==0.26.0,tensorflow==2.10.0)玩 CarRacing-v2 環境。我想讓它更快,將一堆代碼移動到一個單獨的函式中并用 tf.function 包裝它,但是只是將它移動到另一個函式中會產生錯誤。除了將代碼的一部分移動到不同的函式之外,代碼中沒有任何修改。
這是我之前的作業代碼。
def learn(self):
for epoch in range(self.n_epochs):
# print(f"{epoch = }")
state_arr, action_arr, old_prob_arr, vals_arr,\
reward_arr, dones_arr, batches = \
self.memory.generate_batches()
values = vals_arr
advantage = np.zeros(len(reward_arr), dtype=np.float32)
# print("a_t")
for t in range(len(reward_arr)-1):
discount = 1
a_t = 0
for k in range(t, len(reward_arr)-1):
a_t = discount*(reward_arr[k] self.gamma*values[k 1] * (
1-int(dones_arr[k])) - values[k])
discount *= self.gamma*self.gae_lambda
advantage[t] = a_t
for batch in batches:
# do this into a tf function
# print("batch")
with tf.GradientTape(persistent=True) as tape:
states = tf.convert_to_tensor(state_arr[batch])
old_probs = tf.convert_to_tensor(old_prob_arr[batch])
actions = tf.convert_to_tensor(action_arr[batch])
probs = self.actor(states)
dist = tfp.distributions.Categorical(probs)
new_probs = dist.log_prob(actions)
critic_value = self.critic(states)
critic_value = tf.squeeze(critic_value, 1)
prob_ratio = tf.math.exp(new_probs - old_probs)
weighted_probs = advantage[batch] * prob_ratio
clipped_probs = tf.clip_by_value(prob_ratio,
1-self.policy_clip,
1 self.policy_clip)
weighted_clipped_probs = clipped_probs * advantage[batch]
actor_loss = -tf.math.minimum(weighted_probs,
weighted_clipped_probs)
actor_loss = tf.math.reduce_mean(actor_loss)
returns = advantage[batch] values[batch]
# critic_loss = tf.math.reduce_mean(tf.math.pow(
# returns-critic_value, 2))
critic_loss = keras.losses.MSE(critic_value, returns)
actor_params = self.actor.trainable_variables
actor_grads = tape.gradient(actor_loss, actor_params)
critic_params = self.critic.trainable_variables
critic_grads = tape.gradient(critic_loss, critic_params)
self.actor.optimizer.apply_gradients(
zip(actor_grads, actor_params))
self.critic.optimizer.apply_gradients(
zip(critic_grads, critic_params))
self.memory.clear_memory()
這是分成2個函式的代碼,這是發生錯誤的地方
def learn(self):
for epoch in range(self.n_epochs):
# print(f"{epoch = }")
state_arr, action_arr, old_prob_arr, vals_arr,\
reward_arr, dones_arr, batches = \
self.memory.generate_batches()
values = vals_arr
advantage = np.zeros(len(reward_arr), dtype=np.float32)
# print("a_t")
for t in range(len(reward_arr)-1):
discount = 1
a_t = 0
for k in range(t, len(reward_arr)-1):
a_t = discount*(reward_arr[k] self.gamma*values[k 1] * (
1-int(dones_arr[k])) - values[k])
discount *= self.gamma*self.gae_lambda
advantage[t] = a_t
for batch in batches:
self.do_batch(state_arr, old_prob_arr,action_arr, batch, advantage, values)
self.memory.clear_memory()
@tf.function
def do_batch(self, state_arr, old_prob_arr, action_arr, batch, advantage, values):
with tf.GradientTape(persistent=True) as tape:
states = tf.convert_to_tensor(state_arr[batch])
old_probs = tf.convert_to_tensor(old_prob_arr[batch])
actions = tf.convert_to_tensor(action_arr[batch])
probs = self.actor(states)
dist = tfp.distributions.Categorical(probs)
new_probs = dist.log_prob(actions)
critic_value = self.critic(states)
critic_value = tf.squeeze(critic_value, 1)
prob_ratio = tf.math.exp(new_probs - old_probs)
weighted_probs = advantage[batch] * prob_ratio
clipped_probs = tf.clip_by_value(prob_ratio,
1-self.policy_clip,
1 self.policy_clip)
weighted_clipped_probs = clipped_probs * advantage[batch]
actor_loss = -tf.math.minimum(weighted_probs,
weighted_clipped_probs)
actor_loss = tf.math.reduce_mean(actor_loss)
returns = advantage[batch] values[batch]
# critic_loss = tf.math.reduce_mean(tf.math.pow(
# returns-critic_value, 2))
critic_loss = keras.losses.MSE(critic_value, returns)
actor_params = self.actor.trainable_variables
actor_grads = tape.gradient(actor_loss, actor_params)
critic_params = self.critic.trainable_variables
critic_grads = tape.gradient(critic_loss, critic_params)
self.actor.optimizer.apply_gradients(
zip(actor_grads, actor_params))
self.critic.optimizer.apply_gradients(
zip(critic_grads, critic_params))

問題似乎是批處理是張量,但它應該是我的代碼中的一個串列
def generate_batches(self):
n_states = len(self.states)
batch_start = np.arange(0, n_states, self.batch_size)
indices = np.arange(n_states, dtype=np.int64)
np.random.shuffle(indices)
batches = [indices[i:i self.batch_size] for i in batch_start]
return np.array(self.states),\
np.array(self.actions),\
np.array(self.probs),\
np.array(self.vals),\
np.array(self.rewards),\
np.array(self.dones),\
batches
uj5u.com熱心網友回復:
裝飾的函式tf.function將在內部將串列轉換為張量。要a使用另一個張量在一個特定軸上索引張量b,您可以使用tf.gather(a, b)而不是熟悉的索引語法a[b]。
do_batch具體來說,在您的函式中嘗試以下修改:
state_arr[batch] -> tf.gather(state_arr, batch, axis=0)
old_prob_arr[batch] -> tf.gather(old_prob_arr, batch, axis=0)
action_arr[batch] -> tf.gather(action_arr, batch, axis=0)
advantage[batch] -> tf.gather(advantage, batch, axis=0)
values[batch] -> tf.gather(values, batch, axis=0)
轉載請註明出處,本文鏈接:https://www.uj5u.com/caozuo/520801.html
上一篇:恢復訓練PyTorch
