TypeError：只有整數、切片(`:`)、省略號(`...`)、tf.newaxis(`None`)和標量tf.int32/tf.int64張量是有效的索引-有解無憂

在我開始之前，我知道有很多問題都有相同的錯誤，但沒有一個為我解決了這個問題。

我有一個 PPO 實作，用于從健身房（gym==0.26.0，tensorflow==2.10.0）玩 CarRacing-v2 環境。我想讓它更快，將一堆代碼移動到一個單獨的函式中并用 tf.function 包裝它，但是只是將它移動到另一個函式中會產生錯誤。除了將代碼的一部分移動到不同的函式之外，代碼中沒有任何修改。

這是我之前的作業代碼。

def learn(self):
        for epoch in range(self.n_epochs):
            # print(f"{epoch = }")
            state_arr, action_arr, old_prob_arr, vals_arr,\
                reward_arr, dones_arr, batches = \
                self.memory.generate_batches()

            values = vals_arr
            advantage = np.zeros(len(reward_arr), dtype=np.float32)

            # print("a_t")
            for t in range(len(reward_arr)-1):
                discount = 1
                a_t = 0
                for k in range(t, len(reward_arr)-1):
                    a_t  = discount*(reward_arr[k]   self.gamma*values[k 1] * (
                        1-int(dones_arr[k])) - values[k])
                    discount *= self.gamma*self.gae_lambda
                advantage[t] = a_t

            for batch in batches:
                # do this into a tf function
                # print("batch")
                with tf.GradientTape(persistent=True) as tape:
                    states = tf.convert_to_tensor(state_arr[batch])
                    old_probs = tf.convert_to_tensor(old_prob_arr[batch])
                    actions = tf.convert_to_tensor(action_arr[batch])

                    probs = self.actor(states)
                    dist = tfp.distributions.Categorical(probs)
                    new_probs = dist.log_prob(actions)

                    critic_value = self.critic(states)

                    critic_value = tf.squeeze(critic_value, 1)

                    prob_ratio = tf.math.exp(new_probs - old_probs)
                    weighted_probs = advantage[batch] * prob_ratio
                    clipped_probs = tf.clip_by_value(prob_ratio,
                                                     1-self.policy_clip,
                                                     1 self.policy_clip)
                    weighted_clipped_probs = clipped_probs * advantage[batch]
                    actor_loss = -tf.math.minimum(weighted_probs,
                                                  weighted_clipped_probs)
                    actor_loss = tf.math.reduce_mean(actor_loss)

                    returns = advantage[batch]   values[batch]
                    # critic_loss = tf.math.reduce_mean(tf.math.pow(
                    #                                  returns-critic_value, 2))
                    critic_loss = keras.losses.MSE(critic_value, returns)

                actor_params = self.actor.trainable_variables
                actor_grads = tape.gradient(actor_loss, actor_params)
                critic_params = self.critic.trainable_variables
                critic_grads = tape.gradient(critic_loss, critic_params)
                self.actor.optimizer.apply_gradients(
                        zip(actor_grads, actor_params))
                self.critic.optimizer.apply_gradients(
                        zip(critic_grads, critic_params))

        self.memory.clear_memory()

這是分成2個函式的代碼，這是發生錯誤的地方

def learn(self):
        for epoch in range(self.n_epochs):
            # print(f"{epoch = }")
            state_arr, action_arr, old_prob_arr, vals_arr,\
                reward_arr, dones_arr, batches = \
                self.memory.generate_batches()

            values = vals_arr
            advantage = np.zeros(len(reward_arr), dtype=np.float32)

            # print("a_t")
            for t in range(len(reward_arr)-1):
                discount = 1
                a_t = 0
                for k in range(t, len(reward_arr)-1):
                    a_t  = discount*(reward_arr[k]   self.gamma*values[k 1] * (
                        1-int(dones_arr[k])) - values[k])
                    discount *= self.gamma*self.gae_lambda
                advantage[t] = a_t

            for batch in batches:
               self.do_batch(state_arr, old_prob_arr,action_arr, batch, advantage, values)

        self.memory.clear_memory()

    @tf.function
    def do_batch(self, state_arr, old_prob_arr, action_arr, batch, advantage, values):
        with tf.GradientTape(persistent=True) as tape:
            states = tf.convert_to_tensor(state_arr[batch])
            old_probs = tf.convert_to_tensor(old_prob_arr[batch])
            actions = tf.convert_to_tensor(action_arr[batch])

            probs = self.actor(states)
            dist = tfp.distributions.Categorical(probs)
            new_probs = dist.log_prob(actions)

            critic_value = self.critic(states)

            critic_value = tf.squeeze(critic_value, 1)

            prob_ratio = tf.math.exp(new_probs - old_probs)
            weighted_probs = advantage[batch] * prob_ratio
            clipped_probs = tf.clip_by_value(prob_ratio,
                                                1-self.policy_clip,
                                                1 self.policy_clip)
            weighted_clipped_probs = clipped_probs * advantage[batch]
            actor_loss = -tf.math.minimum(weighted_probs,
                                            weighted_clipped_probs)
            actor_loss = tf.math.reduce_mean(actor_loss)

            returns = advantage[batch]   values[batch]
            # critic_loss = tf.math.reduce_mean(tf.math.pow(
            #                                  returns-critic_value, 2))
            critic_loss = keras.losses.MSE(critic_value, returns)

        actor_params = self.actor.trainable_variables
        actor_grads = tape.gradient(actor_loss, actor_params)
        critic_params = self.critic.trainable_variables
        critic_grads = tape.gradient(critic_loss, critic_params)
        self.actor.optimizer.apply_gradients(
                zip(actor_grads, actor_params))
        self.critic.optimizer.apply_gradients(
                zip(critic_grads, critic_params))

TypeError：只有整數、切片 (`:`)、省略號 (`...`)、tf.newaxis (`None`) 和標量 tf.int32/tf.int64 張量是有效的索引

問題似乎是批處理是張量，但它應該是我的代碼中的一個串列

def generate_batches(self):
        n_states = len(self.states)
        batch_start = np.arange(0, n_states, self.batch_size)
        indices = np.arange(n_states, dtype=np.int64)
        np.random.shuffle(indices)
        batches = [indices[i:i self.batch_size] for i in batch_start]

        return np.array(self.states),\
            np.array(self.actions),\
            np.array(self.probs),\
            np.array(self.vals),\
            np.array(self.rewards),\
            np.array(self.dones),\
            batches

uj5u.com熱心網友回復：

裝飾的函式tf.function將在內部將串列轉換為張量。要a使用另一個張量在一個特定軸上索引張量b，您可以使用tf.gather(a, b)而不是熟悉的索引語法a[b]。

do_batch具體來說，在您的函式中嘗試以下修改：

state_arr[batch] -> tf.gather(state_arr, batch, axis=0)
old_prob_arr[batch] -> tf.gather(old_prob_arr, batch, axis=0)
action_arr[batch] -> tf.gather(action_arr, batch, axis=0)
advantage[batch] -> tf.gather(advantage, batch, axis=0)
values[batch] -> tf.gather(values, batch, axis=0)

轉載請註明出處，本文鏈接：https://www.uj5u.com/caozuo/520801.html

標籤：Python张量流机器学习深度学习开放式健身房

上一篇：恢復訓練PyTorch

下一篇：不同的特征在SVM中給出相同的準確度