NEON匯編代碼在Cortex-A72和Cortex-A53上需要更多的周期-有解無憂

我正在 AArch32 模式下的兩個 ARMv8 處理器上對 ARMv7 NEON 代碼進行基準測驗：Cortex-A53 和 Cortex-A72。我正在使用帶有 32 位 Raspbian Buster 的 Raspberry Pi 3B 和 Raspberry Pi 4B 板。

我的基準測驗方法如下：

uint32_t x[4];
uint32_t t0 = ccnt_read();
for(int i = 0; i < 1000; i  )
    armv7_neon(x);
uint32_t t1 = ccnt_read();
printf("%u\n",(t1-t0)/1000);

其中 armv7_neon 函式由以下指令定義：

.global armv7_neon
.func armv7_neon, armv7_neon
.type armv7_neon, %function
armv7_neon:
    vld1.32 {q0}, [r0]
    vmvn.i32 q0, q0
    vmov.i32 q8, #0x11111111
    vshr.u32 q1, q0, #2
    vshr.u32 q2, q0, #3
    vmov.i32 q9, #0x20202020
    vand q1, q1, q2
    vmov.i32 q10, #0x40404040
    vand q1, q1, q8
    vmov.i32 q11, #0x80808080
    veor q0, q0, q1
    vmov.i32 q12, #0x02020202
    vshl.u32 q1, q0, #5
    vshl.u32 q2, q0, #1
    vmov.i32 q13, #0x04040404
    vand q1, q1, q2
    vmov.i32 q14, #0x08080808
    vand q3, q1, q9
    vshl.u32 q1, q0, #5
    vshl.u32 q2, q0, #4
    veor q0, q0, q3
    vand q1, q1, q2
    vmov.i32 q15, #0x32323232
    vand q1, q1, q10
    vmov.i32 q8, #0x01010101
    veor q0, q0, q1
    vshl.u32 q1, q0, #2
    vshl.u32 q2, q0, #1
    vand q1, q1, q2
    vand q3, q1, q11
    vshr.u32 q1, q0, #2
    vshl.u32 q2, q0, #1
    veor q0, q0, q3
    vand q1, q1, q2
    vand q1, q1, q12
    veor q0, q0, q1
    vshr.u32 q1, q0, #5
    vshl.u32 q2, q0, #1
    vand q1, q1, q2
    vand q3, q1, q13
    vshr.u32 q1, q0, #1
    vshr.u32 q2, q0, #2
    veor q0, q0, q3
    vand q1, q1, q2
    vand q1, q1, q14
    veor q0, q0, q1
    vmvn.i32 q0, q0
    vand q1,  q0, q14
    vand q2,  q0, q15
    vand q3,  q0, q8
    vand q8,  q0, q11
    vand q9,  q0, q10
    vand q10, q0, q13
    vshl.u32 q1,  q1,  #1
    vshl.u32 q2,  q2,  #2
    vshl.u32 q3,  q3,  #5
    vshr.u32 q8,  q8,  #6
    vshr.u32 q9,  q9,  #4
    vshr.u32 q10, q10, #2
    vorr q0, q1, q2
    vorr q1, q3, q8
    vorr q2, q9, q10
    vorr q3, q0, q1
    vorr q0, q3, q2
    vst1.32 {q0}, [r0]
    bx lr
.endfunc

代碼只是使用以下選項編譯：

gcc -O3 -mfpu=neon-fp-armv8 -mcpu=cortex-a53
gcc -O3 -mfpu=neon-fp-armv8 -mcpu=cortex-a72

我在 Cortex-A53 和 Cortex-A72 上分別獲得了 74 和 99 個周期。我看到這篇博文討論了 Cortex-A72 上 tbl 指令的一些性能問題，但我正在運行的代碼不包含任何內容。

這個差距從何而來？

uj5u.com熱心網友回復：

我比較了A72和A55的指令周期時序（A53上沒有）：

vshl和vshr：

A72：吞吐量（IPC）1，延遲 3，僅在 F1 流水線上執行
A55：吞吐量（IPC）2，延遲 2，在兩個流水線上執行（但有限制）

這幾乎可以解決問題，因為您的代碼中有很多。

你的匯編代碼也有一些缺點：

vadd與vshl. 你應該用vshl立即數 1替換 all vadd。桶形移位器比 SIMD 上的算術成本更高。
您不應不必要地重復相同的說明 ( <<5)
第二個vmvn是不必要的。您可以替換以下所有vand帶vbic代替。
只要不涉及排列，編譯器就會生成可接受的機器代碼。因此，在這種情況下，我會用霓虹燈內在函式撰寫代碼。

#include <arm_neon.h>

void armv7_neon(uint32_t * pData) {
    const uint32x4_t cx11 = vdupq_n_u32(0x11111111);
    const uint32x4_t cx20 = vdupq_n_u32(0x20202020);
    const uint32x4_t cx40 = vdupq_n_u32(0x40404040);
    const uint32x4_t cx80 = vdupq_n_u32(0x80808080);
    const uint32x4_t cx02 = vdupq_n_u32(0x02020202);
    const uint32x4_t cx04 = vdupq_n_u32(0x04040404);
    const uint32x4_t cx08 = vdupq_n_u32(0x08080808);
    const uint32x4_t cx32 = vdupq_n_u32(0x32323232);
    const uint32x4_t cx01 = vdupq_n_u32(0x01010101);

    uint32x4_t temp1, temp2, temp3, temp4, temp5, temp6;
    uint32x4_t in = vld1q_u32(pData);

    in = vmvnq_u32(in);

    temp1 = (in >> 2) & (in >> 3);
    temp1 &= cx11;
    in ^= temp1;

    temp1 = (in << 5) & (in   in);
    temp1 &= cx20;
    temp2 = (in << 5) & (in << 4);
    temp2 &= cx40;
    in ^= temp1;
    in ^= temp2;

    temp1 = (in << 2) & (in   in);
    temp1 &= cx80;
    temp2 = (in >> 2) & (in >> 1);
    temp2 &= cx02;
    in ^= temp1;
    in ^= temp2;

    temp1 = (in >> 5) & (in   in);
    temp1 &= cx04;
    temp2 = (in >> 1) & (in >> 2);
    temp2 &= cx08;
    in ^= temp1;
    in ^= temp2;

    temp1 = vbicq_u32(cx08, in);
    temp2 = vbicq_u32(cx32, in);
    temp3 = vbicq_u32(cx01, in);
    temp4 = vbicq_u32(cx80, in);
    temp5 = vbicq_u32(cx40, in);
    temp6 = vbicq_u32(cx04, in);

    temp1  = temp1;
    temp2 <<= 2;
    temp3 <<= 5;
    temp4 >>= 6;
    temp5 >>= 4;
    temp6 >>= 2;

    temp1 |= temp2 | temp3 | temp4 | temp5 | temp6;

    vst1q_u32(pData, temp1);
}

神箭鏈接

您可以看到該-mcpu選項在這里產生了明顯的不同。

但 GCC 從未令人失望：vbic即使我明確命令它，它也拒絕使用（Clang 也是如此。我討厭他們兩個）

我會拆解，拆下第二個vmvn，并更換所有vand附件以vbic獲得最佳性能。

請記住，用匯編撰寫并不會自動使代碼運行得更快，并且較新的架構不一定會帶來更有利的 ICT：在 ICT 方面，A72 在很大程度上不如 A53。

PS: With -mcpu=cortex-a53 option the generated code is identical to a55's. We can assume A55 is just an extension to A53 by armv8.2 ISA.

轉載請註明出處，本文鏈接：https://www.uj5u.com/ruanti/339806.html

標籤：集会树莓派手臂氖皮质-a

上一篇：保持輸入LMC的數字總數

下一篇：是否可以在瀏覽器的不同控制臺上分離輸出？