更快的HLSL代碼？想知道在3空間中渲染四邊形時CPU開銷是否會降低-有解無憂

上次我編碼時，我才剛剛開始學習 Direct3D9c。目前，我正在以大約 450fps 的速度使用 15 盞燈點亮大約 30K 個單紋理四邊形。我還沒有學習過實體化或幾何著色，我正在嘗試根據我的需要優先考慮學習的順序，所以我只看了一眼。

我的第一個想法是減少分流到 GPU 的頂點資料量，因此我將頂點結構更改為 FLOAT2（用于紋理坐標）和 UINT（用于索引），依靠頂點著色器中的 4x float3 常量來定義四邊形的角落。

我想我可以進一步減小頂點資料的大小，并將每個頂點單元減少到一個包含 2 位索引（以參考四邊形的實際頂點）和 2 個 15 位定點數（是的，我是顯示我的年齡，但定點仍然有它的價值）表示偏移到 atlas 紋理中。

到目前為止，一切都很好，但我知道 Direct3D11 和 HLSL 的一切，所以我一直想知道是否有更快的方法。

這是我的頂點著色器的當前狀態：

cbuffer CB_PROJ
{
    matrix model;
    matrix modelViewProj;
};

struct VOut
{
    float3 position : POSITION;
    float3 n : NORMAL;
    float2 texcoord : TEXCOORD;
    float4 pos : SV_Position;
};

static const float3 position[4] = { -0.5f, 0.0f,-0.5f,-0.5f, 0.0f, 0.5f, 0.5f, 0.0f,-0.5f, 0.5f, 0.0f, 0.5f };
    
// Index bitpattern: YYYYYYYYYYYYYYYXXXXXXXXXXXXXXXVV
//
// 00-01 .  uint2b   == Vertex index (0-3)
// 02-17 . fixed1p14 == X offset into atlas texture(s)
// 18-31 . fixed1p14 == Y offset into atlas texture(s)
//
VOut main(uint bitField : BLENDINDICES) {
    VOut output;
    
    const uint   i        = bitField & 0x03u;
    const uint   xStep    = (bitField >> 2) & 0x7FFFu;
    const uint   yStep    = (bitField >> 17);
    const float  xDelta   = float(xStep) * 0.00006103515625f;
    const float  yDelta   = float(yStep) * 0.00006103515625f;
    const float2 texCoord = float2(xDelta, yDelta);
    
    output.position = (float3) mul(float4(position[i], 1.0f), model);
    output.n = mul(float3(0.0f, 1.0f, 0.0f), (float3x3) model);
    output.texcoord = texCoord;
    output.pos = mul(float4(output.position, 1.0f), modelViewProj);
    
    return output;
}

我的像素著色器的完整性：

Texture2D Texture : register(t0);

SamplerState Sampler : register(s0);

struct LIGHT {
    float4 lightPos; // .w == range
    float4 lightCol; // .a == flags
};

cbuffer cbLight {
    LIGHT l[16] : register(b0); // 256 bytes
}

static const float3 ambient = { 0.15f, 0.15f, 0.15f };

float4 main(float3 position : POSITION, float3 n : NORMAL, float2 TexCoord : TEXCOORD) : SV_Target
{
    const float4 Texel = Texture.Sample(Sampler, TexCoord);

    if (Texel.a < 0.707106f) discard; // My source images have their alpha values inverted.

    float3 result = { 0.0f, 0.0f, 0.0f };

    for (uint xx = 0 ; xx < 16 && l[xx].lightCol.a != 0xFFFFFFFF; xx  )
    {
        const float3 lCol    = l[xx].lightCol.rgb;
        const float  range   = l[xx].lightPos.w;
        const float3 vToL    = l[xx].lightPos.xyz - position;
        const float  distToL = length(vToL);
        
        if (distToL < range * 2.0f)
        {
            const float  att = min(1.0f, (distToL / range   distToL / (range * range)) * 0.5f);
            const float3 lum = Texel.rgb * saturate(dot(vToL / distToL, n)) * lCol;
            result  = lum * (1.0f - att);
        }
    }
    return float4(ambient * Texel.rgb   result, Texel.a);
}

以及用于生成頂點資料的看起來相當忙碌的 C 函式（洗掉了所有不相關的函式）：

al16 struct CLASS_PRIMITIVES {
    ID3D11Buffer* pVB = { NULL, NULL }, * pIB = { NULL, NULL };
    const UINT strideV1 = sizeof(VERTEX1);

    void CreateQuadSet1(ui32 xSegs, ui32 ySegs) {
        al16 VERTEX1* vBuf;
        al16 D3D11_BUFFER_DESC bd = {};
             D3D11_SUBRESOURCE_DATA srd = {};
             ui32 index = 0, totalVerts = xSegs * ySegs * 4;

        if (pVB) return;
        vBuf = (VERTEX1*)_aligned_malloc(strideV1 * totalVerts, 16);
        for (ui32 yy = ySegs; yy; yy--)
            for (ui32 xx = 0; xx < xSegs; xx  ) {
                double dyStep2 = 16384.0 / double(ySegs); double dyStep1 = dyStep2 * double(yy); dyStep2 *= double(yy - 1);
                ui32 yStep1 = dyStep1;
                yStep1 <<= 17;
                ui32 yStep2 = dyStep2;
                yStep2 <<= 17;
                vBuf[index].b = 0   (ui32(double(16384.0 / double(xSegs) * double(xx))) << 2)   yStep1;
                index  ;
                vBuf[index].b = 1   (ui32(double(16384.0 / double(xSegs) * double(xx))) << 2)   yStep2;
                index  ;
                vBuf[index].b = 2   (ui32(double(16384.0 / double(xSegs) * double(xx   1))) << 2)   yStep1;
                index  ;
                vBuf[index].b = 3   (ui32(double(16384.0 / double(xSegs) * double(xx   1))) << 2)   yStep2;
                index  ;
            }
        bd.Usage = D3D11_USAGE_IMMUTABLE;
        bd.BindFlags = D3D11_BIND_VERTEX_BUFFER;
        bd.CPUAccessFlags = 0;
        bd.ByteWidth = strideV1 * totalVerts;
        bd.StructureByteStride = strideV1;
        srd.pSysMem = vBuf;
        hr = dev->CreateBuffer(&bd, &srd, &pVB);
        if (hr != S_OK) ThrowError();
        _aligned_free(vBuf);
    };

    void DrawQuadFromSet1(ui32 offset) {
        offset *= sizeof(VERTEX1) * 4;
        devcon->IASetVertexBuffers(0, 1, &pVB, &strideV1, &offset);
        devcon->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP);
        devcon->Draw(4, 0);
    };

    void DestroyQuadSet() {
        if (pVB) pVB->Release();
    };

它一切正常，但似乎我正在訴諸黑客來實作我的目標。肯定有更快的方法嗎？使用 DrawIndexed() 始終將幀速率降低 1%，因此我切換回非索引 Draw 呼叫。

uj5u.com熱心網友回復：

在 GPU 允許的范圍內，將頂點資料減少到每個頂點 32 位

您似乎認為頂點緩沖區大小是阻礙您前進的原因。別搞錯了，他們不是。你有很多 VRAM 可以使用，如果它能讓你的代碼更快，就使用它們。具體來說，您在著色器中解包的任何內容可能會顯式存盤在您的頂點緩沖區中，這些內容可能應該存盤在您的頂點緩沖區中。

我想知道是否有人有使用幾何著色器自動生成四邊形的經驗

我會在此阻止您，即使在今天，幾何著色器在大多數驅動程式實作中都非常低效。它們只是沒有被使用那么多，所以沒有人費心去優化它們。

一件讓我大吃一驚的事情是，您每幀都在分配和釋放系統端的頂點陣列。構建它很好，但是快取陣列，C 記憶體分配幾乎和任何東西一樣慢。快速分析應該已經向您展示了這一點。

您的下一個最大問題是您的像素著色器中有很多分支。使用標準函式（如clampor mix）或混合讓數學抵消，而不是檢查范圍或完全透明的值。分支絕對會扼殺性能。

最后，確保您的緩沖區有正確的提示和用法。您沒有顯示它們，但它們應該設定為等效的GL_STREAM_DRAW值，并且您需要確保不會破壞頂點緩沖區的飛行部分。只要您不通過覆寫其頂點緩沖區來使它們的資料無效，未來的幀將與當前幀同時渲染，因此請使用回圈方案來允許盡可能多的頂點存活（再次，使用記憶體性能）。就我個人而言，我分配了一個非常大的頂點緩沖區（一幀需要的資料的 5 倍）并按順序寫入它直到我到達末尾，此時我將整個事物孤立并重新分配并重新從頭開始。

uj5u.com熱心網友回復：

我認為您的代碼受 CPU 限制。雖然您的方法具有非常小的頂點，但您的 API 開銷并不小。

更好的方法是使用一次繪制呼叫渲染所有四邊形。我可能會為此使用實體化。

假設您想要 3D 空間中的任意四邊形大小、位置和方向，這是一種可能的方法。未經測驗。

頂點緩沖區元素：

struct sInstanceData
{
    // Center of the quad in 3D space
    XMFLOAT3 center;
    // XY coordinates of the sprite in the atlas
    uint16_t spriteX, spriteY;
    // Local XY vectors of the quad in 3D space
    // length of the vectors = half width/height of the quad
    XMFLOAT3 plusX, plusY;
};

輸入布局：

D3D11_INPUT_ELEMENT_DESC desc[ 4 ];
desc[ 0 ] = D3D11_INPUT_ELEMENT_DESC{ "QuadCenter", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, D3D11_APPEND_ALIGNED_ELEMENT, D3D11_INPUT_PER_INSTANCE_DATA, 0 };
desc[ 1 ] = D3D11_INPUT_ELEMENT_DESC{ "SpriteIndex", 0, DXGI_FORMAT_R16G16_UINT, 0, D3D11_APPEND_ALIGNED_ELEMENT, D3D11_INPUT_PER_INSTANCE_DATA, 0 };
desc[ 2 ] = D3D11_INPUT_ELEMENT_DESC{ "QuadPlusX", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, D3D11_APPEND_ALIGNED_ELEMENT, D3D11_INPUT_PER_INSTANCE_DATA, 0 };
desc[ 3 ] = D3D11_INPUT_ELEMENT_DESC{ "QuadPlusY", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, D3D11_APPEND_ALIGNED_ELEMENT, D3D11_INPUT_PER_INSTANCE_DATA, 0 };

頂點著色器：

cbuffer Constants
{
    matrix viewProj;
    // Pass [ 1.0 / xSegs, 1.0 / ySegs ] in that field
    float2 texcoordMul;
};

struct VOut
{
    float3 position : POSITION;
    float3 n : NORMAL;
    float2 texcoord : TEXCOORD;
    float4 pos : SV_Position;
};

VOut main( uint index: SV_VertexID,
    float3 center : QuadCenter, uint2 texcoords : SpriteIndex,
    float3 plusX : QuadPlusX, float3 plusY : QuadPlusY )
{
    VOut result;
    float3 pos = center;
    int2 uv = ( int2 )texcoords;

    // No branches are generated in release builds;
    // only conditional moves are there
    if( index & 1 )
    {
        pos  = plusX;
        uv.x  ;
    }
    else
        pos -= plusX;

    if( index & 2 )
    {
        pos  = plusY;
        uv.y  ;
    }
    else
        pos -= plusY;

    result.position = pos;
    result.n = normalize( cross( plusX, plusY ) );
    result.texcoord = ( ( float2 )uv ) * texcoordMul;
    result.pos = mul( float4( pos, 1.0f ), viewProj );
    return result;
}

渲染：

UINT stride = sizeof( sInstanceData );
UINT off = 0;
context->IASetVertexBuffers( 0, 1, &vb, &stride, &off );
context->IASetPrimitiveTopology( D3D_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP );
context->DrawInstanced( 4, countQuads, 0, 0 );

轉載請註明出處，本文鏈接：https://www.uj5u.com/houduan/520280.html

標籤：C精灵hlsl直接3d11图集精灵

上一篇：MisraC和原型C宏生成

下一篇：為什么這個scanf的第三個引數不是指標？[復制]