Unity中通過CPU和GPU著色器移動物體的速度差異-有解無憂

我一直在測驗通過普通 C# 代碼和 HLSL 著色器在 Unity 中移動很多物件。但是，速度沒有區別。FPS 保持不變。使用不同的柏林噪聲來改變位置。C# 代碼使用標準 Mathf.PerlinNoise，而 HLSL 使用自定義噪聲函式。

場景 1 - 僅通過 C# 代碼更新

物件生成：

[SerializeField]
private GameObject prefab;

private void Start()
{
    for (int i = 0; i < 50; i  )
        for (int j = 0; j < 50; j  )
        {
            GameObject createdParticle;
            createdParticle = Instantiate(prefab);
            createdParticle.transform.position = new Vector3(i * 1f, Random.Range(-1f, 1f), j * 1f);
        }
}

通過 C# 移動物件的代碼。這個腳本被添加到每個創建的物件中：

private Vector3 position = new Vector3();

private void Start()
{
    position = new Vector3(transform.position.x, Mathf.PerlinNoise(Time.time, Time.time), transform.position.z);
}

private void Update()
{
    position.y = Mathf.PerlinNoise(transform.position.x / 20f   Time.time, transform.position.z / 20f   Time.time) * 5f;
    transform.position = position;
}

場景 2 - 通過計算內核 (GPGPU)

第 1 部分：C# 客戶端代碼

物件生成，在著色器上運行計算并將結果值分配給物件：

public struct Particle
{
    public Vector3 position;
}

[SerializeField]
private GameObject prefab;
[SerializeField]
private ComputeShader computeShader;

private List<GameObject> particlesList = new List<GameObject>();
private Particle[] particlesDataArray;

private void Start()
{
    CreateParticles();
}

private void Update()
{
    UpdateParticlePosition();
}

private void CreateParticles()
{
    List<Particle> particlesDataList = new List<Particle>();

    for (int i = 0; i < 50; i  )
        for (int j = 0; j < 50; j  )
        {
            GameObject createdParticle;
            createdParticle = Instantiate(prefab);
            createdParticle.transform.position = new Vector3(i * 1f, Random.Range(-1f, 1f), j * 1f);
            particlesList.Add(createdParticle);
            Particle particle = new Particle();
            particle.position = createdParticle.transform.position;
            particlesDataList.Add(particle);
        }

    particlesDataArray = particlesDataList.ToArray();
    particlesDataList.Clear();
    computeBuffer = new ComputeBuffer(particlesDataArray.Length, sizeof(float) * 7);
    computeBuffer.SetData(particlesDataArray);
    computeShader.SetBuffer(0, "particles", computeBuffer);
}

private ComputeBuffer computeBuffer;
private void UpdateParticlePosition()
{
    computeShader.SetFloat("time", Time.time);
    computeShader.Dispatch(computeShader.FindKernel("CSMain"), particlesDataArray.Length / 10, 1, 1);
    computeBuffer.GetData(particlesDataArray);

    for (int i = 0; i < particlesDataArray.Length; i  )
    {
        Vector3 pos = particlesList[i].transform.position;
        pos.y = particlesDataArray[i].position.y;
        particlesList[i].transform.position = pos;
    }
}

第 2 部分：計算內核 (GPGPU)

#pragma kernel CSMain

struct Particle {
    float3 position;
    float4 color;
};

RWStructuredBuffer<Particle> particles;
float time;

float mod(float x, float y)
{
    return x - y * floor(x / y);
}

float  permute(float x) { return floor(mod(((x * 34.0)   1.0) * x, 289.0)); }
float3 permute(float3 x) { return mod(((x * 34.0)   1.0) * x, 289.0); }
float4 permute(float4 x) { return mod(((x * 34.0)   1.0) * x, 289.0); }
float taylorInvSqrt(float r) { return 1.79284291400159 - 0.85373472095314 * r; }
float4 taylorInvSqrt(float4 r) { return float4(taylorInvSqrt(r.x), taylorInvSqrt(r.y), taylorInvSqrt(r.z), taylorInvSqrt(r.w)); }

float3 rand3(float3 c) {
    float j = 4096.0 * sin(dot(c, float3(17.0, 59.4, 15.0)));
    float3 r;
    r.z = frac(512.0 * j);
    j *= .125;
    r.x = frac(512.0 * j);
    j *= .125;
    r.y = frac(512.0 * j);
    return r - 0.5;
}

float _snoise(float3 p) {
    const float F3 = 0.3333333;
    const float G3 = 0.1666667;
    float3 s = floor(p   dot(p, float3(F3, F3, F3)));
    float3 x = p - s   dot(s, float3(G3, G3, G3));

    float3 e = step(float3(0.0, 0.0, 0.0), x - x.yzx);
    float3 i1 = e * (1.0 - e.zxy);
    float3 i2 = 1.0 - e.zxy * (1.0 - e);

    float3 x1 = x - i1   G3;
    float3 x2 = x - i2   2.0 * G3;
    float3 x3 = x - 1.0   3.0 * G3;

    float4 w, d;

    w.x = dot(x, x);
    w.y = dot(x1, x1);
    w.z = dot(x2, x2);
    w.w = dot(x3, x3);

    w = max(0.6 - w, 0.0);

    d.x = dot(rand3(s), x);
    d.y = dot(rand3(s   i1), x1);
    d.z = dot(rand3(s   i2), x2);
    d.w = dot(rand3(s   1.0), x3);

    w *= w;
    w *= w;
    d *= w;

    return dot(d, float4(52.0, 52.0, 52.0, 52.0));
}

[numthreads(10, 1, 1)]
void CSMain(uint3 id : SV_DispatchThreadID)
{
    Particle particle = particles[id.x];
    float modifyTime = time / 5.0;
    float positionY = _snoise(float3(particle.position.x / 20.0   modifyTime, 0.0, particle.position.z / 20.0   modifyTime)) * 5.0;

    particle.position = float3(particle.position.x, positionY, particle.position.z);
    particles[id.x] = particle;
}

我做錯了什么，為什么計算速度沒有提高？:)

提前致謝！

uj5u.com熱心網友回復：

TL；DR：您的 GPGPU（計算著色器）場景未優化，因此會扭曲您的結果。考慮將材質系結到computeBuffer并通過渲染Graphics.DrawProcedural。這樣，一切都保留在 GPU 上。

操作：

我做錯了什么，為什么計算速度沒有提高？

本質上，您的問題有兩個部分。

(1) 從 GPU 讀取速度慢

對于大多數與 GPU 相關的事情，您通常希望避免從 GPU 讀取，因為它會阻塞 CPU。對于 GPGPU 場景也是如此。

如果我冒險猜測它會是computeBuffer.GetData()如下所示的 GPGPU（計算著色器）呼叫：

private void Update()
{
    UpdateParticlePosition();
}

private void UpdateParticlePosition()
{
.
.
.
    computeBuffer.GetData(particlesDataArray); // <----- OUCH!

團結（我的重點）：

ComputeBuffer.GetData

將緩沖區中的資料值讀取到陣列中...
請注意，此函式從 GPU 讀回資料，這可能很慢...如果已提交任何寫入此緩沖區的 GPU 作業，Unity 會等待任務在它檢索到請求的資料之前 完成。

我想每個人都同意微軟的 GPGPU 檔案非常稀少，所以最好的辦法是查看散布在互聯網上的示例。想到的一個是 Three Eyed Games 上出色的“Unity 中的 GPU 光線追蹤”系列。請參閱下面的鏈接。

也可以看看：

MickyD，“通過 Unity 3D 在 GPGPU 上使用計算著色器進行 n 體銀河模擬”，2014

Kuri, D，“Unity 中的 GPU 光線追蹤 - 第 1 部分”，2018 年

uj5u.com熱心網友回復：

ComputeBuffer.GetData 很長。CPU 從 GPU 復制資料。這會停止主執行緒。然后你回圈所有的變換來改變它們的位置，這肯定比數千個 MonoBehaviour 更快，但也很長。有兩種方法可以優化您的代碼。

中央處理器

C# Job System Burst 詳細教程：https ://github.com/stella3d/job-system-cookbook

圖形處理器

使用在計算著色器中計算的結構化緩沖區，而不將其復制回 CPU。這是有關如何執行此操作的詳細教程： https ://catlikecoding.com/unity/tutorials/basics/compute-shaders/

轉載請註明出處，本文鏈接：https://www.uj5u.com/yidong/485841.html
標籤：C＃ unity3d 着色器 hlsl 计算着色器

上一篇：如何讓相機跟隨克隆播放器而不是第一次生產？
下一篇：ScriptableObject資產隨其實體而變化