Home > Mobile >  The difference in the speed of moving objects through the CPU and GPU shader in Unity
The difference in the speed of moving objects through the CPU and GPU shader in Unity

Time:06-04

I have been testing moving a lot of objects in Unity through normal C# code and through HLSL shaders. However, there is no difference in speed. FPS remains the same. Different perlin noise is used to change the position. The C# code uses the standard Mathf.PerlinNoise, while the HLSL uses a custom noise function.

Scenario 1 - Update via C# code only

Object spawn:

[SerializeField]
private GameObject prefab;

private void Start()
{
    for (int i = 0; i < 50; i  )
        for (int j = 0; j < 50; j  )
        {
            GameObject createdParticle;
            createdParticle = Instantiate(prefab);
            createdParticle.transform.position = new Vector3(i * 1f, Random.Range(-1f, 1f), j * 1f);
        }
}

Code to move an object via C#. This script is added to each created object:

private Vector3 position = new Vector3();

private void Start()
{
    position = new Vector3(transform.position.x, Mathf.PerlinNoise(Time.time, Time.time), transform.position.z);
}

private void Update()
{
    position.y = Mathf.PerlinNoise(transform.position.x / 20f   Time.time, transform.position.z / 20f   Time.time) * 5f;
    transform.position = position;
}

Scenario 2 - via Compute Kernel (GPGPU)

Part 1: C# client code

Object spawn, running the calculation on the shader and assigning the resulting value to the objects:

public struct Particle
{
    public Vector3 position;
}

[SerializeField]
private GameObject prefab;
[SerializeField]
private ComputeShader computeShader;

private List<GameObject> particlesList = new List<GameObject>();
private Particle[] particlesDataArray;

private void Start()
{
    CreateParticles();
}

private void Update()
{
    UpdateParticlePosition();
}

private void CreateParticles()
{
    List<Particle> particlesDataList = new List<Particle>();

    for (int i = 0; i < 50; i  )
        for (int j = 0; j < 50; j  )
        {
            GameObject createdParticle;
            createdParticle = Instantiate(prefab);
            createdParticle.transform.position = new Vector3(i * 1f, Random.Range(-1f, 1f), j * 1f);
            particlesList.Add(createdParticle);
            Particle particle = new Particle();
            particle.position = createdParticle.transform.position;
            particlesDataList.Add(particle);
        }

    particlesDataArray = particlesDataList.ToArray();
    particlesDataList.Clear();
    computeBuffer = new ComputeBuffer(particlesDataArray.Length, sizeof(float) * 7);
    computeBuffer.SetData(particlesDataArray);
    computeShader.SetBuffer(0, "particles", computeBuffer);
}

private ComputeBuffer computeBuffer;
private void UpdateParticlePosition()
{
    computeShader.SetFloat("time", Time.time);
    computeShader.Dispatch(computeShader.FindKernel("CSMain"), particlesDataArray.Length / 10, 1, 1);
    computeBuffer.GetData(particlesDataArray);

    for (int i = 0; i < particlesDataArray.Length; i  )
    {
        Vector3 pos = particlesList[i].transform.position;
        pos.y = particlesDataArray[i].position.y;
        particlesList[i].transform.position = pos;
    }
}

Part 2: Compute kernel (GPGPU)

#pragma kernel CSMain

struct Particle {
    float3 position;
    float4 color;
};

RWStructuredBuffer<Particle> particles;
float time;

float mod(float x, float y)
{
    return x - y * floor(x / y);
}

float  permute(float x) { return floor(mod(((x * 34.0)   1.0) * x, 289.0)); }
float3 permute(float3 x) { return mod(((x * 34.0)   1.0) * x, 289.0); }
float4 permute(float4 x) { return mod(((x * 34.0)   1.0) * x, 289.0); }
float taylorInvSqrt(float r) { return 1.79284291400159 - 0.85373472095314 * r; }
float4 taylorInvSqrt(float4 r) { return float4(taylorInvSqrt(r.x), taylorInvSqrt(r.y), taylorInvSqrt(r.z), taylorInvSqrt(r.w)); }

float3 rand3(float3 c) {
    float j = 4096.0 * sin(dot(c, float3(17.0, 59.4, 15.0)));
    float3 r;
    r.z = frac(512.0 * j);
    j *= .125;
    r.x = frac(512.0 * j);
    j *= .125;
    r.y = frac(512.0 * j);
    return r - 0.5;
}

float _snoise(float3 p) {
    const float F3 = 0.3333333;
    const float G3 = 0.1666667;
    float3 s = floor(p   dot(p, float3(F3, F3, F3)));
    float3 x = p - s   dot(s, float3(G3, G3, G3));

    float3 e = step(float3(0.0, 0.0, 0.0), x - x.yzx);
    float3 i1 = e * (1.0 - e.zxy);
    float3 i2 = 1.0 - e.zxy * (1.0 - e);

    float3 x1 = x - i1   G3;
    float3 x2 = x - i2   2.0 * G3;
    float3 x3 = x - 1.0   3.0 * G3;

    float4 w, d;

    w.x = dot(x, x);
    w.y = dot(x1, x1);
    w.z = dot(x2, x2);
    w.w = dot(x3, x3);

    w = max(0.6 - w, 0.0);

    d.x = dot(rand3(s), x);
    d.y = dot(rand3(s   i1), x1);
    d.z = dot(rand3(s   i2), x2);
    d.w = dot(rand3(s   1.0), x3);

    w *= w;
    w *= w;
    d *= w;

    return dot(d, float4(52.0, 52.0, 52.0, 52.0));
}

[numthreads(10, 1, 1)]
void CSMain(uint3 id : SV_DispatchThreadID)
{
    Particle particle = particles[id.x];
    float modifyTime = time / 5.0;
    float positionY = _snoise(float3(particle.position.x / 20.0   modifyTime, 0.0, particle.position.z / 20.0   modifyTime)) * 5.0;

    particle.position = float3(particle.position.x, positionY, particle.position.z);
    particles[id.x] = particle;
}

What am I doing wrong, why is there no increase in calculation speed? :)

Thanks in advance!

CodePudding user response:


TL;DR: your GPGPU (compute shader) scenario is unoptimized thus skewing your results. Consider binding a material to the computeBuffer and rendering via Graphics.DrawProcedural. That way everything stays on the GPU.


OP:

What am I doing wrong, why is there no increase in calculation speed?

Essentially, there are two parts to your problem.

(1) Reading from the GPU is slow

With most things GPU-related, you generally want to avoid reading from the GPU since it will block the CPU. This is true also for GPGPU scenarios.

If I were to hazard a guess it would be the GPGPU (compute shader) call computeBuffer.GetData() shown below:

private void Update()
{
    UpdateParticlePosition();
}

private void UpdateParticlePosition()
{
.
.
.
    computeBuffer.GetData(particlesDataArray); // <----- OUCH!

Unity (my emphasis):

ComputeBuffer.GetData

Read data values from the buffer into an array...
Note that this function reads the data back from the GPU, which can be slow...If any GPU work has been submitted that writes to this buffer, Unity waits for the tasks to complete before it retrieves the requested data. enter image description here

I think everyone can agree that Microsoft's GPGPU documentation is pretty sparse so your best bet is to check out examples scattered around the interwebs. One that comes to mind is the excellent "GPU Ray Tracing in Unity" series over at Three Eyed Games. See the link below.

See also:

CodePudding user response:

ComputeBuffer.GetData is very long. The CPU copies data from the GPU. This stops the main thread. Then you loop around all transforms to change their positions, this is certainly faster than thousands of MonoBehaviour, but also very long. There are two ways to optimize your code.

CPU

C# Job System Burst Detailed tutorial: https://github.com/stella3d/job-system-cookbook

GPU

Use the structured buffer calculated in the compute shader without copying it back to the CPU. Here is a detailed tutorial on how to do it: https://catlikecoding.com/unity/tutorials/basics/compute-shaders/

  • Related