Why does the execution of ComputeShaders vary so much in speed?

Hello everyone,

I am trying to compare performance between mono threading, multi-threading and compute shaders and something hits me.

With compute shaders the execution time varies a lot but I could not understand why. Here is the result of 4 different executions.

The first column is the time in milliseconds to create the array, the second is the time in mon thread, the third is the time for multi-threading and the last is the compute shader.

As you can see, the time for the last one varies a lot compared to the rest. It can also perform better than multi-threading and sometimes worse.

It seems that this is not very consistent.

Here is the C# code:

public class ArrayGenerator : MonoBehaviour
{
    [SerializeField] int arraySize = 67107840;

    int resolution;

    float[] array;

    [ContextMenu("Update Array")]
    void UpdateArray()
    {
        double arrayCreation = CalculateOperationMilliseconds(() =>
        {
            array = new float[arraySize];
        });

        resolution = (int)Mathf.Sqrt(arraySize) + 1;

        double mainThread = CalculateOperationMilliseconds(CPU_MainThread);
        double multiThreads = CalculateOperationMilliseconds(CPU_MultiThreads);
        double gpu = CalculateOperationMilliseconds(GPU);

        Debug.Log($"Num of ticks: {arrayCreation} / {mainThread} / {multiThreads} / {gpu}");
    }

    void CPU_MainThread()
    {
        for (int i = 0; i < arraySize; i++)
        {
            int x = i % resolution;
            int y = i / resolution;
            array[i] = Mathf.PerlinNoise(x, y);
        }
    }

    void CPU_MultiThreads()
    {
        Parallel.For(0, arraySize, i =>
        {
            int x = i % resolution;
            int y = i / resolution;
            array[i] = Mathf.PerlinNoise(x, y);
        });
    }

    ComputeShader computeShader;

    void GPU()
    {
        if (computeShader == null) computeShader = ComputeHelper.LoadComputeShader("ArrayOperation");

        int kernelIndex = computeShader.FindKernel("CSMain");

        ComputeBuffer arrayBuffer = ComputeHelper.CreateBuffer<float>(arraySize);

        computeShader.SetBuffer(kernelIndex, "array", arrayBuffer);

        computeShader.SetInt("arraySize", arraySize);
        computeShader.SetInt("resolution", resolution);

        ComputeHelper.Dispatch(computeShader, kernelIndex, arraySize, 1, 1);

        arrayBuffer.GetData(array);
        arrayBuffer.Release();
    }

    public static double CalculateOperationMilliseconds(Action operation)
    {
        Stopwatch watch = Stopwatch.StartNew();

        operation.Invoke();

        return watch.Elapsed.TotalMilliseconds;
    }
}

Here is the compute shader:

#pragma kernel CSMain

RWStructuredBuffer<float> array;

uint arraySize;
uint resolution;

uint GetIndex(uint3 id)
{
    return id.x + id.y * resolution;
}

float2 unity_gradientNoise_dir(float2 p)
{
    p = p % 289;
    float x = (34 * p.x + 1) * p.x % 289 + p.y;
    x = (34 * x + 1) * x % 289;
    x = frac(x / 41) * 2 - 1;
    return normalize(float2(x - floor(x + 0.5), abs(x) - 0.5));
}

float unity_gradientNoise(float2 p)
{
    float2 ip = floor(p);
    float2 fp = frac(p);
    float d00 = dot(unity_gradientNoise_dir(ip), fp);
    float d01 = dot(unity_gradientNoise_dir(ip + float2(0, 1)), fp - float2(0, 1));
    float d10 = dot(unity_gradientNoise_dir(ip + float2(1, 0)), fp - float2(1, 0));
    float d11 = dot(unity_gradientNoise_dir(ip + float2(1, 1)), fp - float2(1, 1));
    fp = fp * fp * fp * (fp * (fp * 6 - 15) + 10);
    return lerp(lerp(d00, d01, fp.y), lerp(d10, d11, fp.y), fp.x);
}

float PerlinNoise(float2 UV)
{
    return unity_gradientNoise(UV) + 0.5;
}

[numthreads(1024, 1, 1)]
void CSMain(uint3 id : SV_DispatchThreadID)
{
    if (id.x >= arraySize)
        return;

    uint index = GetIndex(id);
    
    int x = id.x % resolution;
    int y = id.x / resolution;
    
    array[index] = PerlinNoise(float2(x, y));
}