How to work with multiple Kernels sequentially(!) in a ComputeShader on one ComputeBuffer (Kernel 2 requires results from Kernel 1) - and read the final result using AsyncGPUReadback.Request

Dear Unity3d experts,

I have a question regarding ComputeShaders with multiple kernels and one ComputeBuffer.

I am trying to work with multiple Kernels (sequentially) on one ComputeBuffer and want to read out the result asynchronously using AsyncGPUReadback.Request(…) after the last Kernel is done.

The usecase is to create float3 position coordinates and noise information in Kernel 1 into a RWStructuredBuffer “TemporaryBuffer” which is used as input in Kernel 2 to calculate normals and finally create a RWTexture2D “NormalMap”.

My problem is that that the results of Kernel 1 dont seem to be available in the TemporaryBuffer when I try to work with it in Kernel 2. It seems to be empty or unassigned in Kernel 2,
however I don’t understand why. Kernel 1 seems to do its job pretty fine, because when I read out the result with AsyncGPUReadback.Request.GetData().ToArray() the positions
and noise information from Kernel 1 is there. So somehow even though Kernel 1 seems to work fine on the TemporaryBuffer, it is not available to Kernel 2. Kernel 2 is definitely running to, because I can read out the normalmap in grey (so it is running and fills the normalmap with (0.5,0.5,0.5,1).
Its just the the TemporaryBuffer in Kernel 2 does not have the values written in Kernel 1.

Is there a way to assure this? Thanks a lot for every hint!!!

This is how I implemented it (leaving some stuff away to focus on the main issue):

On C# (CPU):

// Output Texture
patchData.normalMapGPU = new RenderTexture(patchConstants.nPixelsPerEdge, patchConstants.nPixelsPerEdge, 24, RenderTextureFormat.ARGBHalf);
patchData.normalMapGPU.enableRandomWrite = true;
patchData.normalMapGPU.Create();

// Output Buffers
patchData.temporaryBuffer = new ComputeBuffer(patchConstants.nPixelsPerEdgeWithSkirt, 12 + 4, ComputeBufferType.Default);   // Stride: float3 = 12 bytes, float = 4 bytes

// Set Buffers for kernel [1]
this.shader.SetBuffer(this.kernel[1], "PatchConstantsBuffer", patchConstantsBuffer);
this.shader.SetBuffer(this.kernel[1], "BodyConstantsBuffer", bodyConstantsBuffer);
this.shader.SetBuffer(this.kernel[1], "TemporaryBuffer", patchData.temporaryBuffer);

// Dispatch kernel [1]
this.shader.Dispatch(this.kernel[1], patchConstants.nPixelsPerEdgeWithSkirt, patchConstants.nPixelsPerEdgeWithSkirt, 1);   // (Typically 258,258,1)

// Set Buffers for kernel [2]
this.shader.SetBuffer(this.kernel[2], "PatchConstantsBuffer", patchConstantsBuffer);
this.shader.SetBuffer(this.kernel[2], "TemporaryBuffer", patchData.temporaryBuffer);
this.shader.SetTexture(this.kernel[2], "NormalMapTexture", patchData.normalMapGPU);

// Dispatch kernel [2]
this.shader.Dispatch(this.kernel[2], patchConstants.nPixelsPerEdge, patchConstants.nPixelsPerEdge, 1);			    // (Typically 256,256,1)

// Request the result
patchData.request = AsyncGPUReadback.Request(patchData.normalMapGPU);
//patchData.request = AsyncGPUReadback.Request(patchData.temporaryBuffer); // For test purposes

During later frames I then check if the normal map is done.

if (patchData.request.hasError == true)
        {
            Debug.Log("GPU: GPU readback error detected.");
            return;
        }
        else if (patchData.request.done == true)
        {
            // Read back temporary buffer (testing purpuse only)
            //TemporaryStruct[] output = patchData.request.GetData<TemporaryStruct>().ToArray();
            
            // Read back rendertexture and assign it to a texture
            // Testing purpose only, as we assign teh RenderTexture to the material.
	    RenderTexture.active = patchData.normalMapGPU;
            patchData.normalMapCPU = new Texture2D(patchConstants.nPixelsPerEdge, patchConstants.nPixelsPerEdge, TextureFormat.RGB24, false);
            patchData.normalMapCPU.ReadPixels(new Rect(0, 0, patchData.normalMapGPU.width, patchData.normalMapGPU.height), 0, 0);
            patchData.normalMapCPU.Apply();

            // Release buffer
            patchData.temporaryBuffer.Release();
        }

In the ComputeShader:

// The structure of the temporary buffer to move the data between Kernel 1 and Kernel 2
struct TemporaryStruct
{
    float3 position;
    float noise;
};


//Various input buffers, and an output buffer that is written to by the kernel
StructuredBuffer<PatchConstantsStruct>	        PatchConstantsBuffer;
StructuredBuffer<BodyConstantsStruct>	        BodyConstantsBuffer;
RWStructuredBuffer<TemporaryStruct>		TemporaryBuffer;
RWStructuredBuffer<OutputStruct>		        OutputBuffer;
RWTexture2D<float4>				                NormalMapTexture;



// Second kernel to create the position grid and noise grid writing to TemporarxBuffer
#pragma kernel CSMain2

[numthreads(1, 1, 1)]
void CSMain2(uint3 id : SV_DispatchThreadID)
{
    // Get the constants
    PatchConstantsStruct patchConstants = PatchConstantsBuffer[0];
    BodyConstantsStruct bodyConstants = BodyConstantsBuffer[0];

    // Get outBuffOffset
    int outBuffOffset = id.x + id.y * patchConstants.nPerEdgeWithSkirt;

    // Get the PatchNormalizedCoord
    float3 patchNormalizedCoord = PatchNormalizedCoord(id.x, id.y, patchConstants.nPerEdge, patchConstants.spacing, patchConstants.eastDirection, patchConstants.northDirection, patchConstants.centerVector);

    // Calculate its 'real world' size:
    float3 patchCoord = patchNormalizedCoord * bodyConstants.radiusMeter;
   
    // Next we generate the noise value 
    int octaves = bodyConstants.octaves + patchConstants.level;
    octaves = clamp(octaves, 0, 10);
    float noise = FBM(patchCoord, octaves, bodyConstants.frequency, bodyConstants.amplitude, bodyConstants.lacunarity, bodyConstants.persistence);

    // We create the height value taking max height into account
    float height = (noise * 2) - 1;			    // terrainHeight now ranges from -1 to + 1;
    height = clamp(height, -1, +1);             // We clamp the height to make sure it does not overshoot -1 or +1
    height *= bodyConstants.maxHeightMeter;	    // terrainHeight now ranges from -terrainMaxHeight to +terrainMaxHeight.

    // Apply the height
    patchCoord += patchNormalizedCoord * height;

    // Result
    TemporaryBuffer[outBuffOffset].position = patchCoord;
    TemporaryBuffer[outBuffOffset].noise = noise;
}



// Third kernel to create the normalmap using TemporaryBuffer as input
#pragma kernel CSMain3

[numthreads(1, 1, 1)]
void CSMain3(uint3 id : SV_DispatchThreadID)
{
    // Get the constants
    PatchConstantsStruct patchConstants = PatchConstantsBuffer[0];

    // Get offsets
    int inBuffOffset = (id.x + 1) + (id.y + 1) * patchConstants.nPerEdgeWithSkirt;
    int outBuffOffset = id.x + id.y * patchConstants.nPerEdge;

    // Create Normals (Indexes)
    // Create the necessary indexes of surrounding vertices
    int inBuffOffsetNorth = inBuffOffset + 1 * patchConstants.nPerEdgeWithSkirt;
    int inBuffOffsetEast = inBuffOffset + 1;
    int inBuffOffsetSouth = inBuffOffset - 1 * patchConstants.nPerEdgeWithSkirt;
    int inBuffOffsetWest = inBuffOffset - 1;

    // Method normals
    float3 sideA, sideB, sideC, sideD;
    float3 normalForward, normalBackward, normal;
    sideA = TemporaryBuffer[inBuffOffsetNorth].position - TemporaryBuffer[inBuffOffset].position;
    sideB = TemporaryBuffer[inBuffOffsetEast].position - TemporaryBuffer[inBuffOffset].position;
    normalForward = cross(sideA, sideB);
    sideC = TemporaryBuffer[inBuffOffsetSouth].position - TemporaryBuffer[inBuffOffset].position;
    sideD = TemporaryBuffer[inBuffOffsetWest].position - TemporaryBuffer[inBuffOffset].position;
    normalBackward = cross(sideC, sideD);
    normal = normalBackward + normalForward;

    // Create Texture
    float3 normalRGB = float3(normal.x, normal.z, normal.y) / 2 + float3(0.5f, 0.5f, 0.5f);
    uint2 textureID = uint2(id.x, id.y);
    NormalMapTexture[textureID] = float4(normalRGB, 1);
}

You need to set the buffers (for all kernels) before the first dispatch call.