ComputeBuffer.CopyCount is broken

Hello everyone,

im trying to create GPU driven rendering pipeline. for this i need to use Graphics.DrawMeshInstancedIndirect and a Culling ComputeShader.

but im facing a weird Bug when trying to call Computebuffer.CopyCount on AppendComputeBuffers (to copy visible indexes after Frustum Culling)

At first I thought the Frustum Culling Compute Shader is corrupting the Indices AppendBuffers, but even without using them in Graphics.DrawMeshInstancedIndirect the problem persists.

Version : Unity 2021.1.3f1 (64-bit)

Code Example:
Note: in this example i’m dispatching Frustum Culling shaders without using the resulting Indices AppendBuffers for debugging purposes.

using CustomHybridRenderer;
using CustomHybridRenderer.Components;
using CustomHybridRenderer.DataModels;
using Unity.Collections.LowLevel.Unsafe;
using Unity.Mathematics;
using UnityEngine;
using UnityEngine.Profiling;

public class DrawMeshInstancedIndirectDemo : MonoBehaviour
{
    public static System.Random random;

    //every Gameobject in BatchPrefabs will contain Childs instances
    public GameObject[] BatchPrefabs;

    //AABB Culling ComputeShader
    public ComputeShader CullingShader;

    private Color[] m_Colors;
    private Mesh[] m_Meshs;
    private Material[] m_Materials;
    private uint[] m_InstancesCount;
    private uint[] m_DispatchArgsBufferArray; //Used to fill m_DispatchArgsBuffer

    private ComputeBuffer[] m_VisibleIndicesCB;
    private ComputeBuffer[] m_PropertiesBuffer;
    private ComputeBuffer[] m_BoundsBuffer;
    private ComputeBuffer[] m_ArgsBuffer;
    private ComputeBuffer m_DispatchArgsBuffer; //CullingShader will used this buffer to DispatchIndirect


    private MaterialPropertyBlock propertyBlock;
    private int FrustumCullingKarnelID;


    private Bounds bounds;

    private void OnDisable()
    {

        for (var i = 0; i < m_VisibleIndicesCB.Length; i++)
        {
            m_VisibleIndicesCB[i].Dispose();
            m_VisibleIndicesCB[i].Release();
            m_VisibleIndicesCB[i] = null;
        }

        for (var i = 0; i < m_PropertiesBuffer.Length; i++)
        {
            m_PropertiesBuffer[i].Dispose();
            m_PropertiesBuffer[i].Release();
            m_PropertiesBuffer[i] = null;
        }

        for (var i = 0; i < m_BoundsBuffer.Length; i++)
        {
            m_BoundsBuffer[i].Dispose();
            m_BoundsBuffer[i].Release();
            m_BoundsBuffer[i] = null;
        }


        for (var i = 0; i < m_ArgsBuffer.Length; i++)
        {
            m_ArgsBuffer[i].Dispose();
            m_ArgsBuffer[i].Release();
            m_ArgsBuffer[i] = null;
        }


        if (m_DispatchArgsBuffer != null)
        {
            m_DispatchArgsBuffer.Dispose();
            m_DispatchArgsBuffer.Release();
            m_DispatchArgsBuffer = null;
        }

    }


    // Mesh Properties struct to be read from the GPU.
    // Size() is a convenience funciton which returns the stride of the struct.
    private struct TestMeshProperties
    {
        public float4x4 Matrix;
        public float4 Color;
        public static int Cpu_Size()
        {
            return UnsafeUtility.SizeOf<TestMeshProperties>();
        }
    }


    private void SetupChildAtIndex(int parentIndex)
    {
        var parent = BatchPrefabs[parentIndex];
        var parentTransform = parent.transform;

        if (parentTransform.childCount == 0)
        {
            Debug.LogWarning($"Parent at Index {parentIndex} has no childs");
            return;
        }

        var childs = new Transform[parentTransform.childCount];

        for (var i = 0; i < parentTransform.childCount; i++)
        {
            childs[i] = parentTransform.GetChild(i);
        }



        //Set Color to Index
        m_Colors[parentIndex] = parentTransform.GetComponent<BaseColorURPOverride>().Value;

        //Set up Material
        m_Materials[parentIndex] = childs[0].GetComponent<MeshRenderer>().sharedMaterial;

        //Set up Material
        var currentMesh = childs[0].GetComponent<MeshFilter>().sharedMesh;
        m_Meshs[parentIndex] = currentMesh;


        //Set Instances Count
        m_InstancesCount[parentIndex] = (uint)childs.Length;


        //Set Args for ComputeShader.DispatchIndirect
        var dispatchArgsStartIndex = parentIndex * 3;
        m_DispatchArgsBufferArray[dispatchArgsStartIndex] = math.max((uint)childs.Length / CustomHybridRanderConstants.ComputeShaderCulling.COMPUTE_SHADER_CULLING_AABB_THREAD_GROUP_SIZE_X, 1);

        //Putting the argsStartIndex++ in [] will fail for an unknow reason
        dispatchArgsStartIndex++;
        m_DispatchArgsBufferArray[dispatchArgsStartIndex] = CustomHybridRanderConstants.ComputeShaderCulling.COMPUTE_SHADER_CULLING_AABB_THREAD_GROUP_SIZE_Y;

        dispatchArgsStartIndex++;
        m_DispatchArgsBufferArray[dispatchArgsStartIndex] = CustomHybridRanderConstants.ComputeShaderCulling.COMPUTE_SHADER_CULLING_AABB_THREAD_GROUP_SIZE_Z;





        //Setup ComputeBuffers
        var propertiesByteSize = TestMeshProperties.Cpu_Size();
        var propertiesTotalByteSize = propertiesByteSize * childs.Length;

        var boundsSize = WorldBounds.Cpu_Size();
        var boundsDataBytesize = boundsSize * childs.Length;

        //Init Compute Buffers
        m_PropertiesBuffer[parentIndex] = new ComputeBuffer(childs.Length, TestMeshProperties.Cpu_Size(), ComputeBufferType.Structured, ComputeBufferMode.SubUpdates);
        m_BoundsBuffer[parentIndex] = new ComputeBuffer(childs.Length, WorldBounds.Cpu_Size(), ComputeBufferType.Structured, ComputeBufferMode.SubUpdates);
        m_VisibleIndicesCB[parentIndex] = new ComputeBuffer(childs.Length, sizeof(uint), ComputeBufferType.Append);

        //Setting Args Buffer
        // Arguments for drawing mesh.
        // 0 == number of triangle indices, 1 == population, others are only relevant if drawing submeshes.
        var drawCallArgs = new uint[5] { currentMesh.GetIndexCount(0), (uint)childs.Length, currentMesh.GetIndexStart(0), currentMesh.GetBaseVertex(0), 0 };
        m_ArgsBuffer[parentIndex] = new ComputeBuffer(childs.Length, drawCallArgs.Length * sizeof(uint), ComputeBufferType.IndirectArguments);
        m_ArgsBuffer[parentIndex].SetData(drawCallArgs);



        var propertiesComputeBufferGPUNativeArray = m_PropertiesBuffer[parentIndex].BeginWrite<byte>(0, propertiesTotalByteSize);
        var boundsComputeBufferGPUNativeArray = m_BoundsBuffer[parentIndex].BeginWrite<byte>(0, boundsDataBytesize);

        var meshLocalBounds = currentMesh.bounds.ToAABB();

        unsafe
        {
            var ComputeBufferBytePtr = propertiesComputeBufferGPUNativeArray.GetUnsafePtr();
            var m_BoundsBufferPtr = boundsComputeBufferGPUNativeArray.GetUnsafePtr();
            // Initialize buffer with the given population.
            for (int i = 0; i < childs.Length; i++)
            {

                var matrix = childs[i].localToWorldMatrix;

                //var matrix = Matrix4x4.TRS(Vector3.zero, quaternion.identity, new Vector3(100,100,100));
                var color = m_Colors[parentIndex];

                //Write Properties
                var newBufferPosition = matrix.WriteToBuffer(((byte*)ComputeBufferBytePtr) + (propertiesByteSize * i));
                color.WriteToBuffer(newBufferPosition);

                //Write Bounds
                var worldBounds = WorldBounds.From(AABB.Transform(matrix, meshLocalBounds));
                worldBounds.WriteToBuffer(((byte*)m_BoundsBufferPtr) + boundsSize * i);
            }

        }
        m_PropertiesBuffer[parentIndex].EndWrite<byte>(propertiesTotalByteSize);
        m_BoundsBuffer[parentIndex].EndWrite<byte>(boundsDataBytesize);


    }

    private void Setup()
    {
        m_MainCamera = Camera.main;

        m_VisibleIndicesCB = new ComputeBuffer[BatchPrefabs.Length];
        m_BoundsBuffer = new ComputeBuffer[BatchPrefabs.Length];
        m_PropertiesBuffer = new ComputeBuffer[BatchPrefabs.Length];
        m_ArgsBuffer = new ComputeBuffer[BatchPrefabs.Length];


        m_Meshs = new Mesh[BatchPrefabs.Length];
        m_Materials = new Material[BatchPrefabs.Length];
        m_Colors = new Color[BatchPrefabs.Length];
        m_InstancesCount = new uint[BatchPrefabs.Length];
        m_DispatchArgsBufferArray = new uint[BatchPrefabs.Length * 3];


        FrustumCullingKarnelID = CullingShader.FindKernel(CustomHybridRanderConstants.ComputeShaderCulling.COMPUTE_SHADER_CULLING_AABB_KERNEL_NAME);
        propertyBlock = new MaterialPropertyBlock();


        //setup All Childs and buffers
        for (var i = 0; i < BatchPrefabs.Length; i++)
        {
            SetupChildAtIndex(i);
        }


        m_DispatchArgsBuffer = new ComputeBuffer(BatchPrefabs.Length, 3 * sizeof(int), ComputeBufferType.IndirectArguments);
        m_DispatchArgsBuffer.SetData(m_DispatchArgsBufferArray);
    }

    Camera m_MainCamera;
    private void Start()
    {
        bounds = new Bounds(Vector3.zero, Vector3.one * 1000);
        Setup();
    }

    private void Update()
    {

        var cameraFrustumPlanes = math.mul(m_MainCamera.projectionMatrix, m_MainCamera.worldToCameraMatrix);

        Profiler.BeginSample("Setting Global Matrix");
        //TODO: Set Camera Frustum VPMatrix once for all Compute shaders as it has the same value
        //Set Camera Frustum VPMatrix
        CullingShader.SetMatrix(CustomHybridRanderConstants.ComputeShaderCulling.CameraPlanesPropertyID, cameraFrustumPlanes);
        Profiler.EndSample();

        Profiler.BeginSample("Dispatch Culling Compute shaders");
        //Dispatch Culling Compute shaders
        for (var i = 0; i < m_Meshs.Length; i++)
        {
            propertyBlock.Clear();

            //Reset Visible Indices the zero every frame
            m_VisibleIndicesCB[i].SetCounterValue(0);


            //Set Bounds Buffer
            CullingShader.SetBuffer(FrustumCullingKarnelID, CustomHybridRanderConstants.ComputeShaderCulling.BoundsBufferPropertyID, m_BoundsBuffer[i]);
            CullingShader.SetBuffer(FrustumCullingKarnelID, CustomHybridRanderConstants.VisibleIndicesPropertyID, m_VisibleIndicesCB[i]);


            //Start Frustum Culling
            var dispatchArgsStartIndex = i * 3 * 4;
            CullingShader.DispatchIndirect(FrustumCullingKarnelID, m_DispatchArgsBuffer, (uint)dispatchArgsStartIndex);

        }
        Profiler.EndSample();




        Profiler.BeginSample("Execute DrawCalls");
        //Execute DrawCalls
        for (var i = 0; i < m_Meshs.Length; i++)
        {

            //ArgsBuffer current Batch Start Offset
            Profiler.BeginSample("ComputeBuffer.CopyCount Operation");
            //Copy Visible Entities Count to the argsBuffer
           // ComputeBuffer.CopyCount(m_VisibleIndicesCB[i], m_ArgsBuffer[i], 0); //By removing this Line everthing works correctly
            Profiler.EndSample();
       
            Profiler.BeginSample("propertyBlock.SetBuffer Operation");
            //Set Buffers to MPB
            //propertyBlock.SetBuffer(CustomHybridRanderConstants.VisibleIndicesPropertyID, m_VisibleIndicesCB[i]);
            propertyBlock.SetBuffer(CustomHybridRanderConstants.PropertiesBufferPropertyID, m_PropertiesBuffer[i]);
            Profiler.EndSample();

#if UNITY_EDITOR
            //Debug.Log($"argsStartIndex: {argsStartIndex}");
            // Execute Rendering Command
            Graphics.DrawMeshInstancedIndirect(m_Meshs[i], 0, m_Materials[i], bounds, bufferWithArgs: m_ArgsBuffer[i], argsOffset: 0, propertyBlock);
#else
            // Execute Rendering Command For the Main Camera Only
            Graphics.DrawMeshInstancedIndirect(m_Meshs[i], 0, m_Materials[i], bounds, bufferWithArgs: m_ArgsBuffer[i], argsOffset: 0, propertyBlock, camera: m_MainCamera);
#endif
        }

        Profiler.EndSample();
    }

}

OnePlus 2 with CopyCount Call:

OnePlus 2 without CopyCount Call:

Samsung S10 with CopyCount Call:

Samsung S10 without CopyCount Call:

Thanks in Advance!

I found the problem,
I was wrongly using the ComputeBuffer.CopyCount API by passing 0 to dstOffsetBytes instead of the second uint element which is 4.
ill leave this report so that anyone who is facing this same problem can see it.

3 Likes