GPU Frustum Culling Tips

Hello Everyone,

im trying to make a GPU frustum culling to work with Graphics.DrawMeshInstancedIndirect.
im following the standard workflow:

// All these steps are done per batch. (Mesh + Material)
1) create and fill a StructuredBuffer to hold the AABB bounds
2) create an appendBuffer to hold visible indices by the Frustum Culling ComputeShader.
3) CopyCount from appendBuffer to ArgsBuffer..
4) Do indirect rendering using Graphics.DrawMeshInstancedIndirect.

The good thing about this approach is that it is very easy to implement and batches are vertically scalable Vertically.
tthe ugly part of it, it's not scalable horizontally which means the more batches we have even if it contains only one element, we need to follow all the above steps, and this not only kills performance but also renders incorrect by starting the rendering process without waiting for the appendBuffer to finish which is very strange.

Code Examples:

//Rendering Mono

using CustomHybridRenderer;
using CustomHybridRenderer.Components;
using CustomHybridRenderer.DataModels;
using Unity.Collections.LowLevel.Unsafe;
using Unity.Mathematics;
using UnityEngine;
using UnityEngine.Profiling;

public class DrawMeshInstancedIndirectDemo : MonoBehaviour
{
    public static System.Random random;

    //every Gameobject in BatchPrefabs will contain Childs instances
    public GameObject[] BatchPrefabs;

    //AABB Culling ComputeShader
    public ComputeShader CullingShader;

    private Color[] m_Colors;
    private Mesh[] m_Meshs;
    private Material[] m_Materials;
    private uint[] m_InstancesCount;

    private ComputeBuffer[] m_VisibleIndicesCB;
    private ComputeBuffer[] m_PropertiesBuffer;
    private ComputeBuffer[] m_BoundsBuffer;


    private uint[] m_GlobalArgsBufferArray;
    private ComputeBuffer m_GlobalArgsBuffer;


    private uint[] m_DispatchArgsBufferArray; //Used to fill m_DispatchArgsBuffer
    private ComputeBuffer m_DispatchArgsBuffer; //CullingShader will used this buffer to DispatchIndirect


    private MaterialPropertyBlock propertyBlock;
    private int FrustumCullingKarnelID;


    private Bounds bounds;

    private void OnDisable()
    {

        for (var i = 0; i < m_VisibleIndicesCB.Length; i++)
        {
            m_VisibleIndicesCB[i].Dispose();
            m_VisibleIndicesCB[i].Release();
            m_VisibleIndicesCB[i] = null;
        }

        for (var i = 0; i < m_PropertiesBuffer.Length; i++)
        {
            m_PropertiesBuffer[i].Dispose();
            m_PropertiesBuffer[i].Release();
            m_PropertiesBuffer[i] = null;
        }

        for (var i = 0; i < m_BoundsBuffer.Length; i++)
        {
            m_BoundsBuffer[i].Dispose();
            m_BoundsBuffer[i].Release();
            m_BoundsBuffer[i] = null;
        }


        if (m_GlobalArgsBuffer != null)
        {
            m_GlobalArgsBuffer.Dispose();
            m_GlobalArgsBuffer.Release();
            m_GlobalArgsBuffer = null;
        }


        if (m_DispatchArgsBuffer != null)
        {
            m_DispatchArgsBuffer.Dispose();
            m_DispatchArgsBuffer.Release();
            m_DispatchArgsBuffer = null;
        }

    }


    // Mesh Properties struct to be read from the GPU.
    // Size() is a convenience funciton which returns the stride of the struct.
    private struct TestMeshProperties
    {
        public float4x4 Matrix;
        public float4 Color;
        public static int Cpu_Size()
        {
            return UnsafeUtility.SizeOf<TestMeshProperties>();
        }
    }


    private void SetupChildAtIndex(int parentIndex)
    {
        var parent = BatchPrefabs[parentIndex];
        var parentTransform = parent.transform;

        if (parentTransform.childCount == 0)
        {
            Debug.LogWarning($"Parent at Index {parentIndex} has no childs");
            return;
        }

        var childs = new Transform[parentTransform.childCount];

        for (var i = 0; i < parentTransform.childCount; i++)
        {
            childs[i] = parentTransform.GetChild(i);
        }



        //Set Color to Index
        m_Colors[parentIndex] = parentTransform.GetComponent<BaseColorURPOverride>().Value;

        //Set up Material
        m_Materials[parentIndex] = childs[0].GetComponent<MeshRenderer>().sharedMaterial;

        //Set up Material
        var currentMesh = childs[0].GetComponent<MeshFilter>().sharedMesh;
        m_Meshs[parentIndex] = currentMesh;


        //Set Instances Count
        m_InstancesCount[parentIndex] = (uint)childs.Length;


        //Set Args for ComputeShader.DispatchIndirect
        var dispatchArgsStartIndex = parentIndex * 3;
        m_DispatchArgsBufferArray[dispatchArgsStartIndex] = math.max((uint)childs.Length / CustomHybridRanderConstants.ComputeShaderCulling.COMPUTE_SHADER_CULLING_AABB_THREAD_GROUP_SIZE_X, 1);

        //Putting the argsStartIndex++ in [] will fail for an unknow reason
        dispatchArgsStartIndex++;
        m_DispatchArgsBufferArray[dispatchArgsStartIndex] = CustomHybridRanderConstants.ComputeShaderCulling.COMPUTE_SHADER_CULLING_AABB_THREAD_GROUP_SIZE_Y;

        dispatchArgsStartIndex++;
        m_DispatchArgsBufferArray[dispatchArgsStartIndex] = CustomHybridRanderConstants.ComputeShaderCulling.COMPUTE_SHADER_CULLING_AABB_THREAD_GROUP_SIZE_Z;





        //Setup ComputeBuffers
        var propertiesByteSize = TestMeshProperties.Cpu_Size();
        var propertiesTotalByteSize = propertiesByteSize * childs.Length;

        var boundsSize = WorldBounds.Cpu_Size();
        var boundsDataBytesize = boundsSize * childs.Length;

        //Init Compute Buffers
        m_PropertiesBuffer[parentIndex] = new ComputeBuffer(childs.Length, TestMeshProperties.Cpu_Size(), ComputeBufferType.Default, ComputeBufferMode.SubUpdates);
        m_BoundsBuffer[parentIndex] = new ComputeBuffer(childs.Length, WorldBounds.Cpu_Size(), ComputeBufferType.Default, ComputeBufferMode.SubUpdates);
        m_VisibleIndicesCB[parentIndex] = new ComputeBuffer(childs.Length, sizeof(uint), ComputeBufferType.Append);

        //Setting Args Buffer
        // Arguments for drawing mesh.
        // 0 == number of triangle indices, 1 == population, others are only relevant if drawing submeshes.

        var globalArgsArrayStartIndex = parentIndex * 5;
        m_GlobalArgsBufferArray[globalArgsArrayStartIndex] = currentMesh.GetIndexCount(0);
        globalArgsArrayStartIndex++;
        m_GlobalArgsBufferArray[globalArgsArrayStartIndex] = (uint)childs.Length;
        globalArgsArrayStartIndex++;
        m_GlobalArgsBufferArray[globalArgsArrayStartIndex] = currentMesh.GetIndexStart(0);
        globalArgsArrayStartIndex++;
        m_GlobalArgsBufferArray[globalArgsArrayStartIndex] = currentMesh.GetBaseVertex(0);
        globalArgsArrayStartIndex++;
        m_GlobalArgsBufferArray[globalArgsArrayStartIndex] = 0;

        var propertiesComputeBufferGPUNativeArray = m_PropertiesBuffer[parentIndex].BeginWrite<byte>(0, propertiesTotalByteSize);
        var boundsComputeBufferGPUNativeArray = m_BoundsBuffer[parentIndex].BeginWrite<byte>(0, boundsDataBytesize);

        var meshLocalBounds = currentMesh.bounds.ToAABB();

        unsafe
        {
            var ComputeBufferBytePtr = propertiesComputeBufferGPUNativeArray.GetUnsafePtr();
            var m_BoundsBufferPtr = boundsComputeBufferGPUNativeArray.GetUnsafePtr();
            // Initialize buffer with the given population.
            for (int i = 0; i < childs.Length; i++)
            {

                var matrix = childs[i].localToWorldMatrix;

                //var matrix = Matrix4x4.TRS(Vector3.zero, quaternion.identity, new Vector3(100,100,100));
                var color = m_Colors[parentIndex];

                //Write Properties
                var newBufferPosition = matrix.WriteToBuffer(((byte*)ComputeBufferBytePtr) + (propertiesByteSize * i));
                color.WriteToBuffer(newBufferPosition);

                //Write Bounds
                var worldBounds = WorldBounds.From(AABB.Transform(matrix, meshLocalBounds));
                worldBounds.WriteToBuffer(((byte*)m_BoundsBufferPtr) + boundsSize * i);
            }

        }
        m_PropertiesBuffer[parentIndex].EndWrite<byte>(propertiesTotalByteSize);
        m_BoundsBuffer[parentIndex].EndWrite<byte>(boundsDataBytesize);


    }

    private void Setup()
    {
        m_MainCamera = Camera.main;

        m_VisibleIndicesCB = new ComputeBuffer[BatchPrefabs.Length];
        m_BoundsBuffer = new ComputeBuffer[BatchPrefabs.Length];
        m_PropertiesBuffer = new ComputeBuffer[BatchPrefabs.Length];




        m_Meshs = new Mesh[BatchPrefabs.Length];
        m_Materials = new Material[BatchPrefabs.Length];
        m_Colors = new Color[BatchPrefabs.Length];
        m_InstancesCount = new uint[BatchPrefabs.Length];
        m_DispatchArgsBufferArray = new uint[BatchPrefabs.Length * 3];
        m_GlobalArgsBufferArray = new uint[BatchPrefabs.Length * 5];


        FrustumCullingKarnelID = CullingShader.FindKernel(CustomHybridRanderConstants.ComputeShaderCulling.COMPUTE_SHADER_CULLING_AABB_KERNEL_NAME);
        propertyBlock = new MaterialPropertyBlock();


        //setup All Childs and buffers
        for (var i = 0; i < BatchPrefabs.Length; i++)
        {
            SetupChildAtIndex(i);
        }

        m_GlobalArgsBuffer = new ComputeBuffer(BatchPrefabs.Length, 5 * sizeof(uint), ComputeBufferType.IndirectArguments);
        m_GlobalArgsBuffer.SetData(m_GlobalArgsBufferArray);

        m_DispatchArgsBuffer = new ComputeBuffer(BatchPrefabs.Length, 3 * sizeof(int), ComputeBufferType.IndirectArguments);
        m_DispatchArgsBuffer.SetData(m_DispatchArgsBufferArray);
    }

    Camera m_MainCamera;
    private void Start()
    {
        bounds = new Bounds(Vector3.zero, Vector3.one * 1000);
        Setup();
    }

    private void Update()
    {

        var cameraFrustumPlanes = math.mul(m_MainCamera.projectionMatrix, m_MainCamera.worldToCameraMatrix);

        Profiler.BeginSample("Setting Global Matrix");
        //TODO: Set Camera Frustum VPMatrix once for all Compute shaders as it has the same value
        //Set Camera Frustum VPMatrix
        CullingShader.SetMatrix(CustomHybridRanderConstants.ComputeShaderCulling.CameraPlanesPropertyID, cameraFrustumPlanes);
        Profiler.EndSample();

        Profiler.BeginSample("Dispatch Culling Compute shaders");
        //Dispatch Culling Compute shaders
        for (var i = 0; i < m_Meshs.Length; i++)
        {
            propertyBlock.Clear();

            //Reset Visible Indices the zero every frame
            m_VisibleIndicesCB[i].SetCounterValue(0);


            //Set Bounds Buffer
            CullingShader.SetBuffer(FrustumCullingKarnelID, CustomHybridRanderConstants.ComputeShaderCulling.BoundsBufferPropertyID, m_BoundsBuffer[i]);
            CullingShader.SetBuffer(FrustumCullingKarnelID, CustomHybridRanderConstants.VisibleIndicesPropertyID, m_VisibleIndicesCB[i]);


            //Start Frustum Culling
            var dispatchArgsStartIndex = i * 3 * 4;
            CullingShader.DispatchIndirect(FrustumCullingKarnelID, m_DispatchArgsBuffer, (uint)dispatchArgsStartIndex);

        }
        Profiler.EndSample();




        Profiler.BeginSample("Execute DrawCalls");
        //Execute DrawCalls
        for (var i = 0; i < m_Meshs.Length; i++)
        {
            var argsStartIndex = i * 5 * 4;

            //ArgsBuffer current Batch Start Offset
            Profiler.BeginSample("ComputeBuffer.CopyCount Operation");
            //Copy Visible Entities Count to the argsBuffer
            ComputeBuffer.CopyCount(m_VisibleIndicesCB[i], m_GlobalArgsBuffer, argsStartIndex + 4); //By removing this Line everthing works correctly
            Profiler.EndSample();

            Profiler.BeginSample("propertyBlock.SetBuffer Operation");
            //Set Buffers to MPB
            //propertyBlock.SetBuffer(CustomHybridRanderConstants.VisibleIndicesPropertyID, m_VisibleIndicesCB[i]);
            propertyBlock.SetBuffer(CustomHybridRanderConstants.PropertiesBufferPropertyID, m_PropertiesBuffer[i]);
            Profiler.EndSample();

#if UNITY_EDITOR
            //Debug.Log($"argsStartIndex: {argsStartIndex}");
            // Execute Rendering Command
            Graphics.DrawMeshInstancedIndirect(m_Meshs[i], 0, m_Materials[i], bounds, bufferWithArgs: m_GlobalArgsBuffer, argsOffset: argsStartIndex, propertyBlock);

#else
            // Execute Rendering Command For the Main Camera Only
            Graphics.DrawMeshInstancedIndirect(m_Meshs[i], 0, m_Materials[i], bounds, bufferWithArgs: m_GlobalArgsBuffer, argsOffset: argsStartIndex, propertyBlock, camera: m_MainCamera);
#endif
        }

        Profiler.EndSample();
    }
}

A Different Approach:
so the solution I found is to merge all batches into a global buffer, so now we're going to do steps 1, 2 and 3 once for all batches which is a huge improvement.

//--------------------------------------------------------------------------------------
// Pragmas
//--------------------------------------------------------------------------------------

#pragma kernel CSMainAABBCulling
#pragma kernel CSMainCollectSurvivedIndexes

//--------------------------------------------------------------------------------------
// Constants
//--------------------------------------------------------------------------------------
#define THREAD_GROUP_SIZE_X 64
#define THREAD_GROUP_SIZE_Y 1
#define THREAD_GROUP_SIZE_Z 1



//--------------------------------------------------------------------------------------
// Structs
//--------------------------------------------------------------------------------------

//TODO: Make this Struct Global
struct AABBData
{
    float3 boundsCenter;         // 3
    float3 boundsExtents;        // 6
};


//This Struct is used to correctly index _VISIBILITY_RESULTS while collecting.
struct IndexingBounds
{
    uint StartIndex;
    uint Count;       //eliminates useless checks.
};


//--------------------------------------------------------------------------------------
// Constant Buffers
//--------------------------------------------------------------------------------------

//Matrix4x4 v = Camera.main.worldToCameraMatrix;
//Matrix4x4 p = Camera.main.projectionMatrix; //unity C# use opengl standard projection matrix
//cullingComputeShader.SetMatrix("_VP_MATRIX", p * v); //set from C#
/*
cbuffer CB
{
    float4x4 _VP_MATRIX;
    uint _BOUNDS_COUNT;
};
*/

float4x4 _VP_MATRIX;

uint _GLOBAL_BOUNDS_BUFFER_COUNT; // SUM of all batches Bounds. eg : [A,A,A,A,B,B,C,D,D,E] = 10
uint _TOTAL_BATCH_COUNT; // Contains all possible Batches Count.

//--------------------------------------------------------------------------------------
// Structured Buffers
//--------------------------------------------------------------------------------------
//Global Initial Data
StructuredBuffer<AABBData> _GLOBAL_BOUNDS_BUFFER;    // bounds from all possible DrawCalls.
StructuredBuffer<IndexingBounds> _PER_BATCH_INDEXING_BOUNDS_BUFFER;

//Global Surviving Data
RWStructuredBuffer<uint> _GLOBAL_CULLING_RESULTS_BUFFER; // 0 = Culled & 1 = Visible.
RWStructuredBuffer<uint> _GLOBAL_INDIRECT_ARGUMENTS_BUFFER;
RWStructuredBuffer<uint> _GLOBAL_VISIBLE_INDEXES_BUFFER; //Will contain ordred visible indexes per batch

//--------------------------------------------------------------------------------------
// Kernels & Functions
//--------------------------------------------------------------------------------------

inline uint IsCameraOutsideObjBounds(float3 pos, float3 minPos, float3 maxPos)
{
    float boundsSize = distance(maxPos, minPos);
    return((distance(pos, maxPos) > boundsSize)
    + (distance(pos, minPos) > boundsSize));
}

inline uint IsVisibleAfterFrustumCulling(float4 clipPos)
{
    return(clipPos.z > clipPos.w
    || clipPos.x < - clipPos.w
    || clipPos.x > clipPos.w
    || clipPos.y < - clipPos.w
    || clipPos.y > clipPos.w)
        ? 0: 1;
}

//--------------------------------------------------------------------------------------
// Kernels
//--------------------------------------------------------------------------------------



//--------------------------------------------------------------------------------------
// Require Setting:
// BUFFERS:  _GLOBAL_BOUNDS_BUFFER,  _GLOBAL_CULLING_RESULTS_BUFFER
// VALUES:  _GLOBAL_BOUNDS_BUFFER_COUNT, _VP_MATRIX
//--------------------------------------------------------------------------------------

//Calculate Frustum Culling.
[numthreads(THREAD_GROUP_SIZE_X, THREAD_GROUP_SIZE_Y, THREAD_GROUP_SIZE_Z)]
inline void CSMainAABBCulling(uint3 id: SV_DispatchThreadID)
{
    uint index = id.x;
    if (index >= _GLOBAL_BOUNDS_BUFFER_COUNT)
    {
        return;
    }

    // Get the instance AABBData
    AABBData instance = _GLOBAL_BOUNDS_BUFFER[index];

    // Check if the instance is visible
    float3 minPos = instance.boundsCenter - instance.boundsExtents;
    float3 maxPos = instance.boundsCenter + instance.boundsExtents;

    float4 boxCorners[8];
    boxCorners[0] = float4(minPos.x, minPos.y, minPos.z, 1.0);
    boxCorners[1] = float4(minPos.x, minPos.y, maxPos.z, 1.0);
    boxCorners[2] = float4(minPos.x, maxPos.y, minPos.z, 1.0);
    boxCorners[3] = float4(minPos.x, maxPos.y, maxPos.z, 1.0);
    boxCorners[4] = float4(maxPos.x, minPos.y, minPos.z, 1.0);
    boxCorners[5] = float4(maxPos.x, minPos.y, maxPos.z, 1.0);
    boxCorners[6] = float4(maxPos.x, maxPos.y, minPos.z, 1.0);
    boxCorners[7] = float4(maxPos.x, maxPos.y, maxPos.z, 1.0);

    float4 clipPos = mul(_VP_MATRIX, boxCorners[0]);

    //TODO: check if this validation check is required
    uint isInFrustum = IsVisibleAfterFrustumCulling(clipPos);

    clipPos.xyz = clipPos.xyz / clipPos.w;

    [unroll]
    for (int i = 1; i < 8; i++)
    {
        clipPos = mul(_VP_MATRIX, boxCorners[i]);

        // For Frustum
        isInFrustum = saturate(isInFrustum + IsVisibleAfterFrustumCulling(clipPos));
    }

    //Set Frustum Result
    _GLOBAL_CULLING_RESULTS_BUFFER[index] = isInFrustum;
}







//--------------------------------------------------------------------------------------
// Require Setting:
// BUFFERS:  _PER_BATCH_INDEXING_BOUNDS_BUFFER,  _GLOBAL_CULLING_RESULTS_BUFFER, _GLOBAL_VISIBLE_INDEXES_BUFFER, _GLOBAL_INDIRECT_ARGUMENTS_BUFFER
// VALUES:  _TOTAL_BATCH_COUNT
//--------------------------------------------------------------------------------------


//CSMainCollectSurvivedIndexes will Collect survived Indexes and Set count to _GLOBAL_INDIRECT_ARGUMENTS_BUFFER on the second element.
[numthreads(THREAD_GROUP_SIZE_X, THREAD_GROUP_SIZE_Y, THREAD_GROUP_SIZE_Z)]
inline void CSMainCollectSurvivedIndexes(uint3 id: SV_DispatchThreadID)
{
    uint index = id.x;

    if (index >= _TOTAL_BATCH_COUNT)
    {
        return;
    }


    IndexingBounds indexingBound = _PER_BATCH_INDEXING_BOUNDS_BUFFER[index];
    uint finalIndex = indexingBound.StartIndex + indexingBound.Count;

    uint visibleCount = 0;

    for (int i = indexingBound.StartIndex; i < finalIndex; i++)
    {
        if (_GLOBAL_CULLING_RESULTS_BUFFER[i] == 1)
        {
            _GLOBAL_VISIBLE_INDEXES_BUFFER[indexingBound.StartIndex + visibleCount ] = i - indexingBound.StartIndex;
            visibleCount++;
        }
    }

    _GLOBAL_INDIRECT_ARGUMENTS_BUFFER[(index * 5) + 1] = visibleCount;
}

but the problem with this approach is that it is not vertically scalable, which means the larger the batches are, the slower it will collect frustum results.

do you have any solutions to this?
do you have different approaches?

Any suggestion is welcome

Thanks!!!

You need to parallelize the loop in your CSMainCollectSurvivedIndexes kernel.

Track how many instances of each batch survive your culling kernel, using a RWBuffer with one element per batch type, and InterlockedAdd. (Also, make sure to reset this buffer to zero every frame before the compute shader)

Then dispatch another compute shader, with one thread per batch, and calculate the start+offset for all of the instances of the batch, so they can be grouped together. You can use interlockedAdd here too. So each thread will basically get the total number of surviving instances for a batch, allocate a space in the _GLOBAL_VISIBLE_INDEXES_BUFFER, and store the startOffset+Length in a RWBuffer, which can then be used in your CSMainCollectSurvivedIndexes kernel.

Finally, dispatch your CSMainCollectSurvivedIndexes, with one thread per instance, per batch. First, check if the batch survived culling, if so, get the start+offset of that batch, and write the index. Again, you can use a RWBuffer and interlockedAdd, with one element per batch, so that each batch can track which index it should write to next.

I hope that all makes sense, it's a pretty complex topic and I wish I could explain this process better, but you seem to be on the right track.

The approach that seems to work for this kind of thing is splitting up tasks into multiple layers, and making each layer do a large amount of small/quick tasks. (Which seems to be mostly what you're doing)

Additional notes:
I mention InterlockedAdd a lot, because it's simple and keeps complexity low. However a parallel prefix sum may be much faster, at the expense of complexity. I do have a few million foliage instances rendering at over 100 fps with InterlockedAdd though, so I'm not too concerned currently.

You can also write the total number of surviving instances to an args buffer, and use DispatchIndirect, so you're only dispatching one thread per surviving instance. Again this adds some complexity so I haven't implemented it myself yet, as performance seems good enough without it for now. But if you're aiming for maximum efficiency, you'll probably want to use this too.

I ran into a lot of issues trying to implement this, lots of it was caused by not clearing the interlockedAdd buffers to 0 each frame, and also getting the byte offsets wrong in the argsBuffers, so keep an eye out for that.

Good luck!

3 Likes

I did a similar approach using the interlockedAdd function, now CSMainAABBCulling Cull, collect and set visible entities Batch count to the global indirect rendering buffer, which seems to be pretty fast on mobile.

ComputeShader Code:

//--------------------------------------------------------------------------------------
// Includes
//--------------------------------------------------------------------------------------

//--------------------------------------------------------------------------------------
// Pragmas
//--------------------------------------------------------------------------------------
#pragma kernel CSMainAABBCulling
#pragma kernel CSMainArgsBufferReset


//--------------------------------------------------------------------------------------
// Constants
//--------------------------------------------------------------------------------------
#define THREAD_GROUP_SIZE_X 64
#define THREAD_GROUP_SIZE_Y 1
#define THREAD_GROUP_SIZE_Z 1


//--------------------------------------------------------------------------------------
// Structs
//--------------------------------------------------------------------------------------

//TODO: Make this Struct Global
struct AABBData
{
    float3 boundsCenter;
    float3 boundsExtents;
   // float4x4 objectTransformMatrix;
};


//This Struct is used to correctly index Batches Data, (Batch Index, )
struct IndexingData
{
    uint BatchIndex;
    uint BatchGlobalStartIndex;
};


//--------------------------------------------------------------------------------------
// Constant Buffers
//--------------------------------------------------------------------------------------

//Matrix4x4 v = Camera.main.worldToCameraMatrix;
//Matrix4x4 p = Camera.main.projectionMatrix; //unity C# use opengl standard projection matrix
//cullingComputeShader.SetMatrix("_VP_MATRIX", p * v); //set from C#

float4x4 _VP_MATRIX;

uint _GLOBAL_BOUNDS_BUFFER_COUNT; // SUM of all batches Bounds. eg : [A,A,A,A,B,B,C,D,D,E] = 10
uint _TOTAL_BATCH_COUNT; // Contains all possible Batches Count.

//--------------------------------------------------------------------------------------
// Structured Buffers
//--------------------------------------------------------------------------------------
//Global Initial Data
StructuredBuffer<AABBData> _GLOBAL_BOUNDS_BUFFER;    // bounds from all possible DrawCalls.
StructuredBuffer<IndexingData> _PER_ENTITY_INDEXING_DATA_BUFFER;

//Global Surviving Data
// ____BATCH___|___BATCH__
RWStructuredBuffer < uint > _GLOBAL_INDIRECT_ARGUMENTS_BUFFER: register(u1); //[-,X,-,-,-,  -,X,-,-,-,....]
RWStructuredBuffer<uint> _GLOBAL_VISIBLE_INDEXES_BUFFER; //Will contain visible indexes per batch

//--------------------------------------------------------------------------------------
// Kernels & Functions
//--------------------------------------------------------------------------------------


inline uint IsCameraOutsideObjBounds(float3 pos, float3 minPos, float3 maxPos)
{
    float boundsSize = distance(maxPos, minPos);
    return((distance(pos, maxPos) > boundsSize)
    + (distance(pos, minPos) > boundsSize));
}

inline uint IsVisibleAfterFrustumCulling(float4 clipPos)
{
    return(clipPos.z > clipPos.w
    || clipPos.x < - clipPos.w
    || clipPos.x > clipPos.w
    || clipPos.y < - clipPos.w
    || clipPos.y > clipPos.w)
        ? 0: 1;
}


//--------------------------------------------------------------------------------------
// Kernels
//--------------------------------------------------------------------------------------


//-----------------------------------------------------------------------------------------------------
// Required Variables:
// BUFFERS: _GLOBAL_INDIRECT_ARGUMENTS_BUFFER
// VALUES:  _TOTAL_BATCH_COUNT
//-----------------------------------------------------------------------------------------------------

//Set args buffer count to zero.
[numthreads(THREAD_GROUP_SIZE_X, THREAD_GROUP_SIZE_Y, THREAD_GROUP_SIZE_Z)]
inline void CSMainArgsBufferReset(uint3 id: SV_DispatchThreadID)
{
    uint index = id.x;
    if (index >= _TOTAL_BATCH_COUNT)
    {
        return;
    }
    uint countIndex = (index * 5) + 1;
    _GLOBAL_INDIRECT_ARGUMENTS_BUFFER[countIndex] = 0;
}


//-----------------------------------------------------------------------------------------------------
// Required Variables:
// BUFFERS: _GLOBAL_BOUNDS_BUFFER, _PER_ENTITY_INDEXING_DATA_BUFFER, _GLOBAL_VISIBLE_INDEXES_BUFFER
// VALUES:  _GLOBAL_BOUNDS_BUFFER_COUNT, _VP_MATRIX
//-----------------------------------------------------------------------------------------------------


//Calculate Frustum Culling.
[numthreads(THREAD_GROUP_SIZE_X, THREAD_GROUP_SIZE_Y, THREAD_GROUP_SIZE_Z)]
inline void CSMainAABBCulling(uint3 id: SV_DispatchThreadID)
{
    uint index = id.x;

    if (index >= _GLOBAL_BOUNDS_BUFFER_COUNT)
    {
        return;
    }

    // Get the instance AABBData
    AABBData instance = _GLOBAL_BOUNDS_BUFFER[index];

    // Check if the instance is visible
    float3 minPos = instance.boundsCenter - instance.boundsExtents;
    float3 maxPos = instance.boundsCenter + instance.boundsExtents;

    float4 boxCorners[8];
    boxCorners[0] = float4(minPos.x, minPos.y, minPos.z, 1.0);
    boxCorners[1] = float4(minPos.x, minPos.y, maxPos.z, 1.0);
    boxCorners[2] = float4(minPos.x, maxPos.y, minPos.z, 1.0);
    boxCorners[3] = float4(minPos.x, maxPos.y, maxPos.z, 1.0);
    boxCorners[4] = float4(maxPos.x, minPos.y, minPos.z, 1.0);
    boxCorners[5] = float4(maxPos.x, minPos.y, maxPos.z, 1.0);
    boxCorners[6] = float4(maxPos.x, maxPos.y, minPos.z, 1.0);
    boxCorners[7] = float4(maxPos.x, maxPos.y, maxPos.z, 1.0);

    float4 clipPos = mul(_VP_MATRIX, boxCorners[0]);
    uint isInFrustum = IsVisibleAfterFrustumCulling(clipPos);

    clipPos.xyz = clipPos.xyz / clipPos.w;

    [unroll]
    for (int i = 1; i < 8; i++)
    {
        clipPos = mul(_VP_MATRIX, boxCorners[i]);

        // For Frustum
        isInFrustum = saturate(isInFrustum + IsVisibleAfterFrustumCulling(clipPos));
    }

    //Set Frustum Result
    if(isInFrustum == 1){

        IndexingData indexingData = _PER_ENTITY_INDEXING_DATA_BUFFER[index];

        uint countIndexInArgsBuffer = (indexingData.BatchIndex * 5) +1;

        uint visibleCount;
        //Increament Visible count in ArgsBuffer + Get a new Free Index
        InterlockedAdd(_GLOBAL_INDIRECT_ARGUMENTS_BUFFER[countIndexInArgsBuffer], 1, visibleCount);

        //Set the
        _GLOBAL_VISIBLE_INDEXES_BUFFER[indexingData.BatchGlobalStartIndex + visibleCount] = index - indexingData.BatchGlobalStartIndex;
    }
}

is there a way to optimize the CSMainAABBCulling ?

2 Likes