GPU Instanced Deferred Lights

Hello,

In our game, we put lights on our enemy and friendly shots so that they highlight the environments and the other players. A very nice effect that obviously has the drawback of adding dozens of small lights to the scene. Using the deferred shader, that shouldn’t be a problem right? It’s the whole point of the thing after all.

As it turns out, Unity doesn’t use GPU instancing to render point lights in the deferred renderer. This is problematic on the Switch as those dozens of draw calls have a very high performance impact on the render thread. To solve the issue, I wrote a custom light renderer that uses instancing so that there is just one draw call for all the lights.

This works great, except for one thing: it lights all the pixels instead of just the necessary pixels as the original light shader of unity does. This has obviously a huge performance impact. My problem is that I have no idea which values I should use for the Stencil test. I tried the ones I found in the regular deferred shader:

            Stencil
            {
                Ref [_StencilNonBackground]
                ReadMask [_StencilNonBackground]
                CompBack Equal
                CompFront Equal
            }

They seem to work in the editor but not on the Switch. Any idea how to get the right stencil values ?

Editor: (stencil ref is 128, looks correct)

Switch: (stencil ref is 0, probably wrong, takes to 2ms to render so very probably wrong)

Shader:

Shader "CustomLights/InstancedLight"
{
    SubShader
    {
        Tags { "Queue" = "Transparent-1" }

        Pass
        {
            Fog { Mode Off }
            ZWrite Off
            ZTest LEqual
            Blend One One
            Cull Back

            Stencil
            {
                Ref [_StencilNonBackground]
                ReadMask [_StencilNonBackground]
                CompBack Equal
                CompFront Equal
            }

            CGPROGRAM
            #pragma target 3.0
            #pragma vertex vert_deferred_instanced
            #pragma fragment frag
            #pragma multi_compile_lightpass
            #pragma multi_compile_instancing UNITY_HDR_ON
            #pragma instancing_options nolodfade nolightprobe nolightmap

            #pragma exclude_renderers nomrt

            #include "UnityCG.cginc"
            #include "UnityDeferredLibrary.cginc"
            #include "UnityPBSLighting.cginc"
            #include "UnityStandardUtils.cginc"
            #include "UnityGBuffer.cginc"
            #include "UnityStandardBRDF.cginc"

            sampler2D _CameraGBufferTexture0;
            sampler2D _CameraGBufferTexture1;
            sampler2D _CameraGBufferTexture2;

            UNITY_INSTANCING_BUFFER_START(Props)
                UNITY_DEFINE_INSTANCED_PROP(fixed4, _Color)
            UNITY_INSTANCING_BUFFER_END(Props)

            struct appdata
            {
                float4 vertex : POSITION;
                UNITY_VERTEX_INPUT_INSTANCE_ID
            };

            struct unity_v2f_deferred_instanced {
                float4 pos : SV_POSITION;
                float4 uv : TEXCOORD0;
                float3 ray : TEXCOORD1;
                half4 color : COLOR;
                UNITY_VERTEX_INPUT_INSTANCE_ID
            };

            unity_v2f_deferred_instanced vert_deferred_instanced(appdata v)
            {
                unity_v2f_deferred_instanced o;

                UNITY_SETUP_INSTANCE_ID(v);
                UNITY_TRANSFER_INSTANCE_ID(v, o);

                o.pos = UnityObjectToClipPos(v.vertex);
                o.uv = ComputeScreenPos(o.pos);
                o.ray = UnityObjectToViewPos(v.vertex) * float3(-1, -1, 1);
                o.color = UNITY_ACCESS_INSTANCED_PROP(Props, _Color);

                return o;
            }

            // Common lighting data calculation (direction, attenuation, ...)
            void DeferredCalculateLightParams(
                unity_v2f_deferred_instanced i,
                out float3 outWorldPos,
                out float2 outUV,
                out half3 outLightDir,
                out float outAtten,
                out float outFadeDist)
            {
                i.ray = i.ray * (_ProjectionParams.z / i.ray.z);
                float2 uv = i.uv.xy / i.uv.w;

                // read depth and reconstruct world position
                float depth = SAMPLE_DEPTH_TEXTURE(_CameraDepthTexture, uv);
                depth = Linear01Depth(depth);
                float4 vpos = float4(i.ray * depth, 1);
                float3 wpos = mul(unity_CameraToWorld, vpos).xyz;

                float fadeDist = UnityComputeShadowFadeDistance(wpos, vpos.z);

                float3 lightPos = float3(unity_ObjectToWorld[0][3], unity_ObjectToWorld[1][3], unity_ObjectToWorld[2][3]);
                float3 tolight = wpos - lightPos;
                half3 lightDir = -normalize(tolight);

                float att = dot(tolight, tolight) * i.color.w; // color.w is inversed squared range. ie 1/(r*r)
                float atten = tex2D(_LightTextureB0, att.rr).r;

                atten *= UnityDeferredComputeShadow(tolight, fadeDist, uv);

                outWorldPos = wpos;
                outUV = uv;
                outLightDir = lightDir;
                outAtten = atten;
                outFadeDist = fadeDist;
            }

            half4 frag(unity_v2f_deferred_instanced i) : SV_Target
            {
                UNITY_SETUP_INSTANCE_ID(i);

                float3 wpos;
                float2 uv;
                float atten, fadeDist;

                UnityLight light;
                UNITY_INITIALIZE_OUTPUT(UnityLight, light);

                DeferredCalculateLightParams(i, wpos, uv, light.dir, atten, fadeDist);

                light.color = i.color.rgb * atten;

                // unpack Gbuffer
                half4 gbuffer0 = tex2D(_CameraGBufferTexture0, uv);
                half4 gbuffer1 = tex2D(_CameraGBufferTexture1, uv);
                half4 gbuffer2 = tex2D(_CameraGBufferTexture2, uv);
                UnityStandardData data = UnityStandardDataFromGbuffer(gbuffer0, gbuffer1, gbuffer2);

                float3 eyeVec = normalize(wpos - _WorldSpaceCameraPos);
                half oneMinusReflectivity = 1 - SpecularStrength(data.specularColor.rgb);

                UnityIndirect ind;
                UNITY_INITIALIZE_OUTPUT(UnityIndirect, ind);
                ind.diffuse = 0;
                ind.specular = 0;

                return UNITY_BRDF_PBS(data.diffuseColor, data.specularColor, oneMinusReflectivity,
                    data.smoothness, data.normalWorld, -eyeVec, light, ind);
            }

            ENDCG
        }
    }
}

What you have isn’t working correctly. Not even on desktop. There’s a reason why the lights aren’t instanced. Light’s volume geometry has to be rendered twice. With different shader each time. The idea is to only shade pixels within light’s volume.

First you render front faces and update stencil buffer for every visible pixel. Then you render back faces and you use stencil from previous draw call and zbuffer to isolate pixels within light’s volume. And then finally you shade only those remaining pixels. There are multiple variants of this technique. Here’s one of them explained in more detail. I don’t know which technique exactly is unity using but the basic idea is the same for all of them. You have to render the mesh twice. You can’t use instancing. Your solution can work visually but it will always be inefficient. You’re shading pixels outside of light’s volume.

This technique is at least a decade old and nobody is using it anymore. There are lot more efficient methods but they are also more difficult to write. Look for “Tile-based deferred shading” or “Clustered deferred shading”. Both were specifically designed to handle large number of lights.

Your technique is interesting. I don’t see why it couldn’t work with instancing though, we could totally do two instanced draw calls.

However, this is not beneficial here if I understand correctly. Unity already marks pixels that should be lit in the stencil buffer. Since most of our scenes are lightmapped, that’s not that many pixels. So when we render the sphere of our instanced lights, all we need is to shade those marked pixels, and not all the pixels of the sphere.

This is all we need to achieve good performance here.

  • We don’t handle all the weird cases of camera in volume, etc, just because we don’t have them :slight_smile:

That would work only if there are no collisions of the volumes in screen space.

Yes, and they are doing it by rendering the volume twice. See the picture. There is only one light in the scene but two draw calls of the same geometry.

4972961--484520--Screenshot_147.png

Funny, that stencil write shader already has instancing support.

Might not work in every case, but I got away with this:

Shader "CustomLights/InstancedLight"
{
    SubShader
    {
        Tags { "Queue" = "Transparent-1" }

        // Stencil is marked with bit 4 (value 16) just like Unity does
        // https://docs.unity3d.com/Manual/SL-Stencil.html Deferred rendering path

        Pass // mark stencil bit 4 for geometry inside volume
        {
            Cull Front
            ZTest LEqual
            ZWrite Off
            ColorMask 0
            Stencil
            {
                Ref 16
                WriteMask 16
                ZFail Replace
            }

            CGPROGRAM
            #pragma vertex vert
            #pragma fragment frag
            #pragma target 2.0
            #pragma multi_compile_instancing
            #pragma instancing_options nolodfade nolightprobe nolightmap

            #include "UnityCG.cginc"

            struct a2v {
                float4 pos : POSITION;
                UNITY_VERTEX_INPUT_INSTANCE_ID
            };

            struct v2f {
                float4 vertex : SV_POSITION;
                UNITY_VERTEX_OUTPUT_STEREO
            };

            v2f vert(a2v v)
            {
                v2f o;
                UNITY_SETUP_INSTANCE_ID(v);
                UNITY_INITIALIZE_VERTEX_OUTPUT_STEREO(o);
                o.vertex = UnityObjectToClipPos(v.pos);
                return o;
            }

            fixed4 frag() : SV_Target { return 0; }
            ENDCG
        }

        Pass // do lighting on marked pixels in volume that are not background or lightmapped
        {
            Fog { Mode Off }
            ZTest Less
            ZWrite Off
            Blend One One
            Cull Back
            Stencil
            {
                Ref 16
                ReadMask 16
                Comp Equal
            }

            CGPROGRAM
            #pragma target 3.0
            #pragma vertex vert_deferred_instanced
            #pragma fragment frag
            #pragma multi_compile_lightpass
            #pragma multi_compile_instancing UNITY_HDR_ON
            #pragma instancing_options nolodfade nolightprobe nolightmap

            #pragma exclude_renderers nomrt

            #include "UnityCG.cginc"
            #include "UnityDeferredLibrary.cginc"
            #include "UnityPBSLighting.cginc"
            #include "UnityStandardUtils.cginc"
            #include "UnityGBuffer.cginc"
            #include "UnityStandardBRDF.cginc"

            sampler2D _CameraGBufferTexture0;
            sampler2D _CameraGBufferTexture1;
            sampler2D _CameraGBufferTexture2;

            UNITY_INSTANCING_BUFFER_START(Props)
                UNITY_DEFINE_INSTANCED_PROP(fixed4, _Color)
            UNITY_INSTANCING_BUFFER_END(Props)

            struct appdata
            {
                float4 vertex : POSITION;
                UNITY_VERTEX_INPUT_INSTANCE_ID
            };

            struct unity_v2f_deferred_instanced {
                float4 pos : SV_POSITION;
                float4 uv : TEXCOORD0;
                float3 ray : TEXCOORD1;
                half4 color : COLOR;
                UNITY_VERTEX_INPUT_INSTANCE_ID
            };

            unity_v2f_deferred_instanced vert_deferred_instanced(appdata v)
            {
                unity_v2f_deferred_instanced o;

                UNITY_SETUP_INSTANCE_ID(v);
                UNITY_TRANSFER_INSTANCE_ID(v, o);

                o.pos = UnityObjectToClipPos(v.vertex);
                o.uv = ComputeScreenPos(o.pos);
                o.ray = UnityObjectToViewPos(v.vertex) * float3(-1, -1, 1);
                o.color = UNITY_ACCESS_INSTANCED_PROP(Props, _Color);

                return o;
            }

            // Common lighting data calculation (direction, attenuation, ...)
            void DeferredCalculateLightParams(
                unity_v2f_deferred_instanced i,
                out float3 outWorldPos,
                out float2 outUV,
                out half3 outLightDir,
                out float outAtten,
                out float outFadeDist)
            {
                i.ray = i.ray * (_ProjectionParams.z / i.ray.z);
                float2 uv = i.uv.xy / i.uv.w;

                // read depth and reconstruct world position
                float depth = SAMPLE_DEPTH_TEXTURE(_CameraDepthTexture, uv);
                depth = Linear01Depth(depth);
                float4 vpos = float4(i.ray * depth, 1);
                float3 wpos = mul(unity_CameraToWorld, vpos).xyz;

                float fadeDist = UnityComputeShadowFadeDistance(wpos, vpos.z);

                float3 lightPos = float3(unity_ObjectToWorld[0][3], unity_ObjectToWorld[1][3], unity_ObjectToWorld[2][3]);
                float3 tolight = wpos - lightPos;
                half3 lightDir = -normalize(tolight);

                float att = dot(tolight, tolight) * i.color.w; // color.w is inversed squared range. ie 1/(r*r)
                float atten = tex2D(_LightTextureB0, att.rr).r;

                atten *= UnityDeferredComputeShadow(tolight, fadeDist, uv);

                outWorldPos = wpos;
                outUV = uv;
                outLightDir = lightDir;
                outAtten = atten;
                outFadeDist = fadeDist;
            }

            half4 frag(unity_v2f_deferred_instanced i) : SV_Target
            {
                UNITY_SETUP_INSTANCE_ID(i);

                float3 wpos;
                float2 uv;
                float atten, fadeDist;

                UnityLight light;
                UNITY_INITIALIZE_OUTPUT(UnityLight, light);

                DeferredCalculateLightParams(i, wpos, uv, light.dir, atten, fadeDist);

                light.color = i.color.rgb * atten;

                // unpack Gbuffer
                half4 gbuffer0 = tex2D(_CameraGBufferTexture0, uv);
                half4 gbuffer1 = tex2D(_CameraGBufferTexture1, uv);
                half4 gbuffer2 = tex2D(_CameraGBufferTexture2, uv);
                UnityStandardData data = UnityStandardDataFromGbuffer(gbuffer0, gbuffer1, gbuffer2);

                float3 eyeVec = normalize(wpos - _WorldSpaceCameraPos);
                half oneMinusReflectivity = 1 - SpecularStrength(data.specularColor.rgb);

                UnityIndirect ind;
                UNITY_INITIALIZE_OUTPUT(UnityIndirect, ind);
                ind.diffuse = 0;
                ind.specular = 0;

                return UNITY_BRDF_PBS(data.diffuseColor, data.specularColor, oneMinusReflectivity,
                    data.smoothness, data.normalWorld, -eyeVec, light, ind);
            }

            ENDCG
        }

        Pass // set stencil bit 4 back to zero
        {
            Cull Front
            ZTest LEqual
            ZWrite Off
            ColorMask 0
            Stencil
            {
                WriteMask 16
                Pass Zero
            }

            CGPROGRAM
            #pragma vertex vert
            #pragma fragment frag
            #pragma target 2.0
            #pragma multi_compile_instancing
            #pragma instancing_options nolodfade nolightprobe nolightmap

            #include "UnityCG.cginc"

            struct a2v {
                float4 pos : POSITION;
                UNITY_VERTEX_INPUT_INSTANCE_ID
            };

            struct v2f {
                float4 vertex : SV_POSITION;
                UNITY_VERTEX_OUTPUT_STEREO
            };

            v2f vert(a2v v)
            {
                v2f o;
                UNITY_SETUP_INSTANCE_ID(v);
                UNITY_INITIALIZE_VERTEX_OUTPUT_STEREO(o);
                o.vertex = UnityObjectToClipPos(v.pos);
                return o;
            }

            fixed4 frag() : SV_Target { return 0; }
            ENDCG
        }
    }
}

The last pass is to reset the stencil buffer light bit. The regular implementation does it in the lighting pass so that subsequent lights are ok. Since it’s instanced, I separated it. And I actually don’t even do it because my instanced lights are after all unity lights, there’s no lights after!

It’s obviously not as efficient as with separate light passes, because it lights some unnecessary pixels for some lights, but it’s still a hell of a lot faster than the default implementation. At least in our use case.

Thanks for the link explaining the stencil trick!

2 Likes

The idea with the stencil is to confine each light to only the pixels it covers. If you draw the same stencil value for all lights, then you loose a lot of that as now each light is being applied to pixels that any light covers (within the screen coverage of your light’s mesh). This was the point @Michal_1 was trying to make.

Though, to be fair that still potentially reduces the total pixels shaded considerably.

1 Like

And it does! In our case, we have many small lights that

  • are very close to each other => mostly the same shaded pixels for all
  • and a few scattered that don’t overlap => same pixels as if rendered separately

That won’t work for everyone, obviously.

I’m sharing so that anyone that might end up in a similar situation can benefit.

In our case, we dumped the stencil, and we clip the pixels instead.
GPU Instanciated lights!

Do you mean you’re calling clip() in the shader to skip pixels outside of the range of the light?
I.E.: clip(lightRange - length(lightPos - scenePos));

On modern GPUs this should work relatively well, though potentially slower than the stencil method.

Yes, and it turns hundred of lights into 1 draw call. Might be slightly longer on GPU, but probably a lot faster on CPU.

The advantage of the stencil method is that it doesn’t even run the pixel shader for pixels that won’t be lit because the sphere of the light doesn’t touch the geometry that is behind. (ie. much faster when the light doesn’t light anything)

@Manufacture43 @LightStriker_1

Just came across this thread, and it seems up the alley of what I’m trying to do, however I’m relatively new to shaders.

I am using deferred rendering as I have lots of moving units in the scene, each with their own point lights. Thing is, I’m also using BlendOp Max in the deferred shader, and therefore the lights don’t stack in intensity much. Often the units are close together, and their lights overlap, but it doesn’t actually change the lighting of the overlapping pixels.

What I’d like to do is make Unity not bother to do another lighting pass on pixels from overlapping point lights. Perhaps I’m wrong in thinking so, but are either of your solutions along the lines of achieving that?

No, not at all. The only way I’m thinking one could achieve this would be to increment the value written by the lights to the stencil buffer and have the stencil test be set to GREATER. But that would eat up a lot of bits in the stencil buffer so it would only allow for very few lights and nothing left for any other use. Doesn’t seem practical.

Or maybe have a second depth buffer just for lights? but that would probably yield weird results.

@Manafacture43 I am having a similar problem while porting my game to Switch. Everything works fine on PC, PS4, XB1 (released 2 years ago), but my instanced point lights are messed up.

They all have flickering black squares where the light is.

I found your thread and took a look at your shader.

My question is how do I use the shader?

I just updated my project from Unity 2017 to Unity 2019 so I could do the Switch port, so I’m not familiar with the new render pipelines, is that how I need to use this?

I’ve been digging through Unity documentation for awhile and I can’t find a clear example of “this is how you use a custom point light shader”.

Any help would be appreciated, thanks!

Sorry I just saw your message. I have a custom light component that registers itself into a general list. That list then builds a commandbuffer that just sets properties and DrawMeshInstanced twice (first pass to mark pixels, second pass to light) in the CameraEvent.AfterLighting pass.

using UnityEngine;
using UnityEngine.Rendering;
using System.Collections.Generic;

[RequireComponent(typeof(Camera))]
public class InstancedLightRenderer : MonoBehaviour
{
    public int MaxLights = 256;
    public Mesh UnitSphere;
    public Material Material;

    public static List<InstancedLight> lights = new List<InstancedLight>(256);

    CommandBuffer commands;

    MaterialPropertyBlock properties;

    ExpandableArray<Matrix4x4> matrices;
    List<Vector4> colors;

    int colorID;

    void Awake()
    {
        commands = new CommandBuffer();
        commands.name = "Instanced Lights";

        matrices = new ExpandableArray<Matrix4x4>(0, MaxLights);
        colors = new List<Vector4>(MaxLights);

        properties = new MaterialPropertyBlock();
        colorID = Shader.PropertyToID("_Color");
        colors.AddMany(Color.black, MaxLights); // pre fill array to maximum instances
        properties.SetVectorArray(colorID, colors);
    }

    void OnEnable()
    {
        GetComponent<Camera>().AddCommandBuffer(CameraEvent.AfterLighting, commands);
    }

    void OnDisable()
    {
        GetComponent<Camera>().RemoveCommandBuffer(CameraEvent.AfterLighting, commands);
    }

    void OnPreRender()
    {
        matrices.Clear();
        colors.Clear();

        commands.Clear();

        int count = Mathf.Min(MaxLights, lights.Count);
        for (int i = 0; i < count; i++)
        {
            InstancedLight light = lights[i];
            float s = light.Range * 2f;
            Vector3 scale = new Vector3() { x = s, y = s, z = s };
            matrices.Add() = Matrix4x4.TRS(light.transform.position, Quaternion.identity, scale);
            colors.Add(new Vector4() { x = light.Color.r, y = light.Color.g, z = light.Color.b, w = 1f / (light.Range * light.Range) });
        }

        properties.SetVectorArray(colorID, colors);
        commands.DrawMeshInstanced(UnitSphere, 0, Material, 0, matrices.Data, count, properties); // mark pixels that should be lit
        commands.DrawMeshInstanced(UnitSphere, 0, Material, 1, matrices.Data, count, properties); // light
        //commands.DrawMeshInstanced(UnitSphere, 0, Material, 2, matrices.Data, count, properties); // unmark pixels, not necessary since there is no lights drawn after
    }
}
1 Like

If any of you kind graphics programmers would create an asset store out of your current version of instanced lights the community would be eternally grateful :stuck_out_tongue: Code scares me
https://www.reddit.com/r/Unity3D/comments/gkiyjj/10000_cheap_dynamic_point_lights/