Excessive Texture array samples slow on mobile

I’m creating a terrain shader for mobile. It uses texture arrays. In fragment shader I do this to blend multiple textures.

Shader "Test" {
    Properties
    {
    _ctrlTx0("Control (RGBA)", 2D) = "red" {}
    _ctrlTx1("Control (RGBA)", 2D) = "red" {}
    _ctrlTxArr("Ctrl Array", 2DArray) = "black" {}
    _albTxArr("Albedo Array", 2DArray) = "black" {}
    _normTxArr("Normal Array", 2DArray) = "bump" {}

    _sp0("Layer 3 (A)", 2D) = "white" {}
    _nm0("nm 3 (A)", 2D) = "bump" {}
    _sp1("Layer 3 (A)", 2D) = "white" {}
    _nm1("Layer 3 (A)", 2D) = "bump" {}
    _sp2("Layer 3 (A)", 2D) = "white" {}
    _nm2("Layer 3 (A)", 2D) = "bump" {}
    _sp3("Layer 3 (A)", 2D) = "white" {}
    _nm3("Layer 3 (A)", 2D) = "bump" {}
    _sp4("Layer 3 (A)", 2D) = "white" {}
    _nm4("Layer 3 (A)", 2D) = "bump" {}
    _sp5("Layer 3 (A)", 2D) = "white" {}
    _nm5("Layer 3 (A)", 2D) = "bump" {}
    _sp6("Layer 3 (A)", 2D) = "white" {}
    _nm6("Layer 3 (A)", 2D) = "bump" {}
    _sp7("Layer 3 (A)", 2D) = "white" {}
    _nm7("Layer 3 (A)", 2D) = "bump" {}


    // Props
    _props0("Props 0", Vector) = (1,0,0,0)
    _props1("Props 1", Vector) = (1,0,0,0)
    _props2("Props 2", Vector) = (1,0,0,0)
    _props3("Props 3", Vector) = (1,0,0,0)
    _props4("Props 4", Vector) = (1,0,0,0)
    _props5("Props 5", Vector) = (1,0,0,0)
    _props6("Props 6", Vector) = (1,0,0,0)
    _props7("Props 7", Vector) = (1,0,0,0)

       
        _terrainSize("Terrain Size", Float) = 10000
        _terrainScale("Terrain Scale", Float) = 1
        _Shininess("Shininess", Range(0.03, 1)) = 0.078125
        _SpecColor("Specular Color", Color) = (0.5, 0.5, 0.5, 1)
    }
   
    SubShader{
        Tags{
            "Queue" = "Geometry-99"
            "RenderType" = "Opaque"
        }
        LOD 500
        CGPROGRAM
        #include "TestInclude.cginc"
        #pragma target 3.5
        #pragma debug
        #pragma surface surf BlinnPhong vertex:vert noforwardadd noinstancing        
       
        void surf(Input IN, inout SurfaceOutput o)

        {
            float2 uv = IN.uuv_ctrlTx0;
            half3 alb;
            half3 norm;
            half smooth, metal;

            FastArray(uv, IN.uuv_sp0, IN.uuv_sp1, IN.uuv_sp2, IN.uuv_sp3, IN.uuv_sp4, IN.uuv_sp5, IN.uuv_sp6, alb, norm, metal, smooth);
            o.Albedo = fixed3(alb);
            o.Normal = fixed3(norm);
            o.Gloss = fixed(smooth);
            o.Specular = _Shininess;
        }
        ENDCG
    }
}

The include file is:

struct Input
{
    float2 uuv_sp0 : TEXCOORD0;
    float2 uuv_sp1 : TEXCOORD1;
    float2 uuv_sp2 : TEXCOORD2;
    float2 uuv_sp3 : TEXCOORD3;
    float2 uuv_sp4 : TEXCOORD4;
    float2 uuv_sp5 : TEXCOORD5;
    float2 uuv_sp6 : TEXCOORD6;
    float2 uuv_ctrlTx0: TEXCOORD7;
};

struct appdata {
    float4 vertex : POSITION;
    float4 tangent : TANGENT;
    float3 normal : NORMAL;
    float2 texcoord : TEXCOORD0;
    float4 texcoord1 : TEXCOORD1;
    float4 texcoord2 : TEXCOORD2;
};

half _Shininess;

half4 _props0, _props1, _props2, _props3, _props4, _props5, _props6, _props7;
sampler2D _sp0, _sp1, _sp2, _sp3, _sp4, _sp5, _nm0, _nm1, _nm2, _nm3, _nm4, _nm5;
sampler2D _ctrlTx0, _ctrlTx1;
float _terrainSize, _terrainScale;
UNITY_DECLARE_TEX2DARRAY(_ctrlTxArr);
UNITY_DECLARE_TEX2DARRAY(_albTxArr);
UNITY_DECLARE_TEX2DARRAY(_normTxArr);

void FastArray(float2 uv, float2 uv0, float2 uv1, float2 uv2, float2 uv3, float2 uv4, float2 uv5, float2 uv6, out half3 alb, out half3 norm, out half metal, out half gloss) {
    half4 ctrl0 = tex2D(_ctrlTx0, uv);
    half4 ctrl1 = tex2D(_ctrlTx1, uv);

    //half4 ctrl0 = UNITY_SAMPLE_TEX2DARRAY(_ctrlTxArr, float3(uv, 0));
    //half4 ctrl1 = UNITY_SAMPLE_TEX2DARRAY(_ctrlTxArr, float3(uv, 1));

    half4 sumNorm = 0;
    alb = 0;

    sumNorm += ctrl0.x * UNITY_SAMPLE_TEX2DARRAY(_normTxArr, float3(uv0, 0));
    alb += ctrl0.x * UNITY_SAMPLE_TEX2DARRAY(_albTxArr, float3(uv0, 0));

    sumNorm += ctrl0.y * UNITY_SAMPLE_TEX2DARRAY(_normTxArr, float3(uv1, 1));
    alb += ctrl0.y * UNITY_SAMPLE_TEX2DARRAY(_albTxArr, float3(uv1, 1));

    sumNorm += ctrl0.z * UNITY_SAMPLE_TEX2DARRAY(_normTxArr, float3(uv2, 2));
    alb += ctrl0.z * UNITY_SAMPLE_TEX2DARRAY(_albTxArr, float3(uv2, 2));

    sumNorm += ctrl0.w * UNITY_SAMPLE_TEX2DARRAY(_normTxArr, float3(uv3, 3));
    alb += ctrl0.w * UNITY_SAMPLE_TEX2DARRAY(_albTxArr, float3(uv3, 3));

    sumNorm += ctrl1.x * UNITY_SAMPLE_TEX2DARRAY(_normTxArr, float3(uv4, 4));
    alb += ctrl1.x * UNITY_SAMPLE_TEX2DARRAY(_albTxArr, float3(uv4, 4));

    sumNorm += ctrl1.y * UNITY_SAMPLE_TEX2DARRAY(_normTxArr, float3(uv5, 5));
    alb += ctrl1.y * UNITY_SAMPLE_TEX2DARRAY(_albTxArr, float3(uv5, 5));

    sumNorm += ctrl1.z * UNITY_SAMPLE_TEX2DARRAY(_normTxArr, float3(uv6, 6));
    alb += ctrl1.z * UNITY_SAMPLE_TEX2DARRAY(_albTxArr, float3(uv6, 6));

    sumNorm += ctrl1.w * UNITY_SAMPLE_TEX2DARRAY(_normTxArr, float3(uv6, 7));
    alb += ctrl1.w * UNITY_SAMPLE_TEX2DARRAY(_albTxArr, float3(uv6, 7));

    gloss = 0.2;
    norm.xyz = sumNorm.xyz * 2 - 1;
    norm.z = sqrt(1 - saturate(dot(norm.xy, norm.xy)));
    metal = 0.2;
}

void vert(inout appdata v, out Input data)
{
    UNITY_INITIALIZE_OUTPUT(Input, data);
    float2 uv = v.texcoord;
    data.uuv_ctrlTx0 = uv;

    float2 ouv = (uv - 0.5) * _terrainSize;

    data.uuv_sp0 = ouv / _props0.x;
    data.uuv_sp1 = ouv / _props1.x;
    data.uuv_sp2 = ouv / _props2.x;
    data.uuv_sp3 = ouv / _props3.x;
    data.uuv_sp4 = ouv / _props4.x;
    data.uuv_sp5 = ouv / _props5.x;
    data.uuv_sp6 = ouv / _props6.x;

    v.tangent.xyz = cross(v.normal, float3(0, 0, 1));
    v.tangent.w = -1;
}

Each texture has albedo and normal thus with 4 textures we sample 8 times. The strange thing is performance drops rapidly if more than four texture pairs are sampled.
8 samples =>24 fps
12 samples =>14 fps
16 samples =>7 fps

Why after 4 texture pairs (8 samples) from texture arrays performance drops this much?

Ordinary textures do not have this problems and 12 samples will give 20 fps.

I suspect you’re running into the issue of sampler stalling.

When you have 12 separate textures you generally have 12 unique samplers so you can sample all 12 textures without incurring a huge penalty (assuming memory bandwidth isn’t completely saturated) as all 12 textures will be sampled in parallel. Basically sampling 12 textures at once takes only as long as the slowest texture sample.

Sampling a texture array multiple times is going to use the same physical texture sampler unit in the hardware for all of the samples meaning they happen in serial.

TLDR; Sampling 12 textures in a texture array with a single sampler can take 12 times longer than 12 individual textures.

5 Likes

OMG!! That explains a lot. It took me two straight days wondering. Yes I can tell memory bandwidth is not the main issue here because with ordinary textures it gets at least two times faster. I’ll revise the shader and see if increasing texture array samplers will solve the issue.

I tested the shader with 8 texture arrays each having only two textures. It did not get any better. FPS is 6 as before. Here is the sampling part:

    half4 ctrl0 = UNITY_SAMPLE_TEX2DARRAY(_ctrlTxArr, float3(uv, 0));

    half4 sumNorm = 0;
    alb = 0;

    alb += ctrl0.x * UNITY_SAMPLE_TEX2DARRAY(_albTxArr0, uv0);
    alb += ctrl0.y * UNITY_SAMPLE_TEX2DARRAY(_albTxArr0, uv1);
    alb += ctrl0.z * UNITY_SAMPLE_TEX2DARRAY(_albTxArr1, uv2);
    alb += ctrl0.w * UNITY_SAMPLE_TEX2DARRAY(_albTxArr1, uv3);


    sumNorm += ctrl0.x * UNITY_SAMPLE_TEX2DARRAY(_normTxArr0, uv0);
    sumNorm += ctrl0.y * UNITY_SAMPLE_TEX2DARRAY(_normTxArr0, uv1);
    sumNorm += ctrl0.z * UNITY_SAMPLE_TEX2DARRAY(_normTxArr1, uv2);
    sumNorm += ctrl0.w * UNITY_SAMPLE_TEX2DARRAY(_normTxArr1, uv3);

    half4 ctrl1 = UNITY_SAMPLE_TEX2DARRAY(_ctrlTxArr, float3(uv, 1));

    alb += ctrl1.x * UNITY_SAMPLE_TEX2DARRAY(_albTxArr2, uv4);
    alb += ctrl1.y * UNITY_SAMPLE_TEX2DARRAY(_albTxArr2, uv5);
    alb += ctrl1.z * UNITY_SAMPLE_TEX2DARRAY(_albTxArr3, uv6);
    alb += ctrl1.w * UNITY_SAMPLE_TEX2DARRAY(_albTxArr3, float3(uv6.xy, 1));

    sumNorm += ctrl1.x * UNITY_SAMPLE_TEX2DARRAY(_normTxArr2, uv4);
    sumNorm += ctrl1.y * UNITY_SAMPLE_TEX2DARRAY(_normTxArr2, uv5);
    sumNorm += ctrl1.z * UNITY_SAMPLE_TEX2DARRAY(_normTxArr3, uv6);
    sumNorm += ctrl1.w * UNITY_SAMPLE_TEX2DARRAY(_normTxArr3, float3(uv6.xy, 1));

I tried mixing alb and sumNorm like this:

    alb += ctrl0.x * UNITY_SAMPLE_TEX2DARRAY(_albTxArr0, uv0);
    sumNorm += ctrl0.x * UNITY_SAMPLE_TEX2DARRAY(_normTxArr0, uv0);

Still the same. Also rearranged the samples so that sampling two slices of the same texture array be as far to each other as possible. No gain there too.

You could try using UNITY_DECLARE_TEX2DARRAY_NOSAMPLER and defining inline samplers to use to enforce unique samplers per sample.

Or it could be texture arrays are just super slow on the hardware you’re using. :confused:

1 Like

I don’t get it. When we declare with UNITY_DECLARE_TEX2DARRAY doesn’t It already declare a sampler2DArray on gles3?

Maybe this is the case. I will test on other devices too and report on what happens.

Yes. The idea was to declare multiple samplers (like 12) and use those to see if it’s still slower using the texture arrays. Basically confirm it’s the texture arrays themselves that are the issue as that would get the shader as functionally close to the non-array version as possible. If it’s still slow then you know. Though it’s odd as everything I know about texture arrays wouldn’t make me think there should be any difference. I haven’t been working on mobile for a while though so it’s not something I’ve kept up with as much. @JasonBooth might have a better idea if you can convince him to weigh in.

1 Like

I tested the shader on another device and it ran without any problem. 16 samples on texture array resulted in 16 FPS. It seems HTC One’s GPU has problems with texture arrays.

Thank you so much for the help.

A lot of mobile GPUs, especially on android, will skimp on various features in the spec, either at the driver or hardware level. While texture arrays have been around for a long time, they aren’t used a whole lot, and I could imagine some particular vendor not paying attention to it.

1 Like

I’m not familiar with texture arrays, but the example uses when I Google it seem to suggest they may exist or be used to optimise a different use case - sampling from a subset of the textures bound. If that’s at all the case (and there is every chance I am far off the mark), why would one expect them to be as fast as or fast than the regular texture sampling approach where the driver might reasonably expect all textures to be sampled from.

Thanks for the clear answer. That makes sense. Unfortunately in my case to support terrains with 8 albedo and 8 normal textures I have to use texture arrays.

Why, they aren’t supported on ES2 and they don’t increase the textures number of textures you can sample from in ES3.x, do they?

ES3.0 guarentees GL_MAX_TEXTURE_IMAGE_UNITS be atleast 16 as per https://www.khronos.org/registry/OpenGL-Refpages/es3.0/html/glGet.xhtml

Try to include “Cull Back” under LOD 500 see if that improves things.

On a Unity terrain, for 8 textures w/ normal you need 16 textures for the terrain types, 2 control textures, and then whatever samples the lighting pathway your using needs as well (lightmaps, etc). So it’s atlasing or texture arrays.

On more modern APIs, you can share samplers as well…

You might want to give MicroSplat’s core module a spin then and see if you have similar results. It’s free, and more optimal than the code you posted above, so it might perform better than your homegrown solution. It’s also extensible via it’s module system, so if you don’t like the modules I have available you could extend it yourself the same way I do.

1 Like

Well, 16 is not enough. The most you can get with ordinary textures are 5 or 6 pairs of texture with normal. See slipster’s answer.

Tested and doesn’t make any difference.

Exactly. The atlas bleeding for repeated textures are dreadful and the shader should work on mobiles as well so texture arrays are the only option.

Yes a lot of people say MicroSplat is the best and judging by the videos it seems it really is. If I wasn’t developing my own terrain engine as a asset store product I would definitely go for MicroSplat. My solution is an integrated and complex system and It manages its own database of textures in a precompressed DXT format. I think it would be hard and needs work on both sides to integrate it with Microsplat. Maybe if the product becomes successful we could do it in later versions.

3270778--252684--photo_2017-10-30_16-19-11.jpg

I actually have a shipable version of MicroSplat for including with other products on the Unity Asset Store. Depending on what you need to do, you might be able to write a module with your custom features and ship with MicroSplat in your product. Users are then prompted to upgrade to the free core module if they want to change the shader/material settings, and once they do the demo version disabled and replaced by the full system. They can then purchase additional features. Ideally, it’s a win/win for everyone- other developers get a better looking demo, and we both get cross promotion between our products.

2 Likes

That’s good and I agree it’s a win for everybody. I’ll get in touch when the project is ready.

@bgolus Hi, I’m working with texture arrays and wondering about this. Does defining more samplers in the shader properties alleviate the stalling? As in if I had _TexArray1, TexArray2 inputs defined, but both have the same texture array as the input. Or does the number of shader texture inputs not map to the amount of physical samplers used? Can one texture/texture array only have one sampler no matter what, due to GPU hardware or something? Thanks!

Defining multiple sampler properties, or just multiple inline sampler states, will generally mean more physical samplers get used, even when reusing the same texture asset.

One thing I’ve learned since my previous post is some hardware doesn’t actually have that many physical sampling units dedicated to each shader execution thread. The actual number is sometimes difficult to determine. It seems like it’s relatively safe to assume 2 or 4 concurrent texture samples will happen roughly in parallel, but more than that might be reusing the physical hardware and become serial again.

1 Like

Thank you! This info helps a ton. I’m using 2x 30-count TextureArrays and sampling from each one 2x times with each sample using its own defined sampler property, and I wasn’t sure if this would actually mean that they are sampling from the same texure array input in parallel.

If I have multiple materials/shaders that all use the same TextureArray inputs, is there some sort of performance gained which switching between those materials? I was reading this Draw Call Cost Analysis and they graph out the performance cost of reusing/changing textures, but I’m not sure if I am interpreting their data correctly.