Using 'for' in compute shader causes warning X4714: sum of temp registers and indexable temp registers

I’ve got quite complex compute shader. It already big and everything was OK.

Until I’ve used simple ‘for’.
And started to recieve ‘warning X4714: sum of temp registers and indexable temp registers times 256 threads exceeds the recommended total 16384. Performance may be reduced at kernel’.

Code looks like this:

UNITY_LOOP
for (int vZ = minInc.z; vZ <= maxInc.z; vZ++)
{
    UNITY_LOOP
    for (int vY = minInc.y; vY <= maxInc.y; vY++)
   {
       UNITY_LOOP
       for (int vX = minInc.x; vX <= maxInc.x; vX++)
       {
              //Some functions using 
              //
              //	structure
              //	inout
              //	StructuredBuffers
              //	Texure2D<float4>
              //
              //and stuff like this
       }
    }
}
   

When I remove strings and keep all functions inside loop warning is gone.
But with loop warning appear.

Source code shows extremely strange storage of everything at the beginning of each loop.
Which I guess causes warning. But WHY compiler doing this? What is the reason to duplicate same values in new registers?

 517:         loop 
 518:           ige r17.w, r26.z, cb0[66].z
 519:           breakc_nz r17.w
 520:           mov r17.w, r0.x
 521:           mov r27.xy, r16.xyxx
 522:           mov r28.xyz, r17.xyzx
 523:           mov r29.x, cb0[35].x
 524:           mov r29.y, cb0[36].x
 525:           mov r29.z, cb0[37].x
 526:           mov r29.w, cb0[38].x
 527:           mov r30.x, cb0[35].y
 528:           mov r30.y, cb0[36].y
 529:           mov r30.z, cb0[37].y
 530:           mov r30.w, cb0[38].y
 531:           mov r31.x, cb0[35].z
 532:           mov r31.y, cb0[36].z
 533:           mov r31.z, cb0[37].z
 534:           mov r31.w, cb0[38].z
 535:           mov r32.x, r3.y
 536:           mov r32.y, r10.w
 537:           mov r32.z, r11.w
 538:           mov r32.w, cb0[39].w
 539:           mov r33.xyz, cb0[40].xywx
 540:           mov r33.w, cb0[41].y
 541:           mov r34.xyz, r18.xyzx
 542:           mov r35.xyz, r19.xyzx
 543:           mov r36.xyz, r20.xyzx
 544:           mov r37.xyz, r21.xyzx
 545:           mov r23.zw, r16.zzzw
 546:           mov r24.zw, r23.xxxy
 547:           mov r27.zw, r25.xxxy
 548:           mov r18.w, r2.z
 549:           mov r38.xyz, r22.xyzx
 550:           mov r39.xy, r24.xyxx
 551:           mov r19.w, r11.z
 552:           mov r20.w, l(0)
 553:           loop 
 554:             ilt r21.w, r10.z, r19.w
 555:             mov r20.w, l(0)
 556:             breakc_nz r21.w
 557:             mov r40.xyz, r35.xyzx
 558:             mov r40.w, r11.y
 559:             mov r21.w, r17.w
 560:             mov r41.xy, r27.xyxx
 561:             mov r42.xyz, r28.xyzx
 562:             mov r43.xyzw, r29.xyzw
 563:             mov r44.xyzw, r30.xyzw
 564:             mov r45.xyzw, r31.xyzw
 565:             mov r46.xyz, r34.xyzx
 566:             mov r47.xyz, r33.xyzx
 567:             mov r39.zw, r24.zzzw
 568:             mov r48.xyz, r36.xyzx
 569:             mov r22.w, r23.z
 570:             mov r25.w, r23.w
 571:             mov r26.w, r18.w
 572:             mov r49.xyz, r37.xyzx
 573:             mov r28.w, r38.x
 574:             mov r34.w, r38.y
 575:             mov r35.w, r38.z
 576:             mov r36.w, r39.x
 577:             mov r37.w, r39.y
 578:             mov r50.xy, r27.zwzz
 579:             mov r38.w, r32.x
 580:             mov r41.zw, r32.yyyz
 581:             mov r42.w, r32.w
 582:             mov r46.w, r33.w
 583:             mov r47.w, r19.w
 584:             mov r48.w, l(0)
 585:             loop 
 586:               ilt r49.w, r10.y, r40.w
 587:               mov r48.w, l(0)
 588:               breakc_nz r49.w
 589:               mov r51.z, r21.w
 590:               mov r52.xy, r41.xyxx
 591:               mov r53.xyz, r42.zxyz
 592:               mov r54.xyz, r46.xyzx
 593:               mov r55.xyz, r40.xyzx
 594:               mov r58.xyz, r47.xyzx
 595:               mov r59.xyz, r48.xyzx
 596:               mov r51.x, r22.w
 597:               mov r51.y, r25.w
 598:               mov r49.w, r26.w
 599:               mov r50.z, r39.z
 600:               mov r15.z, r39.w
 601:               mov r50.w, r49.x
 602:               mov r60.x, r49.y
 603:               mov r52.z, r49.z
 604:               mov r60.y, r28.w
 605:               mov r60.z, r34.w
 606:               mov r52.w, r35.w
 607:               mov r51.w, r36.w
 608:               mov r53.w, r37.w
 609:               mov r54.w, r50.x
 610:               mov r55.w, r50.y
 611:               mov r61.xyzw, r43.xyzw
 612:               mov r56.xyzw, r44.xyzw
 613:               mov r57.xyzw, r45.xyzw
 614:               mov r58.w, r38.w
 615:               mov r59.w, r41.z
 616:               mov r60.w, r41.w
 617:               mov r62.x, r42.w
 618:               mov r62.y, r46.w
 619:               mov r62.z, r47.w
 620:               mov r62.w, r40.w
 621:               mov r63.x, r11.x
 622:               mov r63.y, l(0)
 623:               loop 
 624:                 ilt r63.z, r10.x, r63.x
 625:                 mov r63.y, l(0)
 626:                 breakc_nz r63.z
 627:                 ige r63.z, r63.x, l(0)

Have anyone encountered such warning?
Any suggestion how to stop this strange usage of registers an the beginning of each loop?

It’s hard to say what exactly causes this without seeing the source code.

1 Like

Source code inside is something like 3K lines of code randomly stored in 20 files.
Difficult to collect.

A lot of such entry points:

	void Func(
                        inout float3 a,
                        inout float3 b,
                        inout float3 c,
                        inout Struct1 ps,
                        inout Struct2 ps2,
                        inout VariablesStruc d
                        );

	bool Func2(
                        inout float3 a,
                        float3 f1,
                        Texture2D<float4> t,
                        inout Struct1 ps,
                        inout Struct2 ps2,
                        inout VariablesStruc d
                        );


A lot of stuff. And once again it all working and compiling fine inside one loop.
In two nested loops compilation time jumps from 2 seconds to 12 seconds and register warning appears.

I’ve encountered such compilation increase in surface shaders. They are using many of same CGINCs from this compute shader. Compilation time is about 15 minutes. And this is known bug of surface shaders. It was marked as ‘By design’ because of external compiler. And official solution is “Move as many as possible on compute shaders” as far as I remember.

And here we are.

Any advises please?
Things like some strange limitation on structures size?
Is there any way to low down compilation optimization in Unity? Some #pragma? Some way to tweak compiler directly from Windows?

Structs are just a way to group data for you. The compiler will work with individual fields anyway.
Do you have anything happening in outer loops as well, or only in the inner loop?

In surface shader I’ve started to have compile time over 40 minutes once. Tried a lot of stuff. And final solution was to split one struct into two and provide both of them everywhere as a parameter. Sum of variables in them were exectaly as in one initial.

In compute shaders I’ve also met such stuff allready. I’ve got a Texture2D< float4 > as a variable in struct. And all worked correctly. But inside loop (single one) I started to recieve error ‘unable to unroll loop’.
But all struct was initialized and even twice.
Solution - pass such variables to a functions as a separate parameter.

So structures definitely have some special meaning for compiler.

Blockquote
Do you have anything happening in outer loops as well, or only in the inner loop?

A lot of calculations. But nothing special.

My bet all of this are compiler optimization bugs.

Would be great to gave an option to low it via some pragma. May be some present allredy?

You can disable compiler optimizations with #pragma skip_optimizations <api_list>. Or you can try enabling DXC using #pragma use_dxc <api_list>.

This may influence things a lot. Those registers it copies data into may really be needed to temporarily store the data.

This gives ‘Shader warning: ‘skip_optimizations’ : unknown pragma ignored at kernel’. Only for surface shaders I guess.

This one is interesting. It compiles and fast. 1 seconds against 12 and original warning is gone.
But entire shader stopped working without any warnings :slight_smile:

What could it be? Some ‘struct pack’? Different AppendStructuredBuffer behavour? Some asynchronous read from GPU working other way? I’m just guessing options :slight_smile:

Any ideas?

PS: BTW in surface shader ‘use_dxc’ can’t see global Unity variable ‘unity_CameraToWorld’. Wonder why? It is a bit undocumented as far as I remember though. But working with original compiler.

Ah, sorry, it’s only there for graphics shaders, not for compute.
If you force debug symbols (#pragma enable_d3d11_debug_symbols), it should also turn optimizations off.

No warnings even in the Editor.log?

Nothing at all.

I’ve checked a bit what is not working.
And recieving results returns zeros (via AsyncGPUReadbackRequest).
Actually it can be also error in storing values in ComputBuffers or somthing with it. If stored zeros instead of data then result will be 0 as it coded. Will check later.