Why is unrolled loop faster for managed array than for unsafe NativeArray?

Code is below. Results:

ManagedWriteByte avg time elapsed: 8,21948E-07
ManagedReadByte avg time elapsed: 7,098913E-07
ManagedWriteUlong avg time elapsed: 1,907349E-08
ManagedReadUlong avg time elapsed: 1,811981E-08
UnsafeWriteByte avg time elapsed: 9,460449E-07
UnsafeReadByte avg time elapsed: 9,717942E-07
UnsafeWriteUlong avg time elapsed: 1,016855E-07
UnsafeReadUlong avg time elapsed: 1,052618E-07

Unsure why Managed*Ulong get the best times? If reading / writing byte takes about the same time for managed vs. unmanaged/unsafe code, why do we see a substantial difference for ulong?

using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using Unity.Collections;
using Unity.Collections.LowLevel.Unsafe;

//Showcases the difference betweeen Managed vs. Unsafe NativeArray short (byte) and long (ulong) accesses.
//Set all bits in writes, then retrieve these into local members so that they have an effect / aren't stripped by compiler.
public class RWPerfDemo : MonoBehaviour
{
    byte[] bytesManage = null;
    ulong[] ulongsManage = null;
    NativeArray<byte> bytesUnsafe;
    NativeArray<ulong> ulongsUnsafe;
 
    const int iterationsPerFunc = 1000;
    const int iterationsLoopByte = 256; //should be max unrollable, basically.
    const int iterationsLoopUlong = iterationsLoopByte / 8; //should be max unrollable, basically.
  
    byte b = 0;
    ulong u = 0;
 
    void Start()
    {
        int totalBytes = iterationsLoopByte;
        bytesManage =  new byte [totalBytes];
        ulongsManage = new ulong[totalBytes / 8];
        ulongsUnsafe = new NativeArray<ulong>(totalBytes / 8, Allocator.Persistent);
  
        ManagedWriteByte ("ManagedWriteByte");
        ManagedReadByte  ("ManagedReadByte");
        ManagedWriteUlong("ManagedWriteUlong");
        ManagedReadUlong ("ManagedReadUlong");
  
        UnsafeWriteByte  ("UnsafeWriteByte");
        UnsafeReadByte   ("UnsafeReadByte");
        UnsafeWriteUlong ("UnsafeWriteUlong");
        UnsafeReadUlong  ("UnsafeReadUlong");
  
        ulongsUnsafe.Dispose();
    }
 
    //MANAGED...
 
    void ManagedWriteByte(string name)
    {
        float t0 = Time.realtimeSinceStartup;
        for (int i = 0; i < iterationsPerFunc; i++)
        {
            for (int l = 0; l < iterationsLoopByte; l++)
            {
                bytesManage[l] = byte.MaxValue;
            }
        }      
        float t1 = Time.realtimeSinceStartup;
        Debug.Log(name+" avg time elapsed: "+(t1 - t0) / (float) iterationsPerFunc);
    }
 
    void ManagedReadByte(string name)
    {
        float t0 = Time.realtimeSinceStartup;
        for (int i = 0; i < iterationsPerFunc; i++)
        {
            for (int l = 0; l < iterationsLoopByte; l++)
            {
                b = bytesManage[l];
            }
        }      
        float t1 = Time.realtimeSinceStartup;
        Debug.Log(name+" avg time elapsed: "+(t1 - t0) / (float) iterationsPerFunc);
    }
 
    void ManagedWriteUlong(string name)
    {
        float t0 = Time.realtimeSinceStartup;
        for (int i = 0; i < iterationsPerFunc; i++)
        {
            for (int l = 0; l < iterationsLoopUlong / 8; l++)
            {
                ulongsManage[l] = ulong.MaxValue;
            }
        }      
        float t1 = Time.realtimeSinceStartup;
        Debug.Log(name+" avg time elapsed: "+(t1 - t0) / (float) iterationsPerFunc);
    }
 
    void ManagedReadUlong(string name)
    {
        float t0 = Time.realtimeSinceStartup;
        for (int i = 0; i < iterationsPerFunc; i++)
        {
            for (int l = 0; l < iterationsLoopUlong / 8; l++)
            {
                u = ulongsManage[l];
            }
        }      
        float t1 = Time.realtimeSinceStartup;
        Debug.Log(name+" avg time elapsed: "+(t1 - t0) / (float) iterationsPerFunc);
    }
    //...MANAGED

    //UNSAFE...
    void UnsafeWriteByte(string name)
    {
        float t0 = Time.realtimeSinceStartup;
        unsafe {
        ulong* ulongs = (ulong*)NativeArrayUnsafeUtility.GetUnsafeBufferPointerWithoutChecks<ulong>(ulongsUnsafe);
        byte* bytes = (byte*) ulongs;
        for (int i = 0; i < iterationsPerFunc; i++)
        {
            for (int l = 0; l < iterationsLoopByte; l++)
            {
                bytes[l] = byte.MaxValue;
            }
        }
        }  
        float t1 = Time.realtimeSinceStartup;
        Debug.Log(name+" avg time elapsed: "+(t1 - t0) / (float) iterationsPerFunc);
    }
 
    void UnsafeReadByte(string name)
    {
        float t0 = Time.realtimeSinceStartup;
        unsafe {
        ulong* ulongs = (ulong*)NativeArrayUnsafeUtility.GetUnsafeBufferPointerWithoutChecks<ulong>(ulongsUnsafe);
        byte* bytes = (byte*) ulongs;
        for (int i = 0; i < iterationsPerFunc; i++)
        {
            for (int l = 0; l < iterationsLoopByte; l++)
            {
                b = bytes[l];
            }
        }
        }  
        float t1 = Time.realtimeSinceStartup;
        Debug.Log(name+" avg time elapsed: "+(t1 - t0) / (float) iterationsPerFunc);
    }
 
    void UnsafeWriteUlong(string name)
    {
        float t0 = Time.realtimeSinceStartup;
        unsafe {
        ulong* ulongs = (ulong*)NativeArrayUnsafeUtility.GetUnsafeBufferPointerWithoutChecks<ulong>(ulongsUnsafe);
        for (int i = 0; i < iterationsPerFunc; i++)
        {
            for (int l = 0; l < iterationsLoopUlong; l++)
            {
                ulongs[l] = ulong.MaxValue;
            }
        }
        }
        float t1 = Time.realtimeSinceStartup;
        Debug.Log(name+" avg time elapsed: "+(t1 - t0) / (float) iterationsPerFunc);
    }
 
    void UnsafeReadUlong(string name)
    {
        float t0 = Time.realtimeSinceStartup;
        unsafe {
        ulong* ulongs = (ulong*)NativeArrayUnsafeUtility.GetUnsafeBufferPointerWithoutChecks<ulong>(ulongsUnsafe);
        for (int i = 0; i < iterationsPerFunc; i++)
        {
            for (int l = 0; l < iterationsLoopUlong; l++)
            {
                u = ulongs[l];
            }
        }
        }
        float t1 = Time.realtimeSinceStartup;
        Debug.Log(name+" avg time elapsed: "+(t1 - t0) / (float) iterationsPerFunc);
    }
 
    //...UNSAFE
}

Some guesses:

  • CPU cache effects
  • Different-sized instructions being used that have varying efficiencies.
  • Magic

If you want to know, profile it at the assembly language level.

Remember it will likely be different on a different computer with different hardware.

I tried using ILSpy but no luck . How do you handle profiling at this level? VS? VS Code?

First of all when you do time critical time measurements use the Stopwatch class. Time.realTimeSinceStartup is not very precise. Second, your workload inside your test is quite small. You only have 1000 * 256 (so 256k iterations). The CPU pipeline needs setup time until the cache can really kick in. Further more on certain multi processor systems time measurements can be skewed when a taskswitch happens during the execution of one of your tests. So you should generally run the test a couple of times, capture the times for each test and report the average, min and max results for each test. I would also recommend to choose a better unit prefix. Measuring times which are in the nano second scale and reporting them as seconds is not really great for comparison.

Even though it seems weird at first glance that ulong values seem to be faster than bytes, we now use 64 bit systems and 64 bit registers is the native register size which (depending on the hardware) could result in a “slight” better performance. Also you’re only doing 1/8th of the loop count compared to the byte case.

However it’s completely pointless to try to measure the time of a single opcode. Modern CPUs get an exponential increase in speed after a few cycles once the code pipeline is optimised and the cache is filled. Our systems are already too complex at the opcode (and microcode) level to properly reason about the performance behaviour and it generally can change from machine to machine. There are some general things that “usually” helps with performance on most hardware, but micro optimisation is an art in itself and most the time not worth it. If you actually want to do such experiments you should have at least a bunch of different testing hardware to test the same code on different hardware.

The Stopwatch (if the CPU has a high precision timer available, i.e the RDTSC opcode) could usually measure actual processor ticks. I’m not sure how well the Stopwatch actually works on some mobile CPUs which dynamically reduce their clockspeed with the load to save power. I know that there has been issues in the past.

Hmm. Basic inline profiling concepts are not the issue, or I could not have drafted this sample.

Outer loop iteration was set to 100k iterations formerly, but reduced (for faster start up) whilst making changes. 1k was run many times, with minimal variance in stats. A consistent order of 8-10 difference in performance is not simply “spitting into the wind”, so to speak.

Non-sequitur; it is a 64-bit system regardless of whether we are pulling from the stack (Managed*) or the heap (Unsafe* using NativeArray<ulong>), so this part of your answer adds no clarity for the given question.

It was indeed my intent to do 1/8th of the work on a 64-bit processor, or about 1/4 on a 32-bit system: that is exactly why the example was set up this way. *Ulong is expected to be faster than *Byte, but Managed*Ulong is not expected to be faster than Unsafe*Ulong, which should show roughly the same speed, if we ignore any (?) stack optimisations. If such optimisations exist, I’d be glad to hear of them.

Not pointless at all, since the substantial discrepancy queried in the question shows up consistently across runs and at 100k or more iterations. Your points on varied systems do not stand up to scrutiny, since fairly consistent discrepancy is seen, regardless. The simple fact is that the managed / stack version is much faster on a typical dev system and typical older mobile.

No matter how we slice it, given that each function’s internal iteration is factored similarly (conditionals in larger outer loop, unrolled inners), and given that an average is calculated, nothing here changes the fundamentally large discrepancy queried in the original question, i.e. that between Managed*Ulong and Unsafe*Ulong.

What would be useful here are some genuine insights into operations on managed arrays of ulongs vs unmanaged buffers of ulongs, OR some points on how to generate IL disassembly without using Burst… which is one option I have to hand, but which muddies the waters a little. I’d rather understand this in plain context.

It would be interesting to look at the produced IL and actual assembly for each function. I’m not sure if the compiler can be smart enough to realize all your iterations write to the same variable and just skip the loop altogether, or some other unexpected optimizations. What happens if you change the read tests to add the values instead of overwriting?

1 Like

Thanks @Neto_Kokku . This is what I was thinking, but how do I review IL without using Burst?

Is this available in VS Code? Or some other tool? - I tried ILSpy and it didn’t work on the generated .exe.

What generated exe? Do you mean from a built player? If so, your code isn’t there, that’s the Unity player exe. Your code is in the DLLs in the data folder.

@Neto_Kokku Thank you, that helps! Do you use VS or VS Code or some other tool to view disassemblies? I could not get ILSpy to work. I have no idea how to do this.

Well, ILSpy works, but you have to use it on the generated assembly, not the exe. The exe of a Unity application is just the Unity player or the native part of the engine. Now it’s actually just a wrapper for the engine since the engine has been put into a seperate DLL as well. Your own scripts are compiled into a managed assembly inside the “Managed” subfolder in your build. Or when testing in the editor, the compiled assemblies are located inside ProjectFolder/Library/ScriptAssemblies. Your usual scripts are compiled into the “Assembly-CSharp.dll”

I’ve just ran your script verbatim 3 times on my machine and I get extremely varying results. Increasing the iteration count stabilizes the results and as you said, the managed ulong “appears” to be faster. However, I just noticed you have an error in your test setup. You use the two variables iterationsLoopByte and iterationsLoopUlong in your tests where iterationsLoopUlong is set to be 1/8th of iterationsLoopByte. However only in your managed tests you additionally use iterationsLoopUlong / 8 in your for loop while in the unsafe test you just use iterationsLoopUlong as you should. So you’re decreasing your iteration count in the managed ulong tests by another factor of 8.

I replaced the realTimeSinceStartup with a Stopwatch and also centralized the reporting and ouput additional information for each test. This is the result:

Results

ManagedWriteByte ticks:  18275554   tick/i: 1,8275554   ms: 1827,5554   ms/i: 0,00018275554
ManagedReadByte ticks:   14406332   tick/i: 1,4406332   ms: 1440,6332   ms/i: 0,00014406332
ManagedWriteUlong ticks:  1902167   tick/i: 0,1902167   ms: 190,2167    ms/i: 1,902167E-05
ManagedReadUlong ticks:   1719909   tick/i: 0,1719909   ms: 171,9909    ms/i: 1,719909E-05
UnsafeWriteByte ticks:   11732512   tick/i: 1,1732512   ms: 1173,2512   ms/i: 0,00011732512
UnsafeReadByte ticks:    13691006   tick/i: 1,3691006   ms: 1369,1006   ms/i: 0,00013691006
UnsafeWriteUlong ticks:   1732301   tick/i: 0,1732301   ms: 173,2301    ms/i: 1,732301E-05
UnsafeReadUlong ticks:    1761423   tick/i: 0,1761423   ms: 176,1423    ms/i: 1,761423E-05
// note I reformatted it slightly so it aligns better.

The code now looks like this with the fix in place:

using UnityEngine;
using Unity.Collections;
using Unity.Collections.LowLevel.Unsafe;
using System.Text;

//Showcases the difference betweeen Managed vs. Unsafe NativeArray short (byte) and long (ulong) accesses.
//Set all bits in writes, then retrieve these into local members so that they have an effect / aren't stripped by compiler.
public class RWPerfDemo : MonoBehaviour
{
    byte[] bytesManage = null;
    ulong[] ulongsManage = null;
    NativeArray<byte> bytesUnsafe;
    NativeArray<ulong> ulongsUnsafe;

    const int iterationsPerFunc = 10000000;
    const int iterationsLoopByte = 256; //should be max unrollable, basically.
    const int iterationsLoopUlong = iterationsLoopByte / 8; //should be max unrollable, basically.

    byte b = 0;
    ulong u = 0;
    StringBuilder log = new StringBuilder();
    void Report(string name, System.Diagnostics.Stopwatch sw)
    {
        double f = 1d / iterationsPerFunc;
        double ticks = sw.Elapsed.Ticks;
        double ms = sw.Elapsed.TotalMilliseconds;
        log.Append(name)
            .Append(" ticks: ").Append(ticks).Append("\t")
            .Append(" tick/i: ").Append(ticks * f).Append("\t")
            .Append(" ms: ").Append(ms).Append("\t")
            .Append(" ms/i: ").Append(ms * f)
            .AppendLine();
    }

    void Start()
    {
        int totalBytes = iterationsLoopByte;
        bytesManage = new byte[totalBytes];
        ulongsManage = new ulong[totalBytes / 8];
        ulongsUnsafe = new NativeArray<ulong>(totalBytes / 8, Allocator.Persistent);

        ManagedWriteByte("ManagedWriteByte");
        ManagedReadByte("ManagedReadByte");
        ManagedWriteUlong("ManagedWriteUlong");
        ManagedReadUlong("ManagedReadUlong");

        UnsafeWriteByte("UnsafeWriteByte");
        UnsafeReadByte("UnsafeReadByte");
        UnsafeWriteUlong("UnsafeWriteUlong");
        UnsafeReadUlong("UnsafeReadUlong");
        Debug.Log(log.ToString());
        ulongsUnsafe.Dispose();
    }

    //MANAGED...

    void ManagedWriteByte(string name)
    {
        var sw = new System.Diagnostics.Stopwatch();
        sw.Start();
        for (int i = 0; i < iterationsPerFunc; i++)
        {
            for (int l = 0; l < iterationsLoopByte; l++)
            {
                bytesManage[l] = byte.MaxValue;
            }
        }
        sw.Stop();
        Report(name, sw);
    }

    void ManagedReadByte(string name)
    {
        var sw = new System.Diagnostics.Stopwatch();
        sw.Start();
        for (int i = 0; i < iterationsPerFunc; i++)
        {
            for (int l = 0; l < iterationsLoopByte; l++)
            {
                b = bytesManage[l];
            }
        }
        sw.Stop();
        Report(name, sw);
    }

    void ManagedWriteUlong(string name)
    {
        var sw = new System.Diagnostics.Stopwatch();
        sw.Start();
        for (int i = 0; i < iterationsPerFunc; i++)
        {
            for (int l = 0; l < iterationsLoopUlong; l++)
            {
                ulongsManage[l] = ulong.MaxValue;
            }
        }
        sw.Stop();
        Report(name, sw);
    }

    void ManagedReadUlong(string name)
    {
        var sw = new System.Diagnostics.Stopwatch();
        sw.Start();
        for (int i = 0; i < iterationsPerFunc; i++)
        {
            for (int l = 0; l < iterationsLoopUlong; l++)
            {
                u = ulongsManage[l];
            }
        }
        sw.Stop();
        Report(name, sw);
    }
    //...MANAGED

    //UNSAFE...
    void UnsafeWriteByte(string name)
    {
        var sw = new System.Diagnostics.Stopwatch();
        sw.Start();
        unsafe
        {
            ulong* ulongs = (ulong*)NativeArrayUnsafeUtility.GetUnsafeBufferPointerWithoutChecks<ulong>(ulongsUnsafe);
            byte* bytes = (byte*)ulongs;
            for (int i = 0; i < iterationsPerFunc; i++)
            {
                for (int l = 0; l < iterationsLoopByte; l++)
                {
                    bytes[l] = byte.MaxValue;
                }
            }
        }
        sw.Stop();
        Report(name, sw);
    }

    void UnsafeReadByte(string name)
    {
        var sw = new System.Diagnostics.Stopwatch();
        sw.Start();
        unsafe
        {
            ulong* ulongs = (ulong*)NativeArrayUnsafeUtility.GetUnsafeBufferPointerWithoutChecks<ulong>(ulongsUnsafe);
            byte* bytes = (byte*)ulongs;
            for (int i = 0; i < iterationsPerFunc; i++)
            {
                for (int l = 0; l < iterationsLoopByte; l++)
                {
                    b = bytes[l];
                }
            }
        }
        sw.Stop();
        Report(name, sw);
    }

    void UnsafeWriteUlong(string name)
    {
        var sw = new System.Diagnostics.Stopwatch();
        sw.Start();
        unsafe
        {
            ulong* ulongs = (ulong*)NativeArrayUnsafeUtility.GetUnsafeBufferPointerWithoutChecks<ulong>(ulongsUnsafe);
            for (int i = 0; i < iterationsPerFunc; i++)
            {
                for (int l = 0; l < iterationsLoopUlong; l++)
                {
                    ulongs[l] = ulong.MaxValue;
                }
            }
        }
        sw.Stop();
        Report(name, sw);
    }

    void UnsafeReadUlong(string name)
    {
        var sw = new System.Diagnostics.Stopwatch();
        sw.Start();
        unsafe
        {
            ulong* ulongs = (ulong*)NativeArrayUnsafeUtility.GetUnsafeBufferPointerWithoutChecks<ulong>(ulongsUnsafe);
            for (int i = 0; i < iterationsPerFunc; i++)
            {
                for (int l = 0; l < iterationsLoopUlong; l++)
                {
                    u = ulongs[l];
                }
            }
        }
        sw.Stop();
        Report(name, sw);
    }

    //...UNSAFE
}
1 Like

Nice catch there @Bunny83 . I thought there might be something obvious I’d missed.

This gets me on the train of looking into IL, which is something I’ve needed to do for a while.