Let's Discuss Sorting with Jobs

I thought it would be nice to discuss sorting in the context of the ECS and (more importantly) the Job System. Most of my research into parallel sorting turned up a common desire to ensure the list of items to be sorted was greater than a specific threshold, so as to avoid unnecessarily wasting thread resources. However, the ability of the Unity job system to breakup data processing into multiply tiny chunks seems like it would eliminate this concern, thus most current solutions can be optimized for use with Unity. Even if that were not true, I think it would be beneficial to discuss parallel sorting directly in context with the job system, as the mechanism for writing parallel code is a bit different with it versus writing parallel code outside of Unity.

So, with that in mind, I have implemented a sorting system via the IJobParallelForBatch interface. It is a merge sort, which from my understanding is well suited to parallelism. Please take a look and let me know what you think!

Code

using Unity.Entities;
using Unity.Jobs;
using Unity.Collections;
using UnityEngine;
using Unity.Burst;

public class SorterSystem : JobComponentSystem
{
    int runs = 100, itemsToSort = 10000, x = 0;
    int y = 0;//You can comment this out/delete if you don't need to verify that the sort worked correctly
    float sum;

    //The first sort that is run. Just sorts items in blocks of two. For instance for an array of 5
    //2,5,7,3,1
    //would sort into blocks of 2,5|3,7|1
    [BurstCompile]
    struct TwoItemSort : IJobParallelForBatch
    {
        public NativeArray<int> sort;

        public void Execute(int start, int length)
        {
            if (length < 2)
                return;

            if(sort[start] > sort[start + 1])
            {
                int firstValue = sort[start];
                sort[start] = sort[start + 1];
                sort[start + 1] = firstValue;
            }
        }
    }

    //After TwoItemSort has sorted the items in the array into blocks of two, run this job to merge the two item
    //blocks into 4 item blocks (batch size = 4, sort threshold = 2), then run again to sort the 4 item blocks
    //into 8 item blocks (batch size = 8, sort threshold = 4). Keep repeating this cycle until all items have been
    //sorted.
    [BurstCompile]
    struct MergeSort : IJobParallelForBatch
    {
        [ReadOnly] public NativeArray<int> input;


        [WriteOnly] public NativeArray<int> output;
        public int sortThreshold;

        public void Execute(int start, int length)
        {
            //merge sorts are performed on a sorted left and right half batch of data (for instance,
            //an 8 batch has a left half 4 items long and right half 4 items long.
            //However, it's possible to have one batch each job that is not the full batch length (for instance, imagine you were
            //sorting an integer array of length 5, if the partition size for the first merge job was 4, then you'd have two batches,
            //one of length 4, and one of length 1.

            //Because of this fact, it's possible to have a batch that has a left side (maybe not a full left half though) and
            //no right half. In these instances, sorting the batch would be redundant, the left half is already sorted.
            //this check makes sure we only perform the sort on batches with a left and right half.
            if (length <= sortThreshold)
            {
                //indicates the items are already sorted
                int max = start + length;
                for (int m = start; m < max; m++)
                    output[m] = input[m];
                return;
            }
         
            //We have two halves, and its guaranteed that the left half and right half
            //are already sorted. Because of how IJobParallelForBatch works, we also know that the
            //left half of each batch will have the same or greater the number of items as the right half.
            //We can use that fact to our advantage.

            int end = start + length;
            int rightHalfStart = start + sortThreshold;
            int i = start, j = rightHalfStart;
            for (int k = start; k < end; k++)
            {
                //is the current item from the right half less than the current item of the left half?
                //we put the right half item on the left side of this comparison so that when the two
                //items are equal, the left half item wins and is slotted first, thus preserving the
                //order of the items. This isn't really necessary when dealing with integers, but may be
                //important when sorting other things.
                if (input[j] < input[i])
                {
                    output[k] = input[j];
                    j++;
                }
                else
                {
                    output[k] = input[i];
                    i++;
                }

                //since right half is possibly smaller than left half, this should be
                //more likely, so check for it first
                //If this statement is true, it means all items from the right half have
                //been sorted and only left half item exist. Since the remaining left half items
                //are already sorted, we can just do a copy of the remaining items
                if (j == end)
                {
                    //k is still pointing to the last sorted item, so start by incrementing it.
                    for (k += 1; k < end; k++, i++)
                        output[k] = input[i];

                    return;
                }
                //If this statement is true, it means all items from the left half have been sorted and only
                //right half items remain. Since the right half is already sorted, we can just copy the remaining
                //right half items (indexed by j) into the output.
                else if (i == rightHalfStart)
                {
                    //k is still pointing to the last sorted item, so start by incrementing it.
                    for (k += 1; k < end; k++, j++)
                        output[k] = input[j];

                    return;
                }
            }
        }
    }
     
    protected override JobHandle OnUpdate(JobHandle inputDeps)
    {
        if (x == runs)
        {
            Debug.Log("Average sort time = " + (sum / runs * 1000f) + "ms");
            x = 0;
            sum = 0f;
        }

        x++;
             
        NativeArray<int> a = new NativeArray<int>(itemsToSort, Allocator.TempJob, NativeArrayOptions.UninitializedMemory);
        NativeArray<int> b = new NativeArray<int>(itemsToSort, Allocator.TempJob, NativeArrayOptions.UninitializedMemory);

        for (int i = 0; i < a.Length; i++)
            a[i] = Random.Range(0, a.Length - 1);
     
        //Used to measure how long the sort took. This might not be the best way but oh well.
        float startTime = Time.realtimeSinceStartup;
     
        bool aIsInput = false;
        if (a.Length >= 2)
        {
            inputDeps = new TwoItemSort() { sort = a }.ScheduleBatch(a.Length, 2, inputDeps);

            int partitionSize = 2;

            //We are checking to make sure the previous sort jobs partition size is less than the
            //length of the data to be sorted.
            while (partitionSize < itemsToSort)
            {
                //switch back and forth between a and b as input (whichever one is input, the other will be output).
                //This is done because the sort job takes in an input array, and the sorted values are placed in
                //the output (because the input array cannot be overwritten for the sort to work correctly).
                //The next job then merges two sorted partitions from the previous step, which are in the output. So
                //this next job needs to have the output from the previous step as input. We could copy then output to the
                //input after each job, but that is a waste when we can just switch the arrays back and forth between being used
                //as input and output.

                //We do the reversal before the job is run so after all jobs are run it is clearer which array (a or b) has the
                //fully sorted values. If aIsInput is true, b will have the values, otherwise a will have them. This is also why
                //we set aIsInput to false initially, since for the first run we want it to be true.
                aIsInput = !aIsInput;

                //Double the partition size for the next sort job
                partitionSize *= 2;
                inputDeps = new MergeSort() { input = aIsInput ? a : b, output = aIsInput ? b : a, sortThreshold = partitionSize / 2 }.ScheduleBatch(a.Length, partitionSize, inputDeps);
            }
        }
     
        inputDeps.Complete();

        float timeSortTook = Time.realtimeSinceStartup - startTime;
        sum += timeSortTook;

        if (aIsInput)
        {
            a.Dispose();
            //uncomment to verify that the sort worked correctly
            //if (y++ == 0)
            //{
            //    for (int i = 0; i < b.Length; i++)
            //        Debug.Log(b[i]);
            //}
            b.Dispose();
        }
        else
        {
            b.Dispose();
            //uncomment to verify that the sort worked correctly
            //if (y++ == 0)
            //{
            //    for (int i = 0; i < a.Length; i++)
            //        Debug.Log(a[i]);
            //}
            a.Dispose();

        }    
     
        return inputDeps;
    }
}

Here I am sorting 10000 integers, randomly assigned. The sort time is measured using Time.realTimeSinceStartup, which might not be the best way, but it was easy to implement so I went with it. This approach yields an average sort time of .5ms. I feel this is probably pretty good, but I really have nothing to compare it to (a non parallel sort, for instance).

Edit: I just discovered NativeArray has a Sort extension method. I went ahead and tested this in an IJob (BurstCompiled of course), and using this method takes around .65ms. So, if you expect to have less than 10000 items, it’s probably easier to use the Sort method, though if you need any performance boost you can get, parallel sorting is the way to go. Also, we can probably push the parallel sorting times even lower.

Please feel free to critique the sorting code, measurement code, or anything else. Is there anything that can be changed to provide even better results? This is my first foray into parallel sorting so it’s very possible there is a better way to do this!

Has anyone else experimented with parallel sorting in the context of the job system? What are your results? Please feel free to post your code. I’d like this thread to serve as a resource for anyone looking to implement a sorting system. This is my first foray into parallel sorting, so it’s very possible there is a better way to do this.

1 Like

0.5ms parallel vs 0.65ms one job I would say is a bad tradeoff. If you have a couple systems accessing different data running in parallel it will likely result in an overall performance loss.

Sorting is inherently dependent calculations, parallelising it is difficult.

Stating the problem at a higher level and finding a solution that is not as general purpose might give you better performance. Can you naturally bucket your work into parts and then sort the parts seperately? Thus going wide but just schedule a couple totally independent sort IJob?

Hi, thanks for the information. What you say makes sense in general when you have multiple systems running in parallel, however in my particular situation (and perhaps other find themselves in a similar situation), I am not so sure. I would be glad to hear your opinion.

Basically, I am writing a custom physics loop, dividing out the different steps in the loop into different systems. For example, I have a ProjectionSystem that projects the position of a collider for a given amount of time, an UpdateProjectedAABBSystem that calculates an AABB using that projected position and other properties of the collider (i.e., the radius of circle colliders), and then a BroadPhaseCollisionDetectionSystem that performs a rudimentary AABB overlap test on different colliders that can “interact.” There are many more systems, but the general idea is that each system is dependent on the systems prior, so there isn’t parallelism between systems.

The sorting system is a part of this loop later down the chain and is intended to filter out duplicate collision entries (a moving collider can overlap with two or more separate wall colliders, but only the earliest collision actually occurred, so we can remove the later ones). To do this, I sort collisions by when they occurred, and then try to add each collision to a hash map. If the hash map already contains a key associated with the moving collider, that collision is removed. Because the collisions are sorted, this ensures only the earliest collision remains for each moving collider.

So when the sorting is going on, there aren’t any other systems running. If I could have other systems running in parallel I would, but so far I haven’t seen the opportunity. Am I using the ECS/Job system in a bad way? Does a parallel sort under these circumstances make sense?

Thanks!

I don’t know if it’s useful but I was interested :slight_smile:

On my machine (Ryzen with 6 cores) your solution was actually slower (0.59ms vs 0.5ms) than just using Sort in a Job for 10000 entries. At 100.000 yours is faster (4.25ms vs 5.97ms).

Here is my take on the parallel MergeSort. Alot less Jobs and less paralleism but still faster: 2.57ms at 100k, 0.32ms at 10k.

using Unity.Entities;
using Unity.Jobs;
using Unity.Collections;
using UnityEngine;
using Unity.Burst;
using Unity.Mathematics;
using UnityEditor;
using UnityEngine.Assertions;
using static Unity.Mathematics.math;
using Random = UnityEngine.Random;

public class MySorterSystem : JobComponentSystem
{
    int runs = 100, itemsToSort = 10000, x = 0;
    int y = 0; //You can comment this out/delete if you don't need to verify that the sort worked correctly
    float sum;

    [BurstCompile]
    struct SortPart : IJobParallelForBatch
    {
        public NativeArray<int> input;

        public void Execute(int start, int length)
        {
            new NativeSlice<int>(input, start, length).Sort();
        }
    }

    [BurstCompile]
    struct MergeSort : IJobParallelForBatch
    {
        [ReadOnly] public NativeArray<int> input;
        [WriteOnly] public NativeArray<int> output;

        public void Execute(int startIndex, int count)
        {
            var input = new NativeSlice<int>(this.input, startIndex, count);
            var output = new NativeSlice<int>(this.output, startIndex, count);

            var half = input.Length / 2;
            var i = 0;
            var j = int4(0, half, 0, -100);
            for (; i < count && j[3] < -1f; i++)
            {
                j = select(
                    int4(j[0] + 1, j[1], j[0], j[0] - half),
                    int4(j[0], j[1] + 1, j[1], j[1] - count),
                    input[j[0]] > input[j[1]]);
                output[i] = input[j[2]];
            }

            var outSlice = new NativeSlice<int>(output, i, count - i);
            if (j[0] < half)
            {
                var inSlice = new NativeSlice<int>(input, j[0], half - j[0]);
                outSlice.CopyFrom(inSlice);
            }
            else
            {
                var inSlice = new NativeSlice<int>(input, j[1], count - j[1]);
                outSlice.CopyFrom(inSlice);
            }
        }
    }

    protected override JobHandle OnUpdate(JobHandle inputDeps)
    {
        if (x == runs)
        {
            Debug.Log("My Average sorts time = " + (sum / runs * 1000f) + "ms");
            x = 0;
            sum = 0f;
        }

        x++;

        var a = new NativeArray<int>(itemsToSort, Allocator.TempJob, NativeArrayOptions.UninitializedMemory);
        var b = new NativeArray<int>(itemsToSort, Allocator.TempJob, NativeArrayOptions.UninitializedMemory);

        for (var i = 0; i < b.Length; i++)
            b[i] = i;
        for (int i = 0; i < a.Length; i++)
        {
            var r = Random.Range(0, b.Length - i);
            a[i] = b[r];
            b[r] = b[b.Length - i - 1];
        }

        //Used to measure how long the sort took. This might not be the best way but oh well.
        var startTime = Time.realtimeSinceStartup;

        const int parts = 8;
        inputDeps = new SortPart()
        {
            input = a,
        }.ScheduleBatch(a.Length, a.Length / parts, inputDeps);

//        inputDeps = new SortPart()
//        {
//            input = a,
//        }.ScheduleBatch(a.Length, a.Length, inputDeps);

        var input = b;
        var output = a;
        for (var p = parts / 2; p > 0; p /= 2)
        {
            var t = input;
            input = output;
            output = t;
           
            inputDeps = new MergeSort()
            {
                input = input,
                output = output,
            }.ScheduleBatch(a.Length, a.Length / p, inputDeps);
        }

        inputDeps.Complete();

        var timeSortTook = Time.realtimeSinceStartup - startTime;
        sum += timeSortTook;

        if (y++ == 0)
        {
            var allEqual = true;
            for (var i = 0; i < output.Length; i++)
                allEqual &= output[i] == i;
            Assert.IsTrue(allEqual);
        }

        a.Dispose();
        b.Dispose();

        return inputDeps;
    }
}

Some physics engines solve this by having a phase that separates the simulation into islands. This way each island can be treated independently and thus run in parallel to each other. No need to have parallel sorting for that.

This is essentially the concept of finding higher level buckets that can run in parallel.

1 Like

One thing to really take into account is battery consumption on mobile.

As a overly simplified rule of thumb:

  • Transferring data takes a lot of energy, computation less so
  • Generally on mobile most important is to get the work done most efficiently. So getting non-linear speedups with number of cores added is a bad idea when optimizing for mobile, since it will result in total more work performed and thus more energy consumed. On the other hand perfectly linear speedups are good, because it gets you to a point where you can let the CPU sleep for a pro-longed period of time faster.

Microbenchmarks can also be misleading in this case. A real game if done right has a bunch of subsystems, and you have better opportunity for all those to run in parallel, in a synthetic benchmark you only have one thing running so parallel for might look better in theory than in practice.

5 Likes

So you are saying to have each islands simulation run sequentially, but each island would be running in parallel to each other? What would be the difference between that and just having a single island, with each step of the simulation utilizing parallelization to complete faster?

Also, how would this work in terms of Unity’s system? I would think you’d have to create a separate World for each island, to allow for parallel writing to the same component type between islands. Is that right? Then within each world you would just use the IJob interface within each JobComponentSystem rather than IJobParallelFor/Batch/ProcessComponentData?

Thanks! I am very new to parallel stuff so it is a learning process.

Fast parallel integer sorting is fairly simple, though I haven’t tried it out yet in Unity’s job system.

You start from a basic least-significant digit radix sort. The only real trick is to split the input array into tiles to process in parallel; everything else follows from that. Each tile counts to its own histogram. Then, you transform the histograms to write-to offsets. For any tile, this is the sum of all preceding histograms plus the prefix sum of the sum of all histograms. That last part is a little confusing but you’re just finding the right allocation point for each tile in the output array. Finally, you move elements just as in serial radix sort. Repeat for each radix.

The thing is, parallel sorting (like parallel-anything) is only worth it for a high enough N. My simple (145 commented LoC) C++ implementation of the above sorts 10,000 ints in 0.2 ms. But it sorts 260,000 ints in 1.4 ms. A factor of 26 and 7 for element count and time respectively. The latter case could be considered ~4 times as efficient.

Anyway, I’ve done enough research and experimentation on parallel sorting for games to share some hopefully handy conclusions:

The fastest gun in the west is an MSD radix sort that adapts to smaller sub-problems with different algorithms. That’s probably overkill for any game dev’s practical purposes, though. Simple parallel LSD radix sorts consistently sort hundreds of millions of elements per second at reasonable problem sizes. You’ll hear about parallel merge sorts, quicksorts etc., but these only suit programming exercises. They’re either far slower or more complex or both, even when they beat LSD radix sort for cleverness. The sort rate of parallel radix sort gets really crazy once you customize to your specific problem, since key-value-pair sorting is the typical use-case; you don’t need to sort every radix.

Finally, the sole integer sorting case where radix sort is not necessarily fastest is for almost-sorted data, where maintaining a sorted array is the typical use-case. It’s hard to draw firm conclusions here because there’s such a wide range of almost-sortedness you could encounter. Anyway, there’s little research on parallel adaptive sorting. I suspect for game-size workloads, serial implementations of highly adaptive sorts will always be faster. Never looked too far into it, though.

2 Likes

For a research project I am working on, I needed a fast sorting algorithm in Unity and came across this post. I have some experience with using a parallel radix sort and parallel merge sort for a DirectX 12 project I was working on a few years ago and figured that a parallel merge sort was the way to go.

@julian-moschuering , I initially tried using your algorithm, but it seems to only work if the list size is divisible by 8.

After a few variations and a lot of testing, I think I came up with a general solution which works for all sizes. During testing, I noticed that for certain sizes, the Sort method of the NativeSortExtension works better than this implementation, and for larger lists (between 1000 and 10000 items) the SortJob method works best. For larger lists (more than 10000 items), this implementation performs a lot better than either of the sort methods in the Collections package.

Below you will find the code, but you can also find the package at https://github.com/BredaUniversityResearch/UnitySort.

using System.Collections.Generic;
using Unity.Burst;
using Unity.Collections;
using Unity.Jobs;
using UnityEngine;

namespace Cradle
{
    /// <summary>
    /// This job sorts equally sized parts of the the inital array.
    /// </summary>
    /// <typeparam name="T"></typeparam>
    /// <typeparam name="U"></typeparam>
    // Source: https://discussions.unity.com/t/714277
    [BurstCompile(CompileSynchronously = true)]
    internal struct SortPartJob<T, U> : IJobParallelForBatch
        where T : struct
        where U : struct, IComparer<T>
    {
        public NativeArray<T> Src;
        public U Cmp;

        public void Execute(int startIndex, int count)
        {
            Src.Slice(startIndex, count).Sort(Cmp);
        }
    }

    [BurstCompile(CompileSynchronously = true)]
    internal struct MergePartJob<T, U> : IJob
        where T : struct
        where U : struct, IComparer<T>
    {
        [ReadOnly]
        public NativeArray<T> Src;
        [WriteOnly]
        public NativeArray<T> Dst;
        public U Cmp;

        public int Left;
        public int Mid;
        public int End;

        /// <summary>
        /// Merge a[left .. mid-1] and a[mid .. end-1] into b[left .. end-1].
        /// </summary>
        /// <typeparam name="T1">The value type.</typeparam>
        /// <typeparam name="U1">The IComparer type.</typeparam>
        /// <param name="a">The array to be sorted.</param>
        /// <param name="b">The result of sorting.</param>
        /// <param name="cmp">The Comparer to use for comparison.</param>
        /// <param name="left">The index of the left part to merge.</param>
        /// <param name="mid">The mid point between the two parts to merge.</param>
        /// <param name="end">The end of the parts to merge.</param>
        internal static void Merge<T1, U1>(in NativeArray<T1> a, NativeArray<T1> b, U1 cmp, int left, int mid, int end)
            where T1 : struct
            where U1 : struct, IComparer<T1>
        {
            int i = left;
            int j = mid;
            int k = left;

            // Perform the merge until either the left or right parts are exhasted,
            // or the end of the list is reached.
            for (; k < end && i < mid && j < end; ++k)
            {
                if (cmp.Compare(a[i], a[j]) <= 0)
                {
                    b[k] = a[i++];
                }
                else
                {
                    b[k] = a[j++];
                }
            }

            // Copy any remaining values.
            if (i < mid)
            {
                var src = new NativeSlice<T1>(a, i, mid - i);
                var dst = new NativeSlice<T1>(b, k, end - k);
                dst.CopyFrom(src);
            }
            else if (j < end)
            {
                var src = new NativeSlice<T1>(a, j, end - j);
                var dst = new NativeSlice<T1>(b, k, end - k);
                dst.CopyFrom(src);
            }
        }

        public void Execute()
        {
            Merge(Src, Dst, Cmp, Left, Mid, End);
        }
    }

    [BurstCompile(CompileSynchronously = true)]
    internal struct MergeSortJob<T, U> : IJobParallelForBatch
        where T : struct
        where U : struct, IComparer<T>
    {
        [ReadOnly]
        public NativeArray<T> Src;
        [WriteOnly]
        public NativeArray<T> Dst;
        public U Cmp;

        public void Execute(int startIndex, int count)
        {
            int left = startIndex;
            int end = startIndex + count;
            int mid = (left + end) / 2;
            // Merge the two parts of the source array into the destination array.
            MergePartJob<T, U>.Merge(Src, Dst, Cmp, left, mid, end);
        }
    }

    [BurstCompile(CompileSynchronously = true)]
    internal struct CopyJob<T> : IJobParallelFor
        where T : struct
    {
        [ReadOnly]
        public NativeSlice<T> Src;
        [WriteOnly]
        public NativeSlice<T> Dst;

        public void Execute(int i)
        {
            Dst[i] = Src[i];
        }
    }

    public static class Sort
    {
        /// <summary>
        /// Swap the arrays. This is very efficient for NativeArray as only the internal
        /// memory pointer is swapped, not the values of the array.
        /// </summary>
        /// <typeparam name="T">The value type of the arrays being swapped.</typeparam>
        /// <param name="a">The first array to swap with the second.</param>
        /// <param name="b">The second array to swap with the first.</param>
        private static void Swap<T>(ref NativeArray<T> a, ref NativeArray<T> b)
            where T : struct
        {
            var t = a;
            a = b;
            b = t;
        }

        /// <summary>
        /// Sort an array of values.
        /// </summary>
        /// <typeparam name="T">The type of the array to sort.</typeparam>
        /// <typeparam name="U">An IComparer to use for the comparison of values.</typeparam>
        /// <param name="a">The array to sort.</param>
        /// <param name="cmp">The IComparer to use for value comparison.</param>
        public static void MergeSort<T, U>(NativeArray<T> a, in U cmp)
            where T : unmanaged
            where U : struct, IComparer<T>
        {
            switch (a.Length)
            {
                case <= 1:
                    // Ignore lists of 0 or 1 values.
                    break;
                case <= 1000:
                    a.Sort(cmp);
                    break;
                case <= 10000:
                    a.SortJob(cmp).Schedule().Complete();
                    break;
                default:
                    // Compute the number of parts for best parallelisim.
                    int parts = Mathf.ClosestPowerOfTwo((int)Mathf.Log(a.Length, 2));

                    // Create a working copy to perform the merge sort.
                    var b = new NativeArray<T>(a.Length, Allocator.TempJob, NativeArrayOptions.UninitializedMemory);

                    var sort = new SortPartJob<T, U>()
                    {
                        Src = a,
                        Cmp = cmp
                    }.ScheduleBatch(a.Length, a.Length / parts);

                    var input = b;
                    var output = a;
                    // What is the length of the array that is exactly divisble by parts?
                    var length = a.Length / parts * parts;
                    for (var p = parts / 2; p > 0; p /= 2)
                    {
                        // Swap the input/output pointers.
                        Swap(ref input, ref output);

                        sort = new MergeSortJob<T, U>()
                        {
                            Src = input,
                            Dst = output,
                            Cmp = cmp
                        }.ScheduleBatch(length, length / p, sort);
                    }

                    // If the length of the array that is exactly divisible by parts is less
                    // than the length of the source array, there will be 1 part that needs to be
                    // merged into the final array.
                    if (length < a.Length)
                    {
                        // Sort the remaining part.
                        Swap(ref input, ref output);

                        // If the input array is b,
                        // then we need to copy the last sorted part from a into b
                        // before merging the last part.
                        if (input == b)
                        {
                            sort = new CopyJob<T>()
                            {
                                Src = a.Slice(length),
                                Dst = b.Slice(length),
                            }.Schedule(a.Length - length, 64, sort);
                        }

                        sort = new MergePartJob<T, U>()
                        {
                            Src = input,
                            Dst = output,
                            Cmp = cmp,
                            Left = 0,
                            Mid = length,
                            End = a.Length
                        }.Schedule(sort);
                    }

                    if (output == b)
                    {
                        // If the final sorted list is in b,
                        // we need to copy the final sorted result to a.
                        // We use a job to ensure the copy occurs
                        // after the sort job is finished.
                        sort = new CopyJob<T>()
                        {
                            Src = b,
                            Dst = a,
                        }.Schedule(a.Length, 64, sort);
                    }

                    // Wait for the sort jobs to finish.
                    sort.Complete();

                    b.Dispose();
                    break;
            }
        }
    }
}

This method also uses generics which allows you to sort more than just ints.

The following code can be used to test the performance of the various sorting algorithms.

using System;
using System.Collections.Generic;
using NUnit.Framework;
using Unity.PerformanceTesting;
using Unity.Collections;

public class SortTests
{
    // Number of valuse to use for testing.
    const int NUM_VALUES = 100000;

    public struct DefaultComparer<T> : IComparer<T>
        where T : struct, IComparable<T>
    {
        public int Compare(T x, T y)
        {
            return x.CompareTo(y);
        }
    }

    [Test, Performance]
    public void SortPerformance()
    {
        var r = new System.Random();

        var a = new NativeArray<int>(NUM_VALUES, Allocator.TempJob, NativeArrayOptions.UninitializedMemory);
        var cmp = new DefaultComparer<int>();

        for (int i = 0; i < a.Length; ++i)
        {
            a[i] = r.Next();
        }

        Measure.Method(() =>
        {
            a.Sort(cmp);
        })
            .WarmupCount(1)
            .MeasurementCount(10)
            .SampleGroup($"NativeArray.Sort ({a.Length} values)")
            .Run();

        a.Dispose();
    }

    [Test, Performance]
    public void SortJobPerformance()
    {
        var r = new System.Random();

        var a = new NativeArray<int>(NUM_VALUES, Allocator.TempJob, NativeArrayOptions.UninitializedMemory);
        var cmp = new DefaultComparer<int>();

        for (int i = 0; i < a.Length; ++i)
        {
            a[i] = r.Next();
        }

        Measure.Method(() =>
        {
            a.SortJob(cmp).Schedule().Complete();
        })
            .WarmupCount(1)
            .MeasurementCount(10)
            .SampleGroup($"NativeArray.SortJob ({a.Length} values)")
            .Run();

        a.Dispose();
    }

    [Test, Performance]
    public void MergeSortPerformance()
    {
        var r = new System.Random();

        var a = new NativeArray<int>(NUM_VALUES, Allocator.TempJob, NativeArrayOptions.UninitializedMemory);
        var cmp = new DefaultComparer<int>();

        for (int i = 0; i < a.Length; ++i)
        {
            a[i] = r.Next();
        }

        Measure.Method(() =>
        {
            Cradle.Sort.MergeSort(a, cmp);
        })
            .WarmupCount(1)
            .MeasurementCount(10)
            .SampleGroup($"Cradle.MergeSort ({a.Length} values)")
            .Run();

        a.Dispose();
    }

    [Test]
    public void MergeSortTest()
    {
        var r = new System.Random();

        var a = new NativeArray<int>(NUM_VALUES, Allocator.TempJob, NativeArrayOptions.UninitializedMemory);
        var cmp = new DefaultComparer<int>();

        for (int i = 0; i < a.Length; ++i)
        {
            a[i] = r.Next();
        }

        Cradle.Sort.MergeSort(a, cmp);

        bool isSorted = true;
        for (int i = 0; i < a.Length - 1 && isSorted; ++i)
        {
            isSorted &= a[i] <= a[i + 1];
        }

        Assert.IsTrue(isSorted);

        a.Dispose();
    }
}

I’m very curious to see other peoples experience with this method and which methods perform the best depending on your processor and number of cores. If you have any performance results, please share them here!

3 Likes

It’s been awhile but thanks for the input everyone! Hopefully someone finds this thread useful in the future.

2 Likes