odd Burst performances

this produces strange results
update() does 17fps on tegra x1 as expected
jobs does 42 fps
but job+burst does 40 fps, less than non burstified
can you explain why?

using System;
using System.Collections;
using System.Collections.Generic;
using Unity.Burst;
using Unity.Jobs;
using UnityEngine;
using UnityEngine.Jobs;
using Random = UnityEngine.Random;
using Unity.Mathematics;

public class InstantiateThenMove : MonoBehaviour
{
    public GameObject prefab;
    public int count = 1000;
    public float range = 50;
    public float deltaChange = 0.1f;
    Transform[] transforms;
    Vector3[] positions;
    TransformAccessArray trannies;
    WobbleJob wobbleJob;
    JobHandle jobHandle;
    public int mode;

    public void SwitchMode(int m)
    {
        mode = m;
    }

    void Start()
    {
        transforms = new Transform[count];
        positions = new Vector3[count];
        for (int i = 0; i < count; i++)
        {
            transforms[i] = Instantiate(prefab, Random.insideUnitSphere * range, Random.rotation).transform;
            positions[i] = transforms[i].position;
        }
        trannies = new TransformAccessArray(transforms);
    }

    void Update()
    {
        switch (mode)
        {
            case 0:
                Wobble();
                break;
            case 1:
                WobbleJobbified();
                break;
            case 2:
                WobbleBurstedJobbified();
                break;
        }
    }

    void Wobble()
    {
        var time = Time.time;
        var deltaTime = Time.deltaTime;
        Vector3 tmp = new Vector3();
        for (int i = 0; i < count; i++)
        {
            tmp.Set(Mathf.Sin(time+i), Mathf.Cos(time + 1+i), Mathf.Sin(time + 2+i));
            transforms[i].position += tmp * (deltaTime * deltaChange);
        }
    }

    void WobbleJobbified()
    {
        var wobbleJob = new WobbleJob()
        {
            deltaTime = Time.deltaTime,
            deltaChange = deltaChange,
            time = Time.time
        };
        jobHandle = wobbleJob.Schedule(trannies);
    }

    void WobbleBurstedJobbified()
    {
        var wobbleJob = new WobbleBurstJob()
        {
            deltaTime = Time.deltaTime,
            deltaChange = deltaChange,
            time = Time.time
        };
        jobHandle = wobbleJob.Schedule(trannies);
    }

    void LateUpdate()
    {
        if (mode > 0)
            jobHandle.Complete();
    }

    struct WobbleJob : IJobParallelForTransform
    {
        public float time;
        public float deltaTime;
        public float deltaChange;
        Vector3 tmp;

        public void Execute(int i, TransformAccess t)
        {
            tmp.Set(Mathf.Sin(time + i), Mathf.Cos(time + 1 + i), Mathf.Sin(time + 2 + i));
            t.position += tmp * (deltaTime * deltaChange);
        }
    }
   
    [BurstCompile]
    struct WobbleBurstJob : IJobParallelForTransform
    {
        public float time;
        public float deltaTime;
        public float deltaChange;
        Vector3 tmp;

        public void Execute(int i, TransformAccess t)
        {
            tmp.Set(Mathf.Sin(time + i), Mathf.Cos(time + 1 + i), Mathf.Sin(time + 2 + i));
            t.position += tmp * (deltaTime * deltaChange);
        }
    }

    private void OnDestroy()
    {
        trannies.Dispose();
    }
}

it’s 30000 cubes with no collider and camera off

safety check
6347187--705408--upload_2020-9-24_22-12-8.png
and
6347187--705411--upload_2020-9-24_22-12-18.png

but these only affect editor I think

You are using Mathf…?

Yes

IJobParallelFor, no transform update happening


that’s more like it
so
IJobParallelFor gets burstified ok but
IJobParallelForTransform doesn’t
must be due to transform update capping the possible gain

I mean why are you using Mathf instead of Unity.Mathematics?
Burst is a lot about SIMD and they made Unity.Mathematics for Burst, if you want to test the power of Burst you definitely have to use Unity.Mathematics.

because i tried mathematics and it’s slower in this test
¯_(ツ)_/¯

code for posterity:

using System;
using System.Collections;
using System.Collections.Generic;
using Unity.Burst;
using Unity.Collections;
using Unity.Jobs;
using UnityEngine;
using UnityEngine.Jobs;
using Random = UnityEngine.Random;
using Unity.Mathematics;

public class InstantiateThenMove : MonoBehaviour
{
   public GameObject prefab;
   public int count = 1000;
   public float range = 50;
   public float deltaChange = 0.1f;
   Transform[] transforms;
   float3[] positions;
   TransformAccessArray trannies;
   JobHandle jobHandle;
   public int mode;
   NativeArray<float3> positionsNative;

   public void SwitchMode(int m)
   {
      mode = m;
   }

   void Start()
   {
      transforms = new Transform[count];
      positions = new float3[count];
      for (int i = 0; i < count; i++)
      {
         // transforms[i] = Instantiate(prefab, Random.insideUnitSphere * range, Random.rotation).transform;
         positions[i] = (float3) Random.insideUnitSphere * range; //transforms[i].position;
      }
      positionsNative = new NativeArray<float3>(positions, Allocator.Persistent);
      trannies = new TransformAccessArray(transforms);
   }

   void Update()
   {
      switch (mode)
      {
         case 0:
            Wobble();
            break;
         case 1:
            WobbleJobbified();
            break;
         case 2:
            WobbleBurstedJobbified();
            break;
      }
   }

   void Wobble()
   {
      var time = Time.time;
      var deltaTime = Time.deltaTime;
      for (int i = 0; i < count; i++)
      {
         var tmp = math.float3(math.sin(time + i), math.cos(time + 1 + i), math.sin(time + 2 + i));
         positions[i] += tmp * (deltaTime * deltaChange);
      }
   }

   void WobbleJobbified()
   {
      var wobbleJob = new WobbleJob()
      {
         positions = positionsNative,
         deltaTime = Time.deltaTime,
         deltaChange = deltaChange,
         time = Time.time
      };
      jobHandle = wobbleJob.Schedule(count, 1000);
   }

   void WobbleBurstedJobbified()
   {
      var wobbleBurstJob = new WobbleBurstJob()
      {
         positions = positionsNative,
         deltaTime = Time.deltaTime,
         deltaChange = deltaChange,
         time = Time.time
      };
      jobHandle = wobbleBurstJob.Schedule(count, 1000);
   }

   void LateUpdate()
   {
      if (mode > 0)
      {
         jobHandle.Complete();
      }
   }

   struct WobbleJob : IJobParallelFor
   {
      public NativeArray<float3> positions;
      [ReadOnly] public float time;
      [ReadOnly] public float deltaTime;
      [ReadOnly] public float deltaChange;

      public void Execute(int i)
      {
         var tmp = new float3(math.sin(time + i), math.cos(time + 1 + i), math.sin(time + 2 + i));
         positions[i] += tmp * (deltaTime * deltaChange);
      }
   }
   [BurstCompile]
   struct WobbleBurstJob : IJobParallelFor
   {
      public NativeArray<float3> positions;
      [ReadOnly] public float time;
      [ReadOnly] public float deltaTime;
      [ReadOnly] public float deltaChange;

      public void Execute(int i)
      {
         var tmp = new float3(math.sin(time + i), math.cos(time + 1 + i), math.sin(time + 2 + i));
         positions[i] += tmp * (deltaTime * deltaChange);
      }
   }

   private void OnDestroy()
   {
      trannies.Dispose();
      positionsNative.Dispose();
   }
}

It’s all scalar value calculation not vectors at all and not vectorization possible. Mathematics+burst can not do anything about it.

Use IJobParallelForBatch and put your for loop inside of Execute(int startIndex, int count)
then mathematics+burst vectorization could make it better.
for IJobParallelForTransform I not sure. As it involves UnityEngine core c++ module. Burst may not be able to make that faster.

1 Like

6347640--705516--9252020-42211-PM.png
Not good as I expected but still two times faster
None burst version is about 16 ms for both system

    [BurstCompile]
    struct WobbleBurstJob : IJobParallelFor
    {
        public NativeArray<float3> positions;
        [ReadOnly] public float time;
        [ReadOnly] public float deltaTime;
        [ReadOnly] public float deltaChange;

        public void Execute(int i)
        {
            var tmp = new float3(math.sin(time + i), math.cos(time + 1 + i), math.sin(time + 2 + i));

            positions[i] += tmp * (deltaTime * deltaChange);
        }
    }
    public class TestSystem : SystemBase
    {
        protected override void OnUpdate()
        {
            var test = new NativeArray<float3>(1000000, Allocator.TempJob);
            new WobbleBurstJob()
            {
                positions = test,
                time = 1,
                deltaTime = 0.1f,
                deltaChange = 0.3f,
            }.Schedule(test.Length, 64).Complete();
            test.Dispose();
        }
    }

    [BurstCompile]
    struct WobbleBurstJobBatch : IJobParallelForBatch
    {
        public NativeArray<float3> positions;
        [ReadOnly] public float time;
        [ReadOnly] public float deltaTime;
        [ReadOnly] public float deltaChange;

        public void Execute(int startIndex, int count)
        {
            float3 vec = time + math.float3(0, 1 - math.PI * 0.25f, 2);//sin and cos has a 1/4 Pi shift
            for (int i = startIndex, end = startIndex + count; i < end; i++)
            {
                var shifted = vec + i;
                positions[i] += math.sin(shifted) * (deltaTime * deltaChange);
            }
        }
    }

    public class TestSystemBatch : SystemBase
    {
        protected override void OnUpdate()
        {
            var test = new NativeArray<float3>(1000000, Allocator.TempJob);
            new WobbleBurstJobBatch()
            {
                positions = test,
                time = 1,
                deltaTime = 0.1f,
                deltaChange = 0.3f,
            }.ScheduleBatch(test.Length, 64).Complete();
            test.Dispose();
        }
    }
2 Likes

One problem I see is the var tmp is part of the job struct. At the very least, that means the struct is 12 bytes larger, but it could also be trying to copy around the new value.

what’s the solution?
i was trying to get rid of that by using Vector3.Set instead, mathematics is overall slower in my tests

what does that mean?
looking at your code, i think you get 2x speed because of your hand optimization with the shift pi thingy, not due to structure of job

None burst version mean when [BurstCompile] is removed.
both system runs at about 16ms.
And, Yes,“shift pi” is hand optimization, but also the three sin is bursted by SIMD. That the real where I got the 2x speed.
You ‘’sin cos sin" code is hand de-optimization, to avoid burst to do anything.
Scalar value calculation is not what burst+math is good at. There are more info in burst package doc, read it before more test.

Your code, from which I edited, is mixing float3 and scalar. if there’s no branch in the loop all scalar math opt should be auto vectorized to utilize SIMD. IJobParallelForBatch ensures that there is a loop. For IJobParallelFor, if your function is inlined to the caller’s function then it will be in a loop, no loop if it’s not inlined. It depends on how c# compile generates IL code. Burst works on those generated IL codes and generates NativeCode in turn.

And about Mathf vs math on a scalar value. Well, it could have something to do with the actual implementation.
for example, they may have different step count in Taylor series, so the slower one may be more accurate. and according to your test data they are not very different in performance.

But that’s not why you use or not use burst. utilize features like SIMD is the real boost. Again, check the doc before you are going to do some serious test.

1 Like