Hi, I am preparing a tutorial on JobSystem and Burst. I did some simple Burst performance measures using different approaches to float3/4 handling in terms of SIMD. I was always confused when writing algorithms and mixing these two types. I wasn’t sure whether there are some extra operations added due to swizzles, data loading etc.
Although, it is a simple synthetic test, maybe some of you will find it useful.
In the table are time measures in ms for 100k, 1M, 10M elements in data arrays and instructions count in the brackets
Instructions Count
Float3 (18)
0.28ms 2.19ms 21.8ms
Float4to3 (15)
0.28ms 2.62ms 26.1ms
Float4 (14)
0.28ms 2.64ms 26.1ms
Also this is interesting in the context of the GPU but I guess also valid for SIMD.
The question is whether the Burst compiler can also combine different ops together and what are the best practices regarding this topic, in general. If someone from Unity ( @Joachim_Ante_1 ?) could comment on this, that would be much appreciated.
First, just for the reference, a scalar version. 14 assembly instructions
[ComputeJobOptimization]
struct Float1Job : IJob
{
public int dataSize;
[ReadOnly] public NativeArray<float> dataA;
[ReadOnly] public NativeArray<float> dataB;
[WriteOnly] public NativeArray<float> dataOut;
public void Execute()
{
for (int i = 0; i < dataSize; i++)
{
float a = dataA[i];
float b = dataB[i];
float sum = a + b;
float mul = a * b;
float res = (sum - mul) / 10.0f;
dataOut[i] = res;
}
}
}
mov r8, qword ptr [rcx + 8]
mov r9, qword ptr [rcx + 64]
movss xmm1, dword ptr [r8 + rax]
movss xmm2, dword ptr [r9 + rax]
movaps xmm3, xmm2
addss xmm3, xmm1
mulss xmm2, xmm1
subss xmm3, xmm2
mulss xmm3, xmm0
mov rdx, qword ptr [rcx + 120]
movss dword ptr [rdx + rax], xmm3
inc r10d
add eax, 4
cmp r10d, dword ptr [rcx]
Next, both data in calculations using float3s. We get 18 instructions, a few extra insertps and extractps instructions to prepare the data for SIMD:
[ComputeJobOptimization]
struct Float3Job : IJob
{
public int dataSize;
[ReadOnly] public NativeArray<float3> dataA;
[ReadOnly] public NativeArray<float3> dataB;
[WriteOnly] public NativeArray<float3> dataOut;
public void Execute()
{
for (int i = 0; i < dataSize; i++)
{
float3 a = dataA[i];
float3 b = dataB[i];
float3 sum = a + b;
float3 mul = a * b;
float3 res = (sum - mul) / 10.0f;
dataOut[i] = res;
}
}
}
cdqe
mov r8, qword ptr [rcx + 8]
mov r9, qword ptr [rcx + 64]
movsd xmm1, qword ptr [r8 + rax]
insertps xmm1, dword ptr [r8 + rax + 8], 32
movsd xmm2, qword ptr [r9 + rax]
insertps xmm2, dword ptr [r9 + rax + 8], 32
movaps xmm3, xmm2
addps xmm3, xmm1
mulps xmm2, xmm1
subps xmm3, xmm2
mulps xmm3, xmm0
mov rdx, qword ptr [rcx + 120]
movss dword ptr [rdx + rax], xmm3
extractps dword ptr [rdx + rax + 4], xmm3, 1
extractps dword ptr [rdx + rax + 8], xmm3, 2
inc r10d
add eax, 12
cmp r10d, dword ptr [rcx]
Next, I used float4 arrays but used swizzles to convert them to float3s for the computations. We get 15 instructions with only one extra blendps:
[ComputeJobOptimization]
struct Float4to3Job : IJob
{
public int dataSize;
[ReadOnly] public NativeArray<float4> dataA;
[ReadOnly] public NativeArray<float4> dataB;
[WriteOnly] public NativeArray<float4> dataOut;
public void Execute()
{
for (int i = 0; i < dataSize; i++)
{
float3 a = dataA[i].xyz;
float3 b = dataB[i].xyz;
float3 sum = a + b;
float3 mul = a * b;
float3 res = (sum - mul) / 10.0f;
dataOut[i] = new float4(res, 0);
}
}
}
cdqe
mov r8, qword ptr [rcx + 8]
mov r9, qword ptr [rcx + 64]
movups xmm2, xmmword ptr [r8 + rax]
movups xmm3, xmmword ptr [r9 + rax]
movaps xmm4, xmm3
addps xmm4, xmm2
mulps xmm2, xmm3
subps xmm4, xmm2
mulps xmm4, xmm0
blendps xmm4, xmm1, 8
mov rdx, qword ptr [rcx + 120]
movups xmmword ptr [rdx + rax], xmm4
inc r10d
add eax, 16
cmp r10d, dword ptr [rcx]
Finally, both the data and computations using float4s. We get 14 instructions, just as a reference scalar code.
[ComputeJobOptimization]
struct Float4Job : IJob
{
public int dataSize;
[ReadOnly] public NativeArray<float4> dataA;
[ReadOnly] public NativeArray<float4> dataB;
[WriteOnly] public NativeArray<float4> dataOut;
public void Execute()
{
for (int i = 0; i < dataSize; i++)
{
float4 a = dataA[i];
float4 b = dataB[i];
float4 sum = a + b;
float4 mul = a * b;
float4 res = (sum - mul)/ 10.0f;
dataOut[i] = res;
}
}
}
cdqe
mov r8, qword ptr [rcx + 8]
mov r9, qword ptr [rcx + 64]
movups xmm1, xmmword ptr [r8 + rax]
movups xmm2, xmmword ptr [r9 + rax]
movaps xmm3, xmm2
addps xmm3, xmm1
mulps xmm1, xmm2
subps xmm3, xmm1
mulps xmm3, xmm0
mov rdx, qword ptr [rcx + 120]
movups xmmword ptr [rdx + rax], xmm3
inc r10d
add eax, 16
cmp r10d, dword ptr [rcx]