Hi everyone!
I’ve been doing some work on my new animation system called “Kinemation” this weekend. I wanted to experiment with a technique that involves keeping all bones in the hierarchy since that can be a convenient representation for game logic. However, it was nearly impossible to profile the GPU uploads because LocalToParentSystem.UpdateHierarchy was too slow. I have 29,200 skeletons (the max I stuff into a compute buffer) and the hierarchy update was taking a full 16 ms on my machine.
Oddly though, I got the same performance regardless of whether my skeletons were moving or not. So I checked the change version on LocalToWorld, and it wasn’t being updated when the entities weren’t moving. That meant the system was skipping writes like it was supposed to.
It turns out that the system touches the LocalToWorld of every entity in the hierarchy. For entities that don’t need updates, it only reads from it to propagate it to the children. For entities that do update, it reads from LocalToParent and writes to LocalToWorld. Interestingly enough, my out-of-order processor was loading both LocalToParent and LocalToWorld into cache at the same time when LocalToParent was required, and hence I saw the same random access time penalty for both cases.
My solution was to defer propagating the LocalToWorld to the child subroutine, using ref args so that only the first child needs to update it. Keep in mind that change versions are not deterministic with this system and I didn’t fix it with this optimization, so the behavior may be slightly different due to the different timings. If you don’t rely on a deterministic change version, the actually hierarchy updates should still be correct.
Anyways, try it out and let me know what kind of speeds you get. I see a speedup between 25-50% when no entities are moving, and about the same if not a slight speedup when all the roots are moving.
[UpdateInGroup(typeof(TransformSystemGroup))]
[UpdateAfter(typeof(EndFrameTRSToLocalToParentSystem))]
[UpdateBefore(typeof(EndFrameWorldToLocalSystem))]
//[UpdateBefore(typeof(EndFrameLocalToParentSystem))]
public class LocalToParentSystem2 : JobComponentSystem
{
private EntityQuery m_RootsQuery;
private EntityQueryMask m_LocalToWorldWriteGroupMask;
private EntityQuery m_ChildrenQuery;
// LocalToWorld = Parent.LocalToWorld * LocalToParent
[BurstCompile]
struct UpdateHierarchy : IJobEntityBatch
{
[ReadOnly] public ComponentTypeHandle<LocalToWorld> LocalToWorldTypeHandle;
[ReadOnly] public BufferTypeHandle<Child> ChildTypeHandle;
[ReadOnly] public BufferFromEntity<Child> ChildFromEntity;
[ReadOnly] public ComponentDataFromEntity<LocalToParent> LocalToParentFromEntity;
[ReadOnly] public EntityQueryMask LocalToWorldWriteGroupMask;
public uint LastSystemVersion;
[NativeDisableContainerSafetyRestriction]
public ComponentDataFromEntity<LocalToWorld> LocalToWorldFromEntity;
void ChildLocalToWorld(ref float4x4 parentLocalToWorld, Entity entity, bool updateChildrenTransform, Entity parent, ref bool parentLtwValid)
{
updateChildrenTransform = updateChildrenTransform || LocalToParentFromEntity.DidChange(entity, LastSystemVersion);
float4x4 localToWorldMatrix = default;
bool ltwIsValid = false;
if (updateChildrenTransform && LocalToWorldWriteGroupMask.Matches(entity))
{
if (!parentLtwValid)
{
parentLocalToWorld = LocalToWorldFromEntity[parent].Value;
parentLtwValid = true;
}
var localToParent = LocalToParentFromEntity[entity];
localToWorldMatrix = math.mul(parentLocalToWorld, localToParent.Value);
ltwIsValid = true;
LocalToWorldFromEntity[entity] = new LocalToWorld { Value = localToWorldMatrix };
}
else //This entity has a component with the WriteGroup(LocalToWorld)
{
updateChildrenTransform = updateChildrenTransform || LocalToWorldFromEntity.DidChange(entity, LastSystemVersion);
}
if (ChildFromEntity.HasComponent(entity))
{
var children = ChildFromEntity[entity];
for (int i = 0; i < children.Length; i++)
{
ChildLocalToWorld(ref localToWorldMatrix, children[i].Value, updateChildrenTransform, entity, ref ltwIsValid);
}
}
}
public void Execute(ArchetypeChunk batchInChunk, int batchIndex)
{
bool updateChildrenTransform =
batchInChunk.DidChange<LocalToWorld>(LocalToWorldTypeHandle, LastSystemVersion) ||
batchInChunk.DidChange<Child>(ChildTypeHandle, LastSystemVersion);
var chunkLocalToWorld = batchInChunk.GetNativeArray(LocalToWorldTypeHandle);
var chunkChildren = batchInChunk.GetBufferAccessor(ChildTypeHandle);
bool ltwIsValid = true;
for (int i = 0; i < batchInChunk.Count; i++)
{
var localToWorldMatrix = chunkLocalToWorld[i].Value;
var children = chunkChildren[i];
for (int j = 0; j < children.Length; j++)
{
ChildLocalToWorld(ref localToWorldMatrix, children[j].Value, updateChildrenTransform, Entity.Null, ref ltwIsValid);
}
}
}
}
protected override void OnCreate()
{
m_RootsQuery = GetEntityQuery(new EntityQueryDesc
{
All = new ComponentType[]
{
ComponentType.ReadOnly<LocalToWorld>(),
ComponentType.ReadOnly<Child>()
},
None = new ComponentType[]
{
typeof(Parent)
},
Options = EntityQueryOptions.FilterWriteGroup
});
m_ChildrenQuery = GetEntityQuery(new EntityQueryDesc
{
All = new ComponentType[]
{
typeof(LocalToWorld),
ComponentType.ReadOnly<LocalToParent>(),
ComponentType.ReadOnly<Parent>()
},
Options = EntityQueryOptions.FilterWriteGroup
});
m_LocalToWorldWriteGroupMask = EntityManager.GetEntityQueryMask(m_ChildrenQuery);
}
protected override JobHandle OnUpdate(JobHandle inputDeps)
{
var localToWorldType = GetComponentTypeHandle<LocalToWorld>(true);
var childType = GetBufferTypeHandle<Child>(true);
var childFromEntity = GetBufferFromEntity<Child>(true);
var localToParentFromEntity = GetComponentDataFromEntity<LocalToParent>(true);
var localToWorldFromEntity = GetComponentDataFromEntity<LocalToWorld>();
var updateHierarchyJob = new UpdateHierarchy
{
LocalToWorldTypeHandle = localToWorldType,
ChildTypeHandle = childType,
ChildFromEntity = childFromEntity,
LocalToParentFromEntity = localToParentFromEntity,
LocalToWorldFromEntity = localToWorldFromEntity,
LocalToWorldWriteGroupMask = m_LocalToWorldWriteGroupMask,
LastSystemVersion = LastSystemVersion
};
inputDeps = updateHierarchyJob.ScheduleParallel(m_RootsQuery, 1, inputDeps);
return inputDeps;
}
}
I have a couple ideas for how to further improve this, but I need some time to test and experiment.



