(AR object detection) Problem with Slice() and Reshape()

Hey guys, I hope you are all doing well.
I tried to implement object detection but I am coming across an error saying

KeyNotFoundException: The given key ‘nms’ was not present in the dictionary.

Here is my current code:

using UnityEngine;
using Unity.Sentis;
using System.Collections;
using System.Collections.Generic;

public class StairDetection : MonoBehaviour
{
    public ModelAsset detectionModel;
    IWorker m_engineDetection;
    WebCamTexture webcamTexture;
    TensorFloat inputTensor;

    int modelLayerCount = 0;
    public int framesToExectute = 2;

    void Start()
    {
        Application.targetFrameRate = 60;
        var model = ModelLoader.Load(detectionModel);

        // Reshape to [8400, 5]
        model.layers.Add(new Unity.Sentis.Layers.Reshape("reshaped", "output", "8400,5"));

        // Slice for bounding boxes [8400, 4]
        model.layers.Add(new Unity.Sentis.Layers.Slice("sliceBoxes", "reshaped", "0,0", "8400,4"));

        // Slice for confidence scores [8400]
        model.layers.Add(new Unity.Sentis.Layers.Slice("sliceScores", "reshaped", "0,4", "8400,5"));

        model.layers.Add(new Unity.Sentis.Layers.NonMaxSuppression(
                    name: "nms",
                    boxes: "sliceBoxes",
                    scores: "sliceScores",
                    maxOutputBoxesPerClass: "10",  // Adjust as needed
                    iouThreshold: "0.5",           // Common value, adjust as needed
                    scoreThreshold: "0.3",         // Adjust as needed
                    centerPointBox: Unity.Sentis.Layers.CenterPointBox.Corners
                ));

        modelLayerCount = model.layers.Count;
        model.outputs = new List<string> { "nms" };

        m_engineDetection = WorkerFactory.CreateWorker(BackendType.GPUCompute, model);

        WebCamDevice[] devices = WebCamTexture.devices;
        webcamTexture = new WebCamTexture(Screen.width, Screen.height)
        {
            deviceName = devices[0].name
        };
        webcamTexture.Play();

        inputTensor = TensorFloat.Zeros(new TensorShape(1, 3, 640, 640));
    }
    bool executionStarted = false;
    IEnumerator executionSchedule;

    private void Update()
    {
        if (!executionStarted)
        {
            TextureConverter.ToTensor(webcamTexture, inputTensor, new TextureTransform());
            executionSchedule = m_engineDetection.StartManualSchedule(inputTensor);
            executionStarted = true;
        }

        bool hasMoreWork = false;
        int layersToRun = (modelLayerCount + framesToExectute - 1) / framesToExectute; // round up
        for (int i = 0; i < layersToRun; i++)
        {
            hasMoreWork = executionSchedule.MoveNext();
            if (!hasMoreWork)
                break;
        }

        if (hasMoreWork)
            return;

        var output = m_engineDetection.PeekOutput() as TensorFloat;
        if (output != null)
        {
            Debug.Log("Output shape: " + output.shape);
            Debug.Log(output.ToReadOnlyArray());
        }
        executionStarted = false;
    }
}

The object detection model I am using is YOLOv8n. Here is the input and output structure:

I mostly just followed the depth detection sample code. This script is intended to be used for an AR app. Any help would be greatly appreciated. Thanks!

EDIT

I made some changes to my code and the error I am getting now is

KeyNotFoundException: The given key ‘8400,5’ was not present in the dictionary.

This is my updated code:

using UnityEngine;
using Unity.Sentis;
using System.Collections;
using System.Collections.Generic;
using Unity.Sentis.Layers;

public class StairDetection : MonoBehaviour
{
    public ModelAsset detectionModel;
    IWorker m_engineDetection;
    WebCamTexture webcamTexture;
    TensorFloat inputTensor;

    int modelLayerCount = 0;
    public int framesToExectute = 2;

    void Start()
    {
        Application.targetFrameRate = 60;
        var model = ModelLoader.Load(detectionModel);

        // Reshape to [8400, 5]
        model.AddLayer(new Reshape("reshaped", "output0", "8400,5"));

        // Slice for bounding boxes [8400, 4]
        model.AddLayer(new Slice("sliceBoxes", "reshaped", "0,0", "8400,4"));

        // Slice for confidence scores [8400]
        model.AddLayer(new Slice("sliceScores", "reshaped", "0,4", "8400,5"));

        model.AddLayer(new NonMaxSuppression(
                    name: "nms",
                    boxes: "sliceBoxes",
                    scores: "sliceScores",
                    maxOutputBoxesPerClass: "10",  // Adjust as needed
                    iouThreshold: "0.5",           // Common value, adjust as needed
                    scoreThreshold: "0.3",         // Adjust as needed
                    centerPointBox: CenterPointBox.Corners
                ));

        modelLayerCount = model.layers.Count;
        model.AddOutput("nms");

        m_engineDetection = WorkerFactory.CreateWorker(BackendType.GPUCompute, model);

        WebCamDevice[] devices = WebCamTexture.devices;
        webcamTexture = new WebCamTexture(Screen.width, Screen.height)
        {
            deviceName = devices[0].name
        };
        webcamTexture.Play();

        inputTensor = TensorFloat.Zeros(new TensorShape(1, 3, 640, 640));
    }
    bool executionStarted = false;
    IEnumerator executionSchedule;

    private void Update()
    {
        if (!executionStarted)
        {
            TextureConverter.ToTensor(webcamTexture, inputTensor, new TextureTransform());
            executionSchedule = m_engineDetection.StartManualSchedule(inputTensor);
            executionStarted = true;
        }

        bool hasMoreWork = false;
        int layersToRun = (modelLayerCount + framesToExectute - 1) / framesToExectute; // round up
        for (int i = 0; i < layersToRun; i++)
        {
            hasMoreWork = executionSchedule.MoveNext();
            if (!hasMoreWork)
                break;
        }

        if (hasMoreWork)
            return;

        var output = m_engineDetection.PeekOutput() as TensorFloat;
        if (output != null)
        {
            Debug.Log("Output shape: " + output.shape);
            output.MakeReadable();
            Debug.Log(output.ToReadOnlyArray());
        }
        executionStarted = false;
    }
}


I believe I am using Slice() and Reshape() incorrectly.

Can someone please help me?

1 Like