Initially, I obtained datasets from Kaggle to train a speech emotion recognition model. Upon training the data, I generated seven distinct output files: CNN_model_weights.h5, scaler2.pickle, encoder2.pickle, best_model1_weights.h5, CNN_model.json, emotion.csv, and data_path.csv."
The issue at hand is that the output needs to be converted to the ONNX format. The situation is that Unity does not allow us to utilize the Python runtime, or I am not aware of how to do that, as I haven’t looked into whether it’s feasible. Nevertheless, in the past, I managed to convert my data to the ONNX format using Colab. The concern currently is that…
Let’s shift our focus to the Unity side. After obtaining incorrectly trained data, the issue might be related to the conversion process or an incorrect encoder model. However, these specifics are not crucial at the moment since my primary objective is to assess the capabilities of the Sentis.
using UnityEngine;
using Unity.Sentis;
using System.Linq;
using System;
using System.Collections.Generic;
using UnityEngine.UI;
using System.IO;
using Accord.Audio;
public class EmotionAnalyzer : MonoBehaviour
{
// Audio
public AudioClip recordedClip; // Manually assign the clip in the inspector
// Model execution
public ModelAsset modelAsset;
private Model runtimeModel;
IWorker worker;
// UI
public Button analyzeButton;
public Text emotionDisplayText;
// Data
private ScaleMeanData scaleMeanData;
private EncoderData encoderData;
public TextAsset emotionCsv; // Drag and drop your emotion.csv text asset here in the inspector
public List<EmotionData> allEmotionData;
void Start()
{
// Model setup
runtimeModel = ModelLoader.Load(modelAsset);
worker = WorkerFactory.CreateWorker(BackendType.GPUCompute, runtimeModel);
// UI setup
analyzeButton.onClick.AddListener(AnalyzeEmotionFromClip);
// Data loading
scaleMeanData = DataLoader.LoadDataFromJson<ScaleMeanData>("Assets/AI/Data/scale_mean_data.json");
encoderData = DataLoader.LoadDataFromJson<EncoderData>("Assets/AI/Data/encoder_categories_data.json");
if(encoderData == null || encoderData.classes == null || encoderData.classes.Length == 0)
{
Debug.LogError("Failed to load or deserialize encoder data.");
return; // Exit the Start() function early
}
Debug.Log("Encoder Data: Loaded successfully");
// Load emotion data from CSV
LoadCSV();
// Debug to check if the data is loaded correctly
foreach (var data in allEmotionData)
{
Debug.Log($"Path: {data.path}, Emotion: {data.emotion}");
}
}
void AnalyzeEmotionFromClip()
{
if (recordedClip != null)
{
Tensor inputTensor = PreprocessAudioToTensor(recordedClip);
if (inputTensor != null)
{
float[] output = AnalyzeEmotion(inputTensor);
string emotion = InterpretEmotion(output);
emotionDisplayText.text = "Detected Emotion: " + emotion;
}
else
{
Debug.LogError("Failed to preprocess audio data.");
}
}
else
{
Debug.LogError("No audio clip provided.");
}
}
float[] ConvertTo1D(double[][] data)
{
List<float> list = new List<float>();
foreach (var array in data)
{
foreach (var value in array)
{
list.Add((float)value);
}
}
return list.ToArray();
}
float[] ExtractAudioFeatures(AudioClip clip)
{
// Get audio data from clip
float[] audioData = new float[clip.samples * clip.channels];
clip.GetData(audioData, 0);
// Convert AudioClip to Signal format for Accord.NET
int sampleRate = clip.frequency;
Signal signal = Signal.FromArray(audioData, sampleRate);
// Extract MFCCs
MelFrequencyCepstrumCoefficient mfccExtractor = new MelFrequencyCepstrumCoefficient();
var mfccDescriptors = mfccExtractor.Transform(signal);
// Convert the descriptors to a double[][] format
double[][] features = mfccDescriptors.Select(descriptor => descriptor.Descriptor).ToArray();
// Flatten the features into a double[]
double[] flattenedFeatures = features.SelectMany(x => x).ToArray();
// Compute ZCR and RMSE
float zcr = ComputeZCR(audioData);
float rmse = ComputeRMSE(audioData);
// Convert the MFCCs from double to float
float[] finalFeatures = flattenedFeatures.Select(f => (float)f).ToArray();
// Append ZCR and RMSE to the features
List<float> featureList = new List<float>();
featureList.AddRange(finalFeatures); // Add existing MFCCs
featureList.Add(zcr);
featureList.Add(rmse);
// Log for debugging
Debug.Log($"Extracted features length: {featureList.Count}");
return featureList.ToArray();
}
float[] NormalizeFeatures(float[] features)
{
if (scaleMeanData.scale.Length != 2376 || scaleMeanData.mean.Length != 2376)
{
Debug.LogError("Mismatch in the length of a single sequence and scale/mean data");
return features; // Return original features if there's a mismatch
}
int numberOfSequences = features.Length / 2376;
for (int seq = 0; seq < numberOfSequences; seq++)
{
for (int i = 0; i < 2376; i++)
{
features[seq * 2376 + i] = (features[seq * 2376 + i] - scaleMeanData.mean[i]) / scaleMeanData.scale[i];
}
}
return features;
}
float[] PadFeaturesToLength(float[] features, int targetLength)
{
if (features.Length >= targetLength)
{
return features.Take(targetLength).ToArray();
}
float[] paddedFeatures = new float[targetLength];
System.Array.Copy(features, paddedFeatures, features.Length);
return paddedFeatures;
}
// Compute the Zero-Crossing Rate
private float ComputeZCR(float[] audioData)
{
int zeroCrossings = 0;
for (int i = 1; i < audioData.Length; i++)
{
if (audioData[i] * audioData[i - 1] < 0)
{
zeroCrossings++;
}
}
return (float)zeroCrossings / (audioData.Length - 1);
}
float[] AnalyzeEmotion(Tensor inputTensor)
{
Debug.Log($"Input tensor shape: {inputTensor.shape}");
worker.Execute(inputTensor);
var outputTensor = worker.PeekOutput() as TensorFloat;
outputTensor.PrepareCacheForAccess(blocking: true);
float[] output = outputTensor.ToReadOnlyArray();
Debug.Log($"Model output length: {output.Length}");
return output;
}
// Compute the Root Mean Square Error
private float ComputeRMSE(float[] audioData)
{
float sumOfSquares = 0;
foreach (var sample in audioData)
{
sumOfSquares += sample * sample;
}
return Mathf.Sqrt(sumOfSquares / audioData.Length);
}
Tensor ConvertToTensor(float[] features)
{
int sequenceLength = 2376;
int totalLength = (int)Mathf.Ceil(features.Length / (float)sequenceLength) * sequenceLength;
features = PadFeaturesToLength(features, totalLength);
int numberOfSequences = totalLength / sequenceLength;
float[] reshapedFeatures = new float[totalLength];
for (int i = 0; i < numberOfSequences; i++)
{
for (int j = 0; j < sequenceLength; j++)
{
reshapedFeatures[i * sequenceLength + j] = features[i * sequenceLength + j];
}
}
TensorShape shape = new TensorShape(numberOfSequences, sequenceLength, 1);
TensorFloat tensor = new TensorFloat(shape, reshapedFeatures);
return tensor;
}
Tensor PreprocessAudioToTensor(AudioClip clip)
{
float[] features = ExtractAudioFeatures(clip);
features = NormalizeFeatures(features);
return ConvertToTensor(features);
}
string InterpretEmotion(float[] output)
{
if (encoderData == null || encoderData.classes == null)
{
Debug.LogError("encoderData or its classes property is null.");
return "Unknown Emotion";
}
string[] emotions = encoderData.classes;
if (output.Length % emotions.Length != 0)
{
Debug.LogError($"Model output length ({output.Length}) is not a multiple of the number of available classes ({emotions.Length}).");
return "Unknown Emotion";
}
int numSequences = output.Length / emotions.Length;
float[] averagedOutputs = new float[emotions.Length];
for (int i = 0; i < numSequences; i++)
{
for (int j = 0; j < emotions.Length; j++)
{
averagedOutputs[j] += output[i * emotions.Length + j];
}
}
for (int j = 0; j < emotions.Length; j++)
{
averagedOutputs[j] /= numSequences;
}
// Get the index of the maximum value in the averaged outputs.
int maxIndex = System.Array.IndexOf(averagedOutputs, averagedOutputs.Max());
// Bounds check for maxIndex.
if (maxIndex >= 0 && maxIndex < emotions.Length)
{
return emotions[maxIndex];
}
else
{
Debug.LogError("Model produced empty output.");
return "Unknown Emotion";
}
}
[System.Serializable]
public class EmotionData
{
public string path;
public string emotion;
}
void LoadCSV()
{
allEmotionData = new List<EmotionData>();
string[] records = emotionCsv.text.Split('\n');
for (int i = 1; i < records.Length; i++) // Starting from 1 to skip header
{
string[] fields = records[i].Split(',');
if (fields.Length == 2)
{
EmotionData data = new EmotionData
{
path = fields[0].Trim(),
emotion = fields[1].Trim()
};
allEmotionData.Add(data);
}
}
}
void OnDestroy()
{
// Proper cleanup to avoid memory leaks.
if (worker != null)
{
worker.Dispose();
}
}
}
[System.Serializable]
public class ScaleMeanData
{
public float[] scale;
public float[] mean;
}
[System.Serializable]
public class EncoderData
{
public string[] classes;
}
public static class DataLoader
{
public static T LoadDataFromJson<T>(string path)
{
if (File.Exists(path))
{
string jsonString = File.ReadAllText(path);
return JsonUtility.FromJson<T>(jsonString);
}
else
{
Debug.LogError($"File not found at {path}");
return default(T);
}
}
}
public static class EncoderDataLoader
{
public static EncoderData LoadClassesFromJson(string path)
{
if (File.Exists(path))
{
try
{
string jsonString = File.ReadAllText(path);
EncoderData data = JsonUtility.FromJson<EncoderData>(jsonString);
if (data == null || data.classes == null || data.classes.Length == 0)
{
Debug.LogError($"Failed to deserialize encoder data or it's empty. JSON: {jsonString}");
return null;
}
return data;
}
catch (Exception e)
{
Debug.LogError($"Error while reading or deserializing the file at {path}. Error: {e.Message}");
return null;
}
}
else
{
Debug.LogError($"File not found at {path}");
return null;
}
}
}
I composed the aforementioned code using ChatGPT. Throughout this process, I utilized the NuGet library to gain access to Accord.Audio, which allowed me to extract audio features for the purpose of preprocessing.
I’ve received an output, but unfortunately, it’s incorrect. I’m currently working to obtain a reliable and accurate output.
My comment for Sentis:
We need to explore the capabilities of Sentis AI, as this feature can be immensely helpful for academic research, interactive environments, user analysis, and more. Unity provides us with limited insight into Sentis. Unity should highlight the primary pathways that users can explore when utilizing Sentis. Additionally, lesser-known paths should be investigated with the help of beta users.