Simple Lipsync with microphone input

Hi, I want to build a very very simple lipsync animation using the microphone input. My idea was to get the volume from the microphone and move the jaw. Pretty much like this demo: http://qlcomp.com/?page_id=33 , but in real time. After some googling, I found people who know how to calculate the volume, so I adapted the code into:

private float GetVolume()
{
	if(audio==null)
		return 0;
    float[] data = new float[samples];
    audio.GetOutputData(data, 0);
	
	//take the median of the recorded samples
    ArrayList s = new ArrayList();
    foreach (float f in data)
    {
        s.Add(Mathf.Abs(f));
    }
    s.Sort();
    return (float)s[samples / 2];
}

And also, a function to filter noise:

float fMax = 24000;
private float HumanFreq(float fLow, float fHigh)
{
	if(audio==null)
		return 0;
    float[] data = new float[samples];
	fLow = Mathf.Clamp(fLow, 20, fMax); // limit low...
	fHigh = Mathf.Clamp(fHigh, fLow, fMax); // and high frequencies
	// get spectrum: freqData[n] = vol of frequency n * fMax / nSamples
	audio.GetSpectrumData(data, 0, FFTWindow.BlackmanHarris); 
	int n1 = (int)Mathf.Floor(fLow * samples / fMax);
	int n2 = (int)Mathf.Floor(fHigh * samples / fMax);
	float sum = 0;
	// average the volumes of frequencies fLow to fHigh
	for (var i=n1; i<=n2; i++){
		sum += data*;*
  • }*
  • return sum / (n2 - n1 + 1);*
  • }*
    My problem is that GetVolume returns 0 almost always (randomly not), and HumanFreq does not filter anything.
    Does anyone know what’s wrong? Or even any other way to do what I’m trying? I wouldn’t mind completely changing the approach, but I need something simple. Phonem recognition and other techniques are much better looking but also much more complex. I also wouldn’t mind buying assets if they were cheap and easy to use. (Yes, simple is the word :slight_smile: I don’t want to spend too much time on this)

using UnityEngine;
using System.Collections.Generic;
using System.Collections;

public class LipSync : MonoBehaviour
{

private Transform mouth;
private Vector3 initialPos;
public Vector3 endPos;

private bool initialized = false;
private bool microphoneClip = false;
private AudioClip recordedClip;
	
private const int FREQUENCY = 48000;    
private const int SAMPLECOUNT = 1024;  
private const float REFVALUE = 0.1f;    // RMS value for 0 dB.
private const float THRESHOLD = 0.02f;  // Minimum amplitude to extract pitch (recieve anything)

private float[] samples;           // Samples
private float[] spectrum;          // Spectrum
private float rmsValue;            // Volume in RMS
private float dbValue;             // Volume in DB
private float pitchValue;          // Pitch - Hz (is this frequency?)
public int clamp = 160;            // Used to clamp dB

private float time;
private float elapsedTime;

public void Init ()
{
	if(initialized)
		return;
	samples = new float[SAMPLECOUNT];
	spectrum = new float[SAMPLECOUNT];
	
	mouth = transform;
	initialPos = mouth.localPosition;
	if (audio == null)
		gameObject.AddComponent<AudioSource> ();
	audio.playOnAwake = false;
	audio.loop = false;
	enabled = false;
	initialized = true;
}

public void StartClipLipSync (AudioClip clip, bool fromMicrophone)
{
	Init ();
	audio.clip = clip;
	audio.Play ();
	enabled = true;
	microphoneClip = fromMicrophone;
	Debug.Log ("Clip started for " + transform.root.gameObject.name);
}

public float PauseClip ()
{
	audio.Pause ();
	return audio.time;
}

public void StopClip ()
{
	Init ();
	audio.Stop ();
}

public void RestartClip (AudioClip clip, float t)
{
	Init ();
	audio.clip = clip;
	audio.time = t;
	audio.Play ();
}

public void RestartClip (float t)
{
	Init ();
	audio.time = t;
	audio.Play ();
}

public void RestartClip ()
{
	Init ();
	audio.Stop ();
	audio.Play ();
}

// Use this for initialization
public void StartMicrophoneLipSync ()
{
	
	Init ();
	
	if (Microphone.devices.Length > 0) {
		audio.clip = Microphone.Start ("Built-in Microphone", true, 999, 44100);
		while (!(Microphone.GetPosition("Built-in Microphone") > 0)) {
		}
		audio.Play ();
		audio.mute = true;
		enabled = true;
		Debug.Log ("Microphone started for " + transform.root.gameObject.name);
	} else
		enabled = false;
	
	microphoneClip = true;
}

public AudioClip GetRecordedClip ()
{
	return recordedClip;
}

void OnDestroy ()
{
	if (Microphone.IsRecording (null))
		Microphone.End (null);
	Destroy (audio);
}

public void StopMicrophone ()
{
	if (audio == null)
		return;
	if (Microphone.IsRecording (null))
		Microphone.End (null);
	recordedClip = audio.clip;
	DestroyImmediate (audio);
	initialized = false;
}


//This could be programmed as InvokeRepeating
void Update ()
{
	AnalyzeSound();
	
	if (mouth == null)
		return;
	
	float freq = HumanFreq(200,800)*1000;
	if(freq>2 && freq <50) {
		
		float step = Mathf.SmoothStep(0, 1, Mathf.SmoothStep(0, 1, elapsedTime/time));
		mouth.localPosition = Vector3.Lerp(initialPos, endPos,step);
		elapsedTime += Time.deltaTime;
	} else {
		mouth.localPosition = initialPos;
		time = 0.3f;
		elapsedTime = 0;
	}
}

/// Analyzes the sound, to get volume and pitch values.
private void AnalyzeSound ()
{
	if(audio==null)
		return;
	// Get all of our samples from the mic.
	audio.GetOutputData (samples, 0);

	// Sums squared samples
	float sum = 0;
	for (int i = 0; i < SAMPLECOUNT; i++) {
		sum += Mathf.Pow (samples *, 2);*
  •   }*
    
  •   // RMS is the square root of the average value of the samples.*
    
  •   rmsValue = Mathf.Sqrt (sum / SAMPLECOUNT);*
    

_ dbValue = 20 * Mathf.Log10 (rmsValue / REFVALUE);_

  •   // Clamp it to {clamp} min*
    
  •   if (dbValue < -clamp) {*
    
  •   	dbValue = -clamp;*
    
  •   }*
    
  •   // Gets the sound spectrum.*
    
  •   audio.GetSpectrumData (spectrum, 0, FFTWindow.BlackmanHarris);*
    
  •   float maxV = 0;*
    
  •   int maxN = 0;*
    
  •   // Find the highest sample.*
    
  •   for (int i = 0; i < SAMPLECOUNT; i++) {*
    

if (spectrum > maxV && spectrum > THRESHOLD) {
_ maxV = spectrum ;
* maxN = i; // maxN is the index of max*
* }
}
// Pass the index to a float variable*

* float freqN = maxN;
// Interpolate index using neighbours*

* if (maxN > 0 && maxN < SAMPLECOUNT - 1) {
float dL = spectrum [maxN - 1] / spectrum [maxN];
float dR = spectrum [maxN + 1] / spectrum [maxN];
freqN += 0.5f * (dR * dR - dL * dL);
}
// Convert index to frequency*

pitchValue = freqN * 24000 / SAMPLECOUNT;
* }
private float HumanFreq (float fLow, float fHigh)
{
int n1 = (int)Mathf.Floor (fLow * SAMPLECOUNT * 2 / FREQUENCY);
int n2 = (int)Mathf.Floor (fHigh * SAMPLECOUNT * 2 / FREQUENCY);
float sum = 0;
// average the volumes of frequencies fLow to fHigh*

* for (var i=n1; i<=n2; i++) {
sum += spectrum ;
}
return sum / (n2 - n1 + 1);
}
}*

*By the way, I give the most credit to @Riro, as I basically just rewrote his script in http://goo.gl/oLTp5*_

You take just the center sample, that doesn’t make much sense :wink: the sample rate is much faster than you’re scanning the data. So you pick a single sample every hundreds of samples and use it.

Here’s the refactored script the sample project is using:

float[] array = new float[this.winWidth];
audioSource.GetOutputData(array, 0);
float num3 = (float)0;
for (int i = 0; i < this.winWidth; i++)
{
    float num4 = Mathf.Abs(array*);*

num3 += num4;
}
num3 /= (float)this.winWidth;
Note: The original script was written in UnityScript and had additionally a min and max calculation in the forloop, but the values aren’t used anywhere, so i just copied the relevant part :wink:
Sorry for the “bad” variable names, but local variable names doesn’t exist in CIL. They are just values on the stack :wink:
edit
btw. the winWidth defaults to 512 but it might be adjusted.