I'm using Google Cloud text-to-speech to get a live audioClip. The goal is to move the lips y scale of a simple transform attached to a body. However, the timing of the lips during clip playback seems off. Is there any way to correct this? Thanks!
using UnityEngine;
public class SyncMouthToAudio : MonoBehaviour
{
const float scaleYMin = 0.01f;
const float scaleYMax = 0.05f;
TextToSpeechVoice voice = null;
AudioSource audioSource = null;
float[] clipData = null;
const float updateStep = 0.1f;
float detectedLoudnessMin = Mathf.Infinity;
float detectedLoudnessMax = 0f;
const float updateSeconds = 0.096f;
float updateTime = 0f;
void Start()
{
voice = GetComponentInParent<TextToSpeechVoice>();
voice.onStarts = OnVoiceStarts;
voice.onEnds = OnVoiceEnds;
}
void Update()
{
updateTime = Time.deltaTime;
if (updateTime >= updateSeconds)
{
updateTime = 0f;
CheckLoudness();
}
}
void CheckLoudness()
{
float loudness = 0f;
if (audioSource != null && audioSource.isPlaying && audioSource.timeSamples > 0)
{
audioSource.clip.GetData(clipData, audioSource.timeSamples);
foreach (var sample in clipData)
{
loudness = Mathf.Abs(sample);
}
if (loudness < detectedLoudnessMin) { detectedLoudnessMin = loudness; }
else if (loudness > detectedLoudnessMax) { detectedLoudnessMax = loudness; }
}
SetScaleByLoudness(loudness);
}
void SetScaleByLoudness(float loudness)
{
const float visibilityMultiplier = 15f;
float scaleY = scaleYMin;
bool detectedLoudness = loudness > 0f && detectedLoudnessMin < Mathf.Infinity &&
detectedLoudnessMax > 0f && detectedLoudnessMin < detectedLoudnessMax;
if (detectedLoudness)
{
float range = detectedLoudnessMax - detectedLoudnessMin;
float threshold = detectedLoudnessMin range * 0.3f;
bool loudnessIsRelevantEnough = loudness >= threshold;
if (loudnessIsRelevantEnough)
{
float scaleRange = scaleYMax - scaleYMin;
float loudnessRange = detectedLoudnessMax - detectedLoudnessMin;
float scaleToLoudnessRatio = scaleRange / loudnessRange;
scaleY = scaleYMin (loudness - detectedLoudnessMin) * scaleToLoudnessRatio * scaleYMax * visibilityMultiplier;
scaleY = Mathf.Clamp(scaleY, scaleYMin, scaleYMax);
}
}
transform.SetLocalScaleY(scaleY);
}
void OnVoiceStarts(AudioSource audioSource)
{
this.audioSource = audioSource;
clipData = new float[this.audioSource.clip.samples];
}
void OnVoiceEnds()
{
this.audioSource = null;
}
}
Some notes on above:
- I've played with various values for updateSeconds and also tried a RepeatInvoke, to no avail.
- I've played with various threshold values (the goal is to have the lips be closed on near-silence), and also removed the threshold check completely, but it won't help.
- The code tries to automatically determine typical min and max loudness (so as to show the mouth at fullest range independent of the specific audio).
- I've already set the audioSource priority to 0, the highest.
- The contents of the audioClip isn't knowable in advance as it's live-written by the GPT-3 AI based on what the user asked.
CodePudding user response:
With credit to derHugo for helping, the following solves it by providing a constant (and smaller) array size to clipData
, like 512, so that it won't grab too much during getData
:
using UnityEngine;
using System.Linq;
public class SyncMouthToAudio : MonoBehaviour
{
const float scaleYMin = 0.01f;
const float scaleYMax = 0.05f;
TextToSpeechVoice voice = null;
AudioSource audioSource = null;
float[] clipData = null;
int samplesPerCheck = 0;
float detectedLoudnessMin = Mathf.Infinity;
float detectedLoudnessMax = 0f;
const float checkEveryNSeconds = 0.05f;
void Start()
{
voice = GetComponentInParent<TextToSpeechVoice>();
voice.onStarts = OnVoiceStarts;
voice.onEnds = OnVoiceEnds;
}
void CheckLoudness()
{
float loudness = 0f;
if (audioSource != null && audioSource.isPlaying && audioSource.timeSamples > 0)
{
int offset = clipData.Length / 2;
int startSample = (int) Mathf.Clamp(
audioSource.timeSamples - offset, 0, audioSource.clip.samples - offset);
audioSource.clip.GetData(clipData, startSample);
loudness = clipData.Select(x => Mathf.Abs(x)).Average();
if (loudness < detectedLoudnessMin) { detectedLoudnessMin = loudness; }
else if (loudness > detectedLoudnessMax) { detectedLoudnessMax = loudness; }
}
SetScaleByLoudness(loudness);
}
void SetScaleByLoudness(float loudness)
{
const float visibilityMultiplier = 15f;
float scaleY = scaleYMin;
if (loudness > 0f && detectedLoudnessMin < Mathf.Infinity &&
detectedLoudnessMax > 0f && detectedLoudnessMin < detectedLoudnessMax)
{
float range = detectedLoudnessMax - detectedLoudnessMin;
float scaleRange = scaleYMax - scaleYMin;
float loudnessRange = detectedLoudnessMax - detectedLoudnessMin;
float scaleToLoudnessRatio = scaleRange / loudnessRange;
scaleY = scaleYMin (loudness - detectedLoudnessMin) *
scaleToLoudnessRatio * scaleYMax * visibilityMultiplier;
scaleY = Mathf.Clamp(scaleY, scaleYMin, scaleYMax);
}
transform.SetLocalScaleY(scaleY);
}
void OnVoiceStarts(AudioSource audioSource)
{
this.audioSource = audioSource;
float numberOfChecks = audioSource.clip.length / checkEveryNSeconds;
samplesPerCheck = (int) Mathf.Round(audioSource.clip.samples / numberOfChecks);
clipData = new float[samplesPerCheck];
CancelInvoke();
InvokeRepeating("CheckLoudness", checkEveryNSeconds * 0.5f, checkEveryNSeconds);
}
void OnVoiceEnds()
{
CancelInvoke();
this.audioSource = null;
}
}