Why is audioClip lip sync off in this Unity try?-CodePudding

I'm using Google Cloud text-to-speech to get a live audioClip. The goal is to move the lips y scale of a simple transform attached to a body. However, the timing of the lips during clip playback seems off. Is there any way to correct this? Thanks!

using UnityEngine;

public class SyncMouthToAudio : MonoBehaviour
{
    const float scaleYMin = 0.01f;
    const float scaleYMax = 0.05f;

    TextToSpeechVoice voice = null;

    AudioSource audioSource = null;
    float[] clipData = null;

    const float updateStep = 0.1f;
    float detectedLoudnessMin = Mathf.Infinity;
    float detectedLoudnessMax = 0f;

    const float updateSeconds = 0.096f;
    float updateTime = 0f;

    void Start()
    {
        voice = GetComponentInParent<TextToSpeechVoice>();
        voice.onStarts  = OnVoiceStarts;
        voice.onEnds  = OnVoiceEnds;
    }

    void Update()
    {
        updateTime  = Time.deltaTime;
        if (updateTime >= updateSeconds)
        {
            updateTime = 0f;
            CheckLoudness();
        }
    }

    void CheckLoudness()
    {
        float loudness = 0f;
        if (audioSource != null && audioSource.isPlaying && audioSource.timeSamples > 0)
        {
            audioSource.clip.GetData(clipData, audioSource.timeSamples);
            foreach (var sample in clipData)
            {
                loudness  = Mathf.Abs(sample);
            }

            if      (loudness < detectedLoudnessMin) { detectedLoudnessMin = loudness; }
            else if (loudness > detectedLoudnessMax) { detectedLoudnessMax = loudness; }
        }

        SetScaleByLoudness(loudness);
    }

    void SetScaleByLoudness(float loudness)
    {
        const float visibilityMultiplier = 15f;
        float scaleY = scaleYMin;

        bool detectedLoudness = loudness > 0f && detectedLoudnessMin < Mathf.Infinity &&
            detectedLoudnessMax > 0f && detectedLoudnessMin < detectedLoudnessMax;
        if (detectedLoudness)
        {
            float range = detectedLoudnessMax - detectedLoudnessMin;
            float threshold = detectedLoudnessMin   range * 0.3f;
            bool loudnessIsRelevantEnough = loudness >= threshold;
            if (loudnessIsRelevantEnough)
            {
                float scaleRange = scaleYMax - scaleYMin;
                float loudnessRange = detectedLoudnessMax - detectedLoudnessMin;

                float scaleToLoudnessRatio = scaleRange / loudnessRange;

                scaleY = scaleYMin   (loudness - detectedLoudnessMin) * scaleToLoudnessRatio * scaleYMax * visibilityMultiplier;
                scaleY = Mathf.Clamp(scaleY, scaleYMin, scaleYMax);
            }
        }
        
        transform.SetLocalScaleY(scaleY);
    }

    void OnVoiceStarts(AudioSource audioSource)
    {
        this.audioSource = audioSource;
        clipData = new float[this.audioSource.clip.samples];
    }

    void OnVoiceEnds()
    {
        this.audioSource = null;
    }
}

Some notes on above:

I've played with various values for updateSeconds and also tried a RepeatInvoke, to no avail.
I've played with various threshold values (the goal is to have the lips be closed on near-silence), and also removed the threshold check completely, but it won't help.
The code tries to automatically determine typical min and max loudness (so as to show the mouth at fullest range independent of the specific audio).
I've already set the audioSource priority to 0, the highest.
The contents of the audioClip isn't knowable in advance as it's live-written by the GPT-3 AI based on what the user asked.

CodePudding user response：

With credit to derHugo for helping, the following solves it by providing a constant (and smaller) array size to clipData, like 512, so that it won't grab too much during getData:

using UnityEngine;
using System.Linq;

public class SyncMouthToAudio : MonoBehaviour
{
    const float scaleYMin = 0.01f;
    const float scaleYMax = 0.05f;

    TextToSpeechVoice voice = null;

    AudioSource audioSource = null;
    float[] clipData = null;
    int samplesPerCheck = 0;

    float detectedLoudnessMin = Mathf.Infinity;
    float detectedLoudnessMax = 0f;

    const float checkEveryNSeconds = 0.05f;

    void Start()
    {
        voice = GetComponentInParent<TextToSpeechVoice>();
        voice.onStarts  = OnVoiceStarts;
        voice.onEnds  = OnVoiceEnds;
    }

    void CheckLoudness()
    {
        float loudness = 0f;
        if (audioSource != null && audioSource.isPlaying && audioSource.timeSamples > 0)
        {
            int offset = clipData.Length / 2;
            int startSample = (int) Mathf.Clamp(
                audioSource.timeSamples - offset, 0, audioSource.clip.samples - offset);
            audioSource.clip.GetData(clipData, startSample);
            loudness = clipData.Select(x => Mathf.Abs(x)).Average();

            if      (loudness < detectedLoudnessMin) { detectedLoudnessMin = loudness; }
            else if (loudness > detectedLoudnessMax) { detectedLoudnessMax = loudness; }
        }

        SetScaleByLoudness(loudness);
    }

    void SetScaleByLoudness(float loudness)
    {
        const float visibilityMultiplier = 15f;
        float scaleY = scaleYMin;

        if (loudness > 0f && detectedLoudnessMin < Mathf.Infinity &&
            detectedLoudnessMax > 0f && detectedLoudnessMin < detectedLoudnessMax)
        {
            float range = detectedLoudnessMax - detectedLoudnessMin;

            float scaleRange = scaleYMax - scaleYMin;
            float loudnessRange = detectedLoudnessMax - detectedLoudnessMin;

            float scaleToLoudnessRatio = scaleRange / loudnessRange;

            scaleY = scaleYMin   (loudness - detectedLoudnessMin) *
                scaleToLoudnessRatio * scaleYMax * visibilityMultiplier;
            scaleY = Mathf.Clamp(scaleY, scaleYMin, scaleYMax);
        }
        
        transform.SetLocalScaleY(scaleY);
    }

    void OnVoiceStarts(AudioSource audioSource)
    {
        this.audioSource = audioSource;

        float numberOfChecks = audioSource.clip.length / checkEveryNSeconds;
        samplesPerCheck = (int) Mathf.Round(audioSource.clip.samples / numberOfChecks);

        clipData = new float[samplesPerCheck];

        CancelInvoke();
        InvokeRepeating("CheckLoudness", checkEveryNSeconds * 0.5f, checkEveryNSeconds);
    }

    void OnVoiceEnds()
    {
        CancelInvoke();
        this.audioSource = null;
    }
}