Home > Enterprise >  Why is audioClip lip sync off in this Unity try?
Why is audioClip lip sync off in this Unity try?

Time:10-24

I'm using Google Cloud text-to-speech to get a live audioClip. The goal is to move the lips y scale of a simple transform attached to a body. However, the timing of the lips during clip playback seems off. Is there any way to correct this? Thanks!

using UnityEngine;

public class SyncMouthToAudio : MonoBehaviour
{
    const float scaleYMin = 0.01f;
    const float scaleYMax = 0.05f;

    TextToSpeechVoice voice = null;

    AudioSource audioSource = null;
    float[] clipData = null;

    const float updateStep = 0.1f;
    float detectedLoudnessMin = Mathf.Infinity;
    float detectedLoudnessMax = 0f;

    const float updateSeconds = 0.096f;
    float updateTime = 0f;

    void Start()
    {
        voice = GetComponentInParent<TextToSpeechVoice>();
        voice.onStarts  = OnVoiceStarts;
        voice.onEnds  = OnVoiceEnds;
    }

    void Update()
    {
        updateTime  = Time.deltaTime;
        if (updateTime >= updateSeconds)
        {
            updateTime = 0f;
            CheckLoudness();
        }
    }

    void CheckLoudness()
    {
        float loudness = 0f;
        if (audioSource != null && audioSource.isPlaying && audioSource.timeSamples > 0)
        {
            audioSource.clip.GetData(clipData, audioSource.timeSamples);
            foreach (var sample in clipData)
            {
                loudness  = Mathf.Abs(sample);
            }

            if      (loudness < detectedLoudnessMin) { detectedLoudnessMin = loudness; }
            else if (loudness > detectedLoudnessMax) { detectedLoudnessMax = loudness; }
        }

        SetScaleByLoudness(loudness);
    }

    void SetScaleByLoudness(float loudness)
    {
        const float visibilityMultiplier = 15f;
        float scaleY = scaleYMin;

        bool detectedLoudness = loudness > 0f && detectedLoudnessMin < Mathf.Infinity &&
            detectedLoudnessMax > 0f && detectedLoudnessMin < detectedLoudnessMax;
        if (detectedLoudness)
        {
            float range = detectedLoudnessMax - detectedLoudnessMin;
            float threshold = detectedLoudnessMin   range * 0.3f;
            bool loudnessIsRelevantEnough = loudness >= threshold;
            if (loudnessIsRelevantEnough)
            {
                float scaleRange = scaleYMax - scaleYMin;
                float loudnessRange = detectedLoudnessMax - detectedLoudnessMin;

                float scaleToLoudnessRatio = scaleRange / loudnessRange;

                scaleY = scaleYMin   (loudness - detectedLoudnessMin) * scaleToLoudnessRatio * scaleYMax * visibilityMultiplier;
                scaleY = Mathf.Clamp(scaleY, scaleYMin, scaleYMax);
            }
        }
        
        transform.SetLocalScaleY(scaleY);
    }

    void OnVoiceStarts(AudioSource audioSource)
    {
        this.audioSource = audioSource;
        clipData = new float[this.audioSource.clip.samples];
    }

    void OnVoiceEnds()
    {
        this.audioSource = null;
    }
}

Some notes on above:

  • I've played with various values for updateSeconds and also tried a RepeatInvoke, to no avail.
  • I've played with various threshold values (the goal is to have the lips be closed on near-silence), and also removed the threshold check completely, but it won't help.
  • The code tries to automatically determine typical min and max loudness (so as to show the mouth at fullest range independent of the specific audio).
  • I've already set the audioSource priority to 0, the highest.
  • The contents of the audioClip isn't knowable in advance as it's live-written by the GPT-3 AI based on what the user asked.

CodePudding user response:

With credit to derHugo for helping, the following solves it by providing a constant (and smaller) array size to clipData, like 512, so that it won't grab too much during getData:

using UnityEngine;
using System.Linq;

public class SyncMouthToAudio : MonoBehaviour
{
    const float scaleYMin = 0.01f;
    const float scaleYMax = 0.05f;

    TextToSpeechVoice voice = null;

    AudioSource audioSource = null;
    float[] clipData = null;
    int samplesPerCheck = 0;

    float detectedLoudnessMin = Mathf.Infinity;
    float detectedLoudnessMax = 0f;

    const float checkEveryNSeconds = 0.05f;

    void Start()
    {
        voice = GetComponentInParent<TextToSpeechVoice>();
        voice.onStarts  = OnVoiceStarts;
        voice.onEnds  = OnVoiceEnds;
    }

    void CheckLoudness()
    {
        float loudness = 0f;
        if (audioSource != null && audioSource.isPlaying && audioSource.timeSamples > 0)
        {
            int offset = clipData.Length / 2;
            int startSample = (int) Mathf.Clamp(
                audioSource.timeSamples - offset, 0, audioSource.clip.samples - offset);
            audioSource.clip.GetData(clipData, startSample);
            loudness = clipData.Select(x => Mathf.Abs(x)).Average();

            if      (loudness < detectedLoudnessMin) { detectedLoudnessMin = loudness; }
            else if (loudness > detectedLoudnessMax) { detectedLoudnessMax = loudness; }
        }

        SetScaleByLoudness(loudness);
    }

    void SetScaleByLoudness(float loudness)
    {
        const float visibilityMultiplier = 15f;
        float scaleY = scaleYMin;

        if (loudness > 0f && detectedLoudnessMin < Mathf.Infinity &&
            detectedLoudnessMax > 0f && detectedLoudnessMin < detectedLoudnessMax)
        {
            float range = detectedLoudnessMax - detectedLoudnessMin;

            float scaleRange = scaleYMax - scaleYMin;
            float loudnessRange = detectedLoudnessMax - detectedLoudnessMin;

            float scaleToLoudnessRatio = scaleRange / loudnessRange;

            scaleY = scaleYMin   (loudness - detectedLoudnessMin) *
                scaleToLoudnessRatio * scaleYMax * visibilityMultiplier;
            scaleY = Mathf.Clamp(scaleY, scaleYMin, scaleYMax);
        }
        
        transform.SetLocalScaleY(scaleY);
    }

    void OnVoiceStarts(AudioSource audioSource)
    {
        this.audioSource = audioSource;

        float numberOfChecks = audioSource.clip.length / checkEveryNSeconds;
        samplesPerCheck = (int) Mathf.Round(audioSource.clip.samples / numberOfChecks);

        clipData = new float[samplesPerCheck];

        CancelInvoke();
        InvokeRepeating("CheckLoudness", checkEveryNSeconds * 0.5f, checkEveryNSeconds);
    }

    void OnVoiceEnds()
    {
        CancelInvoke();
        this.audioSource = null;
    }
}
  • Related