AVMutableComposition -How to Merge Multiple Audio Recordings with 1 Video Recording-CodePudding

I have several audio clips that I recorded with an AVAudioRecorder over a video. Using a AVMutableComposition I want to merge the audio assets with the video at the times the audio was recorded. For eg, the video is 1 minute long, I recorded 3 audio clips at 5-10secs, 20-25secs, and 30-35secs. The audio clips should be merged with the video at those specific time frames. When the final video plays the audio would play over the video at those time frames.

model:

class AudioModel {

    var audioUrl: URL?
    var startTime: Double?
    var endTime: Double?
}

mix:

let mixComposition = AVMutableComposition()

guard let videoCompositionTrack = mixComposition.addMutableTrack(withMediaType: .video, preferredTrackID: Int32(kCMPersistentTrackID_Invalid)) else { return }
guard let audioFromVideoCompositionTrack = mixComposition.addMutableTrack(withMediaType: .audio, preferredTrackID: Int32(kCMPersistentTrackID_Invalid)) else { return }
guard let audioModelCompositionTrack = mixComposition.addMutableTrack(withMediaType: .audio, preferredTrackID: Int32(kCMPersistentTrackID_Invalid)) else { return }

let videoAsset = AVURLAsset(url: videoURL)
guard let videoTrack = videoAsset.tracks(withMediaType: .video).first else { return }

for audioModel in audioModels {

    let audioAsset = AVURLAsset(url: audioModel.url!)
    let startTime = CMTime(seconds: audioModel.startTime!, preferredTimescale: 1000)

    do {

        try videoCompositionTrack.insertTimeRange(CMTimeRangeMake(start: .zero, duration: videoAsset.duration), of: videoTrack, at: .zero)
            
        if let audioTrackFromAudioModel = audioAsset.tracks(withMediaType: .audio).first {
                
            try audioModelCompositionTrack.insertTimeRange(CMTimeRangeMake(start: startTime, duration: audioAsset.duration),
                                                               of: audioTrackFromAudioModel, at: .zero)
        }
            
        if let audioFromVideoTrack = videoAsset.tracks(withMediaType: .audio).first {
            try audioFromVideoCompositionTrack.insertTimeRange(CMTimeRangeMake(start: CMTime.zero, duration: videoAsset.duration),
                                                                   of: audioFromVideoTrack, at: .zero)
        }

    } catch {
    }
}

let exporter = AVAssetExportSession(asset: mixComposition, presetName: AVAssetExportPresetHighestQuality)
// ... I know what to do from here

CodePudding user response：

Your approach is correct, but you've mixed up the two parameters that you're using for insertTimeRange, and you're adding the video and audio from your video track multiple times.

The first parameter in insertTimeRange refers to the timeRange within the original audio asset, not the composition; so assuming that for each audio clip you are looking to add the entire clip, the time range should always start at .zero, not at startTime. The at: parameter should no be .zero, but rather "startTime" - the time within the composition where you want to add the audio.

Regarding your video track and your audioFromVideoTrack, I would not add these as part of the loop, but rather just add them before the loop. Otherwise you are adding them multiple times (once for each audio item), rather than just once, and this can lead to unwanted behavior or the export sessions failing altogether.

I edited your code but wasn't able to actually test it so take it with a grain of salt.

guard let videoCompositionTrack = mixComposition.addMutableTrack(withMediaType: .video, preferredTrackID: Int32(kCMPersistentTrackID_Invalid)) else { return }
guard let audioFromVideoCompositionTrack = mixComposition.addMutableTrack(withMediaType: .audio, preferredTrackID: Int32(kCMPersistentTrackID_Invalid)) else { return }
guard let audioModelCompositionTrack = mixComposition.addMutableTrack(withMediaType: .audio, preferredTrackID: Int32(kCMPersistentTrackID_Invalid)) else { return }

let videoAsset = AVURLAsset(url: videoURL)
guard let videoTrack = videoAsset.tracks(withMediaType: .video).first else { return }

do {
    try videoCompositionTrack.insertTimeRange(CMTimeRangeMake(start: .zero, duration: videoAsset.duration), of: videoTrack, at: .zero)
    if let audioFromVideoTrack = videoAsset.tracks(withMediaType: .audio).first {
        try audioFromVideoCompositionTrack.insertTimeRange(CMTimeRangeMake(start: CMTime.zero, duration: videoAsset.duration), of: audioFromVideoTrack, at: .zero)
    }
} catch {
}

for audioModel in audioModels {
    let audioAsset = AVURLAsset(url: audioModel.url!)
    let startTime = CMTime(seconds: audioModel.startTime!, preferredTimescale: 1000)
    do {
        if let audioTrackFromAudioModel = audioAsset.tracks(withMediaType: .audio).first {
            try audioModelCompositionTrack.insertTimeRange(CMTimeRangeMake(start: .zero, duration: audioAsset.duration), of: audioTrackFromAudioModel, at: startTime)
        }
    } catch {
    }
}

let exporter = AVAssetExportSession(asset: mixComposition, presetName: AVAssetExportPresetHighestQuality)
// ... I know what to do from here