Crop Horizontal Video Into Vertical Based On Face Locations

Question

For some reason, the export session always fails when I add the 'faceTrackInstructions' to the composition. Everything else works fine when I don't include the instruction however, the vertical crop is centred in the horizontal video. I want the vertical crop to change locations based on where the faces are in the frame.

Result without faceTrackInstructions enter image description here

What I'm trying to achieve on a frame by frame basis enter image description here

import Foundation
import AVFoundation
import Vision
import SwiftUI

func getFaceLocationsForHighlights(in videoAssetURL: URL, highlight: highlight, completion: @escaping ([[CGRect]]?) -> Void) {
    let videoAsset = AVAsset(url: videoAssetURL)
    let videoTrack = videoAsset.tracks(withMediaType: .video).first!
    let videoSize = videoTrack.naturalSize
    
    var faces = [[CGRect]]()
    
    let totalFrames: Double = (Double(highlight.cuts![0].end/1000) - Double(highlight.cuts![0].start/1000)) * 30.0
    print("totalFrames = \(Double(highlight.cuts![0].end/1000) - Double(highlight.cuts![0].start/1000))")
    print("totalFrames = \(totalFrames)")
    
    let request = VNDetectFaceRectanglesRequest(completionHandler: { request, error in
        guard error == nil else {
            print("Error detecting faces: \(error!)")
            faces.append([CGRect()])
            if faces.count == Int(totalFrames) {
                print("All Faces Found")
                completion(faces)
            }
            return
        }
        
        guard let results = request.results as? [VNFaceObservation] else {
            print("No face observations found")
            faces.append([CGRect()])
            if faces.count == Int(totalFrames) {
                print("All Faces Found")
                completion(faces)
            }
            return
        }
        
        let faceLocations = results.map { observation -> CGRect in
            let x = observation.boundingBox.origin.x * videoSize.width
            let y = (1 - observation.boundingBox.origin.y - observation.boundingBox.size.height) * videoSize.height
            let width = observation.boundingBox.size.width * videoSize.width
            let height = observation.boundingBox.size.height * videoSize.height
            return CGRect(x: x, y: y, width: width, height: height)
        }
        print("\(faceLocations.count) Faces Found")
        faces.append(faceLocations)
        if faces.count == Int(totalFrames) {
            print("All Faces Found")
            completion(faces)
        }
    })
    
    guard let videoAssetTrack = videoAsset.tracks(withMediaType: .video).first else {
        print("No video track found in asset.")
        completion(nil)
        return
    }

    // Create an instance of AVAssetImageGenerator
    let generator = AVAssetImageGenerator(asset: videoAsset)

    let start = CMTime(seconds: Double(highlight.cuts![0].start/1000), preferredTimescale: 1)
    let end = CMTime(seconds: Double(highlight.cuts![0].end/1000), preferredTimescale: 1)
    // Set the time range for which to generate images (here we generate images for the entire duration of the video)
    let timeRange = CMTimeRange(start: start, duration: end)

    // Set the time increment between generated frames (here we generate one image per second)
    let frameDuration = CMTime(seconds: 1.0/30.0, preferredTimescale: 1)

    
    // Loop through the frames and generate images
    for i in 0..<Int(totalFrames) {
        let time = CMTime(seconds: (1.0 / 30) * Double(i), preferredTimescale: 1)
        let currentTime = CMTimeAdd(start, time)
        do {
            let image = try generator.copyCGImage(at: currentTime, actualTime: nil)
            // Do something with the image (e.g. display it, save it to a file, etc.)
            let handler = VNImageRequestHandler(cgImage: image)
            do {
                try handler.perform([request])
            } catch {
                print("Error performing face detection: \(error)")
                faces.append([CGRect()])
                if faces.count == Int(totalFrames) {
                    print("All Faces Found")
                    completion(faces)
                }
            }
        } catch let error {
            print("Failed to generate image at time \(currentTime): \(error)")
            faces.append([CGRect()])
            if faces.count == Int(totalFrames) {
                print("All Faces Found")
                completion(faces)
            }
        }
    }
    
    
}

func exportHighlights(outputPath: String, highlights: [highlight], videoFile: URL, completionHandler: @escaping ([highlight]) -> Void) {
    
    let manager = FileManager.default
    
    var finalHighlights : [highlight] = []
    var completionCount : Int = 0

    guard let documentDirectory = URL(string: outputPath) else {
        print("Error: failed to get document directory")
        return
    }

    guard !highlights.isEmpty else {
        print("Error: highlights array is empty")
        return
    }

    for i in 0..<highlights.count {
        
        let highlight = highlights[i]
        let asset = AVAsset(url: videoFile)
        let length = Float(asset.duration.value) / Float(asset.duration.timescale)
        print("video length: \(length) seconds")
        
        let composition = AVMutableComposition()
        
        let videoTrack = composition.addMutableTrack(withMediaType: .video, preferredTrackID: kCMPersistentTrackID_Invalid)
        let audioTrack = composition.addMutableTrack(withMediaType: .audio, preferredTrackID: kCMPersistentTrackID_Invalid)
        
        for cut in highlight.cuts! {
            print("\(cut.start) - \(cut.end)")
            var currentTime: CMTime = CMTime(seconds: 0.0, preferredTimescale: 1)
            if let track = asset.tracks(withMediaType: .video).first {
                do {
                    try videoTrack?.insertTimeRange(CMTimeRangeMake(start: CMTime(seconds: Double(cut.start / 1000), preferredTimescale: 1), duration: CMTime(seconds: Double((cut.end - cut.start) / 1000), preferredTimescale: 1)), of: track, at: currentTime)
                   
                } catch {
                    print("Error : \(error)")
                }
                if let audioAssetTrack = asset.tracks(withMediaType: .audio).first {
                    do {
                        try audioTrack?.insertTimeRange(CMTimeRangeMake(start: CMTime(seconds: Double(cut.start / 1000), preferredTimescale: 1), duration: CMTime(seconds: Double((cut.end - cut.start) / 1000), preferredTimescale: 1)), of: audioAssetTrack, at: currentTime)
                    } catch {
                        print("Error : \(error)")
                    }
                }
                
                currentTime = composition.duration
            }
        }
        
        let instruction = AVMutableVideoCompositionInstruction()
        instruction.timeRange = CMTimeRangeMake(start: .zero, duration: composition.duration)

        let videoInstruction = AVMutableVideoCompositionLayerInstruction(assetTrack: videoTrack!)
        videoInstruction.setTransform(videoTrack!.preferredTransform, at: .zero)
        
        instruction.layerInstructions = [videoInstruction]
        ///
        var faceTrackInstructions = [AVMutableVideoCompositionInstruction]()
        
        getFaceLocationsForHighlights(in: videoFile, highlight: highlight) { faces in
            
            let segmentDuration = CMTime(seconds: 1/30, preferredTimescale: 1)
            var currentTime = CMTime.zero
            
            enum Orientation {
                case up, down, right, left
            }
                    
            func orientation(for track: AVAssetTrack) -> Orientation {
                let t = track.preferredTransform
                
                if(t.a == 0 && t.b == 1.0 && t.c == -1.0 && t.d == 0) {             // Portrait
                    return .up
                } else if(t.a == 0 && t.b == -1.0 && t.c == 1.0 && t.d == 0) {      // PortraitUpsideDown
                    return .down
                } else if(t.a == 1.0 && t.b == 0 && t.c == 0 && t.d == 1.0) {       // LandscapeRight
                    return .right
                } else if(t.a == -1.0 && t.b == 0 && t.c == 0 && t.d == -1.0) {     // LandscapeLeft
                    return .left
                } else {
                    return .up
                }
            }
            
           
            
            let originalSize = videoTrack!.naturalSize
            let trackOrientation = orientation(for: videoTrack!)

            // Loop through each crop rectangle and add the cropped video segment to the composition
            for rects in faces! {
                
                let cropRect = rects[0]
                let cropRectIsPortrait = cropRect.width <= cropRect.height
      
                let instruction = AVMutableVideoCompositionInstruction()
                instruction.timeRange = CMTimeRange(start: currentTime, duration: segmentDuration)
                
                let transformer = AVMutableVideoCompositionLayerInstruction(assetTrack: videoTrack!)
                
                var finalTransform: CGAffineTransform = CGAffineTransform.identity // setup a transform that grows the video, effectively causing a crop
                
                if trackOrientation == .up {
                    if !cropRectIsPortrait { // center video rect vertically
                        finalTransform = finalTransform
                            .translatedBy(x: originalSize.height, y: -(originalSize.width - cropRect.size.height) / 2)
                            .rotated(by: CGFloat(90.0.radians))
                    } else {
                        finalTransform = finalTransform
                            .rotated(by: CGFloat(90.0.radians))
                            .translatedBy(x: 0, y: -originalSize.height)
                    }
                    
                } else if trackOrientation == .down {
                    if !cropRectIsPortrait { // center video rect vertically (NOTE: did not test this case, since camera doesn't support .portraitUpsideDown in this app)
                        finalTransform = finalTransform
                            .translatedBy(x: -originalSize.height, y: (originalSize.width - cropRect.size.height) / 2)
                            .rotated(by: CGFloat(-90.0.radians))
                    } else {
                        finalTransform = finalTransform
                            .rotated(by: CGFloat(-90.0.radians))
                            .translatedBy(x: -originalSize.width, y: -(originalSize.height - cropRect.size.height) / 2)
                    }
                    
                } else if trackOrientation == .right {
                    if cropRectIsPortrait {
                        finalTransform = finalTransform.translatedBy(x: -(originalSize.width - cropRect.size.width) / 2, y: 0)
                    } else {
                        finalTransform = CGAffineTransform.identity
                    }
                    
                } else if trackOrientation == .left {
                    if cropRectIsPortrait { // center video rect horizontally
                        finalTransform = finalTransform
                            .rotated(by: CGFloat(-180.0.radians))
                            .translatedBy(x: -originalSize.width + (originalSize.width - cropRect.size.width) / 2, y: -originalSize.height)
                    } else {
                        finalTransform = finalTransform
                            .rotated(by: CGFloat(-180.0.radians))
                            .translatedBy(x: -originalSize.width, y: -originalSize.height)
                    }
                }
                
                transformer.setTransform(finalTransform, at: .zero)
                instruction.layerInstructions = [transformer]
                faceTrackInstructions.append(instruction)
                
                // Move the current time forward for the next segment
                currentTime = CMTimeAdd(currentTime, segmentDuration)
            }
            
        }

        print("faceTrackInstructions = \(faceTrackInstructions.count)")
        let videoComposition = AVMutableVideoComposition()
        videoComposition.instructions = faceTrackInstructions
        //videoComposition.instructions.append(contentsOf: faceTrackInstructions)
        videoComposition.renderSize = CGSize(width: videoTrack!.naturalSize.height / 16 * 9, height: videoTrack!.naturalSize.height)
        videoComposition.frameDuration = CMTimeMake(value: 1, timescale: 30)
        
        // Add audio mix
        let audioMix = AVMutableAudioMix()
        let audioMixInputParameters = AVMutableAudioMixInputParameters(track: audioTrack)
        let audioVolume = 1.0 // Set volume to 100%
        audioMixInputParameters.setVolume(Float(audioVolume), at: CMTime.zero)

        var outputURL = documentDirectory.appendingPathComponent("output")
        do {
            try manager.createDirectory(at: outputURL, withIntermediateDirectories: true, attributes: nil)
            let name = highlight.gist?.replacingOccurrences(of: " ", with: "%20")
            let clipName = name
            outputURL = outputURL.appendingPathComponent("\(i+1).mp4")
        } catch let error {
            print(error)
        }

        // Remove existing file
        _ = try? manager.removeItem(at: outputURL)

        guard let exportSession = AVAssetExportSession(asset: composition, presetName: AVAssetExportPresetHighestQuality) else {
            print("Error: failed to create export session")
            return
        }

        exportSession.outputFileType = .mov
        audioMix.inputParameters = [audioMixInputParameters]
        exportSession.audioMix = audioMix

        // Configure export session
        exportSession.outputURL = outputURL
        exportSession.videoComposition = videoComposition

        completionCount += 1
        exportSession.exportAsynchronously {
            switch exportSession.status {
            case .completed:
                print("Exporting Complete at \(outputURL)")
                var newHighlight = highlights[i]
                newHighlight.videoURL = outputURL
                finalHighlights.append(newHighlight)
                if completionCount == highlights.count {
                    completionHandler(finalHighlights)
                }
            case .failed, .cancelled:
                print("Exporting Failed: \(exportSession.error?.localizedDescription ?? "unknown error")")
                if completionCount == highlights.count {
                    completionHandler(finalHighlights)
                }
                if videoTrack == nil {
                    print("Video Track == nil")
                }
            default:
                print("Exporting In Progress...")
            }
        }
        
        
    }

    
    


    
}

extension Double {
    var radians: Double {
        return self * .pi / 180.0
    }
}

Everything works when I remove the faceTrackInstructions however, the video crop remains centred. I want the crop to move based on the faces in the frame so that a face is always in shot.

Crop Horizontal Video Into Vertical Based On Face Locations

0 Answers0