For some reason, the export session always fails when I add the 'faceTrackInstructions' to the composition. Everything else works fine when I don't include the instruction however, the vertical crop is centred in the horizontal video. I want the vertical crop to change locations based on where the faces are in the frame.
Result without faceTrackInstructions enter image description here
What I'm trying to achieve on a frame by frame basis enter image description here
import Foundation
import AVFoundation
import Vision
import SwiftUI
func getFaceLocationsForHighlights(in videoAssetURL: URL, highlight: highlight, completion: @escaping ([[CGRect]]?) -> Void) {
let videoAsset = AVAsset(url: videoAssetURL)
let videoTrack = videoAsset.tracks(withMediaType: .video).first!
let videoSize = videoTrack.naturalSize
var faces = [[CGRect]]()
let totalFrames: Double = (Double(highlight.cuts![0].end/1000) - Double(highlight.cuts![0].start/1000)) * 30.0
print("totalFrames = \(Double(highlight.cuts![0].end/1000) - Double(highlight.cuts![0].start/1000))")
print("totalFrames = \(totalFrames)")
let request = VNDetectFaceRectanglesRequest(completionHandler: { request, error in
guard error == nil else {
print("Error detecting faces: \(error!)")
faces.append([CGRect()])
if faces.count == Int(totalFrames) {
print("All Faces Found")
completion(faces)
}
return
}
guard let results = request.results as? [VNFaceObservation] else {
print("No face observations found")
faces.append([CGRect()])
if faces.count == Int(totalFrames) {
print("All Faces Found")
completion(faces)
}
return
}
let faceLocations = results.map { observation -> CGRect in
let x = observation.boundingBox.origin.x * videoSize.width
let y = (1 - observation.boundingBox.origin.y - observation.boundingBox.size.height) * videoSize.height
let width = observation.boundingBox.size.width * videoSize.width
let height = observation.boundingBox.size.height * videoSize.height
return CGRect(x: x, y: y, width: width, height: height)
}
print("\(faceLocations.count) Faces Found")
faces.append(faceLocations)
if faces.count == Int(totalFrames) {
print("All Faces Found")
completion(faces)
}
})
guard let videoAssetTrack = videoAsset.tracks(withMediaType: .video).first else {
print("No video track found in asset.")
completion(nil)
return
}
// Create an instance of AVAssetImageGenerator
let generator = AVAssetImageGenerator(asset: videoAsset)
let start = CMTime(seconds: Double(highlight.cuts![0].start/1000), preferredTimescale: 1)
let end = CMTime(seconds: Double(highlight.cuts![0].end/1000), preferredTimescale: 1)
// Set the time range for which to generate images (here we generate images for the entire duration of the video)
let timeRange = CMTimeRange(start: start, duration: end)
// Set the time increment between generated frames (here we generate one image per second)
let frameDuration = CMTime(seconds: 1.0/30.0, preferredTimescale: 1)
// Loop through the frames and generate images
for i in 0..<Int(totalFrames) {
let time = CMTime(seconds: (1.0 / 30) * Double(i), preferredTimescale: 1)
let currentTime = CMTimeAdd(start, time)
do {
let image = try generator.copyCGImage(at: currentTime, actualTime: nil)
// Do something with the image (e.g. display it, save it to a file, etc.)
let handler = VNImageRequestHandler(cgImage: image)
do {
try handler.perform([request])
} catch {
print("Error performing face detection: \(error)")
faces.append([CGRect()])
if faces.count == Int(totalFrames) {
print("All Faces Found")
completion(faces)
}
}
} catch let error {
print("Failed to generate image at time \(currentTime): \(error)")
faces.append([CGRect()])
if faces.count == Int(totalFrames) {
print("All Faces Found")
completion(faces)
}
}
}
}
func exportHighlights(outputPath: String, highlights: [highlight], videoFile: URL, completionHandler: @escaping ([highlight]) -> Void) {
let manager = FileManager.default
var finalHighlights : [highlight] = []
var completionCount : Int = 0
guard let documentDirectory = URL(string: outputPath) else {
print("Error: failed to get document directory")
return
}
guard !highlights.isEmpty else {
print("Error: highlights array is empty")
return
}
for i in 0..<highlights.count {
let highlight = highlights[i]
let asset = AVAsset(url: videoFile)
let length = Float(asset.duration.value) / Float(asset.duration.timescale)
print("video length: \(length) seconds")
let composition = AVMutableComposition()
let videoTrack = composition.addMutableTrack(withMediaType: .video, preferredTrackID: kCMPersistentTrackID_Invalid)
let audioTrack = composition.addMutableTrack(withMediaType: .audio, preferredTrackID: kCMPersistentTrackID_Invalid)
for cut in highlight.cuts! {
print("\(cut.start) - \(cut.end)")
var currentTime: CMTime = CMTime(seconds: 0.0, preferredTimescale: 1)
if let track = asset.tracks(withMediaType: .video).first {
do {
try videoTrack?.insertTimeRange(CMTimeRangeMake(start: CMTime(seconds: Double(cut.start / 1000), preferredTimescale: 1), duration: CMTime(seconds: Double((cut.end - cut.start) / 1000), preferredTimescale: 1)), of: track, at: currentTime)
} catch {
print("Error : \(error)")
}
if let audioAssetTrack = asset.tracks(withMediaType: .audio).first {
do {
try audioTrack?.insertTimeRange(CMTimeRangeMake(start: CMTime(seconds: Double(cut.start / 1000), preferredTimescale: 1), duration: CMTime(seconds: Double((cut.end - cut.start) / 1000), preferredTimescale: 1)), of: audioAssetTrack, at: currentTime)
} catch {
print("Error : \(error)")
}
}
currentTime = composition.duration
}
}
let instruction = AVMutableVideoCompositionInstruction()
instruction.timeRange = CMTimeRangeMake(start: .zero, duration: composition.duration)
let videoInstruction = AVMutableVideoCompositionLayerInstruction(assetTrack: videoTrack!)
videoInstruction.setTransform(videoTrack!.preferredTransform, at: .zero)
instruction.layerInstructions = [videoInstruction]
///
var faceTrackInstructions = [AVMutableVideoCompositionInstruction]()
getFaceLocationsForHighlights(in: videoFile, highlight: highlight) { faces in
let segmentDuration = CMTime(seconds: 1/30, preferredTimescale: 1)
var currentTime = CMTime.zero
enum Orientation {
case up, down, right, left
}
func orientation(for track: AVAssetTrack) -> Orientation {
let t = track.preferredTransform
if(t.a == 0 && t.b == 1.0 && t.c == -1.0 && t.d == 0) { // Portrait
return .up
} else if(t.a == 0 && t.b == -1.0 && t.c == 1.0 && t.d == 0) { // PortraitUpsideDown
return .down
} else if(t.a == 1.0 && t.b == 0 && t.c == 0 && t.d == 1.0) { // LandscapeRight
return .right
} else if(t.a == -1.0 && t.b == 0 && t.c == 0 && t.d == -1.0) { // LandscapeLeft
return .left
} else {
return .up
}
}
let originalSize = videoTrack!.naturalSize
let trackOrientation = orientation(for: videoTrack!)
// Loop through each crop rectangle and add the cropped video segment to the composition
for rects in faces! {
let cropRect = rects[0]
let cropRectIsPortrait = cropRect.width <= cropRect.height
let instruction = AVMutableVideoCompositionInstruction()
instruction.timeRange = CMTimeRange(start: currentTime, duration: segmentDuration)
let transformer = AVMutableVideoCompositionLayerInstruction(assetTrack: videoTrack!)
var finalTransform: CGAffineTransform = CGAffineTransform.identity // setup a transform that grows the video, effectively causing a crop
if trackOrientation == .up {
if !cropRectIsPortrait { // center video rect vertically
finalTransform = finalTransform
.translatedBy(x: originalSize.height, y: -(originalSize.width - cropRect.size.height) / 2)
.rotated(by: CGFloat(90.0.radians))
} else {
finalTransform = finalTransform
.rotated(by: CGFloat(90.0.radians))
.translatedBy(x: 0, y: -originalSize.height)
}
} else if trackOrientation == .down {
if !cropRectIsPortrait { // center video rect vertically (NOTE: did not test this case, since camera doesn't support .portraitUpsideDown in this app)
finalTransform = finalTransform
.translatedBy(x: -originalSize.height, y: (originalSize.width - cropRect.size.height) / 2)
.rotated(by: CGFloat(-90.0.radians))
} else {
finalTransform = finalTransform
.rotated(by: CGFloat(-90.0.radians))
.translatedBy(x: -originalSize.width, y: -(originalSize.height - cropRect.size.height) / 2)
}
} else if trackOrientation == .right {
if cropRectIsPortrait {
finalTransform = finalTransform.translatedBy(x: -(originalSize.width - cropRect.size.width) / 2, y: 0)
} else {
finalTransform = CGAffineTransform.identity
}
} else if trackOrientation == .left {
if cropRectIsPortrait { // center video rect horizontally
finalTransform = finalTransform
.rotated(by: CGFloat(-180.0.radians))
.translatedBy(x: -originalSize.width + (originalSize.width - cropRect.size.width) / 2, y: -originalSize.height)
} else {
finalTransform = finalTransform
.rotated(by: CGFloat(-180.0.radians))
.translatedBy(x: -originalSize.width, y: -originalSize.height)
}
}
transformer.setTransform(finalTransform, at: .zero)
instruction.layerInstructions = [transformer]
faceTrackInstructions.append(instruction)
// Move the current time forward for the next segment
currentTime = CMTimeAdd(currentTime, segmentDuration)
}
}
print("faceTrackInstructions = \(faceTrackInstructions.count)")
let videoComposition = AVMutableVideoComposition()
videoComposition.instructions = faceTrackInstructions
//videoComposition.instructions.append(contentsOf: faceTrackInstructions)
videoComposition.renderSize = CGSize(width: videoTrack!.naturalSize.height / 16 * 9, height: videoTrack!.naturalSize.height)
videoComposition.frameDuration = CMTimeMake(value: 1, timescale: 30)
// Add audio mix
let audioMix = AVMutableAudioMix()
let audioMixInputParameters = AVMutableAudioMixInputParameters(track: audioTrack)
let audioVolume = 1.0 // Set volume to 100%
audioMixInputParameters.setVolume(Float(audioVolume), at: CMTime.zero)
var outputURL = documentDirectory.appendingPathComponent("output")
do {
try manager.createDirectory(at: outputURL, withIntermediateDirectories: true, attributes: nil)
let name = highlight.gist?.replacingOccurrences(of: " ", with: "%20")
let clipName = name
outputURL = outputURL.appendingPathComponent("\(i+1).mp4")
} catch let error {
print(error)
}
// Remove existing file
_ = try? manager.removeItem(at: outputURL)
guard let exportSession = AVAssetExportSession(asset: composition, presetName: AVAssetExportPresetHighestQuality) else {
print("Error: failed to create export session")
return
}
exportSession.outputFileType = .mov
audioMix.inputParameters = [audioMixInputParameters]
exportSession.audioMix = audioMix
// Configure export session
exportSession.outputURL = outputURL
exportSession.videoComposition = videoComposition
completionCount += 1
exportSession.exportAsynchronously {
switch exportSession.status {
case .completed:
print("Exporting Complete at \(outputURL)")
var newHighlight = highlights[i]
newHighlight.videoURL = outputURL
finalHighlights.append(newHighlight)
if completionCount == highlights.count {
completionHandler(finalHighlights)
}
case .failed, .cancelled:
print("Exporting Failed: \(exportSession.error?.localizedDescription ?? "unknown error")")
if completionCount == highlights.count {
completionHandler(finalHighlights)
}
if videoTrack == nil {
print("Video Track == nil")
}
default:
print("Exporting In Progress...")
}
}
}
}
extension Double {
var radians: Double {
return self * .pi / 180.0
}
}
Everything works when I remove the faceTrackInstructions however, the video crop remains centred. I want the crop to move based on the faces in the frame so that a face is always in shot.