I have a function I've implemented naively, using Accelerate, and using MetalKit with performances of 18 seconds, 9 seconds and 14 seconds respectively. Since the function involves modifying pixel values, I assumed MetalKit would perform best. Unfortunately it seems like Accelerate seems to be the winner. Am I doing something wrong?
Naive Implementation:
let dataPointer = imageData.withUnsafeBytes { $0.bindMemory(to: UInt16.self) }
let adjustedData = adjustedPixelBuffer!.contents().assumingMemoryBound(to: UInt16.self)
for index in 0..<dataPointer.count {
let originalPixelValue = Double(dataPointer[index])
var adjustedPixelValue = (originalPixelValue * rescaleSlope) + rescaleIntercept
if adjustedPixelValue < minWindowValue {
adjustedPixelValue = 0
} else if adjustedPixelValue > maxWindowValue {
adjustedPixelValue = 255
} else {
adjustedPixelValue = 255 * (adjustedPixelValue - minWindowValue) / windowWidth
}
adjustedPixelArray.append(UInt16(adjustedPixelValue))
adjustedPixelArray.append(UInt16(adjustedData[index]))
}
Accelerate:
let dataPointer = imageData.withUnsafeBytes { $0.bindMemory(to: UInt16.self) }
var inputVector = [Double](repeating: 0, count: dataPointer.count)
dataPointer.enumerated().forEach { index, value in
inputVector[index] = Double(value)
}
var slopeVector = [Double](repeating: rescaleSlope, count: dataPointer.count)
var interceptVector = [Double](repeating: rescaleIntercept, count: dataPointer.count)
vDSP_vmaD(inputVector, 1, slopeVector, 1, interceptVector, 1, &inputVector, 1, vDSP_Length(dataPointer.count))
var minWindowValueVector = [Double](repeating: minWindowValue, count: dataPointer.count)
var maxWindowValueVector = [Double](repeating: maxWindowValue, count: dataPointer.count)
vDSP_vclipD(inputVector, 1, minWindowValueVector, maxWindowValueVector, &inputVector, 1, vDSP_Length(dataPointer.count))
vDSP_vsubD(minWindowValueVector, 1, inputVector, 1, &inputVector, 1, vDSP_Length(dataPointer.count))
let scaleVector = [Double](repeating: 255.0/windowWidth, count: dataPointer.count)
vDSP_vmulD(inputVector, 1, scaleVector, 1, &inputVector, 1, vDSP_Length(dataPointer.count))
adjustedPixelArray = [UInt16](repeating: 0, count: dataPointer.count)
vDSP_vfixu16D(inputVector, 1, &adjustedPixelArray, 1, vDSP_Length(dataPointer.count))
MetalKit: PixelAdjustment.m
#include <metal_stdlib>
using namespace metal;
kernel void adjustPixelValues(constant ushort *inTexture [[ buffer(0) ]],
device ushort *outTexture [[ buffer(1) ]],
constant float *parameters [[ buffer(2) ]],
uint id [[ thread_position_in_grid ]]) {
float originalPixelValue = inTexture[id];
float adjustedPixelValue = (originalPixelValue * parameters[0]) + parameters[1];
float minWindowValue = parameters[2];
float maxWindowValue = parameters[3];
float windowWidth = parameters[4];
if (adjustedPixelValue < minWindowValue) {
adjustedPixelValue = 0;
} else if (adjustedPixelValue > maxWindowValue) {
adjustedPixelValue = 255;
} else {
adjustedPixelValue = 255 * (adjustedPixelValue - minWindowValue) / windowWidth;
}
outTexture[id] = ushort(adjustedPixelValue);
}
let dataPointer = imageData.withUnsafeBytes { $0.bindMemory(to: UInt16.self) }
let originalPixelBuffer = device.makeBuffer(bytes: dataPointer.baseAddress!, length: dataPointer.count * MemoryLayout<UInt16>.stride, options: [])
let adjustedPixelBuffer = device.makeBuffer(length: dataPointer.count * MemoryLayout<UInt16>.stride, options: [])
let parameters: [Float] = [Float(rescaleSlope), Float(rescaleIntercept), Float(minWindowValue), Float(maxWindowValue), Float(windowWidth)]
let parametersBuffer = device.makeBuffer(bytes: parameters, length: parameters.count * MemoryLayout<Float>.stride, options: [])
let commandBuffer = commandQueue.makeCommandBuffer()!
let commandEncoder = commandBuffer.makeComputeCommandEncoder()!
commandEncoder.setComputePipelineState(computePipelineState)
commandEncoder.setBuffer(originalPixelBuffer, offset: 0, index: 0)
commandEncoder.setBuffer(adjustedPixelBuffer, offset: 0, index: 1)
commandEncoder.setBuffer(parametersBuffer, offset: 0, index: 2)
let threadGroupCount = MTLSizeMake(32, 1, 1)
let threadGroups = MTLSizeMake((dataPointer.count + 31) / 32, 1, 1)
commandEncoder.dispatchThreadgroups(threadGroups, threadsPerThreadgroup: threadGroupCount)
commandEncoder.endEncoding()
commandBuffer.commit()
commandBuffer.waitUntilCompleted()
let adjustedData = adjustedPixelBuffer!.contents().assumingMemoryBound(to: UInt16.self)
for index in 0..<dataPointer.count {
adjustedPixelArray.append(UInt16(adjustedData[index]))
}
Any help would be greatly appreciated!