0

I have a function I've implemented naively, using Accelerate, and using MetalKit with performances of 18 seconds, 9 seconds and 14 seconds respectively. Since the function involves modifying pixel values, I assumed MetalKit would perform best. Unfortunately it seems like Accelerate seems to be the winner. Am I doing something wrong?

Naive Implementation:

let dataPointer = imageData.withUnsafeBytes { $0.bindMemory(to: UInt16.self) }
let adjustedData = adjustedPixelBuffer!.contents().assumingMemoryBound(to: UInt16.self)
for index in 0..<dataPointer.count {
    let originalPixelValue = Double(dataPointer[index])
    var adjustedPixelValue = (originalPixelValue * rescaleSlope) + rescaleIntercept
    if adjustedPixelValue < minWindowValue {
        adjustedPixelValue = 0
    } else if adjustedPixelValue > maxWindowValue {
        adjustedPixelValue = 255
    } else {
        adjustedPixelValue = 255 * (adjustedPixelValue - minWindowValue) / windowWidth
    }
                                
    adjustedPixelArray.append(UInt16(adjustedPixelValue))
    adjustedPixelArray.append(UInt16(adjustedData[index]))
}

Accelerate:

let dataPointer = imageData.withUnsafeBytes { $0.bindMemory(to: UInt16.self) }
var inputVector = [Double](repeating: 0, count: dataPointer.count)
dataPointer.enumerated().forEach { index, value in
    inputVector[index] = Double(value)
}
var slopeVector = [Double](repeating: rescaleSlope, count: dataPointer.count)
var interceptVector = [Double](repeating: rescaleIntercept, count: dataPointer.count)
vDSP_vmaD(inputVector, 1, slopeVector, 1, interceptVector, 1, &inputVector, 1, vDSP_Length(dataPointer.count))
var minWindowValueVector = [Double](repeating: minWindowValue, count: dataPointer.count)
var maxWindowValueVector = [Double](repeating: maxWindowValue, count: dataPointer.count)
vDSP_vclipD(inputVector, 1, minWindowValueVector, maxWindowValueVector, &inputVector, 1, vDSP_Length(dataPointer.count))
vDSP_vsubD(minWindowValueVector, 1, inputVector, 1, &inputVector, 1, vDSP_Length(dataPointer.count))
let scaleVector = [Double](repeating: 255.0/windowWidth, count: dataPointer.count)
vDSP_vmulD(inputVector, 1, scaleVector, 1, &inputVector, 1, vDSP_Length(dataPointer.count))
adjustedPixelArray = [UInt16](repeating: 0, count: dataPointer.count)
vDSP_vfixu16D(inputVector, 1, &adjustedPixelArray, 1, vDSP_Length(dataPointer.count))

MetalKit: PixelAdjustment.m

#include <metal_stdlib>
using namespace metal;

kernel void adjustPixelValues(constant ushort *inTexture [[ buffer(0) ]],
                              device ushort *outTexture [[ buffer(1) ]],
                              constant float *parameters [[ buffer(2) ]],
                              uint id [[ thread_position_in_grid ]]) {

    float originalPixelValue = inTexture[id];
    float adjustedPixelValue = (originalPixelValue * parameters[0]) + parameters[1];
    float minWindowValue = parameters[2];
    float maxWindowValue = parameters[3];
    float windowWidth = parameters[4];
    if (adjustedPixelValue < minWindowValue) {
        adjustedPixelValue = 0;
    } else if (adjustedPixelValue > maxWindowValue) {
        adjustedPixelValue = 255;
    } else {
        adjustedPixelValue = 255 * (adjustedPixelValue - minWindowValue) / windowWidth;
    }
    outTexture[id] = ushort(adjustedPixelValue);
}
let dataPointer = imageData.withUnsafeBytes { $0.bindMemory(to: UInt16.self) }
let originalPixelBuffer = device.makeBuffer(bytes: dataPointer.baseAddress!, length: dataPointer.count * MemoryLayout<UInt16>.stride, options: [])
let adjustedPixelBuffer = device.makeBuffer(length: dataPointer.count * MemoryLayout<UInt16>.stride, options: [])

let parameters: [Float] = [Float(rescaleSlope), Float(rescaleIntercept), Float(minWindowValue), Float(maxWindowValue), Float(windowWidth)]
let parametersBuffer = device.makeBuffer(bytes: parameters, length: parameters.count * MemoryLayout<Float>.stride, options: [])

let commandBuffer = commandQueue.makeCommandBuffer()!
let commandEncoder = commandBuffer.makeComputeCommandEncoder()!
commandEncoder.setComputePipelineState(computePipelineState)
commandEncoder.setBuffer(originalPixelBuffer, offset: 0, index: 0)
commandEncoder.setBuffer(adjustedPixelBuffer, offset: 0, index: 1)
commandEncoder.setBuffer(parametersBuffer, offset: 0, index: 2)

let threadGroupCount = MTLSizeMake(32, 1, 1)
let threadGroups = MTLSizeMake((dataPointer.count + 31) / 32, 1, 1)
commandEncoder.dispatchThreadgroups(threadGroups, threadsPerThreadgroup: threadGroupCount)

commandEncoder.endEncoding()
commandBuffer.commit()
commandBuffer.waitUntilCompleted()

let adjustedData = adjustedPixelBuffer!.contents().assumingMemoryBound(to: UInt16.self)
for index in 0..<dataPointer.count {
    adjustedPixelArray.append(UInt16(adjustedData[index]))
}

Any help would be greatly appreciated!

cyril
  • 3,020
  • 6
  • 36
  • 61
  • How did you determine to use a thread group count of 32? It's also not clear to me why you are setting thread groups I guess to be between 0 and 1? What size image does `imageData` represent? – Jeshua Lacock Jul 11 '23 at 03:29
  • @JeshuaLacock still in the process of learning and reading documentation but I set it arbitrarily. ImageData can be any size as it's a DICOM image which can also have multiple frames. After spending the day on it, I was able to get the processing time down to 1 sec for Metal. Haven't yet figured out the best way to set threadGroupCount or threadGroups yet though but yes, trying to have it between 0 and 1. – cyril Jul 11 '23 at 03:49
  • 1
    Glad to hear you got it running faster. For an overview of thread groups see: https://developer.apple.com/documentation/metal/compute_passes/calculating_threadgroup_and_grid_sizes (scroll down about half for an image example). – Jeshua Lacock Jul 11 '23 at 04:46

0 Answers0