4

I need your help today ! I begin to work with compute shader in a really simple use case : I have a depth camera and I want to calculate the bounding box of an object near to the camera.

But I have too much pixel to process and I want to use GPGPU, compute shader and parallelization to compute this.

I currently have a problem, when I run my program, I have the same min and max coordinates. So I think that all my group and threads write in the same time to my StructuredBuffers.

Do you have an idea to how to do that?

Here is the code of my compute shader :

#pragma kernel ComputeBoundingBox

//We define the size of a group in the x, y and z directions, y and z direction will just be one (because 1D array is used for depthData)
#define thread_group_size_x 1024
#define thread_group_size_y 1
#define thread_group_size_z 1
//Size of the depthData picture
#define width 512;
#define height 424;

//DataBuffer = depthData of the camera
//minBuffer, maxBuffer, array of size 3 with min/max x, y and z
//mask = image area to process
RWStructuredBuffer<float> dataBuffer;
globallycoherent RWStructuredBuffer<float>minBuffer;
globallycoherent RWStructuredBuffer<float> maxBuffer;
RWStructuredBuffer<float> mask;


float xValue = 0, yValue = 0, zValue = 0;

[numthreads(thread_group_size_x, thread_group_size_y, thread_group_size_z)]
void ComputeBoundingBox(uint3 id : SV_DispatchThreadID)
{
    //xValue and yValue = [X,Y] index in 2D
    //zValue = depthValue of [X,Y] index
    xValue = (id.x + 1) % width;
    yValue = (id.x + 1) / width;
    zValue = dataBuffer[id.x];

    if (mask[id.x] > 0.49)
    {
        if (zValue > 500 && zValue < 1500)
        {
            if (xValue < minBuffer[0])
                minBuffer[0] = xValue;
            else if (xValue > maxBuffer[0])
                maxBuffer[0] = xValue;
            if (yValue < minBuffer[1])
                minBuffer[1] = yValue;
            else if (yValue > maxBuffer[1])
                maxBuffer[1] = yValue;
            if (zValue < minBuffer[2])
                minBuffer[2] = zValue;
            else if (zValue > maxBuffer[2])
                maxBuffer[2] = zValue;
        }
    }
}

Here is the part of code who call the compute shader :

void RunShader()
    {
        dataBuffer.SetData(depthDataFloat);
        minDataBuffer.SetData(reinitialiseMinBuffer);
        maxDataBuffer.SetData(reinitialiseMaxBuffer);
        maskBuffer.SetData(mask);

        computeShader.SetBuffer(_kernel, "dataBuffer", dataBuffer);
        computeShader.SetBuffer(_kernel, "minBuffer", minDataBuffer);
        computeShader.SetBuffer(_kernel, "maxBuffer", maxDataBuffer);
        computeShader.SetBuffer(_kernel, "mask", maskBuffer);

        computeShader.Dispatch(_kernel, 212, 1, 1);
    }

1 Answers1

1

In your case you do not handle data races, so multiple threads can write in the same place.

In order to make sure that your writes are atomic, you need to use interlocked functions. Those only work with uint, but in your case (assuming depth data is always > 0), binary comparison of float will match comparison of their values.

Here is the modified shader :

#pragma kernel ComputeBoundingBox

#define thread_group_size_x 1024
#define thread_group_size_y 1
#define thread_group_size_z 1
//Size of the depthData picture
#define width 512;
#define height 424;

//DataBuffer = depthData of the camera
//minBuffer, maxBuffer, array of size 3 with min/max x, y and z
//mask = image area to process
StructuredBuffer<float> dataBuffer;
RWStructuredBuffer<float>minBuffer;
RWStructuredBuffer<float> maxBuffer;
StructuredBuffer<float> mask;

[numthreads(thread_group_size_x, thread_group_size_y, thread_group_size_z)]
void ComputeBoundingBox(uint3 id : SV_DispatchThreadID)
{
    //xValue and yValue = [X,Y] index in 2D
    //zValue = depthValue of [X,Y] index
    uint xValue = (id.x + 1) % width;
    uint yValue = (id.x + 1) / width;
    uint zValue = asuint(dataBuffer[id.x]);

    if (mask[id.x] > 0.49)
    {
        if (zValue > 500 && zValue < 1500)
        {
             uint oldValue;
             InterlockedMin(minBuffer[0],xValue,oldValue); 
             InterlockedMax(maxBuffer[0],xValue,oldValue); 

             InterlockedMin(minBuffer[1],yValue,oldValue); 
             InterlockedMax(maxBuffer[1],yValue,oldValue);

             InterlockedMin(minBuffer[2],zValue,oldValue); 
             InterlockedMax(maxBuffer[2],zValue,oldValue);
        }
    }
}

I did assign dataBuffer and mask as StructuredBuffers too (since you only read to those, it is generally faster with them bound as such).

Also you need to ensure that your min/max buffers are cleared with a suitable value first (that is, before to call that shader).

This can be done with a simple compute shader (dispatch a single thread):

RWStructuredBuffer<float> minBuffer;
RWStructuredBuffer<float> maxBuffer;

[numthreads(1, 1, 1)]
void ClearBuffers(uint3 id : SV_DispatchThreadID)
{
     uint maxUint = 0xffffffff;
     uint minUint = 0;
     minBuffer[0]= asfloat(maxUint);
     minBuffer[1]= asfloat(maxUint);
     minBuffer[2]= asfloat(maxUint);

     maxBuffer[0]= asfloat(minUint);
     maxBuffer[1]= asfloat(minUint);
     maxBuffer[2]= asfloat(minUint);
}

Please note that uint/float aliasing in that case will work, so you don't need to perform any conversion.

mrvux
  • 8,523
  • 1
  • 27
  • 61