After the label markers operation, if we then compress the label markers, we can realize a fairly simple approach for identifying bounding boxes, using atomicMax
and atomicMin
in a simple CUDA kernel.
Here is a worked example:
$ cat t1461.cu
#include <stdio.h>
#include <nppi_filtering_functions.h>
#include <assert.h>
#define WIDTH 16
#define HEIGHT 16
void my_print(Npp16u *data, int w, int h){
for (int i = 0; i < h; i++)
{
for (int j = 0; j < w; j++)
{
if (data[i*w+j] == 255) printf(" *");
else printf("%3hd", data[i * w + j]);
}
printf("\n");
}
}
template <typename T>
__global__ void bb(const T * __restrict__ i, int * __restrict__ maxh, int * __restrict__ minh, int * __restrict__ maxw, int * __restrict__ minw, int height, int width){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
int idy = threadIdx.y+blockDim.y*blockIdx.y;
if ((idx < width) && (idy < height)){
T myval = i[idy*width+idx];
if (myval > 0){
atomicMax(maxw+myval-1, idx);
atomicMin(minw+myval-1, idx);
atomicMax(maxh+myval-1, idy);
atomicMin(minh+myval-1, idy);}
}
}
int main(){
Npp16u host_src[WIDTH * HEIGHT] =
{
0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,
0, 255,255,255, 0, 0,255,255,255, 0, 0, 0, 0, 0, 0, 0,
0, 255,255,255, 0, 0,255,255,255, 0, 0, 0, 0, 0, 0, 0,
0, 255,255,255, 0, 0,255,255,255,255, 0, 0,255, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,255,255,255, 0, 0, 0,255,255,255,
0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0,255,255,255,
0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,
0, 255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 255,255,255, 0, 0, 0, 0,255, 0, 0, 0, 0, 0, 0, 0,
0, 255,255,255,255, 0, 0,255,255,255, 0, 0, 0, 0, 0, 0,
0, 0,255,255,255, 0,255,255,255,255,255, 0, 0, 0, 0, 0,
0, 0, 0,255, 0, 0, 0,255,255,255, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,255, 0, 0, 0, 0,255,255,255,
0, 255,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,
0, 255,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0,255,255,255,
0, 255,255,255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
Npp16u * device_src;
cudaMalloc((void**)&device_src, sizeof(Npp16u) * WIDTH * HEIGHT);
cudaMemcpy(device_src, host_src, sizeof(Npp16u) * WIDTH * HEIGHT, cudaMemcpyHostToDevice);
int buffer_size;
NppiSize source_roi = { WIDTH, HEIGHT };
NppStatus e = nppiLabelMarkersGetBufferSize_16u_C1R(source_roi, &buffer_size);
assert(e == NPP_NO_ERROR);
Npp8u * buffer;
cudaMalloc((void**)&buffer, buffer_size);
int max;
e = nppiLabelMarkers_16u_C1IR(device_src, sizeof(Npp16u) * WIDTH, source_roi, (Npp16u)1, nppiNormInf, &max, buffer);
assert(e == NPP_NO_ERROR);
printf("initial max: %d\n", max);
int bs;
e = nppiCompressMarkerLabelsGetBufferSize_16u_C1R (1, &bs);
assert(e == NPP_NO_ERROR);
if (bs>buffer_size){
buffer_size = bs;
cudaFree(buffer);
cudaMalloc(&buffer, buffer_size);}
e = nppiCompressMarkerLabels_16u_C1IR(device_src, sizeof(Npp16u)*WIDTH, source_roi, max, &max, buffer);
assert(e == NPP_NO_ERROR);
int *maxw, *maxh, *minw, *minh, *d_maxw, *d_maxh, *d_minw, *d_minh;
maxw = new int[max];
maxh = new int[max];
minw = new int[max];
minh = new int[max];
cudaMalloc(&d_maxw, max*sizeof(int));
cudaMalloc(&d_maxh, max*sizeof(int));
cudaMalloc(&d_minw, max*sizeof(int));
cudaMalloc(&d_minh, max*sizeof(int));
for (int i = 0; i < max; i++){
maxw[i] = 0;
maxh[i] = 0;
minw[i] = WIDTH;
minh[i] = HEIGHT;}
cudaMemcpy(d_maxw, maxw, max*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_maxh, maxh, max*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_minw, minw, max*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_minh, minh, max*sizeof(int), cudaMemcpyHostToDevice);
dim3 block(32,32);
dim3 grid((WIDTH+block.x-1)/block.x, (HEIGHT+block.y-1)/block.y);
bb<<<grid, block>>>(device_src, d_maxh, d_minh, d_maxw, d_minw, HEIGHT, WIDTH);
cudaMemcpy(maxw, d_maxw, max*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(maxh, d_maxh, max*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(minw, d_minw, max*sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(minh, d_minh, max*sizeof(int), cudaMemcpyDeviceToHost);
Npp16u *dst = new Npp16u[WIDTH * HEIGHT];
cudaMemcpy(dst, device_src, sizeof(Npp16u) * WIDTH * HEIGHT, cudaMemcpyDeviceToHost);
printf("*******INPUT************\n");
my_print(host_src, WIDTH, HEIGHT);
printf("******OUTPUT************\n");
my_print(dst, WIDTH,HEIGHT);
printf("compressed max: %d\n", max);
printf("bounding boxes:\n");
for (int i = 0; i < max; i++)
printf("label %d, maxh: %d, minh: %d, maxw: %d, minw: %d\n", i+1, maxh[i], minh[i], maxw[i], minw[i]);
}
$ nvcc -o t1461 t1461.cu -lnppif
$ cuda-memcheck ./t1461
========= CUDA-MEMCHECK
initial max: 10
*******INPUT************
0 0 0 0 0 0 0 0 * 0 0 0 0 0 0 0
0 * * * 0 0 * * * 0 0 0 0 0 0 0
0 * * * 0 0 * * * 0 0 0 0 0 0 0
0 * * * 0 0 * * * * 0 0 * 0 0 0
0 0 0 0 0 0 0 * * * 0 0 0 * * *
0 0 0 0 0 0 0 0 * 0 0 0 0 * * *
0 0 0 * 0 0 0 0 0 0 0 0 0 * * *
0 * * 0 0 0 0 0 0 0 0 0 0 0 0 0
0 * * * 0 0 0 0 * 0 0 0 0 0 0 0
0 * * * * 0 0 * * * 0 0 0 0 0 0
0 0 * * * 0 * * * * * 0 0 0 0 0
0 0 0 * 0 0 0 * * * 0 0 0 0 0 0
0 0 0 0 0 0 0 0 * 0 0 0 0 * * *
0 * * * 0 0 0 0 0 0 0 0 0 * * *
0 * * * 0 0 0 0 0 0 0 0 0 * * *
0 * * * 0 0 0 0 0 0 0 0 0 0 0 0
******OUTPUT************
0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
0 2 2 2 0 0 1 1 1 0 0 0 0 0 0 0
0 2 2 2 0 0 1 1 1 0 0 0 0 0 0 0
0 2 2 2 0 0 1 1 1 1 0 0 3 0 0 0
0 0 0 0 0 0 0 1 1 1 0 0 0 3 3 3
0 0 0 0 0 0 0 0 1 0 0 0 0 3 3 3
0 0 0 4 0 0 0 0 0 0 0 0 0 3 3 3
0 4 4 0 0 0 0 0 0 0 0 0 0 0 0 0
0 4 4 4 0 0 0 0 5 0 0 0 0 0 0 0
0 4 4 4 4 0 0 5 5 5 0 0 0 0 0 0
0 0 4 4 4 0 5 5 5 5 5 0 0 0 0 0
0 0 0 4 0 0 0 5 5 5 0 0 0 0 0 0
0 0 0 0 0 0 0 0 5 0 0 0 0 6 6 6
0 7 7 7 0 0 0 0 0 0 0 0 0 6 6 6
0 7 7 7 0 0 0 0 0 0 0 0 0 6 6 6
0 7 7 7 0 0 0 0 0 0 0 0 0 0 0 0
compressed max: 7
bounding boxes:
label 1, maxh: 5, minh: 0, maxw: 9, minw: 6
label 2, maxh: 3, minh: 1, maxw: 3, minw: 1
label 3, maxh: 6, minh: 3, maxw: 15, minw: 12
label 4, maxh: 11, minh: 6, maxw: 4, minw: 1
label 5, maxh: 12, minh: 8, maxw: 10, minw: 6
label 6, maxh: 14, minh: 12, maxw: 15, minw: 13
label 7, maxh: 15, minh: 13, maxw: 3, minw: 1
========= ERROR SUMMARY: 0 errors
$
Note that if you're going to do this repetetively (for example identifying bounding boxes on video frames) you'll want to get the cudaMalloc
operations mostly out of the performance loop.
A typical approach would be to use the methodology that I have already shown for allocation of buffer
in the code above. Only free and reallocate the buffer if the previous size is too small. Likewise for the max and min buffers.