3

I'm writing a CUDA kernel and each thread has to complete the following task: suppose I have an ordered array a of n unsigned integers (the first one is always 0) stored in shared memory, each thread has to find the array index i such that a[i]threadIdx.x and a[i + 1] > threadIdx.x.

A naive solution could be:

for (i = 0; i < n - 1; i++)
    if (a[i + 1] > threadIdx.x) break;

but I suppose this is not the optimal way to do it... can anyone suggest anything better?

Harshil Sharma
  • 2,016
  • 1
  • 29
  • 54
Filippo Bistaffa
  • 551
  • 3
  • 16

3 Answers3

4

Like Robert, I was thinking that a binary search has got to be faster that a naïve loop -- the upper bound of operation count for a binary search is O(log(n)), compared to O(N) for the loop.

My extremely simple implementation:

#include <iostream>
#include <climits>
#include <assert.h>

__device__  __host__
int midpoint(int a, int b)
{
    return a + (b-a)/2;
}

__device__ __host__
int eval(int A[], int i, int val, int imin, int imax)
{

    int low = (A[i] <= val);
    int high = (A[i+1] > val);

    if (low && high) {
        return 0;
    } else if (low) {
        return -1;
    } else {
        return 1;
    }
}

__device__ __host__
int binary_search(int A[], int val, int imin, int imax)
{
    while (imax >= imin) {
        int imid = midpoint(imin, imax);
        int e = eval(A, imid, val, imin, imax);
        if(e == 0) {
            return imid;
        } else if (e < 0) {
            imin = imid;
        } else {         
            imax = imid;
        }
    }

    return -1;
}


__device__ __host__
int linear_search(int A[], int val, int imin, int imax)
{
    int res = -1;
    for(int i=imin; i<(imax-1); i++) {
        if (A[i+1] > val) {
            res = i;
            break;
        }
    }

    return res;
}

template<int version>
__global__
void search(int * source, int * result, int Nin, int Nout)
{
    extern __shared__ int buff[];
    int tid = threadIdx.x + blockIdx.x*blockDim.x;

    int val = INT_MAX;
    if (tid < Nin) val = source[threadIdx.x];
    buff[threadIdx.x] = val;
    __syncthreads();

    int res;
    switch(version) {

        case 0:
        res = binary_search(buff, threadIdx.x, 0, blockDim.x);
        break;

        case 1:
        res = linear_search(buff, threadIdx.x, 0, blockDim.x);
        break;
    }

    if (tid < Nout) result[tid] = res; 
}

int main(void)
{
    const int inputLength = 128000;
    const int isize = inputLength * sizeof(int);
    const int outputLength = 256;
    const int osize = outputLength * sizeof(int);

    int * hostInput = new int[inputLength];
    int * hostOutput = new int[outputLength];
    int * deviceInput;
    int * deviceOutput;

    for(int i=0; i<inputLength; i++) {
        hostInput[i] = -200 + 5*i;
    }

    cudaMalloc((void**)&deviceInput, isize);
    cudaMalloc((void**)&deviceOutput, osize);

    cudaMemcpy(deviceInput, hostInput, isize, cudaMemcpyHostToDevice);

    dim3 DimBlock(256, 1, 1);
    dim3 DimGrid(1, 1, 1);
    DimGrid.x = (outputLength / DimBlock.x) + 
                ((outputLength % DimBlock.x > 0) ? 1 : 0); 
    size_t shmsz = DimBlock.x * sizeof(int);

    for(int i=0; i<5; i++) {
        search<1><<<DimGrid, DimBlock, shmsz>>>(deviceInput, deviceOutput, 
                inputLength, outputLength);
    }

    for(int i=0; i<5; i++) {
        search<0><<<DimGrid, DimBlock, shmsz>>>(deviceInput, deviceOutput,
                inputLength, outputLength);
    }

    cudaMemcpy(hostOutput, deviceOutput, osize, cudaMemcpyDeviceToHost);

    for(int i=0; i<outputLength; i++) {
        int idx = hostOutput[i];
        int tidx = i % DimBlock.x;
        assert( (hostInput[idx] <= tidx) && (tidx < hostInput[idx+1]) );
    } 
    cudaDeviceReset();

    return 0;
}

gave about a five times speed up compared to the loop:

>nvprof a.exe
======== NVPROF is profiling a.exe...
======== Command: a.exe
======== Profiling result:
 Time(%)      Time   Calls       Avg       Min       Max  Name
   60.11  157.85us       1  157.85us  157.85us  157.85us  [CUDA memcpy HtoD]
   32.58   85.55us       5   17.11us   16.63us   19.04us  void search<int=1>(int*, int*, int, int)
    6.52   17.13us       5    3.42us    3.35us    3.73us  void search<int=0>(int*, int*, int, int)
    0.79    2.08us       1    2.08us    2.08us    2.08us  [CUDA memcpy DtoH]

I'm sure that someoneclever could do a lot better than that. But perhaps this gives you at least a few ideas.

talonmies
  • 70,661
  • 34
  • 192
  • 269
  • Upvoting, this is a better answer than mine, for at least 2 reasons. It's a complete worked example and makes timing comparison, and also because it more accurately tracks the problem statement which is limited to `threadIdx.x` scope, and can therefore take advantage of shared memory. – Robert Crovella Feb 09 '14 at 21:53
  • In Programming Pearls, Jon Bentley describes a sweet binary search optimization that might pertain here. It involves careful initialization of the midpoint, then conditional updates using progressively-decreasing powers of 2. See "binarysearch4" in http://www.cs.bell-labs.com/cm/cs/pearls/search.c – ArchaeaSoftware Feb 10 '14 at 23:22
1

can anyone suggest anything better?

A brute force approach would be to have each thread do a binary search (on threadIdx.x + 1).

// sets idx to the index of the first element in a that is 
// equal to or larger than key

__device__ void bsearch_range(const int *a, const int key, const unsigned len_a, unsigned *idx){
  unsigned lower = 0;
  unsigned upper = len_a;
  unsigned midpt;
  while (lower < upper){
    midpt = (lower + upper)>>1;
    if (a[midpt] < key) lower = midpt +1;
    else upper = midpt;
    }
  *idx = lower;
  return;
  } 

__global__ void find_my_idx(const int *a, const unsigned len_a,  int *my_idx){
  unsigned idx = (blockDim.x * blockIdx.x) + threadIdx.x;
  unsigned sp_a;
  int val = idx+1;
  bsearch_range(a, val, len_a, &sp_a);
  my_idx[idx] = ((val-1) < a[sp_a]) ? sp_a:-1;
}

This is coded in browser, not tested. It's hacked from a piece of working code, however. If you have trouble making it work, I can revisit it. I don't recommend this approach on a device without caches (cc 1.x device).

This is actually searching on the full unique 1D thread index (blockDim.x * blockIdx.x + threadIdx.x + 1) You can change val to be anything you like.

You could also add an appropriate thread check, if the number of threads you intend to launch is greater than the length of your my_idx result vector.

I imagine there is a more clever approach that may use something akin to prefix sums.

Robert Crovella
  • 143,785
  • 11
  • 213
  • 257
-2

This is the best algorithm so far. It's called: LPW Indexed Search

__global__ void find_position_lpw(int *a, int n)
{
    int idx = threadIdx.x;

    __shared__ int aux[ MAX_THREADS_PER_BLOCK /*1024*/ ];

    aux[idx] = 0;

    if (idx < n)
        atomicAdd( &aux[a[idx]], 1); // atomics in case there are duplicates

    __syncthreads();

    int tmp;

    for (int j = 1; j <= MAX_THREADS_PER_BLOCK / 2; j <<= 1)
    {
        if( idx >= j ) tmp = aux[idx - j];
        __syncthreads();
        if( idx >= j ) aux[idx] += tmp;
        __syncthreads();        
    }

    // result in "i"
    int i = aux[idx] - 1;

    // use "i" here...
    // ...
}
Drout
  • 327
  • 4
  • 7
  • 2
    This code doesn't actually do what the original question is asking about -- the inputs are assumed to already in shared memory. And it introduces a limitation which isn't in the original question: you are assuming all input values lie on the interval [0, MAX_THREADS_PER_BLOCK] and your code will fail if they do not. That isn't in the original question. If I turn this into a device function and benchmark it against the naïve binary search from one of the other answers, it is twice as slow, as well as requiring double the amount of shared memory per block. I don't see much "best" here. – talonmies Jun 02 '20 at 14:25