I would concur with the answer (now deleted) and comment by @harrism. Since I put some effort into the non-thrust approach, I'll present my findings.
I tried to naively implement a binary search at the warp-level using __shfl()
, and then repeat that binary search across the data set, passing the data set through each 32-element group.
It's embarrassing, but my code is around 20x slower than thrust (in fact it may be worse than that if you do careful timing with nvprof
).
I made the data sizes a little larger than what was proposed in the question, because the data sizes in the question are so small that the timing is in the dust.
Here's a fully worked example of 2 approaches:
What is approximately outlined in the question, i.e. create a binary search using warp shuffle that can search up to 32 elements against a 32-element ordered array. Repeat this process for as many 32-element ordered arrays as there are, passing the entire data set through each ordered array (hopefully you can start to see some of the inefficiency now.)
Use thrust, essentially the same as what is outlined by @harrism, i.e. sort the grouped data set, and then run a vectorized thrust::binary_search on that.
Here's the example:
$ cat t1030.cu
#include <stdio.h>
#include <assert.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/binary_search.h>
typedef long mytype;
const int gsize = 32;
const int nGRP = 512;
const int dsize = nGRP*gsize;//gsize*nGRP;
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
template <typename T>
__device__ T my_shfl32(T val, unsigned lane){
return __shfl(val, lane);
}
template <typename T>
__device__ T my_shfl64(T val, unsigned lane){
T retval = val;
int2 t1 = *(reinterpret_cast<int2 *>(&retval));
t1.x = __shfl(t1.x, lane);
t1.y = __shfl(t1.y, lane);
retval = *(reinterpret_cast<T *>(&t1));
return retval;
}
template <typename T>
__device__ bool bsearch_shfl(T grp_val, T my_val){
int src_lane = gsize>>1;
bool return_val = false;
T test_val;
int shift = gsize>>2;
for (int i = 0; i <= gsize>>3; i++){
if (sizeof(T)==4){
test_val = my_shfl32(grp_val, src_lane);}
else if (sizeof(T)==8){
test_val = my_shfl64(grp_val, src_lane);}
else assert(0);
if (test_val == my_val) return_val = true;
src_lane += (((test_val<my_val)*2)-1)*shift;
shift>>=1;
assert ((src_lane < gsize)&&(src_lane > 0));}
if (sizeof(T)==4){
test_val = my_shfl32(grp_val, 0);}
else if (sizeof(T)==8){
test_val = my_shfl64(grp_val, 0);}
else assert(0);
if (test_val == my_val) return_val = true;
return return_val;
}
template <typename T>
__global__ void bsearch_grp(const T * __restrict__ search_grps, T *data){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
int tid = threadIdx.x;
if (idx < gsize*nGRP){
T grp_val = search_grps[idx];
while (tid < dsize){
T my_val = data[tid];
if (bsearch_shfl(grp_val, my_val)) data[tid] = -1;
tid += blockDim.x;}
}
}
int main(){
// data setup
assert(gsize == 32); //mandatory (warp size)
assert((dsize % 32)==0); //needed to preserve shfl capability
thrust::host_vector<mytype> grps(gsize*nGRP);
thrust::host_vector<mytype> data(dsize);
thrust::host_vector<mytype> result(dsize);
for (int i = 0; i < gsize*nGRP; i++) grps[i] = i;
for (int i = 0; i < dsize; i++) data[i] = i;
// method 1: individual shfl-based binary searches on each group
mytype *d_grps, *d_data;
cudaMalloc(&d_grps, gsize*nGRP*sizeof(mytype));
cudaMalloc(&d_data, dsize*sizeof(mytype));
cudaMemcpy(d_grps, &(grps[0]), gsize*nGRP*sizeof(mytype), cudaMemcpyHostToDevice);
cudaMemcpy(d_data, &(data[0]), dsize*sizeof(mytype), cudaMemcpyHostToDevice);
unsigned long long my_time = dtime_usec(0);
bsearch_grp<<<nGRP, gsize>>>(d_grps, d_data);
cudaDeviceSynchronize();
my_time = dtime_usec(my_time);
cudaMemcpy(&(result[0]), d_data, dsize*sizeof(mytype), cudaMemcpyDeviceToHost);
for (int i = 0; i < dsize; i++) if (result[i] != -1) {printf("method 1 mismatch at %d, was %d, should be -1\n", i, (int)(result[i])); return 1;}
printf("method 1 time: %fs\n", my_time/(float)USECPSEC);
// method 2: thrust sort, followed by thrust binary search
thrust::device_vector<mytype> t_grps = grps;
thrust::device_vector<mytype> t_data = data;
thrust::device_vector<bool> t_rslt(t_data.size());
my_time = dtime_usec(0);
thrust::sort(t_grps.begin(), t_grps.end());
thrust::binary_search(t_grps.begin(), t_grps.end(), t_data.begin(), t_data.end(), t_rslt.begin());
cudaDeviceSynchronize();
my_time = dtime_usec(my_time);
thrust::host_vector<bool> rslt = t_rslt;
for (int i = 0; i < dsize; i++) if (rslt[i] != true) {printf("method 2 mismatch at %d, was %d, should be 1\n", i, (int)(rslt[i])); return 1;}
printf("method 2 time: %fs\n", my_time/(float)USECPSEC);
// method 3: multiple thrust merges, followed by thrust binary search
return 0;
}
$ nvcc -O3 -arch=sm_35 t1030.cu -o t1030
$ ./t1030
method 1 time: 0.009075s
method 2 time: 0.000516s
$
I was running this on linux, CUDA 7.5, GT640 GPU. Obviously the performance will be different on different GPUs, but I'd be surprised if any GPU significantly closed the gap.
In short, you'd be well advised to use a well-tuned library like thrust or cub. If you don't like the monolithic nature of thrust, you could try cub. I don't know if cub has a binary search, but a single binary search against the whole sorted data set is not a difficult thing to write, and it's the smaller part of the time involved (for method 2 -- identifiable using nvprof
or additional timing code).
Since your 32-element grouped ranges are already sorted, I also pondered the idea of using multiple thrust::merge
operations rather than a single sort. I'm not sure which would be faster, but since the thrust method is already so much faster than the 32-element shuffle search method, I think thrust (or cub) is the obvious choice.