If you read the CUDA 9RC programming guide (section B.15), installed with your copy of CUDA 9RC, you will see that the new __shfl_sync()
function has an additional mask
parameter which you are not accounting for:
CUDA 8:
int __shfl(int var, int srcLane, int width=warpSize);
CUDA 9:
T __shfl_sync(unsigned mask, T var, int srcLane, int width=warpSize);
^^^^^^^^^^^^^
The expectation for this mask parameter is also indicated:
The new *_sync shfl intrinsics take in a mask indicating the threads participating in the
call. A bit, representing the thread's lane id, must be set for each participating thread to
ensure they are properly converged before the intrinsic is executed by the hardware. All
non-exited threads named in mask must execute the same intrinsic with the same mask,
or the result is undefined.
Therefore, if we modify your code to conform with this, we get the expected result:
$ cat t419.cu
#include <stdio.h>
__global__
static void shflTest(int lid){
int tid = threadIdx.x;
float value = tid + 0.1f;
int* ivalue = reinterpret_cast<int*>(&value);
//use the integer shfl
int ix = __shfl(ivalue[0],5,32);
int iy = __shfl_sync(0xFFFFFFFF, ivalue[0],5,32);
float x = reinterpret_cast<float*>(&ix)[0];
float y = reinterpret_cast<float*>(&iy)[0];
if(tid == lid){
printf("shfl tmp %d %d\n",ix,iy);
printf("shfl final %f %f\n",x,y);
}
}
int main()
{
shflTest<<<1,32>>>(0);
cudaDeviceSynchronize();
return 0;
}
$ nvcc -arch=sm_61 -o t419 t419.cu
t419.cu(10): warning: function "__shfl(int, int, int)"
/usr/local/cuda/bin/..//include/sm_30_intrinsics.hpp(152): here was declared deprecated ("__shfl() is deprecated in favor of __shfl_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning).")
$ cuda-memcheck ./t419
========= CUDA-MEMCHECK
shfl tmp 1084437299 1084437299
shfl final 5.100000 5.100000
========= ERROR SUMMARY: 0 errors
$