How to use CUB and Thrust in one CUDA code

Question

I'm trying to introduce some CUB into my "old" Thrust code, and so have started with a small example to compare thrust::reduce_by_key with cub::DeviceReduce::ReduceByKey, both applied to thrust::device_vectors.

The thrust part of the code is fine, but the CUB part, which naively uses raw pointers obtained via thrust::raw_pointer_cast, crashes after the CUB calls. I put in a cudaDeviceSynchronize() to try to solve this problem, but it didn't help. The CUB part of the code was cribbed from the CUB web pages.

On OSX the runtime error is:

libc++abi.dylib: terminate called throwing an exception
Abort trap: 6

On Linux the runtime error is:

terminate called after throwing an instance of 'thrust::system::system_error'
what():  an illegal memory access was encountered

The first few lines of cuda-memcheck are:

========= CUDA-MEMCHECK
========= Invalid __global__ write of size 4
=========     at 0x00127010 in /home/sdettrick/codes/MCthrust/tests/../cub-1.3.2/cub/device/dispatch/../../block_range/block_range_reduce_by_key.cuh:1017:void cub::ReduceByKeyRegionKernel<cub::DeviceReduceByKeyDispatch<unsigned int*, unsigned int*, float*, float*, int*, cub::Equality, CustomSum, int>::PtxReduceByKeyPolicy, unsigned int*, unsigned int*, float*, float*, int*, cub::ReduceByKeyScanTileState<float, int, bool=1>, cub::Equality, CustomSum, int>(unsigned int*, float*, float*, int*, cub::Equality, CustomSum, int, cub::DeviceReduceByKeyDispatch<unsigned int*, unsigned int*, float*, float*, int*, cub::Equality, CustomSum, int>::PtxReduceByKeyPolicy, unsigned int*, int, cub::GridQueue<int>)
=========     by thread (0,0,0) in block (0,0,0)
=========     Address 0x7fff7dbb3e88 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time

Unfortunately I'm not too sure what to do about that.

Any help would be greatly appreciated. I tried this on the NVIDIA developer zone but didn't get any responses. The complete example code is below. It should compile with CUDA 6.5 and cub 1.3.2:

#include <iostream>
#include <thrust/sort.h>
#include <thrust/gather.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/discard_iterator.h>

#include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>

//========================================
// for CUB:
struct CustomSum
{
    template <typename T>
    CUB_RUNTIME_FUNCTION __host__ __device__ __forceinline__
    //__host__ __device__ __forceinline__
    T operator()(const T &a, const T &b) const {
        return b+a;
    }
};
//========================================

int main()
{
  const int Nkey=20;
  int Nseg=9;
  int ikey[Nkey] = {0, 0, 0, 6, 8, 0, 2, 4, 6, 8, 1, 3, 5, 7, 8, 1, 3, 5, 7, 8}; 

  thrust::device_vector<unsigned int> key(ikey,ikey+Nkey);
  thrust::device_vector<unsigned int> keysout(Nkey);

  // Let's reduce x, by key:

  float xval[Nkey];
  for (int i=0; i<Nkey; i++) xval[i]=ikey[i]+0.1f;

  thrust::device_vector<float> x(xval,xval+Nkey);

  // First, sort x by key:

  thrust::sort_by_key(key.begin(),key.end(),x.begin());

  //---------------------------------------------------------------------
  std::cout<<"=================================================================="<<std::endl
       <<" THRUST reduce_by_key:"<<std::endl
       <<"=================================================================="<<std::endl;

  thrust::device_vector<float> output(Nseg,0.0f);

  thrust::reduce_by_key(key.begin(),
            key.end(),
            x.begin(),
            keysout.begin(),
            output.begin());

  for (int i=0;i<Nkey;i++) std::cout << x[i] <<" ";  std::cout<<std::endl;
  for (int i=0;i<Nkey;i++) std::cout << key[i] <<" ";  std::cout<<std::endl;
  for (int i=0;i<Nseg;i++) std::cout << output[i] <<" ";  std::cout<<std::endl;

  float ototal=thrust::reduce(output.begin(),output.end());
  float xtotal=thrust::reduce(x.begin(),x.end());
  std::cout << "total="<< ototal <<", should be "<<xtotal<<std::endl;

  //---------------------------------------------------------------------
  std::cout<<"=================================================================="<<std::endl
       <<" CUB ReduceByKey:"<<std::endl
       <<"=================================================================="<<std::endl;


  unsigned int *d_keys_in   =thrust::raw_pointer_cast(&key[0]);
  float        *d_values_in =thrust::raw_pointer_cast(&x[0]);  
  unsigned int *d_keys_out  =thrust::raw_pointer_cast(&keysout[0]);
  float        *d_values_out=thrust::raw_pointer_cast(&output[0]);
  int          *d_num_segments=&Nseg;
  CustomSum   reduction_op;

  std::cout << "CUB input" << std::endl;
  for (int i=0; i<Nkey; ++i) std::cout << key[i]  << " ";  std::cout<<std::endl;
  for (int i=0; i<Nkey; ++i) std::cout << x[i] << " ";  std::cout<< std::endl;
  for (int i=0; i<Nkey; ++i) std::cout << keysout[i] << " ";  std::cout<< std::endl;
  for (int i=0; i<Nseg; ++i) std::cout << output[i] << " ";  std::cout<< std::endl;

  // Determine temporary device storage requirements
  void     *d_temp_storage = NULL;
  size_t   temp_storage_bytes = 0;
  cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_segments, reduction_op, Nkey);

  // Allocate temporary storage
  cudaMalloc(&d_temp_storage, temp_storage_bytes);
  std::cout << "temp_storage_bytes = " << temp_storage_bytes << std::endl;

  // Run reduce-by-key
  cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_segments, reduction_op, Nkey);
  cudaDeviceSynchronize();

  std::cout << "CUB output" << std::endl;

  std::cout<<Nkey<<" "<<Nseg<<std::endl;
  std::cout<<key.size() << " "<<x.size() << " "<<keysout.size() << " "<<output.size() << std::endl;

  // At this point onward it dies:
  //libc++abi.dylib: terminate called throwing an exception
  //Abort trap: 6  

  // If the next line is uncommented, it crashes the Mac!
  for (int i=0; i<Nkey; ++i) std::cout << key[i]  << " ";  std::cout<<std::endl;
  // for (int i=0; i<Nkey; ++i) std::cout << x[i] << " ";  std::cout<< std::endl;
  // for (int i=0; i<Nkey; ++i) std::cout << keysout[i] << " ";  std::cout<< std::endl;
  // for (int i=0; i<Nseg; ++i) std::cout << output[i] << " ";  std::cout<< std::endl;
  cudaFree(d_temp_storage);

  ototal=thrust::reduce(output.begin(),output.end());
  xtotal=thrust::reduce(x.begin(),x.end());
  std::cout << "total="<< ototal <<", should be "<<xtotal<<std::endl;
  return 1;
}

score 2 · Answer 1 · answered Nov 08 '14 at 02:44

2

This is not appropriate:

 int          *d_num_segments=&Nseg;

You cannot take the address of a host variable and use it as a device pointer.

Instead do this:

int *d_num_segments;
cudaMalloc(&d_num_segments, sizeof(int));

This allocates space on the device for the size of data (a single integer that cub will write to), and assigns the address of that allocation to your d_num_segments variable. This then becomes a valid device pointer.

In (*ordinary, non-UM) CUDA, it is illegal dereference a host address in device code, or a device address in host code.

answered Nov 08 '14 at 02:44

Robert Crovella

143,785
11
213
257

Thanks! At the risk of stating the obvious, it was also necessary to add `cudaMemcpy(d_num_segments,&Nseg,sizeof(int),cudaMemcpyHostToDevice);` to achieve what I was trying to accomplish. – Sean Dettrick Nov 10 '14 at 13:23
I'm not sure why. CUB writes to that location/value. It is not an input to CUB, it is an output from CUB. Whatever you write there will get overwritten by CUB, as it counts up the number of segments that it finds. Take a look at [the documentation for that function and value](http://nvlabs.github.io/cub/structcub_1_1_device_reduce.html#a4822e04d8701b10ac3f2d28effb454d3) (NumSegmentsIterator) – Robert Crovella Nov 10 '14 at 13:42
Hmm, I thought I had an error without that statement, but now I've removed it and there is no error after all. – Sean Dettrick Nov 10 '14 at 16:39

How to use CUB and Thrust in one CUDA code

1 Answers1

Linked