-1

I am trying to do parallel sum scan on a test vector. I am using both Thrust and CUB library for this purpose

struct CustomSum
{
    template <typename T>
    CUB_RUNTIME_FUNCTION __forceinline__
        T operator()(const T &a, const T &b) const {
            return a + b;
        }
};
    // 2d array stored in row-major order [(0,0), (0,1), (0,2), ... ]
    thrust::host_vector<int> hVec_I1(SIZE_IMG, 1);
    thrust::host_vector<int> hVec_I2(SIZE_IMG, 1);
    thrust::host_vector<int> h_out(SIZE_IMG, 1);

    CustomSum sum_op;
    // Innitialize vector with synthetic image:
    initialize(N, N, hVec_I1, hVec_I2);

    // Compute Integral Image M1 and M2
    thrust::device_vector<int> dVec_M1 = hVec_I1;
    thrust::device_vector<int> dVec_M2 = hVec_I2;
    thrust::device_vector<int> d_o = h_out;

    //thrust::device_ptr<double> d_in = dVec_M1.data();
    //thrust::device_ptr<double> d_out1 = d_out.data();
    int* d_in = thrust::raw_pointer_cast(&dVec_M1[0]);
    int *d_out = thrust::raw_pointer_cast(&d_o[0]);
    //d_in = thrust::raw_pointer_cast(dVec_M2.data());

    //thrust::device_vector<int> d_out;
    //int *d_out = thrust::raw_pointer_cast(dVec_M1.data());
    void *d_temp_storage = NULL;
    size_t temp_storage_bytes = 0;

    // Run inclusive prefix sum-scan
    cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, sum_op, SIZE_IMG);
    // Allocate temporary storage for inclusive prefix scan
    cudaMalloc(&d_temp_storage, temp_storage_bytes);
    // Run inclusive prefix sum-scan
    cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, sum_op, SIZE_IMG);

The error I am getting is

Error   43  error : calling a __host__ function("CustomSum::operator ()<int> ") from a __device__ function("cub::TilePrefixCallbackOp<int, CustomSum, cub::ScanTileState<int, (bool)1> > ::operator ()") is not allowed c:\users\asu_cuda_laptop\documents\visual studio 2013\projects\stats_kernel\cub\agent\single_pass_scan_operators.cuh    747 1   stats_kernel

I could not interpret the error correctly and I am sure there is a problem with the way I am handling raw pointers. Any help is appreciated.

Related link: How to use CUB and Thrust in one CUDA code

Community
  • 1
  • 1
Gaara
  • 695
  • 3
  • 8
  • 23

1 Answers1

2

Try defining CustomSum::operator() as a __device__ function. More on __host__ vs __device__ functions in the CUDA C programming guide.

  • The macro `CUB_RUNTIME_FUNCTION` is defined as `__host__ __device__` if the correct compilation trajectory is used – talonmies May 13 '16 at 08:44
  • @talonmies so if Gaara adds \_\_device__ and it still doesn't work, we can at least preclude that CUB_RUNTIME_FUNCTION is defined wrongly. –  May 13 '16 at 10:39
  • I don't believe the principal intent of `CUB_RUNTIME_FUNCTION` is for providing `__host__ __device__` decorations for the general use case of functors which must *always* be decorated so, but is instead intended to provide the `__device__` marking on functions that may additionally be called from device code in a CDP setting. It's evident that [typical CUB functor usage](https://nvlabs.github.io/cub/thread__operators_8cuh_source.html) expects explicit decoration of the functor. Therefore I believe this answer is correct. – Robert Crovella May 13 '16 at 13:01
  • Even the [related link](http://stackoverflow.com/questions/26812038/how-to-use-cub-and-thrust-in-one-cuda-code) provided in OP's posting shows `CustomSum` explicitly decorated, so it's a mystery why OP would drop that decoration. – Robert Crovella May 13 '16 at 13:03