0

Basically, is it possible to implement 1D stencil kernel shown below using pure Thrust? I want this implementation to be as efficient as possible which implies that Thrust should somehow know that there is a multiple access to the same elements and shared memory access needs to be used.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#define BLOCK_SIZE 128
#define RADIUS 8
#define SIZE 1024*1024*8
const dim3 DimBlock(BLOCK_SIZE);
const dim3 DimGrid(SIZE/BLOCK_SIZE);

__global__ void stencil_1d(const int * in, int *out) {
    __shared__ int temp[BLOCK_SIZE + 2 * RADIUS];
    int gindex = threadIdx.x + blockIdx.x * blockDim.x;
    int lindex = threadIdx.x + RADIUS;

    // Read input elements into shared memory
    if( gindex < SIZE ) temp[lindex] = in[gindex]; else temp[lindex] = 0;
    if (threadIdx.x < RADIUS) {
    if(gindex - RADIUS>=0 )temp[lindex - RADIUS] = in[gindex - RADIUS]; else  temp[lindex - RADIUS] = 0;
    if(gindex + BLOCK_SIZE < SIZE )  temp[lindex + BLOCK_SIZE] = in[gindex + BLOCK_SIZE]; else  temp[lindex + BLOCK_SIZE] = 0;
    }

    // Synchronize (ensure all the data is available)
    __syncthreads();

    // Apply the stencil
    int result = 0;
    for (int offset = -RADIUS; offset <= RADIUS; offset++)
    if( gindex < SIZE ) result += temp[lindex + offset];

    // Store the result
    if( gindex < SIZE ) out[gindex] = result;
}

int main()
{
    cudaError_t cudaStat;
    thrust::device_vector<int> dev_vec_inp(SIZE,1);
    thrust::device_vector<int> dev_vec_out(SIZE);
    try 
    {
        stencil_1d<<< DimGrid, DimBlock >>>(thrust::raw_pointer_cast(dev_vec_inp.data()) , thrust::raw_pointer_cast(dev_vec_out.data()));
        cudaStat =  cudaGetLastError(); 
        if (cudaStat != cudaSuccess)
        throw cudaGetErrorString(cudaStat);
        else std::cout<<"1D stencil has been executed successfully" << std:: endl;
    }   
    catch(const char* e)
    {
        std::cout<< e;
    }
}
paleonix
  • 2,293
  • 1
  • 13
  • 29
user3786219
  • 177
  • 1
  • 2
  • 11
  • 1
    Its generally not possible to get thrust to use shared memory for work like this, unless you choose to create your own functor that essentially bypasses thrust. Even with your own functor, it may be difficult to construct. – Robert Crovella Aug 08 '20 at 15:11
  • @RobertCrovella Thank you. Then I will just use cuda kernels for stencil computations and thrust vectors for memory management. – user3786219 Aug 08 '20 at 15:23
  • 2
    Thrust has an adjacent difference that can calculate a difference table from which you could construct whatever stencil you require – talonmies Aug 08 '20 at 15:48

1 Answers1

3

For this particular type of stencil op (+) you could use a prefix sum method. You would perform a prefix sum first, then use the stencil radius to subtract the left end of the stencil window from the right end of the stencil window.

Here is a rough sketch:

$ cat t1777.cu
#include <thrust/device_vector.h>
#include <thrust/scan.h>
#include <thrust/copy.h>
#include <thrust/transform.h>
#include <thrust/host_vector.h>
#include <iostream>

const int ds = 20;
const int stencil_radius = 7;
using namespace thrust::placeholders;
typedef int mt;

int main(){

  thrust::device_vector<mt> data(ds, 1);
  thrust::device_vector<mt> result(ds-(2*stencil_radius));
  thrust::inclusive_scan(data.begin(), data.end(), data.begin());
  thrust::transform(data.begin(), data.end()-(2*stencil_radius),data.begin()+(2*stencil_radius), result.begin(), _2-_1);
  thrust::host_vector<mt> h_result = result;
  thrust::copy(h_result.begin(), h_result.end(), std::ostream_iterator<mt>(std::cout, ","));
  std::cout << std::endl;
}
$ nvcc -o t1777 t1777.cu
$ ./t1777
14,14,14,14,14,14,
$

If you run the above code under a profiler, you can determine that one of the kernels thrust launches for the prefix sum op does use shared memory.

I really haven't tried to create a proper stencil window consisting of a radius on the left, radius on the right, and the stencil position in the center, but that should only require slight modification to what I have shown above.

I'm not making any claims that this is better or faster than the "pure CUDA" method, but it seems to be one way to perform it using a "pure thrust" method, with thrust making use of shared memory.

I'm not suggesting this code is defect-free or suitable for any particular purpose. It is provided merely to demonstrate an idea. Use it at your own risk.

Robert Crovella
  • 143,785
  • 11
  • 213
  • 257