Basically, is it possible to implement 1D stencil kernel shown below using pure Thrust? I want this implementation to be as efficient as possible which implies that Thrust should somehow know that there is a multiple access to the same elements and shared memory access needs to be used.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#define BLOCK_SIZE 128
#define RADIUS 8
#define SIZE 1024*1024*8
const dim3 DimBlock(BLOCK_SIZE);
const dim3 DimGrid(SIZE/BLOCK_SIZE);
__global__ void stencil_1d(const int * in, int *out) {
__shared__ int temp[BLOCK_SIZE + 2 * RADIUS];
int gindex = threadIdx.x + blockIdx.x * blockDim.x;
int lindex = threadIdx.x + RADIUS;
// Read input elements into shared memory
if( gindex < SIZE ) temp[lindex] = in[gindex]; else temp[lindex] = 0;
if (threadIdx.x < RADIUS) {
if(gindex - RADIUS>=0 )temp[lindex - RADIUS] = in[gindex - RADIUS]; else temp[lindex - RADIUS] = 0;
if(gindex + BLOCK_SIZE < SIZE ) temp[lindex + BLOCK_SIZE] = in[gindex + BLOCK_SIZE]; else temp[lindex + BLOCK_SIZE] = 0;
}
// Synchronize (ensure all the data is available)
__syncthreads();
// Apply the stencil
int result = 0;
for (int offset = -RADIUS; offset <= RADIUS; offset++)
if( gindex < SIZE ) result += temp[lindex + offset];
// Store the result
if( gindex < SIZE ) out[gindex] = result;
}
int main()
{
cudaError_t cudaStat;
thrust::device_vector<int> dev_vec_inp(SIZE,1);
thrust::device_vector<int> dev_vec_out(SIZE);
try
{
stencil_1d<<< DimGrid, DimBlock >>>(thrust::raw_pointer_cast(dev_vec_inp.data()) , thrust::raw_pointer_cast(dev_vec_out.data()));
cudaStat = cudaGetLastError();
if (cudaStat != cudaSuccess)
throw cudaGetErrorString(cudaStat);
else std::cout<<"1D stencil has been executed successfully" << std:: endl;
}
catch(const char* e)
{
std::cout<< e;
}
}