3D boxcar filter in CUDA using shared memory

Question

I am trying to see if the usage of shared memory for the problem in object can improve the execution time and result in some speedup:

Kernel function without using shared memory:

__global__ void  3dc(const int nx, const int ny, const int nz, const float* in1, 
    const float* in2, const float* in3, const float* in4, float* out)
{
    int i, j, k;

    int tidx = threadIdx.x + blockIdx.x*blockDim.x;

    if(tidx < (nx)*(ny)*(nz)){
        k = tidx/((nx)*(ny));
        j = (tidx - k*(nx)*(ny))/(nx);
        i = tidx - k*(nx)*(ny) - j*(nx);

        out[i + nx*j + nx*ny*k] = 
            in1[i     + nx*j     + nx*ny*k    ]+
            in1[(i+1) + nx*j     + nx*ny*k    ]+
            in1[(i+1) + nx*(j+1) + nx*ny*k    ]+
            in1[i     + nx*(j+1) + nx*ny*k    ]+
            in1[i     + nx*j     + nx*ny*(k+1)]+
            in1[(i+1) + nx*j     + nx*ny*(k+1)]+
            in1[(i+1) + nx*(j+1) + nx*ny*(k+1)]+
            in1[i     + nx*(j+1) + nx*ny*(k+1)]+
            in2[i     + nx*j     + nx*ny*k    ]+
            in2[(i+1) + nx*j     + nx*ny*k    ]+
            in2[(i+1) + nx*(j+1) + nx*ny*k    ]+
            in2[i     + nx*(j+1) + nx*ny*k    ]+
            in2[i     + nx*j     + nx*ny*(k+1)]+
            in2[(i+1) + nx*j     + nx*ny*(k+1)]+
            in2[(i+1) + nx*(j+1) + nx*ny*(k+1)]+
            in2[i     + nx*(j+1) + nx*ny*(k+1)]+
            in3[i     + nx*j     + nx*ny*k    ]+
            in3[(i+1) + nx*j     + nx*ny*k    ]+
            in3[(i+1) + nx*(j+1) + nx*ny*k    ]+
            in3[i     + nx*(j+1) + nx*ny*k    ]+
            in3[i     + nx*j     + nx*ny*(k+1)]+
            in3[(i+1) + nx*j     + nx*ny*(k+1)]+
            in3[(i+1) + nx*(j+1) + nx*ny*(k+1)]+
            in3[i     + nx*(j+1) + nx*ny*(k+1)]+
            in4[i     + nx*j     + nx*ny*k    ]+
            in4[(i+1) + nx*j     + nx*ny*k    ]+
            in4[(i+1) + nx*(j+1) + nx*ny*k    ]+
            in4[i     + nx*(j+1) + nx*ny*k    ]+
            in4[i     + nx*j     + nx*ny*(k+1)]+
            in4[(i+1) + nx*j     + nx*ny*(k+1)]+
            in4[(i+1) + nx*(j+1) + nx*ny*(k+1)]+
            in4[i     + nx*(j+1) + nx*ny*(k+1)];
    } 
} // 3dc

Kernel function using shared memory:

__global__ void 3d_shared_memory(const int nx, const int ny, const int nz, const float* in1, const float* in2, const float* in3, const float* in4, float* out){
    int idx = blockIdx.x*blockDim.x + threadIdx.x;
    int idy = blockIdx.y*blockDim.y + threadIdx.y;
    int idz = blockIdx.z*blockDim.z + threadIdx.z;

    __shared__ float smem1[16][16][4];
    __shared__ float smem2[16][16][4];
    __shared__ float smem3[16][16][4];
    __shared__ float smem4[16][16][4];

    if ((idx < nx) && (idy < ny) && (idz < nz)){
        smem1[threadIdx.x][threadIdx.y][threadIdx.z] = in1[idz * nx * ny + idy * nx + idx];
        smem2[threadIdx.x][threadIdx.y][threadIdx.z] = in2[idz * nx * ny + idy * nx + idx];
        smem3[threadIdx.x][threadIdx.y][threadIdx.z] = in3[idz * nx * ny + idy * nx + idx];
        smem4[threadIdx.x][threadIdx.y][threadIdx.z] = in4[idz * nx * ny + idy * nx + idx];                        
        __syncthreads();

        for(int k = 0; k < 3; k++){
            for(int j = 0; j < 15; j++){
                for(int i = 0; i < 15; i++){
                    out[idz * nx * ny + idy * nx + idx] = smem1[i][j][k] + smem1[i+1][j][k] + smem1[i+1][j+1][k] + smem1[i][j+1][k] + smem1[i][j][k+1] + smem1[i+1][j][k+1] + smem1[i+1][j+1][k+1] + smem1[i][j+1][k+1] +
                        smem2[i][j][k] + smem2[i+1][j][k] + smem2[i+1][j+1][k] + smem2[i][j+1][k] + smem2[i][j][k+1] + smem2[i+1][j][k+1] + smem2[i+1][j+1][k+1] + smem2[i][j+1][k+1] +
                        smem3[i][j][k] + smem3[i+1][j][k] + smem3[i+1][j+1][k] + smem3[i][j+1][k] + smem3[i][j][k+1] + smem3[i+1][j][k+1] + smem3[i+1][j+1][k+1] + smem3[i][j+1][k+1] +
                        smem4[i][j][k] + smem4[i+1][j][k] + smem4[i+1][j+1][k] + smem4[i][j+1][k] + smem4[i][j][k+1] + smem4[i+1][j][k+1] + smem4[i+1][j+1][k+1] + smem4[i][j+1][k+1];
                }
            }
        }

    }

} //3d_shared_memory example

The shared memory code is always slower. Is there a better way to exploit shared memory for this problem? Thanks in advance for suggestions.

Are you sure it isn't occupancy that causes the slow down? The shared memory version is using 16kb per block. Depending on what GPU you have, that might limited you to as little as 1 block per MP. — talonmies, Jun 28 '12 at 15:54
@talonmies I am using a Tesla C2070. for a 2 million case, with nx = 256, ny = 256 and nz = 32, and thread combinations of 16x16x4, the shared memory example is slower by 10x. Although I can use upto 48KB, I am limiting the use of shared memory to 16KB so that I can use {-Xptxas -dlcm=ca} to switch the size of shared memory to 16KB and L1 cache to 48KB. — cuda_hpc80, Jun 28 '12 at 19:54
First, I think you need to configure the L1 configuration using the runtime API, not a compiler flag. -dlcm=ca just forces all global accesses to go through L1, but I don't think it configures the cache size. If you were configuring the cache to 48KB and SMEM to 16KB, and you are using 16KB of SMEM per block, then I agree with @talonmies you are probably latency bound due to maxing out occupancy with a single thread block. — harrism, Jun 29 '12 at 04:12
@harrism I thought `-Xptxas -dlcm=ca` configures SMEM size as 16KB. Is this a wrong assumption? How is this configuration done during runtime? Does this mean that every kernel can be configured to run with different SMEM sizes (16 or 48 KB)?? — cuda_hpc80, Jun 29 '12 at 12:14
`-dlcm=ca` just specifies that all global accesses should be cached in L1 and L2 ("cache all"). `-dlcm=cg` means cache in L2 only. I'm pretty sure you can't configure the cache *size* at compile time. Note you can use inline PTX to generate specific types of cached accesses on a per-load/store basis, if you are so inclined. Yes, you can configure the cache size per kernel at run time, using `cudaFuncSetCacheConfig()`. See the CUDA Programming guide, Appendix F. — harrism, Jun 29 '12 at 12:53

score 3 · Answer 1 · edited May 23 '17 at 11:44

I'm providing a late answer to this post to remove it from the unanswered list.

You are basically implementing a boxcar filter in 3D using shared memory. Besides those already mentioned in the comments above, I see two possible reasons why you are not experiencing a speedup when using shared memory:

The shared memory loads and stores are not coalesced;
You are not considering the case when significant thread collaboration is needed, as the boxcar size is 2.

Below, I'm providing a code to compare the case of use of global memory only and of shared memory. The code is a modification of the code posted by Robert Crovella at 3d CUDA kernel indexing for image filtering?.

Results from this code, for DATASIZE_X x DATASIZE_Y x DATASIZE_Z = 1024 x 1024 x 64:

GT 540M case

BOXCAR_SIZE            GLOBAL            SHARED
     2                  360ms             342ms
     4                 1292ms             583ms
     6                 3675ms            1166ms

Kepler K20c case

BOXCAR_SIZE            GLOBAL            SHARED
     2                    8ms              16ms
     4                   40ms              33ms
     6                  142ms             102ms

The code:

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define BOXCAR_SIZE 6

#define DATASIZE_X 1024
#define DATASIZE_Y 1024
#define DATASIZE_Z 64

#define BLOCKSIZE_X 8
#define BLOCKSIZE_Y 8
#define BLOCKSIZE_Z 8

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/*****************************/
/* BOXCAR WITH SHARED MEMORY */
/*****************************/
__global__ void boxcar_shared(int* __restrict__ output, const int* __restrict__ input)
{
    __shared__ int smem[(BLOCKSIZE_Z + (BOXCAR_SIZE-1))][(BLOCKSIZE_Y + (BOXCAR_SIZE-1))][(BLOCKSIZE_X + (BOXCAR_SIZE-1))];

    int idx = blockIdx.x*blockDim.x + threadIdx.x;
    int idy = blockIdx.y*blockDim.y + threadIdx.y;
    int idz = blockIdx.z*blockDim.z + threadIdx.z;

    if ((idx < (DATASIZE_X+BOXCAR_SIZE-1)) && (idy < (DATASIZE_Y+BOXCAR_SIZE-1)) && (idz < (DATASIZE_Z+BOXCAR_SIZE-1))){

        smem[threadIdx.z][threadIdx.y][threadIdx.x]=input[idz*(DATASIZE_X+BOXCAR_SIZE-1)*(DATASIZE_Y+BOXCAR_SIZE-1) + idy*(DATASIZE_X+BOXCAR_SIZE-1) + idx];

    if ((threadIdx.z > (BLOCKSIZE_Z - BOXCAR_SIZE)) && (idz < DATASIZE_Z))
        smem[threadIdx.z + (BOXCAR_SIZE-1)][threadIdx.y][threadIdx.x] = input[(idz + (BOXCAR_SIZE-1))*(DATASIZE_X+BOXCAR_SIZE-1)*(DATASIZE_Y+BOXCAR_SIZE-1) + idy*(DATASIZE_X+BOXCAR_SIZE-1) + idx];

    if ((threadIdx.y > (BLOCKSIZE_Y - BOXCAR_SIZE)) && (idy < DATASIZE_Y))
        smem[threadIdx.z][threadIdx.y + (BOXCAR_SIZE-1)][threadIdx.x] = input[idz*(DATASIZE_X+BOXCAR_SIZE-1)*(DATASIZE_Y+BOXCAR_SIZE-1) + (idy+(BOXCAR_SIZE-1))*(DATASIZE_X+BOXCAR_SIZE-1) + idx];

    if ((threadIdx.x > (BLOCKSIZE_X - BOXCAR_SIZE)) && (idx < DATASIZE_X))
        smem[threadIdx.z][threadIdx.y][threadIdx.x + (BOXCAR_SIZE-1)] = input[idz*(DATASIZE_X+BOXCAR_SIZE-1)*(DATASIZE_Y+BOXCAR_SIZE-1) + idy*(DATASIZE_X+BOXCAR_SIZE-1) + (idx+(BOXCAR_SIZE-1))];

    if ((threadIdx.z > (BLOCKSIZE_Z - BOXCAR_SIZE)) && (threadIdx.y > (BLOCKSIZE_Y - BOXCAR_SIZE)) && (idz < DATASIZE_Z) && (idy < DATASIZE_Y))
        smem[threadIdx.z + (BOXCAR_SIZE-1)][threadIdx.y + (BOXCAR_SIZE-1)][threadIdx.x] = input[(idz+(BOXCAR_SIZE-1))*(DATASIZE_X+BOXCAR_SIZE-1)*(DATASIZE_Y+BOXCAR_SIZE-1) + (idy+(BOXCAR_SIZE-1))*(DATASIZE_X+BOXCAR_SIZE-1) + idx];

    if ((threadIdx.z > (BLOCKSIZE_Z - BOXCAR_SIZE)) && (threadIdx.x > (BLOCKSIZE_X - BOXCAR_SIZE)) && (idz < DATASIZE_Z) && (idx < DATASIZE_X))
        smem[threadIdx.z + (BOXCAR_SIZE-1)][threadIdx.y][threadIdx.x + (BOXCAR_SIZE-1)] = input[(idz+(BOXCAR_SIZE-1))*(DATASIZE_X+BOXCAR_SIZE-1)*(DATASIZE_Y+BOXCAR_SIZE-1) + idy*(DATASIZE_X+BOXCAR_SIZE-1) + (idx+(BOXCAR_SIZE-1))];

    if ((threadIdx.y > (BLOCKSIZE_Y - BOXCAR_SIZE)) && (threadIdx.x > (BLOCKSIZE_X - BOXCAR_SIZE)) && (idy < DATASIZE_Y) && (idx < DATASIZE_X))
        smem[threadIdx.z][threadIdx.y + (BOXCAR_SIZE-1)][threadIdx.x + (BOXCAR_SIZE-1)] = input[idz*(DATASIZE_X+BOXCAR_SIZE-1)*(DATASIZE_Y+BOXCAR_SIZE-1) + (idy+(BOXCAR_SIZE-1))*(DATASIZE_X+BOXCAR_SIZE-1) + (idx+(BOXCAR_SIZE-1))];

    if ((threadIdx.z > (BLOCKSIZE_Z - BOXCAR_SIZE)) && (threadIdx.y > (BLOCKSIZE_Y - BOXCAR_SIZE)) && (threadIdx.x > (BLOCKSIZE_X - BOXCAR_SIZE)) && (idz < DATASIZE_Z) && (idy < DATASIZE_Y) && (idx < DATASIZE_X))
        smem[threadIdx.z+(BOXCAR_SIZE-1)][threadIdx.y+(BOXCAR_SIZE-1)][threadIdx.x+(BOXCAR_SIZE-1)] = input[(idz+(BOXCAR_SIZE-1))*(DATASIZE_X+BOXCAR_SIZE-1)*(DATASIZE_Y+BOXCAR_SIZE-1) + (idy+(BOXCAR_SIZE-1))*(DATASIZE_X+BOXCAR_SIZE-1) + (idx+(BOXCAR_SIZE-1))];
}

    __syncthreads();

    if ((idx < DATASIZE_X) && (idy < DATASIZE_Y) && (idz < DATASIZE_Z)){

        int temp = 0;

        for (int i=0; i<BOXCAR_SIZE; i++)
            for (int j=0; j<BOXCAR_SIZE; j++)
                for (int k=0; k<BOXCAR_SIZE; k++)
                    temp = temp + smem[threadIdx.z + i][threadIdx.y + j][threadIdx.x + k];

        output[idz*DATASIZE_X*DATASIZE_Y + idy*DATASIZE_X + idx] = temp;
    }
}

/********************************/
/* BOXCAR WITHOUT SHARED MEMORY */
/********************************/
__global__ void boxcar(int* __restrict__ output, const int* __restrict__ input)
{
    int idx = blockIdx.x*blockDim.x + threadIdx.x;
    int idy = blockIdx.y*blockDim.y + threadIdx.y;
    int idz = blockIdx.z*blockDim.z + threadIdx.z;

    if ((idx < DATASIZE_X) && (idy < DATASIZE_Y) && (idz < DATASIZE_Z)){

        int temp = 0;
        for (int i=0; i<BOXCAR_SIZE; i++)
            for (int j=0; j<BOXCAR_SIZE; j++)
                for (int k=0; k<BOXCAR_SIZE; k++)
                    temp = temp + input[(k+idz)*(DATASIZE_X+BOXCAR_SIZE-1)*(DATASIZE_Y+BOXCAR_SIZE-1) + (j+idy)*(DATASIZE_X+BOXCAR_SIZE-1) + (i+idx)];

        output[idz*DATASIZE_X*DATASIZE_Y + idy*DATASIZE_X + idx] = temp;
    }
}

/********/
/* MAIN */
/********/
int main(void)
{
    int i, j, k, u, v, w, temp;

    // --- these are just for timing
    clock_t t0, t1, t2, t3;
    double t1sum=0.0f;
    double t2sum=0.0f;
    double t3sum=0.0f;

    const int nx = DATASIZE_X;
    const int ny = DATASIZE_Y;
    const int nz = DATASIZE_Z;

    const int wx = BOXCAR_SIZE;
    const int wy = BOXCAR_SIZE;
    const int wz = BOXCAR_SIZE;

    // --- start timing
    t0 = clock();

    // --- CPU memory allocations
    int *input, *output, *ref_output; 
    if ((input  = (int*)malloc(((nx+(wx-1))*(ny+(wy-1))*(nz+(wz-1)))*sizeof(int))) == 0)    { fprintf(stderr, "malloc Fail \n"); return 1; }
    if ((output = (int*)malloc((nx*ny*nz)*sizeof(int))) == 0)                               { fprintf(stderr, "malloc Fail \n"); return 1; }
    if ((ref_output = (int*)malloc((nx*ny*nz)*sizeof(int))) == 0)                               { fprintf(stderr, "malloc Fail \n"); return 1; }

    // --- Data generation
    srand(time(NULL));
    for(int i=0; i<(nz+(wz-1)); i++)
        for(int j=0; j<(ny+(wy-1)); j++)
            for (int k=0; k<(nx+(wx-1)); k++)
                input[i*(ny+(wy-1))*(nx+(wx-1))+j*(nx+(wx-1))+k] = rand(); 

    t1 = clock();

    // --- Allocate GPU space for data and results
    int *d_output, *d_input;  // storage for input
    gpuErrchk(cudaMalloc((void**)&d_input, (((nx+(wx-1))*(ny+(wy-1))*(nz+(wz-1)))*sizeof(int))));
    gpuErrchk(cudaMalloc((void**)&d_output, ((nx*ny*nz)*sizeof(int))));

    // --- Copy data from GPU to CPU
    gpuErrchk(cudaMemcpy(d_input, input, (((nx+(wx-1))*(ny+(wy-1))*(nz+(wz-1)))*sizeof(int)), cudaMemcpyHostToDevice));

    const dim3 blockSize(BLOCKSIZE_X, BLOCKSIZE_Y, BLOCKSIZE_Z);
    const dim3 gridSize(((DATASIZE_X+BLOCKSIZE_X-1)/BLOCKSIZE_X), ((DATASIZE_Y+BLOCKSIZE_Y-1)/BLOCKSIZE_Y), ((DATASIZE_Z+BLOCKSIZE_Z-1)/BLOCKSIZE_Z));

    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    boxcar_shared<<<gridSize,blockSize>>>(d_output, d_input);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time:  %3.4f ms \n", time);

    // --- Copy result from GPU to CPU
    gpuErrchk(cudaMemcpy(output, d_output, ((nx*ny*nz)*sizeof(int)), cudaMemcpyDeviceToHost));

    t2 = clock();
    t2sum = ((double)(t2-t1))/CLOCKS_PER_SEC;
    printf(" Device compute took %3.2f seconds.  Beginning host compute.\n", t2sum);

    // --- Host-side computations
    for (int u=0; u<nz; u++)
        for (int v=0; v<ny; v++)
            for (int w=0; w<nx; w++){
                temp = 0;
                for (int i=0; i<wz; i++)
                    for (int j=0; j<wy; j++)
                        for (int k=0; k<wx; k++)
                            temp = temp + input[(i+u)*(ny+(wy-1))*(nx+(wx-1))+(j+v)*(nx+(wx-1))+(k+w)];
                ref_output[u*ny*nx + v*nx + w] = temp;
            }

    t3 = clock();
    t3sum = ((double)(t3-t2))/CLOCKS_PER_SEC;
    printf(" Host compute took %3.2f seconds.  Comparing results.\n", t3sum);

    // --- Check CPU and GPU results
    for (int i=0; i<nz; i++)
        for (int j=0; j<ny; j++)
            for (int k=0; k<nx; k++)
                if (ref_output[i*ny*nx + j*nx + k] != output[i*ny*nx + j*nx + k]) {
                    printf("Mismatch at x= %d, y= %d, z= %d  Host= %d, Device = %d\n", i, j, k, ref_output[i*ny*nx + j*nx + k], output[i*ny*nx + j*nx + k]);
                    return 1;
                }
    printf("Results match!\n");

    // --- Freeing memory
    free(input);
    free(output);
    gpuErrchk(cudaFree(d_input));
    gpuErrchk(cudaFree(d_output));

    cudaDeviceReset();

    return 0;
}

3D boxcar filter in CUDA using shared memory

1 Answers1