1

As the trick described in here, I tested the following code and got the corresponding profiling result. Conflicts were notably diminished, but some still persist.

// store conflict
__global__ void setRowReadCol(int *out){
    __shared__ int tile[BDIMY][BDIMX];
    unsigned int idx = threadIdx.y * blockDim.x + threadIdx.x;
    tile[threadIdx.y][threadIdx.x] = idx;
    __syncthreads();
    out[idx] = tile[threadIdx.x][threadIdx.y];
}
// should be conflict free
__global__ void setRowReadColPad(int *out){
    __shared__ int tile[BDIMY][BDIMX + 1];  // BDIMY=BDIMY=32
    unsigned int idx = threadIdx.y * blockDim.x + threadIdx.x;
    tile[threadIdx.y][threadIdx.x] = idx;
    __syncthreads();
    out[idx] = tile[threadIdx.x][threadIdx.y];
}

NCU profiling results:

  setRowReadCol(int*),  Context 1, Stream 7
    Section: Command line profiler metrics
    ---------------------------------------------------------------------- --------------- ------------------------------
    l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum                                                          994
    l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum                                                            0
    ---------------------------------------------------------------------- --------------- ------------------------------

  setRowReadColPad(int*),  Context 1, Stream 7
    Section: Command line profiler metrics
    ---------------------------------------------------------------------- --------------- ------------------------------
    l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum                                                            2
    l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum                                                            0
    ---------------------------------------------------------------------- --------------- ------------------------------

It seems that there still exist two conflicted transactions. Interestingly, when the padding size was adjusted to 31, the bank conflicts were completely eliminated:

  setRowReadColPad31(int*), Context 1, Stream 7
    Section: Command line profiler metrics
    ---------------------------------------------------------------------- --------------- ------------------------------
    l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum                                                            0
    l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum                                                            0
    ---------------------------------------------------------------------- --------------- ------------------------------

Could anyone explain this?

Full code:

#include <cuda_runtime.h>
#include <stdio.h>
#include <cstdlib>
#include <chrono>
#include <ctime>
#include <iostream>
#include <iomanip>
#include <stdio.h>
#include <stdarg.h>

#define CUDA_CHECK(call)                                      \
    do {                                                      \
        cudaError_t err = call;                               \
        if (err != cudaSuccess) {                             \
            std::cerr << "CUDA error: " << cudaGetErrorString(err) \
                      << " at " << __FILE__ << ":" << __LINE__ \
                      << std::endl;                           \
            exit(EXIT_FAILURE);                               \
        }                                                     \
    } while (0)


#define BDIMX 32
#define BDIMY 32

// conflict free
__global__ void setRowReadRow (int *out){
    __shared__ int tile[BDIMY][BDIMX];
    unsigned int idx = threadIdx.y * blockDim.x + threadIdx.x;
    tile[threadIdx.y][threadIdx.x] = idx;
    __syncthreads();
    out[idx] = tile[threadIdx.y][threadIdx.x] ;
}

// both conflict
__global__ void setRowReadCol(int *out){
    __shared__ int tile[BDIMY][BDIMX];
    unsigned int idx = threadIdx.y * blockDim.x + threadIdx.x;
    tile[threadIdx.y][threadIdx.x] = idx;
    __syncthreads();
    out[idx] = tile[threadIdx.x][threadIdx.y];
}

// should be zero?
__global__ void setRowReadColPad(int *out){
    __shared__ int tile[BDIMY][BDIMX + 1];
    unsigned int idx = threadIdx.y * blockDim.x + threadIdx.x;
    tile[threadIdx.y][threadIdx.x] = idx;
    __syncthreads();
    out[idx] = tile[threadIdx.x][threadIdx.y];
}

__global__ void setRowReadColPad31(int *out) {
    __shared__ int tile[BDIMY][BDIMX + 31];
    unsigned int idx = threadIdx.y * blockDim.x + threadIdx.x;
    tile[threadIdx.y][threadIdx.x] = idx; __syncthreads();
    out[idx] = tile[threadIdx.x][threadIdx.y];
}

int main(int argc, char **argv)
{
    // set up device
    int dev = 0;
    cudaDeviceProp deviceProp;
    CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, dev));
    printf("%s at ", argv[0]);
    printf("device %d: %s ", dev, deviceProp.name);
    CUDA_CHECK(cudaSetDevice(dev));

    cudaSharedMemConfig pConfig;
    CUDA_CHECK(cudaDeviceGetSharedMemConfig ( &pConfig ));
    printf("with Bank Mode:%s ", pConfig == 1 ? "4-Byte" : "8-Byte");

    // set up array size 2048
    int nx = BDIMX;
    int ny = BDIMY;

    bool iprintf = 0;

    if (argc > 1) iprintf = atoi(argv[1]);

    size_t nBytes = nx * ny * sizeof(int);

    // execution configuration
    dim3 block (BDIMX, BDIMY);
    dim3 grid  (1, 1);
    printf("<<< grid (%d,%d) block (%d,%d)>>>\n", grid.x, grid.y, block.x,
           block.y);

    // allocate device memory
    int *d_C;
    CUDA_CHECK(cudaMalloc((int**)&d_C, nBytes));
    int *gpuRef  = (int *)malloc(nBytes);

    CUDA_CHECK(cudaMemset(d_C, 0, nBytes));
    setRowReadRow<<<grid, block>>>(d_C);
    CUDA_CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));

    CUDA_CHECK(cudaMemset(d_C, 0, nBytes));
    setRowReadCol<<<grid, block>>>(d_C);
    CUDA_CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));

    CUDA_CHECK(cudaMemset(d_C, 0, nBytes));
    setRowReadColPad<<<grid, block>>>(d_C);
    CUDA_CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));

    CUDA_CHECK(cudaMemset(d_C, 0, nBytes));
    setRowReadColPad31<<<grid, block>>>(d_C);
    CUDA_CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaFree(d_C));
    free(gpuRef);
    return EXIT_SUCCESS;
}
paleonix
  • 2,293
  • 1
  • 13
  • 29
  • 4
    [This](https://forums.developer.nvidia.com/t/shared-memory-bank-conflicts-and-nsight-metric/115731/15) may be of interest: "There is not currently a hardware counter that only counts bank conflicts. Other arbitration conflicts that result in a replayed wavefront are included. Summing L1 Wavefronts Shared Excessive on the Source View page is the best method to only count bank conflicts. " – Robert Crovella Aug 13 '23 at 21:22

0 Answers0