As the trick described in here, I tested the following code and got the corresponding profiling result. Conflicts were notably diminished, but some still persist.
// store conflict
__global__ void setRowReadCol(int *out){
__shared__ int tile[BDIMY][BDIMX];
unsigned int idx = threadIdx.y * blockDim.x + threadIdx.x;
tile[threadIdx.y][threadIdx.x] = idx;
__syncthreads();
out[idx] = tile[threadIdx.x][threadIdx.y];
}
// should be conflict free
__global__ void setRowReadColPad(int *out){
__shared__ int tile[BDIMY][BDIMX + 1]; // BDIMY=BDIMY=32
unsigned int idx = threadIdx.y * blockDim.x + threadIdx.x;
tile[threadIdx.y][threadIdx.x] = idx;
__syncthreads();
out[idx] = tile[threadIdx.x][threadIdx.y];
}
NCU profiling results:
setRowReadCol(int*), Context 1, Stream 7
Section: Command line profiler metrics
---------------------------------------------------------------------- --------------- ------------------------------
l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 994
l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum 0
---------------------------------------------------------------------- --------------- ------------------------------
setRowReadColPad(int*), Context 1, Stream 7
Section: Command line profiler metrics
---------------------------------------------------------------------- --------------- ------------------------------
l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 2
l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum 0
---------------------------------------------------------------------- --------------- ------------------------------
It seems that there still exist two conflicted transactions. Interestingly, when the padding size was adjusted to 31, the bank conflicts were completely eliminated:
setRowReadColPad31(int*), Context 1, Stream 7
Section: Command line profiler metrics
---------------------------------------------------------------------- --------------- ------------------------------
l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_ld.sum 0
l1tex__data_bank_conflicts_pipe_lsu_mem_shared_op_st.sum 0
---------------------------------------------------------------------- --------------- ------------------------------
Could anyone explain this?
Full code:
#include <cuda_runtime.h>
#include <stdio.h>
#include <cstdlib>
#include <chrono>
#include <ctime>
#include <iostream>
#include <iomanip>
#include <stdio.h>
#include <stdarg.h>
#define CUDA_CHECK(call) \
do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
std::cerr << "CUDA error: " << cudaGetErrorString(err) \
<< " at " << __FILE__ << ":" << __LINE__ \
<< std::endl; \
exit(EXIT_FAILURE); \
} \
} while (0)
#define BDIMX 32
#define BDIMY 32
// conflict free
__global__ void setRowReadRow (int *out){
__shared__ int tile[BDIMY][BDIMX];
unsigned int idx = threadIdx.y * blockDim.x + threadIdx.x;
tile[threadIdx.y][threadIdx.x] = idx;
__syncthreads();
out[idx] = tile[threadIdx.y][threadIdx.x] ;
}
// both conflict
__global__ void setRowReadCol(int *out){
__shared__ int tile[BDIMY][BDIMX];
unsigned int idx = threadIdx.y * blockDim.x + threadIdx.x;
tile[threadIdx.y][threadIdx.x] = idx;
__syncthreads();
out[idx] = tile[threadIdx.x][threadIdx.y];
}
// should be zero?
__global__ void setRowReadColPad(int *out){
__shared__ int tile[BDIMY][BDIMX + 1];
unsigned int idx = threadIdx.y * blockDim.x + threadIdx.x;
tile[threadIdx.y][threadIdx.x] = idx;
__syncthreads();
out[idx] = tile[threadIdx.x][threadIdx.y];
}
__global__ void setRowReadColPad31(int *out) {
__shared__ int tile[BDIMY][BDIMX + 31];
unsigned int idx = threadIdx.y * blockDim.x + threadIdx.x;
tile[threadIdx.y][threadIdx.x] = idx; __syncthreads();
out[idx] = tile[threadIdx.x][threadIdx.y];
}
int main(int argc, char **argv)
{
// set up device
int dev = 0;
cudaDeviceProp deviceProp;
CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, dev));
printf("%s at ", argv[0]);
printf("device %d: %s ", dev, deviceProp.name);
CUDA_CHECK(cudaSetDevice(dev));
cudaSharedMemConfig pConfig;
CUDA_CHECK(cudaDeviceGetSharedMemConfig ( &pConfig ));
printf("with Bank Mode:%s ", pConfig == 1 ? "4-Byte" : "8-Byte");
// set up array size 2048
int nx = BDIMX;
int ny = BDIMY;
bool iprintf = 0;
if (argc > 1) iprintf = atoi(argv[1]);
size_t nBytes = nx * ny * sizeof(int);
// execution configuration
dim3 block (BDIMX, BDIMY);
dim3 grid (1, 1);
printf("<<< grid (%d,%d) block (%d,%d)>>>\n", grid.x, grid.y, block.x,
block.y);
// allocate device memory
int *d_C;
CUDA_CHECK(cudaMalloc((int**)&d_C, nBytes));
int *gpuRef = (int *)malloc(nBytes);
CUDA_CHECK(cudaMemset(d_C, 0, nBytes));
setRowReadRow<<<grid, block>>>(d_C);
CUDA_CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemset(d_C, 0, nBytes));
setRowReadCol<<<grid, block>>>(d_C);
CUDA_CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemset(d_C, 0, nBytes));
setRowReadColPad<<<grid, block>>>(d_C);
CUDA_CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemset(d_C, 0, nBytes));
setRowReadColPad31<<<grid, block>>>(d_C);
CUDA_CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaFree(d_C));
free(gpuRef);
return EXIT_SUCCESS;
}