1

I'm having trouble coalescing reads when using the float2 datatype in CUDA.

I've tried to make a simple example to run in the visual profiler but it always returns noncoalesced reads. If anyone could shed some light on this I would be really grateful, thanks.

#include <stdio.h>
#include <cuda_runtime_api.h>

__global__ void kernel(float2 *in, float2 *out) {
        int idx=blockIdx.x*blockDim.x+threadIdx.x;
        float2 d=in[idx];
        d.x = 100.f;

        out[idx] = d;
}

int main() {
  const int dataSize=32;
  float2 *in;
  cudaMalloc((void**)&in,dataSize*sizeof(float2));

  float2 *out;
  cudaMalloc((void**)&out,dataSize*sizeof(float2));
  kernel<<<1,32>>>(in,out);
  return 0;
}
Ljdawson
  • 12,091
  • 11
  • 45
  • 60

1 Answers1

1

I asked this question on the NVIDIA forums. Turns out loading vectors is not optimized in debug mode. Forums

Ljdawson
  • 12,091
  • 11
  • 45
  • 60