Is it possible to read from a CUDA texture using a floating point index directly, e.g. can I perform a texture fetch using tex.1d.v4.f32.f32
.
This appears to save two instructions when looking at the .ptx
files and this is reflected in an increased performance when benchmarking. However, the rather critical downside is that, while this appears to run without issue, it does not produce the desired results.
The code below demonstrates the issue:
#include "cuda.h"
#include <thrust/device_vector.h>
//create a global 1D texture of type float
texture<float, cudaTextureType1D, cudaReadModeElementType> tex;
//below is a hand rolled ptx texture lookup using tex.1d.v4.f32.f32
__device__
float tex_load(float idx)
{
float4 temp;
asm("tex.1d.v4.f32.f32 {%0, %1, %2, %3}, [tex, {%4}];" :
"=f"(temp.x), "=f"(temp.y), "=f"(temp.z), "=f"(temp.w) : "f"(idx));
return temp.x;
}
//Try to read from the texture using tex1Dfetch and the custom tex_load
__global__ void read(){
float x = tex1Dfetch(tex,0.0f);
float y = tex_load(0.0f);
printf("tex1Dfetch: %f tex_load: %f\n",x,y);
}
int main()
{
//create a vector of size 1 with the x[0]=3.14
thrust::device_vector<float> x(1,3.14);
float* x_ptr = thrust::raw_pointer_cast(&x[0]);
//bind the texture
cudaBindTexture(0, tex, x_ptr, sizeof(float));
//launch a single thread single block kernel
read<<<1,1>>>();
cudaUnbindTexture(tex);
return 0;
}
I've tried this on a couple of cards (K40, C2070) and with a couple of CUDA versions (6.0,7.0), but on all I get the same output:
tex1Dfetch: 3.140000 tex_load: 0.000000
Is this possible or am I barking up the wrong tree?