0

I wrote following code to see how to use texture memory for 1D array.but tex1D function is not fetching the value from array for corresponding thread id.Please correct this code and tell me how to use texture memory for 1D array efficiently and effectively.

__global__ void sum(float *b,cudaTextureObject_t texObj)

    {
    b[threadIdx.x]=tex1D<float>(texObj,threadIdx.x);
    //printf("\n%f\n",tex1Dfetch<float>(texObj,threadIdx.x));
    }
    int main()
    {
    float *a,*b;
    float *d_a,*d_b;
    int i;
    a=(float*)malloc(sizeof(float)*5);
    b=(float*)malloc(sizeof(float)*5);

    for(i=0;i<5;i++)
        a[i]=i;

    cudaChannelFormatDesc channelDesc =cudaCreateChannelDesc(32, 0, 0, 0,cudaChannelFormatKindFloat);

    cudaArray* cuArray;
    cudaMallocArray(&cuArray, &channelDesc, 5, 0);

    cudaMemcpyToArray(cuArray, 0, 0, a,sizeof(float)*5,cudaMemcpyHostToDevice);


    struct cudaResourceDesc resDesc;
        memset(&resDesc, 0, sizeof(resDesc));
        resDesc.resType = cudaResourceTypeArray;
        resDesc.res.array.array = cuArray;


      struct cudaTextureDesc texDesc;
        memset(&texDesc, 0, sizeof(texDesc));
        texDesc.addressMode[0]   = cudaAddressModeWrap;
        texDesc.addressMode[1]   = cudaAddressModeWrap;
        texDesc.filterMode       = cudaFilterModeLinear;
        texDesc.readMode         = cudaReadModeElementType;
        texDesc.normalizedCoords = 1;

        // Create texture object
        cudaTextureObject_t texObj = 0;
        cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);


    cudaMalloc(&d_b, 5* sizeof(float));

    sum<<<1,5>>>(d_b,texObj);



        // Free device memory
    cudaMemcpy(b,d_b,sizeof(float),cudaMemcpyDeviceToHost);

     for(i=0;i<5;i++)
        printf("%f\t",b[i]);
      cudaDestroyTextureObject(texObj); 
    cudaFreeArray(cuArray);
    cudaFree(d_b);

        return 0;

    }
einpoklum
  • 118,144
  • 57
  • 340
  • 684
  • 1
    Welcome to Stack Overflow! While we can potentially point out obvious errors in your code, we are not a debugging service. Please consider reading some [basic debugging techniques](https://ericlippert.com/2014/03/05/how-to-debug-small-programs/) to help you to either solve the problem yourself, or narrow your problem down to something specific enough for this site. – Joe C Oct 01 '16 at 19:14

1 Answers1

3

There are at least 2 issues:

  1. You are only copying back one float quantity from device to host at the end:

    cudaMemcpy(b,d_b,sizeof(float),cudaMemcpyDeviceToHost);
                     ^^^^^^^^^^^^^
    

    if you want to print 5 values, you should copy 5 values back:

    cudaMemcpy(b,d_b,5*sizeof(float),cudaMemcpyDeviceToHost);
    
  2. You have selected normalized coordinates:

    texDesc.normalizedCoords = 1;
    

    this means you should be passing a floating point coordinate between 0 and 1 as your index, not an integer coordinate from 0 to 4:

     b[threadIdx.x]=tex1D<float>(texObj,threadIdx.x);
                                        ^^^^^^^^^^^
    

    use something like this instead:

     b[threadIdx.x]=tex1D<float>(texObj, ((float)threadIdx.x/5.0f));
    

with those changes, I get sensible results. Here's a fully worked code:

$ cat t3.cu
#include <stdio.h>

__global__ void sum(float *b,cudaTextureObject_t texObj)

    {
    b[threadIdx.x]=tex1D<float>(texObj,((float)(threadIdx.x+1)/5.0f));

    //printf("\n%f\n",tex1Dfetch<float>(texObj,threadIdx.x));
    }


int main()
    {
    float *a,*b;
    float *d_b;
    int i;
    a=(float*)malloc(sizeof(float)*5);
    b=(float*)malloc(sizeof(float)*5);

    for(i=0;i<5;i++)
        a[i]=i;

    cudaChannelFormatDesc channelDesc =cudaCreateChannelDesc(32, 0, 0, 0,cudaChannelFormatKindFloat);

    cudaArray* cuArray;
    cudaMallocArray(&cuArray, &channelDesc, 5, 0);

    cudaMemcpyToArray(cuArray, 0, 0, a,sizeof(float)*5,cudaMemcpyHostToDevice);


    struct cudaResourceDesc resDesc;
        memset(&resDesc, 0, sizeof(resDesc));
        resDesc.resType = cudaResourceTypeArray;
        resDesc.res.array.array = cuArray;


      struct cudaTextureDesc texDesc;
        memset(&texDesc, 0, sizeof(texDesc));
        texDesc.addressMode[0]   = cudaAddressModeWrap;
        texDesc.addressMode[1]   = cudaAddressModeWrap;
        texDesc.filterMode       = cudaFilterModeLinear;
        texDesc.readMode         = cudaReadModeElementType;
        texDesc.normalizedCoords = 1;

        // Create texture object
        cudaTextureObject_t texObj = 0;
        cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);


    cudaMalloc(&d_b, 5* sizeof(float));

    sum<<<1,4>>>(d_b,texObj);



        // Free device memory
    cudaMemcpy(b,d_b,5*sizeof(float),cudaMemcpyDeviceToHost);

     for(i=0;i<4;i++)
        printf("%f\t",b[i]);
      printf("\n");
      cudaDestroyTextureObject(texObj);
    cudaFreeArray(cuArray);
    cudaFree(d_b);

        return 0;

    }
$ nvcc -arch=sm_61 -o t3 t3.cu
$ cuda-memcheck ./t3
========= CUDA-MEMCHECK
0.500000        1.500000        2.500000        3.500000
========= ERROR SUMMARY: 0 errors
$

Note that I did make some other changes. In particular, I've adjusted your sample points as well as the sample quantity to choose sample points that are linearly interpolated halfway between each of the 5 data points you have (0, 1, 2, 3, 4) yielding a total output of 4 quantities (0.5, 1.5, 2.5, 3.5) representing the midpoints between your 5 datapoints.

If you want to learn more about normalized coordinate indexing, this is covered in the programming guide as are other concepts such as border modes and the like. Furthermore, there are various CUDA sample codes that demonstrate proper use of textures.

Robert Crovella
  • 143,785
  • 11
  • 213
  • 257