5

I have a linear array of unsigned chars representing a 2D array. I would like to place it into a CUDA 2D texture and perform (floating point) linear interpolation on it, i.e., have the texture call fetch the 4 nearest unsigned char neighbors, internally convert them to float, interpolate between them, and return the resulting floating point value.

I am having some difficulty setting up the texture and binding it to a texture reference. I have been through the CUDA reference manual & appendices, but I'm just not having any luck.

Below is runnable code to set up and bind 1) a floating point texture and 2) an unsigned char texture. The floating point code runs just fine. However, if you uncomment the two commented unsigned char lines toward the bottom, an "invalid argument" error is thrown.

#include <cstdio>
#include <cuda_runtime.h>

typedef unsigned char uchar;

// Define (global) texture references; must use "cudaReadModeNormalizedFloat"
// for ordinal textures
texture<float, cudaTextureType2D, cudaReadModeNormalizedFloat> texRefFloat;
texture<uchar, cudaTextureType2D, cudaReadModeNormalizedFloat> texRefUChar;

// Define size of (row major) textures
size_t const WIDTH  = 1000;
size_t const HEIGHT = 1000;
size_t const TOT_PIX = WIDTH*HEIGHT;

int main(void)
{
   // Set texel formats
   cudaChannelFormatDesc descFloat = cudaCreateChannelDesc<float>();
   cudaChannelFormatDesc descUChar = cudaCreateChannelDesc<uchar>();

   // Choose to perform texture 2D linear interpolation
   texRefFloat.filterMode = cudaFilterModeLinear;
   texRefUChar.filterMode = cudaFilterModeLinear;

   // Allocate texture device memory
   float * d_buffFloat; cudaMalloc(&d_buffFloat, sizeof(float)*TOT_PIX);
   uchar * d_buffUChar; cudaMalloc(&d_buffUChar, sizeof(uchar)*TOT_PIX);

   // Bind texture references to textures
   cudaError_t errFloat = cudaSuccess;
   cudaError_t errUChar = cudaSuccess;

   errFloat = cudaBindTexture2D(0, texRefFloat, d_buffFloat, descFloat,
                  WIDTH, HEIGHT, sizeof(float)*WIDTH);
   // Uncomment the following two lines for an error
   //errUChar = cudaBindTexture2D(0, texRefUChar, d_buffUChar, descUChar,
   //               WIDTH, HEIGHT, sizeof(uchar)*WIDTH);

   // Check for errors during binding
   if (errFloat != cudaSuccess)
   {
      printf("Error binding float texture reference: %s\n",
          cudaGetErrorString(errFloat));
      exit(-1);
   }

   if (errUChar != cudaSuccess)
   {
      printf("Error binding unsigned char texture reference: %s\n",
          cudaGetErrorString(errUChar));
      exit(-1);
   }

   return 0;
}

Any help/insight would be most appreciated!

Aaron

Jammy
  • 303
  • 2
  • 14
  • 3
    Each row of a texture must be properly aligned. This cannot be guaranteed in general if you bind the texture to a plain array (as opposed to a CUDA array). To bind plain memory to a 2D texture, you would want to allocate the memory with cudaMallocPitch(). This sets the row pitch such that it is suitable for binding to a texture. Note that it is not good practice to pass `0` as the first argument to a texture binding API call. This argument is for CUDA to return an offset to the app. If the offset is non-zero you will need to add it to the texture coordinate during texture access. – njuffa Jun 12 '13 at 21:54

1 Answers1

8

Each row of a texture must be properly aligned. This cannot be guaranteed in general if you bind the texture to a plain array (as opposed to a CUDA array). To bind plain memory to a 2D texture, you would want to allocate the memory with cudaMallocPitch(). This sets the row pitch such that it is suitable for binding to a texture. Note that it is not good practice to pass 0 as the first argument to a texture binding API call. This argument is for CUDA to return an offset to the app. If the offset is non-zero you will need to add it to the texture coordinate during texture access.

Here is a quick example that shows how to read interpolated values from a texture whose elements are unsigned char.

#include <stdlib.h>
#include <stdio.h>

// Macro to catch CUDA errors in CUDA runtime calls
#define CUDA_SAFE_CALL(call)                                          \
do {                                                                  \
    cudaError_t err = call;                                           \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)
// Macro to catch CUDA errors in kernel launches
#define CHECK_LAUNCH_ERROR()                                          \
do {                                                                  \
    /* Check synchronous errors, i.e. pre-launch */                   \
    cudaError_t err = cudaGetLastError();                             \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString(err) );       \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
    /* Check asynchronous errors, i.e. kernel failed (ULF) */         \
    err = cudaThreadSynchronize();                                    \
    if (cudaSuccess != err) {                                         \
        fprintf (stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                 __FILE__, __LINE__, cudaGetErrorString( err) );      \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while (0)

texture<unsigned char, 2, cudaReadModeNormalizedFloat> tex;

__global__ void kernel (int m, int n, float shift_x, float shift_y) 
{
    float val;
    for (int row = 0; row < m; row++) {
        for (int col = 0; col < n; col++) {
            val = tex2D (tex, col+0.5f+shift_x, row+0.5f+shift_y);
            printf ("%.2f  ", val);
        }
        printf ("\n");
    }
}

int main (void)
{
    int m = 4; // height = #rows
    int n = 3; // width  = #columns
    size_t pitch, tex_ofs;
    unsigned char arr[4][3]= {{11,12,13},{21,22,23},{31,32,33},{251,252,253}};
    unsigned char *arr_d = 0;

    CUDA_SAFE_CALL(cudaMallocPitch((void**)&arr_d,&pitch,n*sizeof(*arr_d),m));
    CUDA_SAFE_CALL(cudaMemcpy2D(arr_d, pitch, arr, n*sizeof(arr[0][0]),
                                n*sizeof(arr[0][0]),m,cudaMemcpyHostToDevice));
    tex.normalized = false;
    tex.filterMode = cudaFilterModeLinear;
    CUDA_SAFE_CALL (cudaBindTexture2D (&tex_ofs, &tex, arr_d, &tex.channelDesc,
                                       n, m, pitch));
    if (tex_ofs !=0) {
        printf ("tex_ofs = %zu\n", tex_ofs);
        return EXIT_FAILURE;
    }
    printf ("reading array straight\n");
    kernel<<<1,1>>>(m, n, 0.0f, 0.0f);
    CHECK_LAUNCH_ERROR();
    CUDA_SAFE_CALL (cudaDeviceSynchronize());
    printf ("reading array shifted in x-direction\n");
    kernel<<<1,1>>>(m, n, 0.5f, 0.0f);
    CHECK_LAUNCH_ERROR();
    CUDA_SAFE_CALL (cudaDeviceSynchronize());
    printf ("reading array shifted in y-direction\n");
    kernel<<<1,1>>>(m, n, 0.0f, 0.5f);
    CUDA_SAFE_CALL (cudaDeviceSynchronize());
    CUDA_SAFE_CALL (cudaFree (arr_d));
    return EXIT_SUCCESS;
}

The output of this program is as follows:

reading array straight
0.04  0.05  0.05
0.08  0.09  0.09
0.12  0.13  0.13
0.98  0.99  0.99
reading array shifted in x-direction
0.05  0.05  0.05
0.08  0.09  0.09
0.12  0.13  0.13
0.99  0.99  0.99
reading array shifted in y-direction
0.06  0.07  0.07
0.10  0.11  0.11
0.55  0.56  0.56
0.98  0.99  0.99
njuffa
  • 23,970
  • 4
  • 78
  • 130
  • Wow! Full working code to illustrate your answer! Thanks so much! So is it just a fluke that one can setup float textures without cudaMallocPitch()? – Jammy Jun 13 '13 at 12:04
  • 1
    I don't know what the alignment requirement is for the rows of a texture, but it is possible to satisfy it by accident when simply using `cudaMalloc()`. Assume for the sake of argument that the required row aligment is 32 bytes. Then 1000 `float` per row = 4000 bytes which divides evenly by 32, since 32 * 125 = 4000. But 1000 `unsigned char` = 1000 bytes, which doesn't divide evenly by 32. – njuffa Jun 13 '13 at 16:12
  • On Pascal architecture, texture row alignment requirement is 512-byte boundary. That number is smaller on some earlier architectures, but never as small as 32-bytes. (The 32-byte number is the smallest alignment where coalesced memory accesses are possible.) – Jeff Benshetler Mar 14 '19 at 15:28