How does CUB's TexRefInputIterator work?

Question

CUB provides an iterator for texture references, the implementation of which is readily accessible.

Since I couldn't figure out how to implement template-able texture references myself - they "can only be declared as a static global variable" - I am now trying to understand how it's done in CUB. But some of it is beyond my C++ knowledge, and I haven't been able to find the answers elsewhere (then again, I don't really know what to search for).

Specifically:

Is the unnamed namespace surrounding IteratorTexRef significant? I can only think that it is to limit IteratorTexRef::TexId::ref to file/translation unit scope.

What is the purpose of IteratorTexRef? It only wraps TexId, but removing it results in unintelligible (to me) compile-time errors.

This code, a stripped-down version of the linked-to implementation, compiles and runs:

#include <thrust/device_vector.h>

namespace {

template <typename T>
struct IteratorTexRef
{
    template <int UNIQUE_ID>
    struct TexId
    {
        // Assume T is a valid texture word size.
        typedef texture<T> TexRef;

        static TexRef ref;

        static __device__ T fetch(ptrdiff_t offset)
        {
            return tex1Dfetch(ref, offset);
        }
    };
};

template <typename  T>
template <int UNIQUE_ID>
typename IteratorTexRef<T>:: template TexId<UNIQUE_ID>::TexRef IteratorTexRef<T>:: template TexId<UNIQUE_ID>::ref;

} // Anomymous namespace

template <typename T, int UNIQUE_ID = 0>
class TextureRefIterator
{
private:
    typedef typename IteratorTexRef<T>:: template TexId<UNIQUE_ID> TexId;
    ptrdiff_t tex_offset;

public:
    __device__ T operator[](int i) const
    {
        return TexId::fetch(this->tex_offset + i);
    }

    cudaError_t bind(
        const T* const ptr,
        size_t bytes = size_t(-1))
    {
        size_t offset;
        cudaError_t state = cudaBindTexture(&offset, TexId::ref, ptr, bytes);
        this->tex_offset = (ptrdiff_t) (offset / sizeof(T));
        return state;
    }
};

template <typename TexIter>
__global__ void kernel(TexIter iter)
{
    int a = iter[threadIdx.x];
    printf("tid %d, a %d\n", threadIdx.x, a);
}

template <typename T>
void launch_kernel(T* d_in)
{
    TextureRefIterator<T> tex_iter;
    tex_iter.bind(d_in);

    kernel<<<1, 32>>>(tex_iter);
}

int main()
{
    thrust::device_vector<float> d_in(32, 1);
    launch_kernel(thrust::raw_pointer_cast(d_in.data()));
}

The closest I got was something similar to the below, based on how one would normally access a static template member. For clarity, the below simply eliminates IteratorTexRef from the above:

#include <thrust/device_vector.h>

namespace {

template <typename T, int UNIQUE_ID>
struct TexId
{
    // Assume T is a valid texture word size.
    typedef texture<T> TexRef;

    static TexRef ref;

    static __device__ T fetch(ptrdiff_t offset)
    {
        return tex1Dfetch(ref, offset);
    }
};

template <typename  T, int UNIQUE_ID>
typename TexId<T, UNIQUE_ID>::TexRef TexId<T, UNIQUE_ID>::ref;


} // Anonymous namespace

template <typename T, int UNIQUE_ID = 0>
class TextureRefIterator
{
private:
    typedef TexId<T, UNIQUE_ID> TexId;
    ptrdiff_t tex_offset;

public:
    __device__ T operator[](int i) const
    {
        return TexId::fetch(this->tex_offset + i);
    }

    cudaError_t bind(
        const T* const ptr,
        size_t bytes = size_t(-1))
    {
        size_t offset;
        cudaError_t state = cudaBindTexture(&offset, TexId::ref, ptr, bytes);
        this->tex_offset = (ptrdiff_t) (offset / sizeof(T));
        return state;
    }
};

template <typename TexIter>
__global__ void kernel(TexIter iter)
{
    int a = iter[0];
    printf("tid %d, a %d\n", threadIdx.x, a);
}

template <typename T>
void launch_kernel(T* d_in)
{
    TextureRefIterator<T> tex_iter;
    tex_iter.bind(d_in);

    kernel<<<1, 32>>>(tex_iter);
}

int main()
{
    thrust::device_vector<float> d_in(32, 1);
    launch_kernel(thrust::raw_pointer_cast(d_in.data()));
}

It gives these somewhat esoteric compile-time errors. (Compiled with nvcc iter.cu and CUDA 7.0):

In file included from tmpxft_000057d4_00000000-4_test2.cudafe1.stub.c:1:0:
/tmp/tmpxft_000057d4_00000000-4_test2.cudafe1.stub.c:30:3737: error: macro "__text_var" passed 3 arguments, but takes just 2
 dIfLi0EE3refE,::_NV_ANON_NAMESPACE::TexId<float, (int)0> ::ref), 1, 0, 0);__cudaReg
                                                                         ^
/tmp/tmpxft_000057d4_00000000-4_test2.cudafe1.stub.c:30:1: error: macro "__device__text_var" passed 3 arguments, but takes just 2
 static void __nv_cudaEntityRegisterCallback(void **__T2202){__nv_dummy_param_ref(__
 ^
/tmp/tmpxft_000057d4_00000000-4_test2.cudafe1.stub.c:30:1: error: macro "__name__text_var" passed 3 arguments, but takes just 2

stk · Accepted Answer · 2021-02-11T22:47:18.080

1

That compile error is due to the generated code using macros that contain template types so the commas in the templates make the preprocessor think that that they are more arguments. I fixed this by patching the crt/host_runtime header and making the cpp parameter of those macros (__text_var, __device__text_var, and __name__text_var) variadic. In other words, replace cpp with cpp....

edited Feb 11 '21 at 22:47

answered Feb 22 '16 at 19:32

stk

26
3

So I'm clear I understand this: the compile-time error is _not_ generated when implementing `IteratorTexRef` as CUB does (my first example), because both `IteratorTexRef` and `TexId` take only one template parameter, and hence there are no extra commas for the preprocessor to interpret as argument separators? – Sam Feb 23 '16 at 13:29
Yes that is the case. – stk Feb 23 '16 at 15:51

How does CUB's TexRefInputIterator work?

1 Answers1