1

I have a kernel that takes several arrays as input. To improve readability it would be nice to group them into a struct and (after proper memory allocation and copy for each input) pass the struct to the kernel instead of the long list of pointers.

Is it going to be the same in the 2 cases, memory-wise, when accessing the arrays inside the kernel?

Can anyone recommend me some documentation on this topic (Couldn't find it on the programming guide)

talonmies
  • 70,661
  • 34
  • 192
  • 269
AathakA
  • 117
  • 7
  • I think passing kernel arguments as structs is quite common, I personally cannot think of any downside... – geebert Nov 05 '21 at 11:12
  • [this](https://stackoverflow.com/questions/36655853/do-nvcc-gcc-clang-and-msvc-respect-the-restrict-keyword-within-structs) may be of interest – Robert Crovella Nov 11 '21 at 02:17

1 Answers1

3

No, there should be no difference. You can read the PTX output to make sure. Here is a simple example:


struct Foo
{
    int* a, *b, *c;
};

__global__ void bar(Foo f)
{ f.a[0] = f.b[0] + f.c[0]; }

__global__ void baz(int* a, int* b, int* c)
{ a[0] = b[0] + c[0]; }

struct Quz
{
    int* a, *b, *c;

    ~Quz() {}
};

__global__ void quuz(Quz f)
{ f.a[0] = f.b[0] + f.c[0]; }

And here is the PTX assembly. Note how there is basically no difference between the functions.

.visible .entry _Z3bar3Foo(
    .param .align 8 .b8 _Z3bar3Foo_param_0[24]
)
{
    .reg .b32   %r<4>;
    .reg .b64   %rd<7>;

    ld.param.u64    %rd1, [_Z3bar3Foo_param_0+16];
    ld.param.u64    %rd2, [_Z3bar3Foo_param_0+8];
    ld.param.u64    %rd3, [_Z3bar3Foo_param_0];
    cvta.to.global.u64  %rd4, %rd3;
    cvta.to.global.u64  %rd5, %rd2;
    cvta.to.global.u64  %rd6, %rd1;
    ld.global.u32   %r1, [%rd5];
    ld.global.u32   %r2, [%rd6];
    add.s32     %r3, %r2, %r1;
    st.global.u32   [%rd4], %r3;
    ret;
}

.visible .entry _Z3bazPiS_S_(
    .param .u64 _Z3bazPiS_S__param_0,
    .param .u64 _Z3bazPiS_S__param_1,
    .param .u64 _Z3bazPiS_S__param_2
)
{
    .reg .b32   %r<4>;
    .reg .b64   %rd<7>;


    ld.param.u64    %rd1, [_Z3bazPiS_S__param_0];
    ld.param.u64    %rd2, [_Z3bazPiS_S__param_1];
    ld.param.u64    %rd3, [_Z3bazPiS_S__param_2];
    cvta.to.global.u64  %rd4, %rd1;
    cvta.to.global.u64  %rd5, %rd3;
    cvta.to.global.u64  %rd6, %rd2;
    ld.global.u32   %r1, [%rd6];
    ld.global.u32   %r2, [%rd5];
    add.s32     %r3, %r2, %r1;
    st.global.u32   [%rd4], %r3;
    ret;
}

.visible .entry _Z4quuz3Quz(
    .param .align 8 .b8 _Z4quuz3Quz_param_0[24]
)
{
    .reg .b32   %r<4>;
    .reg .b64   %rd<7>;


    ld.param.u64    %rd1, [_Z4quuz3Quz_param_0+16];
    ld.param.u64    %rd2, [_Z4quuz3Quz_param_0+8];
    ld.param.u64    %rd3, [_Z4quuz3Quz_param_0];
    cvta.to.global.u64  %rd4, %rd3;
    cvta.to.global.u64  %rd5, %rd2;
    cvta.to.global.u64  %rd6, %rd1;
    ld.global.u32   %r1, [%rd5];
    ld.global.u32   %r2, [%rd6];
    add.s32     %r3, %r2, %r1;
    st.global.u32   [%rd4], %r3;
    ret;
}

It all works the same because CUDA puts all arguments into "constant memory" and accesses them through specialized memory load functions that go through the "constant cache."

Homer512
  • 9,144
  • 2
  • 8
  • 25