Is there any performance downside when passing a struct to a kernel?

Question

I have a kernel that takes several arrays as input. To improve readability it would be nice to group them into a struct and (after proper memory allocation and copy for each input) pass the struct to the kernel instead of the long list of pointers.

Is it going to be the same in the 2 cases, memory-wise, when accessing the arrays inside the kernel?

Can anyone recommend me some documentation on this topic (Couldn't find it on the programming guide)

I think passing kernel arguments as structs is quite common, I personally cannot think of any downside... — geebert, Nov 05 '21 at 11:12
[this](https://stackoverflow.com/questions/36655853/do-nvcc-gcc-clang-and-msvc-respect-the-restrict-keyword-within-structs) may be of interest — Robert Crovella, Nov 11 '21 at 02:17

Homer512 · Accepted Answer · 2021-11-07T13:47:21.623

No, there should be no difference. You can read the PTX output to make sure. Here is a simple example:


struct Foo
{
    int* a, *b, *c;
};

__global__ void bar(Foo f)
{ f.a[0] = f.b[0] + f.c[0]; }

__global__ void baz(int* a, int* b, int* c)
{ a[0] = b[0] + c[0]; }

struct Quz
{
    int* a, *b, *c;

    ~Quz() {}
};

__global__ void quuz(Quz f)
{ f.a[0] = f.b[0] + f.c[0]; }

And here is the PTX assembly. Note how there is basically no difference between the functions.

.visible .entry _Z3bar3Foo(
    .param .align 8 .b8 _Z3bar3Foo_param_0[24]
)
{
    .reg .b32   %r<4>;
    .reg .b64   %rd<7>;

    ld.param.u64    %rd1, [_Z3bar3Foo_param_0+16];
    ld.param.u64    %rd2, [_Z3bar3Foo_param_0+8];
    ld.param.u64    %rd3, [_Z3bar3Foo_param_0];
    cvta.to.global.u64  %rd4, %rd3;
    cvta.to.global.u64  %rd5, %rd2;
    cvta.to.global.u64  %rd6, %rd1;
    ld.global.u32   %r1, [%rd5];
    ld.global.u32   %r2, [%rd6];
    add.s32     %r3, %r2, %r1;
    st.global.u32   [%rd4], %r3;
    ret;
}

.visible .entry _Z3bazPiS_S_(
    .param .u64 _Z3bazPiS_S__param_0,
    .param .u64 _Z3bazPiS_S__param_1,
    .param .u64 _Z3bazPiS_S__param_2
)
{
    .reg .b32   %r<4>;
    .reg .b64   %rd<7>;


    ld.param.u64    %rd1, [_Z3bazPiS_S__param_0];
    ld.param.u64    %rd2, [_Z3bazPiS_S__param_1];
    ld.param.u64    %rd3, [_Z3bazPiS_S__param_2];
    cvta.to.global.u64  %rd4, %rd1;
    cvta.to.global.u64  %rd5, %rd3;
    cvta.to.global.u64  %rd6, %rd2;
    ld.global.u32   %r1, [%rd6];
    ld.global.u32   %r2, [%rd5];
    add.s32     %r3, %r2, %r1;
    st.global.u32   [%rd4], %r3;
    ret;
}

.visible .entry _Z4quuz3Quz(
    .param .align 8 .b8 _Z4quuz3Quz_param_0[24]
)
{
    .reg .b32   %r<4>;
    .reg .b64   %rd<7>;


    ld.param.u64    %rd1, [_Z4quuz3Quz_param_0+16];
    ld.param.u64    %rd2, [_Z4quuz3Quz_param_0+8];
    ld.param.u64    %rd3, [_Z4quuz3Quz_param_0];
    cvta.to.global.u64  %rd4, %rd3;
    cvta.to.global.u64  %rd5, %rd2;
    cvta.to.global.u64  %rd6, %rd1;
    ld.global.u32   %r1, [%rd5];
    ld.global.u32   %r2, [%rd6];
    add.s32     %r3, %r2, %r1;
    st.global.u32   [%rd4], %r3;
    ret;
}

It all works the same because CUDA puts all arguments into "constant memory" and accesses them through specialized memory load functions that go through the "constant cache."

Is there any performance downside when passing a struct to a kernel?

1 Answers1

Linked

Related