The compiler will only emit vectorized load or store instructions if the memory is guaranteed to be aligned to the size of the type, and all the elements of the type are used (otherwise the vector instruction will be optimized away to a scalar instruction to save bandwidth).
If you do this:
struct dtype{
int val;
int temp2;
int temp3;
int temp4;
};
struct __align__ (16) adtype{
int val;
int temp2;
int temp3;
int temp4;
};
__global__
void kernel(adtype* x, dtype* y)
{
adtype lx = x[threadIdx.x];
dtype ly;
ly.val = lx.temp4;
ly.temp2 = lx.temp3;
ly.temp3 = lx.val;
ly.temp4 = lx.temp2;
y[threadIdx.x] = ly;
}
you should get something like this:
visible .entry _Z6kernelP6adtypeP5dtype(
.param .u64 _Z6kernelP6adtypeP5dtype_param_0,
.param .u64 _Z6kernelP6adtypeP5dtype_param_1
)
{
ld.param.u64 %rd1, [_Z6kernelP6adtypeP5dtype_param_0];
ld.param.u64 %rd2, [_Z6kernelP6adtypeP5dtype_param_1];
cvta.to.global.u64 %rd3, %rd2;
cvta.to.global.u64 %rd4, %rd1;
mov.u32 %r1, %tid.x;
mul.wide.u32 %rd5, %r1, 16;
add.s64 %rd6, %rd4, %rd5;
ld.global.v4.u32 {%r2, %r3, %r4, %r5}, [%rd6];
add.s64 %rd7, %rd3, %rd5;
st.global.u32 [%rd7], %r5;
st.global.u32 [%rd7+4], %r4;
st.global.u32 [%rd7+8], %r2;
st.global.u32 [%rd7+12], %r3;
ret;
}
Here you can clearly see the vectorized load for the aligned type, and the non-vectorized store for the non-aligned type. If the kernel is changed so that the store is to the aligned version:
__global__
void kernel(adtype* x, dtype* y)
{
dtype ly = y[threadIdx.x];
adtype lx;
lx.val = ly.temp4;
lx.temp2 = ly.temp3;
lx.temp3 = ly.val;
lx.temp4 = ly.temp2;
x[threadIdx.x] = lx;
}
you will get this:
.visible .entry _Z6kernelP6adtypeP5dtype(
.param .u64 _Z6kernelP6adtypeP5dtype_param_0,
.param .u64 _Z6kernelP6adtypeP5dtype_param_1
)
{
ld.param.u64 %rd1, [_Z6kernelP6adtypeP5dtype_param_0];
ld.param.u64 %rd2, [_Z6kernelP6adtypeP5dtype_param_1];
cvta.to.global.u64 %rd3, %rd1;
cvta.to.global.u64 %rd4, %rd2;
mov.u32 %r1, %tid.x;
mul.wide.u32 %rd5, %r1, 16;
add.s64 %rd6, %rd4, %rd5;
add.s64 %rd7, %rd3, %rd5;
ld.global.u32 %r2, [%rd6+12];
ld.global.u32 %r3, [%rd6+8];
ld.global.u32 %r4, [%rd6+4];
ld.global.u32 %r5, [%rd6];
st.global.v4.u32 [%rd7], {%r2, %r3, %r5, %r4};
ret;
}
Now the aligned type is stored with a vectorized instruction.
[ All code compiled for sm_53 using the default Godbolt toolchain (10.2) ]