Based on the discussion here this appears to be based on whether or not the shared variable scope can be restricted to a single function (i.e. a single kernel). Even very complex kernel functions with shared usage that I have looked at demote the shared variables.
Here's a simple example where it is demoted and not demoted.
Not demoted:
$ vi t1.cu
$ cat t1.cu
__shared__ float s[32];
__global__ void k(float * my_ptr){
s[threadIdx.x] = threadIdx.x;
*my_ptr = s[threadIdx.x];
}
__global__ void k1(float * my_ptr){
s[threadIdx.x] = 0.0f;
*my_ptr = s[threadIdx.x];
}
$ nvcc -ptx t1.cu
$ cat t1.ptx
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-29373293
// Cuda compilation tools, release 11.2, V11.2.67
// Based on NVVM 7.0.1
//
.version 7.2
.target sm_52
.address_size 64
// .globl _Z1kPf
.shared .align 4 .b8 s[128];
.visible .entry _Z1kPf(
.param .u64 _Z1kPf_param_0
)
{
.reg .f32 %f<2>;
.reg .b32 %r<5>;
.reg .b64 %rd<3>;
ld.param.u64 %rd1, [_Z1kPf_param_0];
cvta.to.global.u64 %rd2, %rd1;
mov.u32 %r1, %tid.x;
cvt.rn.f32.u32 %f1, %r1;
shl.b32 %r2, %r1, 2;
mov.u32 %r3, s;
add.s32 %r4, %r3, %r2;
st.shared.f32 [%r4], %f1;
st.global.f32 [%rd2], %f1;
ret;
}
// .globl _Z2k1Pf
.visible .entry _Z2k1Pf(
.param .u64 _Z2k1Pf_param_0
)
{
.reg .b32 %r<6>;
.reg .b64 %rd<3>;
ld.param.u64 %rd1, [_Z2k1Pf_param_0];
cvta.to.global.u64 %rd2, %rd1;
mov.u32 %r1, %tid.x;
shl.b32 %r2, %r1, 2;
mov.u32 %r3, s;
add.s32 %r4, %r3, %r2;
mov.u32 %r5, 0;
st.shared.u32 [%r4], %r5;
st.global.u32 [%rd2], %r5;
ret;
}
Demoted:
$ cat t1.cu
__global__ void k(float * my_ptr){
__shared__ float s[32];
s[threadIdx.x] = threadIdx.x;
*my_ptr = s[threadIdx.x];
}
__global__ void k1(float * my_ptr){
__shared__ float s[32];
s[threadIdx.x] = 0.0f;
*my_ptr = s[threadIdx.x];
}
$ nvcc -ptx t1.cu
$ cat t1.ptx
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-29373293
// Cuda compilation tools, release 11.2, V11.2.67
// Based on NVVM 7.0.1
//
.version 7.2
.target sm_52
.address_size 64
// .globl _Z1kPf
// _ZZ1kPfE1s has been demoted
// _ZZ2k1PfE1s has been demoted
.visible .entry _Z1kPf(
.param .u64 _Z1kPf_param_0
)
{
.reg .f32 %f<2>;
.reg .b32 %r<5>;
.reg .b64 %rd<3>;
// demoted variable
.shared .align 4 .b8 _ZZ1kPfE1s[128];
ld.param.u64 %rd1, [_Z1kPf_param_0];
cvta.to.global.u64 %rd2, %rd1;
mov.u32 %r1, %tid.x;
cvt.rn.f32.u32 %f1, %r1;
shl.b32 %r2, %r1, 2;
mov.u32 %r3, _ZZ1kPfE1s;
add.s32 %r4, %r3, %r2;
st.shared.f32 [%r4], %f1;
st.global.f32 [%rd2], %f1;
ret;
}
// .globl _Z2k1Pf
.visible .entry _Z2k1Pf(
.param .u64 _Z2k1Pf_param_0
)
{
.reg .b32 %r<6>;
.reg .b64 %rd<3>;
// demoted variable
.shared .align 4 .b8 _ZZ2k1PfE1s[128];
ld.param.u64 %rd1, [_Z2k1Pf_param_0];
cvta.to.global.u64 %rd2, %rd1;
mov.u32 %r1, %tid.x;
shl.b32 %r2, %r1, 2;
mov.u32 %r3, _ZZ2k1PfE1s;
add.s32 %r4, %r3, %r2;
mov.u32 %r5, 0;
st.shared.u32 [%r4], %r5;
st.global.u32 [%rd2], %r5;
ret;
}
As an aside, there do appear to be cases where the PTX generator can delete the shared variable entirely, but this is not directly related to the question here.