LLVM reordering blocks breaks ptxas divergence analysis (original) (raw)

I’m having some issues with LLVM reordering basic blocks in a way that results in invalid code for Pascal (and earlier) CUDA GPUs. I’ll demonstrate with a simple example:

function kernel(input::T, output, n) where T
    i = threadIdx().x
    temp = StaticSharedArray(T, 1)

    # thread 1 initializes the shared memory
    if i == 1
        1 <= n || throw_oob()
        temp[1] = input
    end

    sync_threads()

    # other threads read from shared memory
    1 <= n || throw_oob()
    output[i] = temp[1]

    return
end

Note that the code is simplified to the point that it looks a little nonsensical.
I generate the following optimized LLVM IR for this snippet (again, simplified for clarity):

; ModuleID = 'start'
source_filename = "start"
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-nvidia-cuda"

declare void @llvm.nvvm.barrier0() #0
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()

declare fastcc void @throw_oob();

@shmem = internal unnamed_addr addrspace(3) global i8 0, align 32

define ptx_kernel void @kernel(i8 signext %0, i64 zeroext %1, i64 signext %2) {
conversion:
  %3 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  %.not = icmp eq i32 %3, 0
  br i1 %.not, label %maybe_initialize, label %cont1

; start of a divergent region

maybe_initialize:
  %4 = icmp slt i64 %2, 1
  br i1 %4, label %oob1, label %actually_initialize

actually_initialize:
  store i8 %0, i8 addrspace(3)* @shmem, align 32
  br label %cont1

oob1:
  call fastcc void @throw_oob()
  unreachable

; end of a divergent region

cont1:
  call void @llvm.nvvm.barrier0()
  %5 = icmp slt i64 %2, 1
  br i1 %5, label %oob2, label %read

read:
  %6 = load i8, i8 addrspace(3)* @shmem, align 32
  %7 = zext i32 %3 to i64
  %8 = inttoptr i64 %1 to i8*
  %9 = getelementptr inbounds i8, i8* %8, i64 %7
  store i8 %6, i8* %9, align 1
  ret void

oob2:
  call fastcc void @throw_oob()
  unreachable
}

attributes #0 = { convergent }

Note how I annotated a region of code where execution diverges, i.e., where not all threads execute the same code.

NVPTX produces the following PTX code:

.visible .entry kernel(
    .param .u8 kernel_param_0,
    .param .u64 kernel_param_1,
    .param .u64 kernel_param_2
)
{
    .reg .pred 	%p<5>;
    .reg .b16 	%rs<3>;
    .reg .b32 	%r<2>;
    .reg .b64 	%rd<5>;
    // demoted variable
    .shared .align 32 .u8 shmem;
// %bb.0:                               // %conversion
    ld.param.u64 	%rd2, [kernel_param_2];
    mov.u32 	%r1, %tid.x;
    setp.ne.s32 	%p1, %r1, 0;
    setp.lt.s64 	%p4, %rd2, 1;
    @%p1 bra 	$L__BB0_3;
// %bb.1:                               // %maybe_initialize
    @%p4 bra 	$L__BB0_6;
// %bb.2:                               // %actually_initialize
    ld.param.s8 	%rs1, [kernel_param_0];
    st.shared.u8 	[shmem], %rs1;
$L__BB0_3:                              // %cont1
    bar.sync 	0;
    @%p4 bra 	$L__BB0_5;
// %bb.4:                               // %read
    ld.param.u64 	%rd1, [kernel_param_1];
    ld.shared.u8 	%rs2, [shmem];
    cvt.u64.u32 	%rd3, %r1;
    add.s64 	%rd4, %rd1, %rd3;
    st.u8 	[%rd4], %rs2;
    ret;
$L__BB0_5:                              // %oob2
    { // callseq 0, 0
    .reg .b32 temp_param_reg;
    call.uni
    throw_oob,
    (
    );
    } // callseq 0
$L__BB0_6:                              // %oob1
    { // callseq 1, 0
    .reg .b32 temp_param_reg;
    call.uni
    throw_oob,
    (
    );
    } // callseq 1
                                        // -- End function
}

Here, the noreturn calls to throw_oob have been moved to the end of the function, which results in ptxas not being able to mark the code I annotated above as a single divergent region. Because of that, the barrier0 is executed divergently, which is undefined behavior (and causes miscomputations).

What can I do to avoid this? I’ve seen the machine IR passes reorder IR as part of the branch-folder and block-placement passes; I guess I could try disabling those, but I’m hoping for a more principled solution so that this issue doesn’t come up again in the future when another pass starts reordering IR or machine IR.