(original) (raw)
// // Generated by LLVM NVPTX Back-End // .version 7.2 .target sm_80 .address_size 64 // .weak cudaMalloc .weak .func (.param .b32 func_retval0) _Z15warpReduceReduxj ( .param .b32 _Z15warpReduceReduxj_param_0 ) ; .global .align 1 .b8 threadIdx[1]; .weak .func (.param .b32 func_retval0) cudaMalloc( .param .b64 cudaMalloc_param_0, .param .b64 cudaMalloc_param_1 ) { .local .align 8 .b8 __local_depot0[16]; .reg .b64 %SP; .reg .b64 %SPL; .reg .b32 %r<2>; .reg .b64 %rd<3>; mov.u64 %SPL, __local_depot0; cvta.local.u64 %SP, %SPL; ld.param.u64 %rd2, [cudaMalloc_param_1]; ld.param.u64 %rd1, [cudaMalloc_param_0]; st.u64 [%SP+0], %rd1; st.u64 [%SP+8], %rd2; mov.u32 %r1, 999; st.param.b32 [func_retval0+0], %r1; ret; } // .weak cudaFuncGetAttributes .weak .func (.param .b32 func_retval0) cudaFuncGetAttributes( .param .b64 cudaFuncGetAttributes_param_0, .param .b64 cudaFuncGetAttributes_param_1 ) { .local .align 8 .b8 __local_depot1[16]; .reg .b64 %SP; .reg .b64 %SPL; .reg .b32 %r<2>; .reg .b64 %rd<3>; mov.u64 %SPL, __local_depot1; cvta.local.u64 %SP, %SPL; ld.param.u64 %rd2, [cudaFuncGetAttributes_param_1]; ld.param.u64 %rd1, [cudaFuncGetAttributes_param_0]; st.u64 [%SP+0], %rd1; st.u64 [%SP+8], %rd2; mov.u32 %r1, 999; st.param.b32 [func_retval0+0], %r1; ret; } // .weak cudaDeviceGetAttribute .weak .func (.param .b32 func_retval0) cudaDeviceGetAttribute( .param .b64 cudaDeviceGetAttribute_param_0, .param .b32 cudaDeviceGetAttribute_param_1, .param .b32 cudaDeviceGetAttribute_param_2 ) { .local .align 8 .b8 __local_depot2[16]; .reg .b64 %SP; .reg .b64 %SPL; .reg .b32 %r<4>; .reg .b64 %rd<2>; mov.u64 %SPL, __local_depot2; cvta.local.u64 %SP, %SPL; ld.param.u32 %r2, [cudaDeviceGetAttribute_param_2]; ld.param.u32 %r1, [cudaDeviceGetAttribute_param_1]; ld.param.u64 %rd1, [cudaDeviceGetAttribute_param_0]; st.u64 [%SP+0], %rd1; st.u32 [%SP+8], %r1; st.u32 [%SP+12], %r2; mov.u32 %r3, 999; st.param.b32 [func_retval0+0], %r3; ret; } // .weak cudaGetDevice .weak .func (.param .b32 func_retval0) cudaGetDevice( .param .b64 cudaGetDevice_param_0 ) { .local .align 8 .b8 __local_depot3[8]; .reg .b64 %SP; .reg .b64 %SPL; .reg .b32 %r<2>; .reg .b64 %rd<2>; mov.u64 %SPL, __local_depot3; cvta.local.u64 %SP, %SPL; ld.param.u64 %rd1, [cudaGetDevice_param_0]; st.u64 [%SP+0], %rd1; mov.u32 %r1, 999; st.param.b32 [func_retval0+0], %r1; ret; } // .weak cudaOccupancyMaxActiveBlocksPerMultiprocessor .weak .func (.param .b32 func_retval0) cudaOccupancyMaxActiveBlocksPerMultiprocessor( .param .b64 cudaOccupancyMaxActiveBlocksPerMultiprocessor_param_0, .param .b64 cudaOccupancyMaxActiveBlocksPerMultiprocessor_param_1, .param .b32 cudaOccupancyMaxActiveBlocksPerMultiprocessor_param_2, .param .b64 cudaOccupancyMaxActiveBlocksPerMultiprocessor_param_3 ) { .local .align 8 .b8 __local_depot4[32]; .reg .b64 %SP; .reg .b64 %SPL; .reg .b32 %r<3>; .reg .b64 %rd<4>; mov.u64 %SPL, __local_depot4; cvta.local.u64 %SP, %SPL; ld.param.u64 %rd3, [cudaOccupancyMaxActiveBlocksPerMultiprocessor_param_3]; ld.param.u32 %r1, [cudaOccupancyMaxActiveBlocksPerMultiprocessor_param_2]; ld.param.u64 %rd2, [cudaOccupancyMaxActiveBlocksPerMultiprocessor_param_1]; ld.param.u64 %rd1, [cudaOccupancyMaxActiveBlocksPerMultiprocessor_param_0]; st.u64 [%SP+0], %rd1; st.u64 [%SP+8], %rd2; st.u32 [%SP+16], %r1; st.u64 [%SP+24], %rd3; mov.u32 %r2, 999; st.param.b32 [func_retval0+0], %r2; ret; } // .weak cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags .weak .func (.param .b32 func_retval0) cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( .param .b64 cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_param_0, .param .b64 cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_param_1, .param .b32 cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_param_2, .param .b64 cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_param_3, .param .b32 cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_param_4 ) { .local .align 8 .b8 __local_depot5[40]; .reg .b64 %SP; .reg .b64 %SPL; .reg .b32 %r<4>; .reg .b64 %rd<4>; mov.u64 %SPL, __local_depot5; cvta.local.u64 %SP, %SPL; ld.param.u32 %r2, [cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_param_4]; ld.param.u64 %rd3, [cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_param_3]; ld.param.u32 %r1, [cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_param_2]; ld.param.u64 %rd2, [cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_param_1]; ld.param.u64 %rd1, [cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_param_0]; st.u64 [%SP+0], %rd1; st.u64 [%SP+8], %rd2; st.u32 [%SP+16], %r1; st.u64 [%SP+24], %rd3; st.u32 [%SP+32], %r2; mov.u32 %r3, 999; st.param.b32 [func_retval0+0], %r3; ret; } // .globl _Z12reduceKernelPjS_i .visible .entry _Z12reduceKernelPjS_i( .param .u64 _Z12reduceKernelPjS_i_param_0, .param .u64 _Z12reduceKernelPjS_i_param_1, .param .u32 _Z12reduceKernelPjS_i_param_2 ) { .local .align 8 .b8 __local_depot6[24]; .reg .b64 %SP; .reg .b64 %SPL; .reg .pred %p<2>; .reg .b32 %r<8>; .reg .b64 %rd<12>; mov.u64 %SPL, __local_depot6; cvta.local.u64 %SP, %SPL; ld.param.u32 %r1, [_Z12reduceKernelPjS_i_param_2]; ld.param.u64 %rd2, [_Z12reduceKernelPjS_i_param_1]; ld.param.u64 %rd1, [_Z12reduceKernelPjS_i_param_0]; cvta.to.global.u64 %rd3, %rd2; cvta.global.u64 %rd4, %rd3; cvta.to.global.u64 %rd5, %rd1; cvta.global.u64 %rd6, %rd5; st.u64 [%SP+0], %rd6; st.u64 [%SP+8], %rd4; st.u32 [%SP+16], %r1; ld.u64 %rd7, [%SP+0]; mov.u32 %r2, %tid.x; cvt.u64.u32 %rd8, %r2; shl.b64 %rd9, %rd8, 2; add.s64 %rd10, %rd7, %rd9; ld.u32 %r3, [%rd10]; st.u32 [%SP+20], %r3; ld.u32 %r4, [%SP+20]; { // callseq 0, 0 .reg .b32 temp_param_reg; .param .b32 param0; st.param.b32 [param0+0], %r4; .param .b32 retval0; call.uni (retval0), _Z15warpReduceReduxj, ( param0 ); ld.param.b32 %r5, [retval0+0]; } // callseq 0 st.u32 [%SP+20], %r5; setp.ne.s32 %p1, %r2, 0; @%p1 bra LBB6_2; bra.uni LBB6_1; LBB6_1: ld.u32 %r7, [%SP+20]; ld.u64 %rd11, [%SP+8]; st.u32 [%rd11], %r7; bra.uni LBB6_2; LBB6_2: ret; } // .weak _Z15warpReduceReduxj .weak .func (.param .b32 func_retval0) _Z15warpReduceReduxj( .param .b32 _Z15warpReduceReduxj_param_0 ) { .local .align 4 .b8 __local_depot7[4]; .reg .b64 %SP; .reg .b64 %SPL; .reg .b32 %r<5>; mov.u64 %SPL, __local_depot7; cvta.local.u64 %SP, %SPL; ld.param.u32 %r1, [_Z15warpReduceReduxj_param_0]; st.u32 [%SP+0], %r1; ld.u32 %r2, [%SP+0]; mov.u32 %r3, 255; redux.sync.add.s32 %r4, %r2, %r3; st.param.b32 [func_retval0+0], %r4; ret; }