(original) (raw)

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def index f974cfc78c8dd..5f0b6e9521395 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -470,18 +470,18 @@ __OMP_RTL(__kmpc_target_deinit, false, Void,) __OMP_RTL(__kmpc_kernel_prepare_parallel, false, Void, VoidPtr) __OMP_RTL(__kmpc_parallel_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32, VoidPtr, VoidPtr, VoidPtrPtr, SizeTy) -__OMP_RTL(__kmpc_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32) -__OMP_RTL(__kmpc_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32) -__OMP_RTL(__kmpc_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64) -__OMP_RTL(__kmpc_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64) -__OMP_RTL(__kmpc_distribute_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32) -__OMP_RTL(__kmpc_distribute_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32) -__OMP_RTL(__kmpc_distribute_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64) -__OMP_RTL(__kmpc_distribute_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64) -__OMP_RTL(__kmpc_distribute_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32) -__OMP_RTL(__kmpc_distribute_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32) -__OMP_RTL(__kmpc_distribute_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64) -__OMP_RTL(__kmpc_distribute_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64) +__OMP_RTL(__kmpc_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int8) +__OMP_RTL(__kmpc_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int8) +__OMP_RTL(__kmpc_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int8) +__OMP_RTL(__kmpc_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int8) +__OMP_RTL(__kmpc_distribute_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int8) +__OMP_RTL(__kmpc_distribute_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int8) +__OMP_RTL(__kmpc_distribute_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int8) +__OMP_RTL(__kmpc_distribute_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int8) +__OMP_RTL(__kmpc_distribute_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32, Int8) +__OMP_RTL(__kmpc_distribute_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int32, Int8) +__OMP_RTL(__kmpc_distribute_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64, Int8) +__OMP_RTL(__kmpc_distribute_for_static_loop_8u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int64, Int8) __OMP_RTL(__kmpc_kernel_parallel, false, Int1, VoidPtrPtr) __OMP_RTL(__kmpc_kernel_end_parallel, false, Void, ) __OMP_RTL(__kmpc_serialized_parallel, false, Void, IdentPtr, Int32) @@ -674,22 +674,22 @@ __OMP_RTL_ATTRS(__kmpc_cancel_barrier, BarrierAttrs, SExt, ParamAttrs(ReadOnlyPtrAttrs, SExt)) __OMP_RTL_ATTRS(__kmpc_distribute_for_static_loop_4, AlwaysInlineAttrs, AttributeSet(), ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(), - SExt, SExt, SExt, SExt)) + SExt, SExt, SExt, SExt, ZExt)) __OMP_RTL_ATTRS(__kmpc_distribute_for_static_loop_4u, AlwaysInlineAttrs, AttributeSet(), ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(), - ZExt, ZExt, ZExt, ZExt)) + ZExt, ZExt, ZExt, ZExt, ZExt)) __OMP_RTL_ATTRS(__kmpc_distribute_static_loop_4, AlwaysInlineAttrs, AttributeSet(), ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(), - SExt, SExt)) + SExt, SExt, ZExt)) __OMP_RTL_ATTRS(__kmpc_distribute_static_loop_4u, AlwaysInlineAttrs, AttributeSet(), ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(), - ZExt, ZExt)) + ZExt, ZExt, ZExt)) __OMP_RTL_ATTRS(__kmpc_for_static_loop_4, AlwaysInlineAttrs, AttributeSet(), ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(), - SExt, SExt, SExt)) + SExt, SExt, SExt, ZExt)) __OMP_RTL_ATTRS(__kmpc_for_static_loop_4u, AlwaysInlineAttrs, AttributeSet(), ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(), - ZExt, ZExt, ZExt)) + ZExt, ZExt, ZExt, ZExt)) __OMP_RTL_ATTRS(__kmpc_error, AttributeSet(), AttributeSet(), ParamAttrs(AttributeSet(), SExt)) __OMP_RTL_ATTRS(__kmpc_flush, BarrierAttrs, AttributeSet(), diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index c16b0dde1a3da..9ad6db8574cf4 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -4969,6 +4969,7 @@ static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, RealArgs.push_back(TripCount); if (LoopType == WorksharingLoopType::DistributeStaticLoop) { RealArgs.push_back(ConstantInt::get(TripCountTy, 0)); + RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0)); Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())}); Builder.CreateCall(RTLFn, RealArgs); return; @@ -4984,6 +4985,7 @@ static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, if (LoopType == WorksharingLoopType::DistributeForStaticLoop) { RealArgs.push_back(ConstantInt::get(TripCountTy, 0)); } + RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0)); Builder.CreateCall(RTLFn, RealArgs); } diff --git a/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir b/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir index 830610f12a5d2..5d2861a5d0f35 100644 --- a/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir @@ -37,7 +37,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo // CHECK-SAME: #[[ATTRS1:[0-9]+]] // CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB]] to ptr), // CHECK-SAME: ptr @[[LOOP_BODY_FUNC:.*]], ptr %[[LOO_BODY_FUNC_ARG:.*]], i32 10, -// CHECK-SAME: i32 %[[THREAD_NUM:.*]], i32 0) +// CHECK-SAME: i32 %[[THREAD_NUM:.*]], i8 0) // CHECK: define internal void @[[LOOP_BODY_FUNC]](i32 %[[CNT:.*]], ptr %[[LOOP_BODY_ARG_PTR:.*]]) #[[ATTRS2:[0-9]+]] { diff --git a/mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir b/mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir index 0ebcec0e0ec31..b42e387acbb11 100644 --- a/mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir @@ -25,7 +25,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo // CHECK: define void @[[FUNC_COLLAPSED_WSLOOP:.*]](ptr %[[ARG0:.*]]) // CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr), // CHECK-SAME: ptr @[[COLLAPSED_WSLOOP_BODY_FN:.*]], ptr %[[STRUCT_ARG:.*]], i32 10000, -// CHECK-SAME: i32 %[[NUM_THREADS:.*]], i32 0) +// CHECK-SAME: i32 %[[NUM_THREADS:.*]], i8 0) // CHECK: define internal void @[[COLLAPSED_WSLOOP_BODY_FN]](i32 %[[LOOP_CNT:.*]], ptr %[[LOOP_BODY_ARG:.*]]) // CHECK: %[[TMP0:.*]] = urem i32 %[[LOOP_CNT]], 100 diff --git a/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir b/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir index a9f913b744489..7be635f46111b 100644 --- a/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir +++ b/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir @@ -37,7 +37,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo // CHECK: %[[GEP:.*]] = getelementptr { ptr }, ptr addrspace(5) %[[STRUCTARG]], i32 0, i32 0 // CHECK: store ptr %[[ARG0]], ptr addrspace(5) %[[GEP]], align 8 // CHECK: %[[NUM_THREADS:.*]] = call i32 @omp_get_num_threads() -// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), ptr @[[LOOP_BODY_FN:.*]], ptr %[[STRUCTARG_ASCAST]], i32 10, i32 %[[NUM_THREADS]], i32 0) +// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), ptr @[[LOOP_BODY_FN:.*]], ptr %[[STRUCTARG_ASCAST]], i32 10, i32 %[[NUM_THREADS]], i32 0, i8 0) // CHECK: define internal void @[[LOOP_BODY_FN]](i32 %[[LOOP_CNT:.*]], ptr %[[LOOP_BODY_ARG:.*]]) // CHECK: %[[GEP2:.*]] = getelementptr { ptr }, ptr %[[LOOP_BODY_ARG]], i32 0, i32 0 @@ -46,6 +46,6 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo // CHECK: store i32 %[[VAL0:.*]], ptr %[[GEP3]], align 4 // CHECK: define void @[[FUNC_EMPTY_WSLOOP:.*]]() -// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr), ptr @[[LOOP_EMPTY_BODY_FN:.*]], ptr null, i32 10, i32 %[[NUM_THREADS:.*]], i32 0) +// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr), ptr @[[LOOP_EMPTY_BODY_FN:.*]], ptr null, i32 10, i32 %[[NUM_THREADS:.*]], i32 0, i8 0) // CHECK: define internal void @[[LOOP_EMPTY_BODY_FN]](i32 %[[LOOP_CNT:.*]]) diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp index a8759307b42bd..59a2cc3f27aca 100644 --- a/offload/DeviceRTL/src/Workshare.cpp +++ b/offload/DeviceRTL/src/Workshare.cpp @@ -698,7 +698,7 @@ template class StaticLoopChunker { static void NormalizedLoopNestNoChunk(void (*LoopBody)(Ty, void *), void *Arg, Ty NumBlocks, Ty BId, Ty NumThreads, Ty TId, Ty NumIters, - bool OneIterationPerThread) { + uint8_t OneIterationPerThread) { Ty KernelIteration = NumBlocks * NumThreads; // Start index in the normalized space. @@ -729,7 +729,7 @@ template class StaticLoopChunker { Ty BlockChunk, Ty NumBlocks, Ty BId, Ty ThreadChunk, Ty NumThreads, Ty TId, Ty NumIters, - bool OneIterationPerThread) { + uint8_t OneIterationPerThread) { Ty KernelIteration = NumBlocks * BlockChunk; // Start index in the chunked space. @@ -767,8 +767,18 @@ template class StaticLoopChunker { public: /// Worksharing `for`-loop. + /// \param[in] Loc Description of source location + /// \param[in] LoopBody Function which corresponds to loop body + /// \param[in] Arg Pointer to struct which contains loop body args + /// \param[in] NumIters Number of loop iterations + /// \param[in] NumThreads Number of GPU threads + /// \param[in] ThreadChunk Size of thread chunk + /// \param[in] OneIterationPerThread If true/nonzero, each thread executes + /// only one loop iteration or one thread chunk. This avoids an outer loop + /// over all loop iterations/chunks. static void For(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg, - Ty NumIters, Ty NumThreads, Ty ThreadChunk) { + Ty NumIters, Ty NumThreads, Ty ThreadChunk, + uint8_t OneIterationPerThread) { ASSERT(NumIters >= 0, "Bad iteration count"); ASSERT(ThreadChunk >= 0, "Bad thread count"); @@ -790,12 +800,13 @@ template class StaticLoopChunker { // If we know we have more threads than iterations we can indicate that to // avoid an outer loop. - bool OneIterationPerThread = false; if (config::getAssumeThreadsOversubscription()) { - ASSERT(NumThreads >= NumIters, "Broken assumption"); OneIterationPerThread = true; } + if (OneIterationPerThread) + ASSERT(NumThreads >= NumIters, "Broken assumption"); + if (ThreadChunk != 1) NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId, ThreadChunk, NumThreads, TId, NumIters, @@ -806,8 +817,17 @@ template class StaticLoopChunker { } /// Worksharing `distribute`-loop. + /// \param[in] Loc Description of source location + /// \param[in] LoopBody Function which corresponds to loop body + /// \param[in] Arg Pointer to struct which contains loop body args + /// \param[in] NumIters Number of loop iterations + /// \param[in] BlockChunk Size of block chunk + /// \param[in] OneIterationPerThread If true/nonzero, each thread executes + /// only one loop iteration or one thread chunk. This avoids an outer loop + /// over all loop iterations/chunks. static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg, - Ty NumIters, Ty BlockChunk) { + Ty NumIters, Ty BlockChunk, + uint8_t OneIterationPerThread) { ASSERT(icv::Level == 0, "Bad distribute"); ASSERT(icv::ActiveLevel == 0, "Bad distribute"); ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute"); @@ -831,12 +851,13 @@ template class StaticLoopChunker { // If we know we have more blocks than iterations we can indicate that to // avoid an outer loop. - bool OneIterationPerThread = false; if (config::getAssumeTeamsOversubscription()) { - ASSERT(NumBlocks >= NumIters, "Broken assumption"); OneIterationPerThread = true; } + if (OneIterationPerThread) + ASSERT(NumBlocks >= NumIters, "Broken assumption"); + if (BlockChunk != NumThreads) NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId, ThreadChunk, NumThreads, TId, NumIters, @@ -852,9 +873,20 @@ template class StaticLoopChunker { } /// Worksharing `distribute parallel for`-loop. + /// \param[in] Loc Description of source location + /// \param[in] LoopBody Function which corresponds to loop body + /// \param[in] Arg Pointer to struct which contains loop body args + /// \param[in] NumIters Number of loop iterations + /// \param[in] NumThreads Number of GPU threads + /// \param[in] BlockChunk Size of block chunk + /// \param[in] ThreadChunk Size of thread chunk + /// \param[in] OneIterationPerThread If true/nonzero, each thread executes + /// only one loop iteration or one thread chunk. This avoids an outer loop + /// over all loop iterations/chunks. static void DistributeFor(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg, Ty NumIters, Ty NumThreads, - Ty BlockChunk, Ty ThreadChunk) { + Ty BlockChunk, Ty ThreadChunk, + uint8_t OneIterationPerThread) { ASSERT(icv::Level == 1, "Bad distribute"); ASSERT(icv::ActiveLevel == 1, "Bad distribute"); ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute"); @@ -882,13 +914,14 @@ template class StaticLoopChunker { // If we know we have more threads (across all blocks) than iterations we // can indicate that to avoid an outer loop. - bool OneIterationPerThread = false; if (config::getAssumeTeamsOversubscription() & config::getAssumeThreadsOversubscription()) { OneIterationPerThread = true; - ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption"); } + if (OneIterationPerThread) + ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption"); + if (BlockChunk != NumThreads || ThreadChunk != 1) NormalizedLoopNestChunked(LoopBody, Arg, BlockChunk, NumBlocks, BId, ThreadChunk, NumThreads, TId, NumIters, @@ -907,24 +940,26 @@ template class StaticLoopChunker { #define OMP_LOOP_ENTRY(BW, TY) \ [[gnu::flatten, clang::always_inline]] void \ - __kmpc_distribute_for_static_loop##BW( \ - IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \ - TY num_threads, TY block_chunk, TY thread_chunk) { \ + __kmpc_distribute_for_static_loop##BW( \ + IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \ + TY num_threads, TY block_chunk, TY thread_chunk, \ + uint8_t one_iteration_per_thread) { \ ompx::StaticLoopChunker::DistributeFor( \ - loc, fn, arg, num_iters, num_threads, block_chunk, thread_chunk); \ + loc, fn, arg, num_iters, num_threads, block_chunk, thread_chunk, \ + one_iteration_per_thread); \ } \ [[gnu::flatten, clang::always_inline]] void \ - __kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *), \ - void *arg, TY num_iters, \ - TY block_chunk) { \ - ompx::StaticLoopChunker::Distribute(loc, fn, arg, num_iters, \ - block_chunk); \ + __kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *), \ + void *arg, TY num_iters, TY block_chunk, \ + uint8_t one_iteration_per_thread) { \ + ompx::StaticLoopChunker::Distribute( \ + loc, fn, arg, num_iters, block_chunk, one_iteration_per_thread); \ } \ [[gnu::flatten, clang::always_inline]] void __kmpc_for_static_loop##BW( \ IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \ - TY num_threads, TY thread_chunk) { \ + TY num_threads, TY thread_chunk, uint8_t one_iteration_per_thread) { \ ompx::StaticLoopChunker::For(loc, fn, arg, num_iters, num_threads, \ - thread_chunk); \ + thread_chunk, one_iteration_per_thread); \ } extern "C" {