[OpenMP][Offload] Add offload runtime support for dyn_groupprivate clause by kevinsala · Pull Request #152831 · llvm/llvm-project (original) (raw)

@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-offload

Author: Kevin Sala Penades (kevinsala)

Changes

Part 2 adding offload runtime support. See #152651.

Patch is 39.36 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/152831.diff

23 Files Affected:

(modified) offload/DeviceRTL/include/DeviceTypes.h (+4)
(modified) offload/DeviceRTL/include/Interface.h (+1-1)
(modified) offload/DeviceRTL/include/State.h (+1-1)
(modified) offload/DeviceRTL/src/Kernel.cpp (+7-7)
(modified) offload/DeviceRTL/src/State.cpp (+46-2)
(modified) offload/include/Shared/APITypes.h (+4-2)
(modified) offload/include/Shared/Environment.h (+3-1)
(modified) offload/include/device.h (+3)
(modified) offload/include/omptarget.h (+6-1)
(modified) offload/libomptarget/OpenMP/API.cpp (+14)
(modified) offload/libomptarget/device.cpp (+6)
(modified) offload/libomptarget/exports (+1)
(modified) offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h (+1)
(modified) offload/plugins-nextgen/amdgpu/src/rtl.cpp (+22-12)
(modified) offload/plugins-nextgen/common/include/PluginInterface.h (+29-4)
(modified) offload/plugins-nextgen/common/src/PluginInterface.cpp (+65-21)
(modified) offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h (+1)
(modified) offload/plugins-nextgen/cuda/src/rtl.cpp (+26-11)
(modified) offload/plugins-nextgen/host/src/rtl.cpp (+2-2)
(added) offload/test/offloading/dyn_groupprivate_strict.cpp (+141)
(modified) openmp/runtime/src/include/omp.h.var (+10)
(modified) openmp/runtime/src/kmp_csupport.cpp (+9)
(modified) openmp/runtime/src/kmp_stub.cpp (+16)

diff --git a/offload/DeviceRTL/include/DeviceTypes.h b/offload/DeviceRTL/include/DeviceTypes.h index 2e5d92380f040..a43b506d6879e 100644 --- a/offload/DeviceRTL/include/DeviceTypes.h +++ b/offload/DeviceRTL/include/DeviceTypes.h @@ -163,4 +163,8 @@ typedef enum omp_allocator_handle_t { ///} +enum omp_access_t { + omp_access_cgroup = 0, +}; + #endif diff --git a/offload/DeviceRTL/include/Interface.h b/offload/DeviceRTL/include/Interface.h index c4bfaaa2404b4..672afea206785 100644 --- a/offload/DeviceRTL/include/Interface.h +++ b/offload/DeviceRTL/include/Interface.h @@ -222,7 +222,7 @@ struct KernelEnvironmentTy; int8_t __kmpc_is_spmd_exec_mode(); int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment); + KernelLaunchEnvironmentTy *KernelLaunchEnvironment); void __kmpc_target_deinit(); diff --git a/offload/DeviceRTL/include/State.h b/offload/DeviceRTL/include/State.h index db396dae6e445..17c3c6f2d3e42 100644 --- a/offload/DeviceRTL/include/State.h +++ b/offload/DeviceRTL/include/State.h @@ -116,7 +116,7 @@ extern Local<ThreadStateTy **> ThreadStates; /// Initialize the state machinery. Must be called by all threads. void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment); + KernelLaunchEnvironmentTy KernelLaunchEnvironment); /// Return the kernel and kernel launch environment associated with the current /// kernel. The former is static and contains compile time information that diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp index 467e44a65276c..58e9a09105a76 100644 --- a/offload/DeviceRTL/src/Kernel.cpp +++ b/offload/DeviceRTL/src/Kernel.cpp @@ -34,8 +34,8 @@ enum OMPTgtExecModeFlags : unsigned char { }; static void -inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { +initializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, + KernelLaunchEnvironmentTy KernelLaunchEnvironment) { // Order is important here. synchronize::init(IsSPMD); mapping::init(IsSPMD); @@ -80,17 +80,17 @@ extern "C" { /// \param Ident Source location identification, can be NULL. /// int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { + KernelLaunchEnvironmentTy KernelLaunchEnvironment) { ConfigurationEnvironmentTy &Configuration = KernelEnvironment.Configuration; bool IsSPMD = Configuration.ExecMode & OMP_TGT_EXEC_MODE_SPMD; bool UseGenericStateMachine = Configuration.UseGenericStateMachine; if (IsSPMD) { - inititializeRuntime(/IsSPMD=/true, KernelEnvironment, - KernelLaunchEnvironment); + initializeRuntime(/IsSPMD=/true, KernelEnvironment, + KernelLaunchEnvironment); synchronize::threadsAligned(atomic::relaxed); } else { - inititializeRuntime(/IsSPMD=/false, KernelEnvironment, - KernelLaunchEnvironment); + initializeRuntime(/IsSPMD=/false, KernelEnvironment, + KernelLaunchEnvironment); // No need to wait since only the main threads will execute user // code and workers will run into a barrier right away. } diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp index 62b03e7bba720..9e2a9999167b4 100644 --- a/offload/DeviceRTL/src/State.cpp +++ b/offload/DeviceRTL/src/State.cpp @@ -158,6 +158,34 @@ void SharedMemorySmartStackTy::pop(void Ptr, uint64_t Bytes) { memory::freeGlobal(Ptr, "Slow path shared memory deallocation"); } +struct DynCGroupMemTy { + void init(KernelLaunchEnvironmentTy KLE, void NativeDynCGroup) { + Size = 0; + Ptr = nullptr; + IsFallback = false; + if (KLE) { + Size = KLE->DynCGroupMemSize; + if (void Fallback = KLE->DynCGroupMemFallback) { + Ptr = static_cast<char >(Fallback) + Size * omp_get_team_num(); + IsFallback = true; + } else { + Ptr = static_cast<char >(NativeDynCGroup); + } + } + } + + char getPtr(size_t Offset) const { return Ptr + Offset; } + bool isFallback() const { return IsFallback; } + size_t getSize() const { return Size; } + +private: + char Ptr; + size_t Size; + bool IsFallback; +}; + +[[clang::loader_uninitialized]] static Local DynCGroupMem; + } // namespace void memory::getDynamicBuffer() { return DynamicSharedBuffer; } @@ -246,13 +274,18 @@ int returnValIfLevelIsActive(int Level, int Val, int DefaultVal, } // namespace void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { + KernelLaunchEnvironmentTy KLE) { SharedMemorySmartStack.init(IsSPMD); + + if (KLE == reinterpret_cast<KernelLaunchEnvironmentTy >(~0)) + KLE = nullptr; + if (mapping::isInitialThreadInLevel0(IsSPMD)) { + DynCGroupMem.init(KLE, DynamicSharedBuffer); TeamState.init(IsSPMD); ThreadStates = nullptr; KernelEnvironmentPtr = &KernelEnvironment; - KernelLaunchEnvironmentPtr = &KernelLaunchEnvironment; + KernelLaunchEnvironmentPtr = KLE; } } @@ -430,6 +463,17 @@ int omp_get_team_num() { return mapping::getBlockIdInKernel(); } int omp_get_initial_device(void) { return -1; } int omp_is_initial_device(void) { return 0; } + +void omp_get_dyn_groupprivate_ptr(size_t Offset, int IsFallback, + omp_access_t) { + if (IsFallback != NULL) + IsFallback = DynCGroupMem.isFallback(); + return DynCGroupMem.getPtr(Offset); +} + +size_t omp_get_dyn_groupprivate_size(omp_access_t) { + return DynCGroupMem.getSize(); +} } extern "C" { diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h index 978b53d5d69b9..0ef2dd162292b 100644 --- a/offload/include/Shared/APITypes.h +++ b/offload/include/Shared/APITypes.h @@ -97,8 +97,10 @@ struct KernelArgsTy { struct { uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause. uint64_t IsCUDA : 1; // Was this kernel spawned via CUDA. - uint64_t Unused : 62; - } Flags = {0, 0, 0}; + uint64_t AllowDynCGroupMemFallback : 1; // Allow fallback for dynamic cgroup + // mem fallback. + uint64_t Unused : 61; + } Flags = {0, 0, 0, 0}; // The number of teams (for x,y,z dimension). uint32_t NumTeams[3] = {0, 0, 0}; // The number of threads (for x,y,z dimension). diff --git a/offload/include/Shared/Environment.h b/offload/include/Shared/Environment.h index 2a283bd6fa4ed..0670ac1090da4 100644 --- a/offload/include/Shared/Environment.h +++ b/offload/include/Shared/Environment.h @@ -93,9 +93,11 @@ struct KernelEnvironmentTy { }; struct KernelLaunchEnvironmentTy { + void ReductionBuffer = nullptr; + void DynCGroupMemFallback = nullptr; uint32_t ReductionCnt = 0; uint32_t ReductionIterCnt = 0; - void ReductionBuffer = nullptr; + uint32_t DynCGroupMemSize = 0; }; #endif // OMPTARGET_SHARED_ENVIRONMENT_H diff --git a/offload/include/device.h b/offload/include/device.h index f4b10abbaa3fd..0e93cf8ec1a8b 100644 --- a/offload/include/device.h +++ b/offload/include/device.h @@ -158,6 +158,9 @@ struct DeviceTy { /// Indicate that there are pending images for this device or not. void setHasPendingImages(bool V) { HasPendingImages = V; } + /// Get the maximum shared memory per team for any kernel. + uint64_t getMaxSharedTeamMemory(); + private: /// Deinitialize the device (and plugin). void deinit(); diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h index 6971780c7bdb5..45bb74ec367d6 100644 --- a/offload/include/omptarget.h +++ b/offload/include/omptarget.h @@ -107,7 +107,7 @@ enum TargetAllocTy : int32_t { inline KernelArgsTy CTorDTorKernelArgs = {1, 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, - 0, {0,0,0}, {1, 0, 0}, {1, 0, 0}, 0}; + 0, {0,0,0,0}, {1, 0, 0}, {1, 0, 0}, 0}; struct DeviceTy; @@ -273,10 +273,15 @@ struct __tgt_target_non_contig { extern "C" { #endif +typedef enum { + omp_access_cgroup = 0, +} omp_access_t; + void ompx_dump_mapping_tables(void); int omp_get_num_devices(void); int omp_get_device_num(void); int omp_get_initial_device(void); +size_t omp_get_groupprivate_limit(int device_num, omp_access_t access_group = omp_access_cgroup); void omp_target_alloc(size_t Size, int DeviceNum); void omp_target_free(void DevicePtr, int DeviceNum); int omp_target_is_present(const void Ptr, int DeviceNum); diff --git a/offload/libomptarget/OpenMP/API.cpp b/offload/libomptarget/OpenMP/API.cpp index 4576f9bd06121..1ed4192157fc8 100644 --- a/offload/libomptarget/OpenMP/API.cpp +++ b/offload/libomptarget/OpenMP/API.cpp @@ -98,6 +98,20 @@ EXTERN int omp_get_initial_device(void) { return HostDevice; } +EXTERN size_t omp_get_groupprivate_limit(int DeviceNum, + omp_access_t AccessGroup) { + TIMESCOPE(); + OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0))); + if (DeviceNum == omp_get_initial_device()) + return 0; + + auto DeviceOrErr = PM->getDevice(DeviceNum); + if (!DeviceOrErr) + FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str()); + + return DeviceOrErr->getMaxSharedTeamMemory(); +} + EXTERN void omp_target_alloc(size_t Size, int DeviceNum) { TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DeviceNum) + ";size=" + std::to_string(Size)); diff --git a/offload/libomptarget/device.cpp b/offload/libomptarget/device.cpp index f88e30ae9e76b..31bfc7d092424 100644 --- a/offload/libomptarget/device.cpp +++ b/offload/libomptarget/device.cpp @@ -281,3 +281,9 @@ bool DeviceTy::useAutoZeroCopy() { return false; return RTL->use_auto_zero_copy(RTLDeviceID); } + +uint64_t DeviceTy::getMaxSharedTeamMemory() { + using DeviceQueryKind = llvm::omp:🎯:plugin::DeviceQueryKind; + return RTL->query_device_info( + RTLDeviceID, DeviceQueryKind::DEVICE_QUERY_MAX_SHARED_TEAM_MEM); +} diff --git a/offload/libomptarget/exports b/offload/libomptarget/exports index 2406776c1fb5f..b5a1401564d58 100644 --- a/offload/libomptarget/exports +++ b/offload/libomptarget/exports @@ -40,6 +40,7 @@ VERS1.0 { omp_get_num_devices; omp_get_device_num; omp_get_initial_device; + omp_get_groupprivate_limit; omp_target_alloc; omp_target_free; omp_target_is_present; diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h index 3117763e35896..2cf156e576c5f 100644 --- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h +++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h @@ -52,6 +52,7 @@ typedef enum { HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE = 6, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT = 7, HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL = 15, + HSA_AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE = 16, } hsa_amd_memory_pool_info_t; typedef enum { diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 12c7cc62905c9..fa373c2029f0c 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -273,7 +273,6 @@ struct AMDGPUMemoryPoolTy { if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, GlobalFlags)) return Err;

 return Plugin::success();

} @@ -543,6 +542,8 @@ struct AMDGPUKernelTy : public GenericKernelTy { return Err; } + StaticBlockMemSize = GroupSize; + // Make sure it is a kernel symbol. if (SymbolType != HSA_SYMBOL_KIND_KERNEL) return Plugin::error(ErrorCode::INVALID_BINARY, @@ -566,8 +567,8 @@ struct AMDGPUKernelTy : public GenericKernelTy { /// Launch the AMDGPU kernel function. Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3], - uint32_t NumBlocks[3], KernelArgsTy &KernelArgs, - KernelLaunchParamsTy LaunchParams, + uint32_t NumBlocks[3], uint32_t DynBlockMemSize, + KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const override; /// Print more elaborate kernel launch info for AMDGPU @@ -2020,6 +2021,20 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { if (auto Err = checkIfAPU()) return Err; + // Retrieve the size of the group memory. + for (const auto Pool : AllMemoryPools) { + if (Pool->isGroup()) { + size_t Size = 0; + if (auto Err = Pool->getAttr(HSA_AMD_MEMORY_POOL_INFO_SIZE, Size)) + return Err; + MaxBlockSharedMemSize = Size; + break; + } + } + + // Supports block shared memory natively. + HasNativeBlockSharedMem = true; + return Plugin::success(); } @@ -2856,7 +2871,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { KernelArgsTy KernelArgs = {}; uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u}; if (auto Err = AMDGPUKernel.launchImpl( - this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs, + *this, NumBlocksAndThreads, NumBlocksAndThreads, 0, KernelArgs, KernelLaunchParamsTy{}, AsyncInfoWrapper)) return Err; @@ -3357,6 +3372,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy { Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3], uint32_t NumBlocks[3], + uint32_t DynBlockMemSize, KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const { @@ -3374,13 +3390,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, if (auto Err = ArgsMemoryManager.allocate(ArgsSize, &AllArgs)) return Err; - // Account for user requested dynamic shared memory. - uint32_t GroupSize = getGroupSize(); - if (uint32_t MaxDynCGroupMem = std::max( - KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize())) { - GroupSize += MaxDynCGroupMem; - }

uint64_t StackSize; if (auto Err = GenericDevice.getDeviceStackSize(StackSize)) return Err; @@ -3434,7 +3443,8 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,

// Push the kernel launch into the stream. return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,

                             GroupSize, StackSize, ArgsMemoryManager);

                             getStaticBlockMemSize() + DynBlockMemSize,

                             StackSize, ArgsMemoryManager);

}

Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice, diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index 162b149ab483e..3357ccfe0c9b5 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -226,6 +226,10 @@ struct InfoTreeNode { } };

+enum class DeviceQueryKind {

DEVICE_QUERY_MAX_SHARED_TEAM_MEM = 0, +};
/// Class wrapping a __tgt_device_image and its offload entry table on a /// specific device. This class is responsible for storing and managing /// the offload entries for an image on a device. @@ -312,13 +316,16 @@ struct GenericKernelTy { AsyncInfoWrapperTy &AsyncInfoWrapper) const; virtual Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3], uint32_t NumBlocks[3],

                      KernelArgsTy &KernelArgs,

                      uint32_t DynBlockMemSize, KernelArgsTy &KernelArgs,
                      KernelLaunchParamsTy LaunchParams,
                      AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;

/// Get the kernel name. const char *getName() const { return Name.c_str(); }

/// Get the size of the static per-block memory consumed by the kernel.
uint32_t getStaticBlockMemSize() const { return StaticBlockMemSize; };
/// Get the kernel image. DeviceImageTy &getImage() const { assert(ImagePtr && "Kernel is not initialized!");

@@ -331,9 +338,9 @@ struct GenericKernelTy { }

/// Return a device pointer to a new kernel launch environment.

Expected<KernelLaunchEnvironmentTy *>
getKernelLaunchEnvironment(GenericDeviceTy &GenericDevice, uint32_t Version,

                        AsyncInfoWrapperTy &AsyncInfo) const;

Expected<KernelLaunchEnvironmentTy *> getKernelLaunchEnvironment(

 GenericDeviceTy &GenericDevice, const KernelArgsTy &KernelArgs,

```
 void *FallbackBlockMem, AsyncInfoWrapperTy &AsyncInfo) const;
```
/// Indicate whether an execution mode is valid. static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) {

@@ -425,6 +432,9 @@ struct GenericKernelTy { /// The maximum number of threads which the kernel could leverage. uint32_t MaxNumThreads;

/// The static memory sized per block.
uint32_t StaticBlockMemSize = 0;
/// The kernel environment, including execution flags. KernelEnvironmentTy KernelEnvironment;

@@ -731,6 +741,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy { /// this id is not unique between different plugins; they may overlap. int32_t getDeviceId() const { return DeviceId; }

/// Get the total shared memory per block that can be used in any kernel.
uint32_t getMaxBlockSharedMemSize() const { return MaxBlockSharedMemSize; }
/// Indicate whether the device has native block shared memory.
bool hasNativeBlockSharedMem() const { return HasNativeBlockSharedMem; }
/// Set the context of the device if needed, before calling device-specific /// functions. Plugins may implement this function as a no-op if not needed. virtual Error setContext() = 0; @@ -1132,6 +1148,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy { std::atomic OmptInitialized; #endif
/// The total per-block shared memory that a kernel may use.
uint32_t MaxBlockSharedMemSize = 0;
/// Whether the device has native block shared memory.
bool HasNativeBlockSharedMem = false;
private: DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0}; DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0}; @@ -1347,6 +1369,9 @@ struct GenericPluginTy { /// Prints information about the given devices supported by the plugin. void print_device_info(int32_t DeviceId);
/// Retrieve information about the given device.
int64_t query_device_info(int32_t DeviceId, DeviceQueryKind Query);
/// Creates an event in the given plugin if supported. int32_t create_event(int32_t DeviceId, void **EventPtr);

diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index 81b9d423e13d8..2997585e1660f 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -477,20 +477,20 @@ Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,

Expected<KernelLaunchEnvironmentTy *> GenericKernelTy::getKernelLaunchEnvironment(

GenericDeviceTy &GenericDevice, uint32_t Version,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {

GenericDeviceTy &GenericDevice, const KernelArgsTy &KernelArgs,
void *FallbackBlockMem, AsyncInfoWrapperTy &AsyncInfoWrapper) const {

// Ctor/Dtor have no arguments, replaying uses the original kernel launch // environment. Older versions of the compiler do not generate a kernel // launch environment. if (GenericDevice.Plugin.getRecordReplay().isReplaying() ||

 Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR)

 KernelArgs.Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR)

return nullptr;

if (!KernelEnvironment.... [truncated]