LLVM: lib/Transforms/IPO/OpenMPOpt.cpp Source File (original) (raw)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
21
50#include "llvm/IR/IntrinsicsAMDGPU.h"
51#include "llvm/IR/IntrinsicsNVPTX.h"
59
60#include
61#include
62#include
63
64using namespace llvm;
65using namespace omp;
66
67#define DEBUG_TYPE "openmp-opt"
68
70 "openmp-opt-disable", cl::desc("Disable OpenMP specific optimizations."),
72
74 "openmp-opt-enable-merging",
75 cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden,
77
80 cl::desc("Disable function internalization."),
82
89
91 "openmp-hide-memory-transfer-latency",
92 cl::desc("[WIP] Tries to hide the latency of host to device memory"
93 " transfers"),
95
97 "openmp-opt-disable-deglobalization",
98 cl::desc("Disable OpenMP optimizations involving deglobalization."),
100
102 "openmp-opt-disable-spmdization",
103 cl::desc("Disable OpenMP optimizations involving SPMD-ization."),
105
107 "openmp-opt-disable-folding",
108 cl::desc("Disable OpenMP optimizations involving folding."), cl::Hidden,
110
112 "openmp-opt-disable-state-machine-rewrite",
113 cl::desc("Disable OpenMP optimizations that replace the state machine."),
115
117 "openmp-opt-disable-barrier-elimination",
118 cl::desc("Disable OpenMP optimizations that eliminate barriers."),
120
122 "openmp-opt-print-module-after",
123 cl::desc("Print the current module after OpenMP optimizations."),
125
127 "openmp-opt-print-module-before",
128 cl::desc("Print the current module before OpenMP optimizations."),
130
132 "openmp-opt-inline-device",
133 cl::desc("Inline all applicable functions on the device."), cl::Hidden,
135
140
143 cl::desc("Maximal number of attributor iterations."),
145
148 cl::desc("Maximum amount of shared memory to use."),
149 cl::init(std::numeric_limits::max()));
150
152 "Number of OpenMP runtime calls deduplicated");
154 "Number of OpenMP parallel regions deleted");
155STATISTIC(NumOpenMPRuntimeFunctionsIdentified,
156 "Number of OpenMP runtime functions identified");
157STATISTIC(NumOpenMPRuntimeFunctionUsesIdentified,
158 "Number of OpenMP runtime function uses identified");
160 "Number of OpenMP target region entry points (=kernels) identified");
162 "Number of non-OpenMP target region kernels identified");
164 "Number of OpenMP target region entry points (=kernels) executed in "
165 "SPMD-mode instead of generic-mode");
166STATISTIC(NumOpenMPTargetRegionKernelsWithoutStateMachine,
167 "Number of OpenMP target region entry points (=kernels) executed in "
168 "generic-mode without a state machines");
169STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback,
170 "Number of OpenMP target region entry points (=kernels) executed in "
171 "generic-mode with customized state machines with fallback");
172STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback,
173 "Number of OpenMP target region entry points (=kernels) executed in "
174 "generic-mode with customized state machines without fallback");
176 NumOpenMPParallelRegionsReplacedInGPUStateMachine,
177 "Number of OpenMP parallel regions replaced with ID in GPU state machines");
179 "Number of OpenMP parallel regions merged");
181 "Amount of memory pushed to shared memory");
182STATISTIC(NumBarriersEliminated, "Number of redundant barriers eliminated");
183
184#if !defined(NDEBUG)
186#endif
187
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210#define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX) \
211 constexpr unsigned MEMBER##Idx = IDX;
212
215
216#undef KERNEL_ENVIRONMENT_IDX
217
218#define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX) \
219 constexpr unsigned MEMBER##Idx = IDX;
220
228
229#undef KERNEL_ENVIRONMENT_CONFIGURATION_IDX
230
231#define KERNEL_ENVIRONMENT_GETTER(MEMBER, RETURNTYPE) \
232 RETURNTYPE *get##MEMBER##FromKernelEnvironment(ConstantStruct *KernelEnvC) { \
233 return cast(KernelEnvC->getAggregateElement(MEMBER##Idx)); \
234 }
235
238
239#undef KERNEL_ENVIRONMENT_GETTER
240
241#define KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MEMBER) \
242 ConstantInt *get##MEMBER##FromKernelEnvironment( \
243 ConstantStruct *KernelEnvC) { \
244 ConstantStruct *ConfigC = \
245 getConfigurationFromKernelEnvironment(KernelEnvC); \
246 return dyn_cast(ConfigC->getAggregateElement(MEMBER##Idx)); \
247 }
248
256
257#undef KERNEL_ENVIRONMENT_CONFIGURATION_GETTER
258
261 constexpr int InitKernelEnvironmentArgNo = 0;
263 KernelInitCB->getArgOperand(InitKernelEnvironmentArgNo)
265}
266
272}
273
274namespace {
275
276struct AAHeapToShared;
277
278struct AAICVTracker;
279
280
281
283 OMPInformationCache(Module &M, AnalysisGetter &AG,
285 bool OpenMPPostLink)
287 OpenMPPostLink(OpenMPPostLink) {
288
289 OMPBuilder.Config.IsTargetDevice = isOpenMPDevice(OMPBuilder.M);
290 const Triple T(OMPBuilder.M.getTargetTriple());
291 switch (T.getArch()) {
295 assert(OMPBuilder.Config.IsTargetDevice &&
296 "OpenMP AMDGPU/NVPTX is only prepared to deal with device code.");
297 OMPBuilder.Config.IsGPU = true;
298 break;
299 default:
300 OMPBuilder.Config.IsGPU = false;
301 break;
302 }
303 OMPBuilder.initialize();
304 initializeRuntimeFunctions(M);
305 initializeInternalControlVars();
306 }
307
308
309 struct InternalControlVarInfo {
310
312
313
314 StringRef Name;
315
316
317 StringRef EnvVarName;
318
319
321
322
323 ConstantInt *InitValue;
324
325
327
328
330
331
333 };
334
335
336 struct RuntimeFunctionInfo {
337
338
340
341
342 StringRef Name;
343
344
345 bool IsVarArg;
346
347
348 Type *ReturnType;
349
350
352
353
354 Function *Declaration = nullptr;
355
356
357 using UseVector = SmallVector<Use *, 16>;
358
359
360 void clearUsesMap() { UsesMap.clear(); }
361
362
363 operator bool() const { return Declaration; }
364
365
366 UseVector &getOrCreateUseVector(Function *F) {
367 std::shared_ptr &UV = UsesMap[F];
368 if (!UV)
369 UV = std::make_shared();
370 return *UV;
371 }
372
373
374
375 const UseVector *getUseVector(Function &F) const {
376 auto I = UsesMap.find(&F);
377 if (I != UsesMap.end())
378 return I->second.get();
379 return nullptr;
380 }
381
382
383 size_t getNumFunctionsWithUses() const { return UsesMap.size(); }
384
385
386
387 size_t getNumArgs() const { return ArgumentTypes.size(); }
388
389
390
391
392 void foreachUse(SmallVectorImpl<Function *> &SCC,
393 function_ref<bool(Use &, Function &)> CB) {
394 for (Function *F : SCC)
395 foreachUse(CB, F);
396 }
397
398
399
400 void foreachUse(function_ref<bool(Use &, Function &)> CB, Function *F) {
401 SmallVector<unsigned, 8> ToBeDeleted;
402 ToBeDeleted.clear();
403
404 unsigned Idx = 0;
405 UseVector &UV = getOrCreateUseVector(F);
406
407 for (Use *U : UV) {
408 if (CB(*U, *F))
410 ++Idx;
411 }
412
413
414
415 while (!ToBeDeleted.empty()) {
417 UV[Idx] = UV.back();
418 UV.pop_back();
419 }
420 }
421
422 private:
423
424
425 DenseMap<Function *, std::shared_ptr> UsesMap;
426
427 public:
428
429 decltype(UsesMap)::iterator begin() { return UsesMap.begin(); }
430 decltype(UsesMap)::iterator end() { return UsesMap.end(); }
431 };
432
433
434 OpenMPIRBuilder OMPBuilder;
435
436
438 RuntimeFunction::OMPRTL___last>
439 RFIs;
440
441
442 DenseMap<Function *, RuntimeFunction> RuntimeFunctionIDMap;
443
444
446 InternalControlVar::ICV___last>
447 ICVs;
448
449
450
451 void initializeInternalControlVars() {
452#define ICV_RT_SET(_Name, RTL) \
453 { \
454 auto &ICV = ICVs[_Name]; \
455 ICV.Setter = RTL; \
456 }
457#define ICV_RT_GET(Name, RTL) \
458 { \
459 auto &ICV = ICVs[Name]; \
460 ICV.Getter = RTL; \
461 }
462#define ICV_DATA_ENV(Enum, _Name, _EnvVarName, Init) \
463 { \
464 auto &ICV = ICVs[Enum]; \
465 ICV.Name = _Name; \
466 ICV.Kind = Enum; \
467 ICV.InitKind = Init; \
468 ICV.EnvVarName = _EnvVarName; \
469 switch (ICV.InitKind) { \
470 case ICV_IMPLEMENTATION_DEFINED: \
471 ICV.InitValue = nullptr; \
472 break; \
473 case ICV_ZERO: \
474 ICV.InitValue = ConstantInt::get( \
475 Type::getInt32Ty(OMPBuilder.Int32->getContext()), 0); \
476 break; \
477 case ICV_FALSE: \
478 ICV.InitValue = ConstantInt::getFalse(OMPBuilder.Int1->getContext()); \
479 break; \
480 case ICV_LAST: \
481 break; \
482 } \
483 }
484#include "llvm/Frontend/OpenMP/OMPKinds.def"
485 }
486
487
488
489
490 static bool declMatchesRTFTypes(Function *F, Type *RTFRetType,
492
493
494
495 if ()
496 return false;
497 if (F->getReturnType() != RTFRetType)
498 return false;
499 if (F->arg_size() != RTFArgTypes.size())
500 return false;
501
502 auto *RTFTyIt = RTFArgTypes.begin();
503 for (Argument &Arg : F->args()) {
504 if (Arg.getType() != *RTFTyIt)
505 return false;
506
507 ++RTFTyIt;
508 }
509
510 return true;
511 }
512
513
514 unsigned collectUses(RuntimeFunctionInfo &RFI, bool CollectStats = true) {
515 unsigned NumUses = 0;
516 if (!RFI.Declaration)
517 return NumUses;
518 OMPBuilder.addAttributes(RFI.Kind, *RFI.Declaration);
519
520 if (CollectStats) {
521 NumOpenMPRuntimeFunctionsIdentified += 1;
522 NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses();
523 }
524
525
526 for (Use &U : RFI.Declaration->uses()) {
528 if ( || CGSCC->empty() || CGSCC->contains(UserI->getFunction())) {
529 RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U);
530 ++NumUses;
531 }
532 } else {
533 RFI.getOrCreateUseVector(nullptr).push_back(&U);
534 ++NumUses;
535 }
536 }
537 return NumUses;
538 }
539
540
542 auto &RFI = RFIs[RTF];
543 RFI.clearUsesMap();
544 collectUses(RFI, false);
545 }
546
547
548 void recollectUses() {
549 for (int Idx = 0; Idx < RFIs.size(); ++Idx)
550 recollectUsesForFunction(static_cast<RuntimeFunction>(Idx));
551 }
552
553
554 void setCallingConvention(FunctionCallee Callee, CallInst *CI) {
557 }
558
559
560
562
563 if (!OpenMPPostLink)
564 return true;
565
566
567
569 RuntimeFunctionInfo &RFI = RFIs[Fn];
570
571 if (!RFI.Declaration || RFI.Declaration->isDeclaration())
572 return false;
573 }
574 return true;
575 }
576
577
578
579 void initializeRuntimeFunctions(Module &M) {
580
581
582#define OMP_TYPE(VarName, ...) \
583 Type *VarName = OMPBuilder.VarName; \
584 (void)VarName;
585
586#define OMP_ARRAY_TYPE(VarName, ...) \
587 ArrayType *VarName##Ty = OMPBuilder.VarName##Ty; \
588 (void)VarName##Ty; \
589 PointerType *VarName##PtrTy = OMPBuilder.VarName##PtrTy; \
590 (void)VarName##PtrTy;
591
592#define OMP_FUNCTION_TYPE(VarName, ...) \
593 FunctionType *VarName = OMPBuilder.VarName; \
594 (void)VarName; \
595 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
596 (void)VarName##Ptr;
597
598#define OMP_STRUCT_TYPE(VarName, ...) \
599 StructType *VarName = OMPBuilder.VarName; \
600 (void)VarName; \
601 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
602 (void)VarName##Ptr;
603
604#define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...) \
605 { \
606 SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__}); \
607 Function *F = M.getFunction(_Name); \
608 RTLFunctions.insert(F); \
609 if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) { \
610 RuntimeFunctionIDMap[F] = _Enum; \
611 auto &RFI = RFIs[_Enum]; \
612 RFI.Kind = _Enum; \
613 RFI.Name = _Name; \
614 RFI.IsVarArg = _IsVarArg; \
615 RFI.ReturnType = OMPBuilder._ReturnType; \
616 RFI.ArgumentTypes = std::move(ArgsTypes); \
617 RFI.Declaration = F; \
618 unsigned NumUses = collectUses(RFI); \
619 (void)NumUses; \
620 LLVM_DEBUG({ \
621 dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not") \
622 << " found\n"; \
623 if (RFI.Declaration) \
624 dbgs() << TAG << "-> got " << NumUses << " uses in " \
625 << RFI.getNumFunctionsWithUses() \
626 << " different functions.\n"; \
627 }); \
628 } \
629 }
630#include "llvm/Frontend/OpenMP/OMPKinds.def"
631
632
633
635 for (Function &F : M) {
636 for (StringRef Prefix : {"__kmpc", "_ZN4ompx", "omp_"})
637 if (F.hasFnAttribute(Attribute::NoInline) &&
638 F.getName().starts_with(Prefix) &&
639 .hasFnAttribute(Attribute::OptimizeNone))
640 F.removeFnAttr(Attribute::NoInline);
641 }
642 }
643
644
645 }
646
647
648 DenseSet<const Function *> RTLFunctions;
649
650
651 bool OpenMPPostLink = false;
652};
653
654template <typename Ty, bool InsertInvalidates = true>
655struct BooleanStateWithSetVector : public BooleanState {
656 bool contains(const Ty &Elem) const { return Set.contains(Elem); }
657 bool insert(const Ty &Elem) {
658 if (InsertInvalidates)
659 BooleanState::indicatePessimisticFixpoint();
660 return Set.insert(Elem);
661 }
662
663 const Ty &operator[](int Idx) const { return Set[Idx]; }
664 bool operator==(const BooleanStateWithSetVector &RHS) const {
665 return BooleanState::operator==(RHS) && Set == RHS.Set;
666 }
667 bool operator!=(const BooleanStateWithSetVector &RHS) const {
668 return !(*this == RHS);
669 }
670
671 bool empty() const { return Set.empty(); }
672 size_t size() const { return Set.size(); }
673
674
675 BooleanStateWithSetVector &operator^=(const BooleanStateWithSetVector &RHS) {
676 BooleanState::operator^=(RHS);
677 Set.insert_range(RHS.Set);
678 return *this;
679 }
680
681private:
682
683 SetVector Set;
684
685public:
686 typename decltype(Set)::iterator begin() { return Set.begin(); }
687 typename decltype(Set)::iterator end() { return Set.end(); }
688 typename decltype(Set)::const_iterator begin() const { return Set.begin(); }
689 typename decltype(Set)::const_iterator end() const { return Set.end(); }
690};
691
692template <typename Ty, bool InsertInvalidates = true>
693using BooleanStateWithPtrSetVector =
694 BooleanStateWithSetVector<Ty *, InsertInvalidates>;
695
697
698 bool IsAtFixpoint = false;
699
700
701
702 BooleanStateWithPtrSetVector<CallBase, false>
703 ReachedKnownParallelRegions;
704
705
706 BooleanStateWithPtrSetVector ReachedUnknownParallelRegions;
707
708
709
710
711 BooleanStateWithPtrSetVector<Instruction, false> SPMDCompatibilityTracker;
712
713
714
715 CallBase *KernelInitCB = nullptr;
716
717
718
719 ConstantStruct *KernelEnvC = nullptr;
720
721
722
723 CallBase *KernelDeinitCB = nullptr;
724
725
726 bool IsKernelEntry = false;
727
728
729 BooleanStateWithPtrSetVector<Function, false> ReachingKernelEntries;
730
731
732
733
734 BooleanStateWithSetVector<uint8_t> ParallelLevels;
735
736
737 bool NestedParallelism = false;
738
739
740
741
742 KernelInfoState() = default;
743 KernelInfoState(bool BestState) {
744 if (!BestState)
745 indicatePessimisticFixpoint();
746 }
747
748
749 bool isValidState() const override { return true; }
750
751
752 bool isAtFixpoint() const override { return IsAtFixpoint; }
753
754
755 ChangeStatus indicatePessimisticFixpoint() override {
756 IsAtFixpoint = true;
757 ParallelLevels.indicatePessimisticFixpoint();
758 ReachingKernelEntries.indicatePessimisticFixpoint();
759 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
760 ReachedKnownParallelRegions.indicatePessimisticFixpoint();
761 ReachedUnknownParallelRegions.indicatePessimisticFixpoint();
762 NestedParallelism = true;
763 return ChangeStatus::CHANGED;
764 }
765
766
767 ChangeStatus indicateOptimisticFixpoint() override {
768 IsAtFixpoint = true;
769 ParallelLevels.indicateOptimisticFixpoint();
770 ReachingKernelEntries.indicateOptimisticFixpoint();
771 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
772 ReachedKnownParallelRegions.indicateOptimisticFixpoint();
773 ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
774 return ChangeStatus::UNCHANGED;
775 }
776
777
778 KernelInfoState &getAssumed() { return *this; }
779 const KernelInfoState &getAssumed() const { return *this; }
780
781 bool operator==(const KernelInfoState &RHS) const {
782 if (SPMDCompatibilityTracker != RHS.SPMDCompatibilityTracker)
783 return false;
784 if (ReachedKnownParallelRegions != RHS.ReachedKnownParallelRegions)
785 return false;
786 if (ReachedUnknownParallelRegions != RHS.ReachedUnknownParallelRegions)
787 return false;
788 if (ReachingKernelEntries != RHS.ReachingKernelEntries)
789 return false;
790 if (ParallelLevels != RHS.ParallelLevels)
791 return false;
792 if (NestedParallelism != RHS.NestedParallelism)
793 return false;
794 return true;
795 }
796
797
798 bool mayContainParallelRegion() {
799 return !ReachedKnownParallelRegions.empty() ||
800 !ReachedUnknownParallelRegions.empty();
801 }
802
803
804 static KernelInfoState getBestState() { return KernelInfoState(true); }
805
806 static KernelInfoState getBestState(KernelInfoState &KIS) {
807 return getBestState();
808 }
809
810
811 static KernelInfoState getWorstState() { return KernelInfoState(false); }
812
813
814 KernelInfoState operator^=(const KernelInfoState &KIS) {
815
816 if (KIS.KernelInitCB) {
817 if (KernelInitCB && KernelInitCB != KIS.KernelInitCB)
818 llvm_unreachable("Kernel that calls another kernel violates OpenMP-Opt "
819 "assumptions.");
820 KernelInitCB = KIS.KernelInitCB;
821 }
822 if (KIS.KernelDeinitCB) {
823 if (KernelDeinitCB && KernelDeinitCB != KIS.KernelDeinitCB)
824 llvm_unreachable("Kernel that calls another kernel violates OpenMP-Opt "
825 "assumptions.");
826 KernelDeinitCB = KIS.KernelDeinitCB;
827 }
828 if (KIS.KernelEnvC) {
829 if (KernelEnvC && KernelEnvC != KIS.KernelEnvC)
830 llvm_unreachable("Kernel that calls another kernel violates OpenMP-Opt "
831 "assumptions.");
832 KernelEnvC = KIS.KernelEnvC;
833 }
834 SPMDCompatibilityTracker ^= KIS.SPMDCompatibilityTracker;
835 ReachedKnownParallelRegions ^= KIS.ReachedKnownParallelRegions;
836 ReachedUnknownParallelRegions ^= KIS.ReachedUnknownParallelRegions;
837 NestedParallelism |= KIS.NestedParallelism;
838 return *this;
839 }
840
841 KernelInfoState operator&=(const KernelInfoState &KIS) {
842 return (*this ^= KIS);
843 }
844
845
846};
847
848
849
850struct OffloadArray {
851
852 AllocaInst *Array = nullptr;
853
854 SmallVector<Value *, 8> StoredValues;
855
856 SmallVector<StoreInst *, 8> LastAccesses;
857
858 OffloadArray() = default;
859
860
861
862
863
864 bool initialize(AllocaInst &Array, Instruction &Before) {
865 if (!Array.getAllocatedType()->isArrayTy())
866 return false;
867
868 if (!getValues(Array, Before))
869 return false;
870
871 this->Array = &Array;
872 return true;
873 }
874
875 static const unsigned DeviceIDArgNum = 1;
876 static const unsigned BasePtrsArgNum = 3;
877 static const unsigned PtrsArgNum = 4;
878 static const unsigned SizesArgNum = 5;
879
880private:
881
882
883
884 bool getValues(AllocaInst &Array, Instruction &Before) {
885
886 const uint64_t NumValues = Array.getAllocatedType()->getArrayNumElements();
887 StoredValues.assign(NumValues, nullptr);
888 LastAccesses.assign(NumValues, nullptr);
889
890
891
894 return false;
895
896 const DataLayout &DL = Array.getDataLayout();
897 const unsigned int PointerSize = DL.getPointerSize();
898
899 for (Instruction &I : *BB) {
900 if (&I == &Before)
901 break;
902
904 continue;
905
908 auto *Dst =
910 if (Dst == &Array) {
913 LastAccesses[Idx] = S;
914 }
915 }
916
917 return isFilled();
918 }
919
920
921
922 bool isFilled() {
923 const unsigned NumValues = StoredValues.size();
924 for (unsigned I = 0; I < NumValues; ++I) {
925 if (!StoredValues[I] || !LastAccesses[I])
926 return false;
927 }
928
929 return true;
930 }
931};
932
933struct OpenMPOpt {
934
935 using OptimizationRemarkGetter =
936 function_ref<OptimizationRemarkEmitter &(Function *)>;
937
938 OpenMPOpt(SmallVectorImpl<Function *> &SCC, CallGraphUpdater &CGUpdater,
939 OptimizationRemarkGetter OREGetter,
940 OMPInformationCache &OMPInfoCache, Attributor &A)
941 : M(*(*SCC.begin())->getParent()), SCC(SCC), CGUpdater(CGUpdater),
942 OREGetter(OREGetter), OMPInfoCache(OMPInfoCache), A(A) {}
943
944
945 bool remarksEnabled() {
946 auto &Ctx = M.getContext();
947 return Ctx.getDiagHandlerPtr()->isAnyRemarkEnabled(DEBUG_TYPE);
948 }
949
950
951 bool run(bool IsModulePass) {
952 if (SCC.empty())
953 return false;
954
956
958 << " functions\n");
959
960 if (IsModulePass) {
961 Changed |= runAttributor(IsModulePass);
962
963
964 OMPInfoCache.recollectUses();
965
966
967 Changed |= rewriteDeviceCodeStateMachine();
968
969 if (remarksEnabled())
970 analysisGlobalization();
971 } else {
973 printICVs();
975 printKernels();
976
977 Changed |= runAttributor(IsModulePass);
978
979
980 OMPInfoCache.recollectUses();
981
982 Changed |= deleteParallelRegions();
983
985 Changed |= hideMemTransfersLatency();
986 Changed |= deduplicateRuntimeCalls();
988 if (mergeParallelRegions()) {
989 deduplicateRuntimeCalls();
991 }
992 }
993 }
994
995 if (OMPInfoCache.OpenMPPostLink)
996 Changed |= removeRuntimeSymbols();
997
999 }
1000
1001
1002
1003 void printICVs() const {
1004 InternalControlVar ICVs[] = {ICV_nthreads, ICV_active_levels, ICV_cancel,
1005 ICV_proc_bind};
1006
1007 for (Function *F : SCC) {
1008 for (auto ICV : ICVs) {
1009 auto ICVInfo = OMPInfoCache.ICVs[ICV];
1010 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
1011 return ORA << "OpenMP ICV " << ore::NV("OpenMPICV", ICVInfo.Name)
1012 << " Value: "
1013 << (ICVInfo.InitValue
1014 ? toString(ICVInfo.InitValue->getValue(), 10, true)
1015 : "IMPLEMENTATION_DEFINED");
1016 };
1017
1019 }
1020 }
1021 }
1022
1023
1024 void printKernels() const {
1025 for (Function *F : SCC) {
1027 continue;
1028
1029 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
1030 return ORA << "OpenMP GPU kernel "
1031 << ore::NV("OpenMPGPUKernel", F->getName()) << "\n";
1032 };
1033
1035 }
1036 }
1037
1038
1039
1040 static CallInst *getCallIfRegularCall(
1041 Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) {
1044 (!RFI ||
1045 (RFI->Declaration && CI->getCalledFunction() == RFI->Declaration)))
1046 return CI;
1047 return nullptr;
1048 }
1049
1050
1051
1052 static CallInst *getCallIfRegularCall(
1053 Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) {
1056 (!RFI ||
1057 (RFI->Declaration && CI->getCalledFunction() == RFI->Declaration)))
1058 return CI;
1059 return nullptr;
1060 }
1061
1062private:
1063
1064 bool mergeParallelRegions() {
1065 const unsigned CallbackCalleeOperand = 2;
1066 const unsigned CallbackFirstArgOperand = 3;
1067 using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
1068
1069
1070 OMPInformationCache::RuntimeFunctionInfo &RFI =
1071 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
1072
1073 if (!RFI.Declaration)
1074 return false;
1075
1076
1077 OMPInformationCache::RuntimeFunctionInfo UnmergableCallsInfo[] = {
1078 OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind],
1079 OMPInfoCache.RFIs[OMPRTL___kmpc_push_num_threads],
1080 };
1081
1083 LoopInfo *LI = nullptr;
1084 DominatorTree *DT = nullptr;
1085
1086 SmallDenseMap<BasicBlock *, SmallPtrSet<Instruction *, 4>> BB2PRMap;
1087
1088 BasicBlock *StartBB = nullptr, *EndBB = nullptr;
1089 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
1090 BasicBlock *CGStartBB = CodeGenIP.getBlock();
1092 SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
1093 assert(StartBB != nullptr && "StartBB should not be null");
1095 assert(EndBB != nullptr && "EndBB should not be null");
1096 EndBB->getTerminator()->setSuccessor(0, CGEndBB);
1098 };
1099
1100 auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &,
1101 Value &Inner, Value *&ReplacementValue) -> InsertPointTy {
1102 ReplacementValue = &Inner;
1103 return CodeGenIP;
1104 };
1105
1106 auto FiniCB = [&](InsertPointTy CodeGenIP) { return Error::success(); };
1107
1108
1109
1110 auto CreateSequentialRegion = [&](Function *OuterFn,
1114
1115
1116 BasicBlock *ParentBB = SeqStartI->getParent();
1118 SplitBlock(ParentBB, SeqEndI->getNextNode(), DT, LI);
1122 SplitBlock(ParentBB, SeqStartI, DT, LI, nullptr, "seq.par.merged");
1123
1125 "Expected a different CFG");
1128
1129 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
1130 BasicBlock *CGStartBB = CodeGenIP.getBlock();
1132 SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
1133 assert(SeqStartBB != nullptr && "SeqStartBB should not be null");
1135 assert(SeqEndBB != nullptr && "SeqEndBB should not be null");
1138 };
1139 auto FiniCB = [&](InsertPointTy CodeGenIP) { return Error::success(); };
1140
1141
1142
1143 for (Instruction &I : *SeqStartBB) {
1144 SmallPtrSet<Instruction *, 4> OutsideUsers;
1145 for (User *Usr : I.users()) {
1147
1148
1150 continue;
1151
1152 if (UsrI.getParent() != SeqStartBB)
1153 OutsideUsers.insert(&UsrI);
1154 }
1155
1156 if (OutsideUsers.empty())
1157 continue;
1158
1159
1160
1161 const DataLayout &DL = M.getDataLayout();
1162 AllocaInst *AllocaI = new AllocaInst(
1163 I.getType(), DL.getAllocaAddrSpace(), nullptr,
1165
1166
1167
1168 new StoreInst(&I, AllocaI, SeqStartBB->getTerminator()->getIterator());
1169
1170
1171
1172 for (Instruction *UsrI : OutsideUsers) {
1173 LoadInst *LoadI = new LoadInst(I.getType(), AllocaI,
1174 I.getName() + ".seq.output.load",
1177 }
1178 }
1179
1180 OpenMPIRBuilder::LocationDescription Loc(
1181 InsertPointTy(ParentBB, ParentBB->end()), DL);
1182 OpenMPIRBuilder::InsertPointTy SeqAfterIP = cantFail(
1183 OMPInfoCache.OMPBuilder.createMaster(Loc, BodyGenCB, FiniCB));
1185 OMPInfoCache.OMPBuilder.createBarrier(SeqAfterIP, OMPD_parallel));
1186
1188
1189 LLVM_DEBUG(dbgs() << TAG << "After sequential inlining " << *OuterFn
1190 << "\n");
1191 };
1192
1193
1194
1195
1196
1197
1198
1199
1200 auto Merge = [&](const SmallVectorImpl<CallInst *> &MergableCIs,
1202
1203
1204 assert(MergableCIs.size() > 1 && "Assumed multiple mergable CIs");
1205
1206 auto Remark = [&](OptimizationRemark OR) {
1207 OR << "Parallel region merged with parallel region"
1208 << (MergableCIs.size() > 2 ? "s" : "") << " at ";
1211 if (CI != MergableCIs.back())
1212 OR << ", ";
1213 }
1214 return OR << ".";
1215 };
1216
1218
1219 Function *OriginalFn = BB->getParent();
1221 << " parallel regions in " << OriginalFn->getName()
1222 << "\n");
1223
1224
1225 EndBB = SplitBlock(BB, MergableCIs.back()->getNextNode(), DT, LI);
1227 SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI);
1228 StartBB = SplitBlock(BB, MergableCIs.front(), DT, LI, nullptr,
1229 "omp.par.merged");
1230
1231 assert(BB->getUniqueSuccessor() == StartBB && "Expected a different CFG");
1232 const DebugLoc DL = BB->getTerminator()->getDebugLoc();
1234
1235
1236
1237 for (auto *It = MergableCIs.begin(), *End = MergableCIs.end() - 1;
1238 It != End; ++It) {
1241
1242
1243 if (ForkCI->getNextNode() == NextForkCI)
1244 continue;
1245
1246 CreateSequentialRegion(OriginalFn, BB, ForkCI->getNextNode(),
1248 }
1249
1250 OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB, BB->end()),
1251 DL);
1252 IRBuilder<>::InsertPoint AllocaIP(
1255
1256
1257 OpenMPIRBuilder::InsertPointTy AfterIP =
1258 cantFail(OMPInfoCache.OMPBuilder.createParallel(
1259 Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB, nullptr, nullptr,
1260 OMP_PROC_BIND_default, false));
1262
1263
1264 OMPInfoCache.OMPBuilder.finalize(OriginalFn);
1265
1266 Function *OutlinedFn = MergableCIs.front()->getCaller();
1267
1268
1269
1270 SmallVector<Value *, 8> Args;
1271 for (auto *CI : MergableCIs) {
1273 FunctionType *FT = OMPInfoCache.OMPBuilder.ParallelTask;
1274 Args.clear();
1275 Args.push_back(OutlinedFn->getArg(0));
1276 Args.push_back(OutlinedFn->getArg(1));
1277 for (unsigned U = CallbackFirstArgOperand, E = CI->arg_size(); U < E;
1278 ++U)
1280
1281 CallInst *NewCI =
1285
1286
1287 for (unsigned U = CallbackFirstArgOperand, E = CI->arg_size(); U < E;
1288 ++U)
1291 U - (CallbackFirstArgOperand - CallbackCalleeOperand), A);
1292
1293
1294 if (CI != MergableCIs.back()) {
1295
1296
1297 cantFail(OMPInfoCache.OMPBuilder.createBarrier(
1298 InsertPointTy(NewCI->getParent(),
1300 OMPD_parallel));
1301 }
1302
1304 }
1305
1306 assert(OutlinedFn != OriginalFn && "Outlining failed");
1307 CGUpdater.registerOutlinedFunction(*OriginalFn, *OutlinedFn);
1308 CGUpdater.reanalyzeFunction(*OriginalFn);
1309
1310 NumOpenMPParallelRegionsMerged += MergableCIs.size();
1311
1312 return true;
1313 };
1314
1315
1316
1318 CallInst *CI = getCallIfRegularCall(U, &RFI);
1320
1321 return false;
1322 };
1323
1324 BB2PRMap.clear();
1325 RFI.foreachUse(SCC, DetectPRsCB);
1327
1328
1329
1330
1331 for (auto &It : BB2PRMap) {
1332 auto &CIs = It.getSecond();
1333 if (CIs.size() < 2)
1334 continue;
1335
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347 auto IsMergable = [&](Instruction &I, bool IsBeforeMergableRegion) {
1348
1349
1350 if (I.isTerminator())
1351 return false;
1352
1354 return true;
1355
1357 if (IsBeforeMergableRegion) {
1359 if (!CalledFunction)
1360 return false;
1361
1362
1363
1364
1365
1366 for (const auto &RFI : UnmergableCallsInfo) {
1367 if (CalledFunction == RFI.Declaration)
1368 return false;
1369 }
1370 } else {
1371
1372
1373
1374
1376 return false;
1377 }
1378
1379 return true;
1380 };
1381
1382 for (auto It = BB->begin(), End = BB->end(); It != End;) {
1384 ++It;
1385
1386 if (CIs.count(&I)) {
1388 continue;
1389 }
1390
1391
1392 if (IsMergable(I, MergableCIs.empty()))
1393 continue;
1394
1395
1396
1397 for (; It != End; ++It) {
1399 if (CIs.count(&SkipI)) {
1401 << " due to " << I << "\n");
1402 ++It;
1403 break;
1404 }
1405 }
1406
1407
1408 if (MergableCIs.size() > 1) {
1409 MergableCIsVector.push_back(MergableCIs);
1411 << " parallel regions in block " << BB->getName()
1413 << "\n";);
1414 }
1415
1416 MergableCIs.clear();
1417 }
1418
1419 if (!MergableCIsVector.empty()) {
1421
1422 for (auto &MergableCIs : MergableCIsVector)
1423 Merge(MergableCIs, BB);
1424 MergableCIsVector.clear();
1425 }
1426 }
1427
1429
1430
1431 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_fork_call);
1432 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_barrier);
1433 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_master);
1434 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_end_master);
1435 }
1436
1438 }
1439
1440
1441 bool deleteParallelRegions() {
1442 const unsigned CallbackCalleeOperand = 2;
1443
1444 OMPInformationCache::RuntimeFunctionInfo &RFI =
1445 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
1446
1447 if (!RFI.Declaration)
1448 return false;
1449
1451 auto DeleteCallCB = [&](Use &U, Function &) {
1452 CallInst *CI = getCallIfRegularCall(U);
1453 if (!CI)
1454 return false;
1457 if (!Fn)
1458 return false;
1459 if (!Fn->onlyReadsMemory())
1460 return false;
1461 if (!Fn->hasFnAttribute(Attribute::WillReturn))
1462 return false;
1463
1464 LLVM_DEBUG(dbgs() << TAG << "Delete read-only parallel region in "
1466
1467 auto Remark = [&](OptimizationRemark OR) {
1468 return OR << "Removing parallel region with no side-effects.";
1469 };
1471
1474 ++NumOpenMPParallelRegionsDeleted;
1475 return true;
1476 };
1477
1478 RFI.foreachUse(SCC, DeleteCallCB);
1479
1481 }
1482
1483
1484 bool deduplicateRuntimeCalls() {
1486
1488 OMPRTL_omp_get_num_threads,
1489 OMPRTL_omp_in_parallel,
1490 OMPRTL_omp_get_cancellation,
1491 OMPRTL_omp_get_supported_active_levels,
1492 OMPRTL_omp_get_level,
1493 OMPRTL_omp_get_ancestor_thread_num,
1494 OMPRTL_omp_get_team_size,
1495 OMPRTL_omp_get_active_level,
1496 OMPRTL_omp_in_final,
1497 OMPRTL_omp_get_proc_bind,
1498 OMPRTL_omp_get_num_places,
1499 OMPRTL_omp_get_num_procs,
1500 OMPRTL_omp_get_place_num,
1501 OMPRTL_omp_get_partition_num_places,
1502 OMPRTL_omp_get_partition_place_nums};
1503
1504
1505 SmallSetVector<Value *, 16> GTIdArgs;
1506 collectGlobalThreadIdArguments(GTIdArgs);
1508 << " global thread ID arguments\n");
1509
1510 for (Function *F : SCC) {
1511 for (auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs)
1512 Changed |= deduplicateRuntimeCalls(
1513 *F, OMPInfoCache.RFIs[DeduplicableRuntimeCallID]);
1514
1515
1516
1517 Value *GTIdArg = nullptr;
1518 for (Argument &Arg : F->args())
1519 if (GTIdArgs.count(&Arg)) {
1520 GTIdArg = &Arg;
1521 break;
1522 }
1523 Changed |= deduplicateRuntimeCalls(
1524 *F, OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num], GTIdArg);
1525 }
1526
1528 }
1529
1530
1531 bool removeRuntimeSymbols() {
1532
1533
1534
1535
1536 if (GlobalVariable *GV = M.getNamedGlobal("__llvm_rpc_client")) {
1537 if (GV->hasNUsesOrMore(1))
1538 return false;
1539
1541 GV->eraseFromParent();
1542 return true;
1543 }
1544 return false;
1545 }
1546
1547
1548
1549
1550
1551
1552
1553 bool hideMemTransfersLatency() {
1554 auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper];
1556 auto SplitMemTransfers = [&](Use &U, Function &Decl) {
1557 auto *RTCall = getCallIfRegularCall(U, &RFI);
1558 if (!RTCall)
1559 return false;
1560
1561 OffloadArray OffloadArrays[3];
1562 if (!getValuesInOffloadArrays(*RTCall, OffloadArrays))
1563 return false;
1564
1565 LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays));
1566
1567
1568 bool WasSplit = false;
1569 Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall);
1570 if (WaitMovementPoint)
1571 WasSplit = splitTargetDataBeginRTC(*RTCall, *WaitMovementPoint);
1572
1574 return WasSplit;
1575 };
1576 if (OMPInfoCache.runtimeFnsAvailable(
1577 {OMPRTL___tgt_target_data_begin_mapper_issue,
1578 OMPRTL___tgt_target_data_begin_mapper_wait}))
1579 RFI.foreachUse(SCC, SplitMemTransfers);
1580
1582 }
1583
1584 void analysisGlobalization() {
1585 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
1586
1587 auto CheckGlobalization = [&](Use &U, Function &Decl) {
1588 if (CallInst *CI = getCallIfRegularCall(U, &RFI)) {
1589 auto Remark = [&](OptimizationRemarkMissed ORM) {
1590 return ORM
1591 << "Found thread data sharing on the GPU. "
1592 << "Expect degraded performance due to data globalization.";
1593 };
1595 }
1596
1597 return false;
1598 };
1599
1600 RFI.foreachUse(SCC, CheckGlobalization);
1601 }
1602
1603
1604
1605 bool getValuesInOffloadArrays(CallInst &RuntimeCall,
1607 assert(OAs.size() == 3 && "Need space for three offload arrays!");
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617 Value *BasePtrsArg =
1618 RuntimeCall.getArgOperand(OffloadArray::BasePtrsArgNum);
1619
1620 Value *PtrsArg = RuntimeCall.getArgOperand(OffloadArray::PtrsArgNum);
1621
1622 Value *SizesArg = RuntimeCall.getArgOperand(OffloadArray::SizesArgNum);
1623
1624
1627 return false;
1629 if (!OAs[0].initialize(*BasePtrsArray, RuntimeCall))
1630 return false;
1631
1632
1635 return false;
1637 if (!OAs[1].initialize(*PtrsArray, RuntimeCall))
1638 return false;
1639
1640
1642
1646 return false;
1647
1649 if (!OAs[2].initialize(*SizesArray, RuntimeCall))
1650 return false;
1651
1652 return true;
1653 }
1654
1655
1656
1657
1658
1660 assert(OAs.size() == 3 && "There are three offload arrays to debug!");
1661
1662 LLVM_DEBUG(dbgs() << TAG << " Successfully got offload values:\n");
1663 std::string ValuesStr;
1664 raw_string_ostream Printer(ValuesStr);
1665 std::string Separator = " --- ";
1666
1667 for (auto *BP : OAs[0].StoredValues) {
1670 }
1671 LLVM_DEBUG(dbgs() << "\t\toffload_baseptrs: " << ValuesStr << "\n");
1672 ValuesStr.clear();
1673
1674 for (auto *P : OAs[1].StoredValues) {
1677 }
1678 LLVM_DEBUG(dbgs() << "\t\toffload_ptrs: " << ValuesStr << "\n");
1679 ValuesStr.clear();
1680
1681 for (auto *S : OAs[2].StoredValues) {
1684 }
1685 LLVM_DEBUG(dbgs() << "\t\toffload_sizes: " << ValuesStr << "\n");
1686 }
1687
1688
1689
1690 Instruction *canBeMovedDownwards(CallInst &RuntimeCall) {
1691
1692
1693
1695 bool IsWorthIt = false;
1696 while ((CurrentI = CurrentI->getNextNode())) {
1697
1698
1699
1700
1702 if (IsWorthIt)
1703 return CurrentI;
1704
1705 return nullptr;
1706 }
1707
1708
1709
1710 IsWorthIt = true;
1711 }
1712
1713
1714 return RuntimeCall.getParent()->getTerminator();
1715 }
1716
1717
1718 bool splitTargetDataBeginRTC(CallInst &RuntimeCall,
1719 Instruction &WaitMovementPoint) {
1720
1721
1722
1723 auto &IRBuilder = OMPInfoCache.OMPBuilder;
1726 IRBuilder.Builder.SetInsertPoint(&Entry,
1727 Entry.getFirstNonPHIOrDbgOrAlloca());
1729 IRBuilder.AsyncInfo, nullptr, "handle");
1730 Handle =
1731 IRBuilder.Builder.CreateAddrSpaceCast(Handle, IRBuilder.AsyncInfoPtr);
1732
1733
1734
1735
1736 FunctionCallee IssueDecl = IRBuilder.getOrCreateRuntimeFunction(
1737 M, OMPRTL___tgt_target_data_begin_mapper_issue);
1738
1739
1740 SmallVector<Value *, 16> Args;
1741 for (auto &Arg : RuntimeCall.args())
1742 Args.push_back(Arg.get());
1743 Args.push_back(Handle);
1744
1745 CallInst *IssueCallsite = CallInst::Create(IssueDecl, Args, "",
1747 OMPInfoCache.setCallingConvention(IssueDecl, IssueCallsite);
1749
1750
1751
1752 FunctionCallee WaitDecl = IRBuilder.getOrCreateRuntimeFunction(
1753 M, OMPRTL___tgt_target_data_begin_mapper_wait);
1754
1755 Value *WaitParams[2] = {
1757 OffloadArray::DeviceIDArgNum),
1758 Handle
1759 };
1761 WaitDecl, WaitParams, "", WaitMovementPoint.getIterator());
1762 OMPInfoCache.setCallingConvention(WaitDecl, WaitCallsite);
1763
1764 return true;
1765 }
1766
1767 static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent,
1768 bool GlobalOnly, bool &SingleChoice) {
1769 if (CurrentIdent == NextIdent)
1770 return CurrentIdent;
1771
1772
1773
1775 SingleChoice = !CurrentIdent;
1776 return NextIdent;
1777 }
1778 return nullptr;
1779 }
1780
1781
1782
1783
1784
1785
1787 getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI,
1788 Function &F, bool GlobalOnly) {
1789 bool SingleChoice = true;
1790 Value *Ident = nullptr;
1792 CallInst *CI = getCallIfRegularCall(U, &RFI);
1793 if (!CI || &F != &Caller)
1794 return false;
1795 Ident = combinedIdentStruct(Ident, CI->getArgOperand(0),
1796 true, SingleChoice);
1797 return false;
1798 };
1799 RFI.foreachUse(SCC, CombineIdentStruct);
1800
1801 if (!Ident || !SingleChoice) {
1802
1803
1804 if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock())
1805 OMPInfoCache.OMPBuilder.updateToLocation(OpenMPIRBuilder::InsertPointTy(
1806 &F.getEntryBlock(), F.getEntryBlock().begin()));
1807
1808
1809 uint32_t SrcLocStrSize;
1811 OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1812 Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc, SrcLocStrSize);
1813 }
1814 return Ident;
1815 }
1816
1817
1818
1819 bool deduplicateRuntimeCalls(Function &F,
1820 OMPInformationCache::RuntimeFunctionInfo &RFI,
1821 Value *ReplVal = nullptr) {
1822 auto *UV = RFI.getUseVector(F);
1823 if (!UV || UV->size() + (ReplVal != nullptr) < 2)
1824 return false;
1825
1827 dbgs() << TAG << "Deduplicate " << UV->size() << " uses of " << RFI.Name
1828 << (ReplVal ? " with an existing value\n" : "\n") << "\n");
1829
1832 "Unexpected replacement value!");
1833
1834
1835 auto CanBeMoved = [this](CallBase &CB) {
1836 unsigned NumArgs = CB.arg_size();
1837 if (NumArgs == 0)
1838 return true;
1839 if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr)
1840 return false;
1841 for (unsigned U = 1; U < NumArgs; ++U)
1843 return false;
1844 return true;
1845 };
1846
1847 if (!ReplVal) {
1848 auto *DT =
1849 OMPInfoCache.getAnalysisResultForFunction(F);
1850 if (!DT)
1851 return false;
1853 for (Use *U : *UV) {
1854 if (CallInst *CI = getCallIfRegularCall(*U, &RFI)) {
1855 if (IP)
1857 else
1858 IP = CI;
1859 if (!CanBeMoved(*CI))
1860 continue;
1861 if (!ReplVal)
1862 ReplVal = CI;
1863 }
1864 }
1865 if (!ReplVal)
1866 return false;
1867 assert(IP && "Expected insertion point!");
1869 }
1870
1871
1872
1873
1877 Value *Ident = getCombinedIdentFromCallUsesIn(RFI, F,
1878 true);
1880 }
1881 }
1882
1885 CallInst *CI = getCallIfRegularCall(U, &RFI);
1886 if (!CI || CI == ReplVal || &F != &Caller)
1887 return false;
1889
1890 auto Remark = [&](OptimizationRemark OR) {
1891 return OR << "OpenMP runtime call "
1892 << ore::NV("OpenMPOptRuntime", RFI.Name) << " deduplicated.";
1893 };
1896 else
1898
1901 ++NumOpenMPRuntimeCallsDeduplicated;
1903 return true;
1904 };
1905 RFI.foreachUse(SCC, ReplaceAndDeleteCB);
1906
1908 }
1909
1910
1911 void collectGlobalThreadIdArguments(SmallSetVector<Value *, 16> >IdArgs) {
1912
1913
1914
1915
1916
1917
1918 auto CallArgOpIsGTId = [&](Function &F, unsigned ArgNo, CallInst &RefCI) {
1919 if (.hasLocalLinkage())
1920 return false;
1921 for (Use &U : F.uses()) {
1922 if (CallInst *CI = getCallIfRegularCall(U)) {
1924 if (CI == &RefCI || GTIdArgs.count(ArgOp) ||
1925 getCallIfRegularCall(
1926 *ArgOp, &OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num]))
1927 continue;
1928 }
1929 return false;
1930 }
1931 return true;
1932 };
1933
1934
1935 auto AddUserArgs = [&](Value >Id) {
1936 for (Use &U : GTId.uses())
1940 if (CallArgOpIsGTId(*Callee, U.getOperandNo(), *CI))
1941 GTIdArgs.insert(Callee->getArg(U.getOperandNo()));
1942 };
1943
1944
1945 OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI =
1946 OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num];
1947
1948 GlobThreadNumRFI.foreachUse(SCC, [&](Use &U, Function &F) {
1949 if (CallInst *CI = getCallIfRegularCall(U, &GlobThreadNumRFI))
1950 AddUserArgs(*CI);
1951 return false;
1952 });
1953
1954
1955
1956
1957 for (unsigned U = 0; U < GTIdArgs.size(); ++U)
1958 AddUserArgs(*GTIdArgs[U]);
1959 }
1960
1961
1962
1963
1964
1965
1966 DenseMap<Function *, std::optional> UniqueKernelMap;
1967
1968
1969 Kernel getUniqueKernelFor(Function &F);
1970
1971
1972 Kernel getUniqueKernelFor(Instruction &I) {
1973 return getUniqueKernelFor(*I.getFunction());
1974 }
1975
1976
1977
1978 bool rewriteDeviceCodeStateMachine();
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994 template <typename RemarkKind, typename RemarkCallBack>
1995 void emitRemark(Instruction *I, StringRef RemarkName,
1996 RemarkCallBack &&RemarkCB) const {
1997 Function *F = I->getParent()->getParent();
1998 auto &ORE = OREGetter(F);
1999
2001 ORE.emit([&]() {
2002 return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, I))
2003 << " [" << RemarkName << "]";
2004 });
2005 else
2006 ORE.emit(
2007 [&]() { return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, I)); });
2008 }
2009
2010
2011 template <typename RemarkKind, typename RemarkCallBack>
2012 void emitRemark(Function *F, StringRef RemarkName,
2013 RemarkCallBack &&RemarkCB) const {
2014 auto &ORE = OREGetter(F);
2015
2017 ORE.emit([&]() {
2018 return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, F))
2019 << " [" << RemarkName << "]";
2020 });
2021 else
2022 ORE.emit(
2023 [&]() { return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, F)); });
2024 }
2025
2026
2028
2029
2030 SmallVectorImpl<Function *> &SCC;
2031
2032
2033
2034 CallGraphUpdater &CGUpdater;
2035
2036
2037 OptimizationRemarkGetter OREGetter;
2038
2039
2040 OMPInformationCache &OMPInfoCache;
2041
2042
2043 Attributor &A;
2044
2045
2046 bool runAttributor(bool IsModulePass) {
2047 if (SCC.empty())
2048 return false;
2049
2050 registerAAs(IsModulePass);
2051
2053
2054 LLVM_DEBUG(dbgs() << "[Attributor] Done with " << SCC.size()
2055 << " functions, result: " << Changed << ".\n");
2056
2057 if (Changed == ChangeStatus::CHANGED)
2058 OMPInfoCache.invalidateAnalyses();
2059
2060 return Changed == ChangeStatus::CHANGED;
2061 }
2062
2064
2065
2066
2067 void registerAAs(bool IsModulePass);
2068
2069public:
2070
2071
2072 static void registerAAsForFunction(Attributor &A, const Function &F);
2073};
2074
2076 if (OMPInfoCache.CGSCC && !OMPInfoCache.CGSCC->empty() &&
2077 !OMPInfoCache.CGSCC->contains(&F))
2078 return nullptr;
2079
2080
2081 {
2082 std::optional &CachedKernel = UniqueKernelMap[&F];
2083 if (CachedKernel)
2084 return *CachedKernel;
2085
2086
2087
2088
2090 CachedKernel = Kernel(&F);
2091 return *CachedKernel;
2092 }
2093
2094 CachedKernel = nullptr;
2095 if (.hasLocalLinkage()) {
2096
2097
2098 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
2099 return ORA << "Potentially unknown OpenMP target region caller.";
2100 };
2102
2103 return nullptr;
2104 }
2105 }
2106
2107 auto GetUniqueKernelForUse = [&](const Use &U) -> Kernel {
2109
2110 if (Cmp->isEquality())
2111 return getUniqueKernelFor(*Cmp);
2112 return nullptr;
2113 }
2115
2116 if (CB->isCallee(&U))
2117 return getUniqueKernelFor(*CB);
2118
2119 OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
2120 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
2121
2122 if (OpenMPOpt::getCallIfRegularCall(*U.getUser(), &KernelParallelRFI))
2123 return getUniqueKernelFor(*CB);
2124 return nullptr;
2125 }
2126
2127 return nullptr;
2128 };
2129
2130
2131 SmallPtrSet<Kernel, 2> PotentialKernels;
2132 OMPInformationCache::foreachUse(F, [&](const Use &U) {
2133 PotentialKernels.insert(GetUniqueKernelForUse(U));
2134 });
2135
2137 if (PotentialKernels.size() == 1)
2138 K = *PotentialKernels.begin();
2139
2140
2142
2143 return K;
2144}
2145
2146bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
2147 OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
2148 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
2149
2151 if (!KernelParallelRFI)
2153
2154
2157
2158 for (Function *F : SCC) {
2159
2160
2161
2162 bool UnknownUse = false;
2163 bool KernelParallelUse = false;
2164 unsigned NumDirectCalls = 0;
2165
2167 OMPInformationCache::foreachUse(*F, [&](Use &U) {
2169 if (CB->isCallee(&U)) {
2170 ++NumDirectCalls;
2171 return;
2172 }
2173
2175 ToBeReplacedStateMachineUses.push_back(&U);
2176 return;
2177 }
2178
2179
2180 CallInst *CI =
2181 OpenMPOpt::getCallIfRegularCall(*U.getUser(), &KernelParallelRFI);
2182 const unsigned int WrapperFunctionArgNo = 6;
2183 if (!KernelParallelUse && CI &&
2185 KernelParallelUse = true;
2186 ToBeReplacedStateMachineUses.push_back(&U);
2187 return;
2188 }
2189 UnknownUse = true;
2190 });
2191
2192
2193
2194 if (!KernelParallelUse)
2195 continue;
2196
2197
2198
2199
2200 if (UnknownUse || NumDirectCalls != 1 ||
2201 ToBeReplacedStateMachineUses.size() > 2) {
2202 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
2203 return ORA << "Parallel region is used in "
2204 << (UnknownUse ? "unknown" : "unexpected")
2205 << " ways. Will not attempt to rewrite the state machine.";
2206 };
2208 continue;
2209 }
2210
2211
2212
2213 Kernel K = getUniqueKernelFor(*F);
2214 if (!K) {
2215 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
2216 return ORA << "Parallel region is not called from a unique kernel. "
2217 "Will not attempt to rewrite the state machine.";
2218 };
2220 continue;
2221 }
2222
2223
2224
2225
2226
2227
2228 Module &M = *F->getParent();
2229 Type *Int8Ty = Type::getInt8Ty(M.getContext());
2230
2231 auto *ID = new GlobalVariable(
2234
2235 for (Use *U : ToBeReplacedStateMachineUses)
2237 ID, U->get()->getType()));
2238
2239 ++NumOpenMPParallelRegionsReplacedInGPUStateMachine;
2240
2242 }
2243
2245}
2246
2247
2248struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> {
2249 using Base = StateWrapper<BooleanState, AbstractAttribute>;
2250 AAICVTracker(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
2251
2252
2253 bool isAssumedTracked() const { return getAssumed(); }
2254
2255
2256 bool isKnownTracked() const { return getAssumed(); }
2257
2258
2259 static AAICVTracker &createForPosition(const IRPosition &IRP, Attributor &A);
2260
2261
2262 virtual std::optional<Value *> getReplacementValue(InternalControlVar ICV,
2263 const Instruction *I,
2264 Attributor &A) const {
2265 return std::nullopt;
2266 }
2267
2268
2269
2270
2271 virtual std::optional<Value *>
2273
2274
2275
2277
2278
2279 StringRef getName() const override { return "AAICVTracker"; }
2280
2281
2282 const char *getIdAddr() const override { return &ID; }
2283
2284
2285 static bool classof(const AbstractAttribute *AA) {
2287 }
2288
2289 static const char ID;
2290};
2291
2292struct AAICVTrackerFunction : public AAICVTracker {
2293 AAICVTrackerFunction(const IRPosition &IRP, Attributor &A)
2294 : AAICVTracker(IRP, A) {}
2295
2296
2297 const std::string getAsStr(Attributor *) const override {
2298 return "ICVTrackerFunction";
2299 }
2300
2301
2302 void trackStatistics() const override {}
2303
2304
2306 return ChangeStatus::UNCHANGED;
2307 }
2308
2309
2310 EnumeratedArray<DenseMap<Instruction *, Value *>, InternalControlVar,
2311 InternalControlVar::ICV___last>
2312 ICVReplacementValuesMap;
2313
2314 ChangeStatus updateImpl(Attributor &A) override {
2315 ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
2316
2318
2319 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2320
2322 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
2323
2324 auto &ValuesMap = ICVReplacementValuesMap[ICV];
2325 auto TrackValues = [&](Use &U, Function &) {
2326 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U);
2327 if (!CI)
2328 return false;
2329
2330
2331
2332 if (ValuesMap.insert(std::make_pair(CI, CI->getArgOperand(0))).second)
2333 HasChanged = ChangeStatus::CHANGED;
2334
2335 return false;
2336 };
2337
2339 std::optional<Value *> ReplVal = getValueForCall(A, I, ICV);
2340 if (ReplVal && ValuesMap.insert(std::make_pair(&I, *ReplVal)).second)
2341 HasChanged = ChangeStatus::CHANGED;
2342
2343 return true;
2344 };
2345
2346
2347 SetterRFI.foreachUse(TrackValues, F);
2348
2349 bool UsedAssumedInformation = false;
2350 A.checkForAllInstructions(CallCheck, *this, {Instruction::Call},
2351 UsedAssumedInformation,
2352 true);
2353
2354
2355
2357 if (HasChanged == ChangeStatus::CHANGED)
2358 ValuesMap.try_emplace(Entry);
2359 }
2360
2361 return HasChanged;
2362 }
2363
2364
2365
2366 std::optional<Value *> getValueForCall(Attributor &A, const Instruction &I,
2368
2370 if (!CB || CB->hasFnAttr("no_openmp") ||
2371 CB->hasFnAttr("no_openmp_routines") ||
2372 CB->hasFnAttr("no_openmp_constructs"))
2373 return std::nullopt;
2374
2375 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2376 auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter];
2377 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
2378 Function *CalledFunction = CB->getCalledFunction();
2379
2380
2381 if (CalledFunction == nullptr)
2382 return nullptr;
2383 if (CalledFunction == GetterRFI.Declaration)
2384 return std::nullopt;
2385 if (CalledFunction == SetterRFI.Declaration) {
2386 if (ICVReplacementValuesMap[ICV].count(&I))
2387 return ICVReplacementValuesMap[ICV].lookup(&I);
2388
2389 return nullptr;
2390 }
2391
2392
2394 return nullptr;
2395
2396 const auto *ICVTrackingAA = A.getAAFor(
2398
2399 if (ICVTrackingAA->isAssumedTracked()) {
2400 std::optional<Value *> URV =
2401 ICVTrackingAA->getUniqueReplacementValue(ICV);
2403 OMPInfoCache)))
2404 return URV;
2405 }
2406
2407
2408 return nullptr;
2409 }
2410
2411
2412 std::optional<Value *>
2414 return std::nullopt;
2415 }
2416
2417
2419 const Instruction *I,
2420 Attributor &A) const override {
2421 const auto &ValuesMap = ICVReplacementValuesMap[ICV];
2422 if (ValuesMap.count(I))
2423 return ValuesMap.lookup(I);
2424
2426 SmallPtrSet<const Instruction *, 16> Visited;
2428
2429 std::optional<Value *> ReplVal;
2430
2431 while (!Worklist.empty()) {
2433 if (!Visited.insert(CurrInst).second)
2434 continue;
2435
2437
2438
2439
2440 while ((CurrInst = CurrInst->getPrevNode())) {
2441 if (ValuesMap.count(CurrInst)) {
2442 std::optional<Value *> NewReplVal = ValuesMap.lookup(CurrInst);
2443
2444 if (!ReplVal) {
2445 ReplVal = NewReplVal;
2446 break;
2447 }
2448
2449
2450 if (NewReplVal)
2451 if (ReplVal != NewReplVal)
2452 return nullptr;
2453
2454 break;
2455 }
2456
2457 std::optional<Value *> NewReplVal = getValueForCall(A, *CurrInst, ICV);
2458 if (!NewReplVal)
2459 continue;
2460
2461
2462 if (!ReplVal) {
2463 ReplVal = NewReplVal;
2464 break;
2465 }
2466
2467
2468
2469 if (ReplVal != NewReplVal)
2470 return nullptr;
2471 }
2472
2473
2474 if (CurrBB == I->getParent() && ReplVal)
2475 return ReplVal;
2476
2477
2478 for (const BasicBlock *Pred : predecessors(CurrBB))
2479 if (const Instruction *Terminator = Pred->getTerminator())
2481 }
2482
2483 return ReplVal;
2484 }
2485};
2486
2487struct AAICVTrackerFunctionReturned : AAICVTracker {
2488 AAICVTrackerFunctionReturned(const IRPosition &IRP, Attributor &A)
2489 : AAICVTracker(IRP, A) {}
2490
2491
2492 const std::string getAsStr(Attributor *) const override {
2493 return "ICVTrackerFunctionReturned";
2494 }
2495
2496
2497 void trackStatistics() const override {}
2498
2499
2501 return ChangeStatus::UNCHANGED;
2502 }
2503
2504
2506 InternalControlVar::ICV___last>
2507 ICVReplacementValuesMap;
2508
2509
2510 std::optional<Value *>
2512 return ICVReplacementValuesMap[ICV];
2513 }
2514
2515 ChangeStatus updateImpl(Attributor &A) override {
2517 const auto *ICVTrackingAA = A.getAAFor(
2519
2520 if (!ICVTrackingAA->isAssumedTracked())
2521 return indicatePessimisticFixpoint();
2522
2524 std::optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
2525 std::optional<Value *> UniqueICVValue;
2526
2527 auto CheckReturnInst = [&](Instruction &I) {
2528 std::optional<Value *> NewReplVal =
2529 ICVTrackingAA->getReplacementValue(ICV, &I, A);
2530
2531
2532 if (UniqueICVValue && UniqueICVValue != NewReplVal)
2533 return false;
2534
2535 UniqueICVValue = NewReplVal;
2536
2537 return true;
2538 };
2539
2540 bool UsedAssumedInformation = false;
2541 if (.checkForAllInstructions(CheckReturnInst, *this, {Instruction::Ret},
2542 UsedAssumedInformation,
2543 true))
2544 UniqueICVValue = nullptr;
2545
2546 if (UniqueICVValue == ReplVal)
2547 continue;
2548
2549 ReplVal = UniqueICVValue;
2550 Changed = ChangeStatus::CHANGED;
2551 }
2552
2554 }
2555};
2556
2557struct AAICVTrackerCallSite : AAICVTracker {
2558 AAICVTrackerCallSite(const IRPosition &IRP, Attributor &A)
2559 : AAICVTracker(IRP, A) {}
2560
2561 void initialize(Attributor &A) override {
2562 assert(getAnchorScope() && "Expected anchor function");
2563
2564
2565
2566 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2568 auto ICVInfo = OMPInfoCache.ICVs[ICV];
2569 auto &Getter = OMPInfoCache.RFIs[ICVInfo.Getter];
2570 if (Getter.Declaration == getAssociatedFunction()) {
2571 AssociatedICV = ICVInfo.Kind;
2572 return;
2573 }
2574 }
2575
2576
2577 indicatePessimisticFixpoint();
2578 }
2579
2581 if (!ReplVal || !*ReplVal)
2582 return ChangeStatus::UNCHANGED;
2583
2585 A.deleteAfterManifest(*getCtxI());
2586
2587 return ChangeStatus::CHANGED;
2588 }
2589
2590
2591 const std::string getAsStr(Attributor *) const override {
2592 return "ICVTrackerCallSite";
2593 }
2594
2595
2596 void trackStatistics() const override {}
2597
2599 std::optional<Value *> ReplVal;
2600
2601 ChangeStatus updateImpl(Attributor &A) override {
2602 const auto *ICVTrackingAA = A.getAAFor(
2604
2605
2606 if (!ICVTrackingAA->isAssumedTracked())
2607 return indicatePessimisticFixpoint();
2608
2609 std::optional<Value *> NewReplVal =
2610 ICVTrackingAA->getReplacementValue(AssociatedICV, getCtxI(), A);
2611
2612 if (ReplVal == NewReplVal)
2613 return ChangeStatus::UNCHANGED;
2614
2615 ReplVal = NewReplVal;
2616 return ChangeStatus::CHANGED;
2617 }
2618
2619
2620
2621 std::optional<Value *>
2623 return ReplVal;
2624 }
2625};
2626
2627struct AAICVTrackerCallSiteReturned : AAICVTracker {
2628 AAICVTrackerCallSiteReturned(const IRPosition &IRP, Attributor &A)
2629 : AAICVTracker(IRP, A) {}
2630
2631
2632 const std::string getAsStr(Attributor *) const override {
2633 return "ICVTrackerCallSiteReturned";
2634 }
2635
2636
2637 void trackStatistics() const override {}
2638
2639
2641 return ChangeStatus::UNCHANGED;
2642 }
2643
2644
2646 InternalControlVar::ICV___last>
2647 ICVReplacementValuesMap;
2648
2649
2650
2651 std::optional<Value *>
2653 return ICVReplacementValuesMap[ICV];
2654 }
2655
2656 ChangeStatus updateImpl(Attributor &A) override {
2658 const auto *ICVTrackingAA = A.getAAFor(
2660 DepClassTy::REQUIRED);
2661
2662
2663 if (!ICVTrackingAA->isAssumedTracked())
2664 return indicatePessimisticFixpoint();
2665
2667 std::optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
2668 std::optional<Value *> NewReplVal =
2669 ICVTrackingAA->getUniqueReplacementValue(ICV);
2670
2671 if (ReplVal == NewReplVal)
2672 continue;
2673
2674 ReplVal = NewReplVal;
2675 Changed = ChangeStatus::CHANGED;
2676 }
2678 }
2679};
2680
2681
2682
2683static bool hasFunctionEndAsUniqueSuccessor(const BasicBlock *BB) {
2685 return true;
2688 return false;
2689 return hasFunctionEndAsUniqueSuccessor(Successor);
2690}
2691
2692struct AAExecutionDomainFunction : public AAExecutionDomain {
2693 AAExecutionDomainFunction(const IRPosition &IRP, Attributor &A)
2694 : AAExecutionDomain(IRP, A) {}
2695
2696 ~AAExecutionDomainFunction() override { delete RPOT; }
2697
2698 void initialize(Attributor &A) override {
2700 assert(F && "Expected anchor function");
2701 RPOT = new ReversePostOrderTraversal<Function *>(F);
2702 }
2703
2704 const std::string getAsStr(Attributor *) const override {
2705 unsigned TotalBlocks = 0, InitialThreadBlocks = 0, AlignedBlocks = 0;
2706 for (auto &It : BEDMap) {
2707 if (!It.getFirst())
2708 continue;
2709 TotalBlocks++;
2710 InitialThreadBlocks += It.getSecond().IsExecutedByInitialThreadOnly;
2711 AlignedBlocks += It.getSecond().IsReachedFromAlignedBarrierOnly &&
2712 It.getSecond().IsReachingAlignedBarrierOnly;
2713 }
2714 return "[AAExecutionDomain] " + std::to_string(InitialThreadBlocks) + "/" +
2715 std::to_string(AlignedBlocks) + " of " +
2716 std::to_string(TotalBlocks) +
2717 " executed by initial thread / aligned";
2718 }
2719
2720
2721 void trackStatistics() const override {}
2722
2725 for (const BasicBlock &BB : *getAnchorScope()) {
2726 if (!isExecutedByInitialThreadOnly(BB))
2727 continue;
2728 dbgs() << TAG << " Basic block @" << getAnchorScope()->getName() << " "
2729 << BB.getName() << " is executed by a single thread.\n";
2730 }
2731 });
2732
2734
2737
2738 SmallPtrSet<CallBase *, 16> DeletedBarriers;
2739 auto HandleAlignedBarrier = [&](CallBase *CB) {
2740 const ExecutionDomainTy &ED = CB ? CEDMap[{CB, PRE}] : BEDMap[nullptr];
2741 if (!ED.IsReachedFromAlignedBarrierOnly ||
2742 ED.EncounteredNonLocalSideEffect)
2743 return;
2744 if (!ED.EncounteredAssumes.empty() && .isModulePass())
2745 return;
2746
2747
2748
2749
2750
2751
2752
2753
2754 if (CB) {
2755 DeletedBarriers.insert(CB);
2756 A.deleteAfterManifest(*CB);
2757 ++NumBarriersEliminated;
2758 Changed = ChangeStatus::CHANGED;
2759 } else if (!ED.AlignedBarriers.empty()) {
2760 Changed = ChangeStatus::CHANGED;
2762 ED.AlignedBarriers.end());
2763 SmallSetVector<CallBase *, 16> Visited;
2764 while (!Worklist.empty()) {
2766 if (!Visited.insert(LastCB))
2767 continue;
2768 if (LastCB->getFunction() != getAnchorScope())
2769 continue;
2770 if (!hasFunctionEndAsUniqueSuccessor(LastCB->getParent()))
2771 continue;
2772 if (!DeletedBarriers.count(LastCB)) {
2773 ++NumBarriersEliminated;
2774 A.deleteAfterManifest(*LastCB);
2775 continue;
2776 }
2777
2778
2779
2780 const ExecutionDomainTy &LastED = CEDMap[{LastCB, PRE}];
2781 Worklist.append(LastED.AlignedBarriers.begin(),
2782 LastED.AlignedBarriers.end());
2783 }
2784 }
2785
2786
2787
2788 if (!ED.EncounteredAssumes.empty() && (CB || !ED.AlignedBarriers.empty()))
2789 for (auto *AssumeCB : ED.EncounteredAssumes)
2790 A.deleteAfterManifest(*AssumeCB);
2791 };
2792
2793 for (auto *CB : AlignedBarriers)
2794 HandleAlignedBarrier(CB);
2795
2796
2798 HandleAlignedBarrier(nullptr);
2799
2801 }
2802
2803 bool isNoOpFence(const FenceInst &FI) const override {
2804 return getState().isValidState() && !NonNoOpFences.count(&FI);
2805 }
2806
2807
2808
2809 void
2810 mergeInPredecessorBarriersAndAssumptions(Attributor &A, ExecutionDomainTy &ED,
2811 const ExecutionDomainTy &PredED);
2812
2813
2814
2815
2816 bool mergeInPredecessor(Attributor &A, ExecutionDomainTy &ED,
2817 const ExecutionDomainTy &PredED,
2818 bool InitialEdgeOnly = false);
2819
2820
2821 bool handleCallees(Attributor &A, ExecutionDomainTy &EntryBBED);
2822
2823
2824 ChangeStatus updateImpl(Attributor &A) override;
2825
2826
2827
2828 bool isExecutedByInitialThreadOnly(const BasicBlock &BB) const override {
2829 if (!isValidState())
2830 return false;
2831 assert(BB.getParent() == getAnchorScope() && "Block is out of scope!");
2832 return BEDMap.lookup(&BB).IsExecutedByInitialThreadOnly;
2833 }
2834
2835 bool isExecutedInAlignedRegion(Attributor &A,
2836 const Instruction &I) const override {
2837 assert(I.getFunction() == getAnchorScope() &&
2838 "Instruction is out of scope!");
2839 if (!isValidState())
2840 return false;
2841
2842 bool ForwardIsOk = true;
2844
2845
2846 CurI = &I;
2847 do {
2849 if (!CB)
2850 continue;
2851 if (CB != &I && AlignedBarriers.contains(const_cast<CallBase *>(CB)))
2852 return true;
2853 const auto &It = CEDMap.find({CB, PRE});
2854 if (It == CEDMap.end())
2855 continue;
2856 if (!It->getSecond().IsReachingAlignedBarrierOnly)
2857 ForwardIsOk = false;
2858 break;
2860
2861 if (!CurI && !BEDMap.lookup(I.getParent()).IsReachingAlignedBarrierOnly)
2862 ForwardIsOk = false;
2863
2864
2865 CurI = &I;
2866 do {
2868 if (!CB)
2869 continue;
2870 if (CB != &I && AlignedBarriers.contains(const_cast<CallBase *>(CB)))
2871 return true;
2872 const auto &It = CEDMap.find({CB, POST});
2873 if (It == CEDMap.end())
2874 continue;
2875 if (It->getSecond().IsReachedFromAlignedBarrierOnly)
2876 break;
2877 return false;
2879
2880
2881
2882 if (!ForwardIsOk)
2883 return false;
2884
2885 if (!CurI) {
2888 return BEDMap.lookup(nullptr).IsReachedFromAlignedBarrierOnly;
2890 return BEDMap.lookup(PredBB).IsReachedFromAlignedBarrierOnly;
2891 })) {
2892 return false;
2893 }
2894 }
2895
2896
2897 return true;
2898 }
2899
2900 ExecutionDomainTy getExecutionDomain(const BasicBlock &BB) const override {
2901 assert(isValidState() &&
2902 "No request should be made against an invalid state!");
2903 return BEDMap.lookup(&BB);
2904 }
2905 std::pair<ExecutionDomainTy, ExecutionDomainTy>
2906 getExecutionDomain(const CallBase &CB) const override {
2907 assert(isValidState() &&
2908 "No request should be made against an invalid state!");
2909 return {CEDMap.lookup({&CB, PRE}), CEDMap.lookup({&CB, POST})};
2910 }
2911 ExecutionDomainTy getFunctionExecutionDomain() const override {
2912 assert(isValidState() &&
2913 "No request should be made against an invalid state!");
2914 return InterProceduralED;
2915 }
2916
2917
2918
2919
2920 static bool isInitialThreadOnlyEdge(Attributor &A, BranchInst *Edge,
2921 BasicBlock &SuccessorBB) {
2922 if ( ||
->isConditional())
2923 return false;
2924 if (Edge->getSuccessor(0) != &SuccessorBB)
2925 return false;
2926
2928 if (!Cmp || ->isTrueWhenEqual() ||
->isEquality())
2929 return false;
2930
2932 if ()
2933 return false;
2934
2935
2936 if (C->isAllOnesValue()) {
2938 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
2939 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
2940 CB = CB ? OpenMPOpt::getCallIfRegularCall(*CB, &RFI) : nullptr;
2941 if (!CB)
2942 return false;
2943 ConstantStruct *KernelEnvC =
2945 ConstantInt *ExecModeC =
2946 KernelInfo::getExecModeFromKernelEnvironment(KernelEnvC);
2948 }
2949
2950 if (C->isZero()) {
2951
2953 if (II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_tid_x)
2954 return true;
2955
2956
2958 if (II->getIntrinsicID() == Intrinsic::amdgcn_workitem_id_x)
2959 return true;
2960 }
2961
2962 return false;
2963 };
2964
2965
2966 ExecutionDomainTy InterProceduralED;
2967
2968 enum Direction { PRE = 0, POST = 1 };
2969
2970 DenseMap<const BasicBlock *, ExecutionDomainTy> BEDMap;
2971 DenseMap<PointerIntPair<const CallBase *, 1, Direction>, ExecutionDomainTy>
2972 CEDMap;
2973 SmallSetVector<CallBase *, 16> AlignedBarriers;
2974
2975 ReversePostOrderTraversal<Function *> *RPOT = nullptr;
2976
2977
2978 static bool setAndRecord(bool &R, bool V) {
2981 return !Eq;
2982 }
2983
2984
2985
2986 SmallPtrSet<const FenceInst *, 8> NonNoOpFences;
2987};
2988
2989void AAExecutionDomainFunction::mergeInPredecessorBarriersAndAssumptions(
2990 Attributor &A, ExecutionDomainTy &ED, const ExecutionDomainTy &PredED) {
2991 for (auto *EA : PredED.EncounteredAssumes)
2992 ED.addAssumeInst(A, *EA);
2993
2994 for (auto *AB : PredED.AlignedBarriers)
2995 ED.addAlignedBarrier(A, *AB);
2996}
2997
2998bool AAExecutionDomainFunction::mergeInPredecessor(
2999 Attributor &A, ExecutionDomainTy &ED, const ExecutionDomainTy &PredED,
3000 bool InitialEdgeOnly) {
3001
3004 setAndRecord(ED.IsExecutedByInitialThreadOnly,
3005 InitialEdgeOnly || (PredED.IsExecutedByInitialThreadOnly &&
3006 ED.IsExecutedByInitialThreadOnly));
3007
3008 Changed |= setAndRecord(ED.IsReachedFromAlignedBarrierOnly,
3009 ED.IsReachedFromAlignedBarrierOnly &&
3010 PredED.IsReachedFromAlignedBarrierOnly);
3011 Changed |= setAndRecord(ED.EncounteredNonLocalSideEffect,
3012 ED.EncounteredNonLocalSideEffect |
3013 PredED.EncounteredNonLocalSideEffect);
3014
3015 if (ED.IsReachedFromAlignedBarrierOnly)
3016 mergeInPredecessorBarriersAndAssumptions(A, ED, PredED);
3017 else
3018 ED.clearAssumeInstAndAlignedBarriers();
3020}
3021
3022bool AAExecutionDomainFunction::handleCallees(Attributor &A,
3023 ExecutionDomainTy &EntryBBED) {
3025 auto PredForCallSite = [&](AbstractCallSite ACS) {
3026 const auto *EDAA = A.getAAFor(
3028 DepClassTy::OPTIONAL);
3029 if (!EDAA || !EDAA->getState().isValidState())
3030 return false;
3032 EDAA->getExecutionDomain(*cast(ACS.getInstruction())));
3033 return true;
3034 };
3035
3036 ExecutionDomainTy ExitED;
3037 bool AllCallSitesKnown;
3038 if (A.checkForAllCallSites(PredForCallSite, *this,
3039 true,
3040 AllCallSitesKnown)) {
3041 for (const auto &[CSInED, CSOutED] : CallSiteEDs) {
3042 mergeInPredecessor(A, EntryBBED, CSInED);
3043 ExitED.IsReachingAlignedBarrierOnly &=
3044 CSOutED.IsReachingAlignedBarrierOnly;
3045 }
3046
3047 } else {
3048
3049
3051 EntryBBED.IsExecutedByInitialThreadOnly = false;
3052 EntryBBED.IsReachedFromAlignedBarrierOnly = true;
3053 EntryBBED.EncounteredNonLocalSideEffect = false;
3054 ExitED.IsReachingAlignedBarrierOnly = false;
3055 } else {
3056 EntryBBED.IsExecutedByInitialThreadOnly = false;
3057 EntryBBED.IsReachedFromAlignedBarrierOnly = false;
3058 EntryBBED.EncounteredNonLocalSideEffect = true;
3059 ExitED.IsReachingAlignedBarrierOnly = false;
3060 }
3061 }
3062
3064 auto &FnED = BEDMap[nullptr];
3065 Changed |= setAndRecord(FnED.IsReachedFromAlignedBarrierOnly,
3066 FnED.IsReachedFromAlignedBarrierOnly &
3067 EntryBBED.IsReachedFromAlignedBarrierOnly);
3068 Changed |= setAndRecord(FnED.IsReachingAlignedBarrierOnly,
3069 FnED.IsReachingAlignedBarrierOnly &
3070 ExitED.IsReachingAlignedBarrierOnly);
3071 Changed |= setAndRecord(FnED.IsExecutedByInitialThreadOnly,
3072 EntryBBED.IsExecutedByInitialThreadOnly);
3074}
3075
3076ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
3077
3079
3080
3081
3082
3083 auto HandleAlignedBarrier = [&](CallBase &CB, ExecutionDomainTy &ED) {
3084 Changed |= AlignedBarriers.insert(&CB);
3085
3086 auto &CallInED = CEDMap[{&CB, PRE}];
3087 Changed |= mergeInPredecessor(A, CallInED, ED);
3088 CallInED.IsReachingAlignedBarrierOnly = true;
3089
3090 ED.EncounteredNonLocalSideEffect = false;
3091 ED.IsReachedFromAlignedBarrierOnly = true;
3092
3093 ED.clearAssumeInstAndAlignedBarriers();
3094 ED.addAlignedBarrier(A, CB);
3095 auto &CallOutED = CEDMap[{&CB, POST}];
3096 Changed |= mergeInPredecessor(A, CallOutED, ED);
3097 };
3098
3099 auto *LivenessAA =
3100 A.getAAFor(*this, getIRPosition(), DepClassTy::OPTIONAL);
3101
3103 BasicBlock &EntryBB = F->getEntryBlock();
3105
3107 for (auto &RIt : *RPOT) {
3109
3110 bool IsEntryBB = &BB == &EntryBB;
3111
3112
3113 bool AlignedBarrierLastInBlock = IsEntryBB && IsKernel;
3114 bool IsExplicitlyAligned = IsEntryBB && IsKernel;
3115 ExecutionDomainTy ED;
3116
3117 if (IsEntryBB) {
3118 Changed |= handleCallees(A, ED);
3119 } else {
3120
3121
3122 if (LivenessAA && LivenessAA->isAssumedDead(&BB))
3123 continue;
3124
3126 if (LivenessAA && LivenessAA->isEdgeDead(PredBB, &BB))
3127 continue;
3128 bool InitialEdgeOnly = isInitialThreadOnlyEdge(
3130 mergeInPredecessor(A, ED, BEDMap[PredBB], InitialEdgeOnly);
3131 }
3132 }
3133
3134
3135
3136 for (Instruction &I : BB) {
3137 bool UsedAssumedInformation;
3138 if (A.isAssumedDead(I, *this, LivenessAA, UsedAssumedInformation,
3139 false, DepClassTy::OPTIONAL,
3140 true))
3141 continue;
3142
3143
3144
3147 ED.addAssumeInst(A, *AI);
3148 continue;
3149 }
3150
3151 if (II->isAssumeLikeIntrinsic())
3152 continue;
3153 }
3154
3156 if (!ED.EncounteredNonLocalSideEffect) {
3157
3158 if (ED.IsReachedFromAlignedBarrierOnly)
3159 continue;
3160
3161
3163 case AtomicOrdering::NotAtomic:
3164 continue;
3165 case AtomicOrdering::Unordered:
3166 continue;
3167 case AtomicOrdering::Monotonic:
3168 continue;
3169 case AtomicOrdering::Acquire:
3170 break;
3171 case AtomicOrdering::Release:
3172 continue;
3173 case AtomicOrdering::AcquireRelease:
3174 break;
3175 case AtomicOrdering::SequentiallyConsistent:
3176 break;
3177 };
3178 }
3179 NonNoOpFences.insert(FI);
3180 }
3181
3184 bool IsAlignedBarrier =
3185 !IsNoSync && CB &&
3187
3188 AlignedBarrierLastInBlock &= IsNoSync;
3189 IsExplicitlyAligned &= IsNoSync;
3190
3191
3192
3193
3194 if (CB) {
3195 if (IsAlignedBarrier) {
3196 HandleAlignedBarrier(*CB, ED);
3197 AlignedBarrierLastInBlock = true;
3198 IsExplicitlyAligned = true;
3199 continue;
3200 }
3201
3202
3204 if (!ED.EncounteredNonLocalSideEffect &&
3206 ED.EncounteredNonLocalSideEffect = true;
3207 if (!IsNoSync) {
3208 ED.IsReachedFromAlignedBarrierOnly = false;
3210 }
3211 continue;
3212 }
3213
3214
3215
3216 auto &CallInED = CEDMap[{CB, PRE}];
3217 Changed |= mergeInPredecessor(A, CallInED, ED);
3218
3219
3220
3221
3223 if (!IsNoSync && Callee && ->isDeclaration()) {
3224 const auto *EDAA = A.getAAFor(
3226 if (EDAA && EDAA->getState().isValidState()) {
3227 const auto &CalleeED = EDAA->getFunctionExecutionDomain();
3228 ED.IsReachedFromAlignedBarrierOnly =
3229 CalleeED.IsReachedFromAlignedBarrierOnly;
3230 AlignedBarrierLastInBlock = ED.IsReachedFromAlignedBarrierOnly;
3231 if (IsNoSync || !CalleeED.IsReachedFromAlignedBarrierOnly)
3232 ED.EncounteredNonLocalSideEffect |=
3233 CalleeED.EncounteredNonLocalSideEffect;
3234 else
3235 ED.EncounteredNonLocalSideEffect =
3236 CalleeED.EncounteredNonLocalSideEffect;
3237 if (!CalleeED.IsReachingAlignedBarrierOnly) {
3239 setAndRecord(CallInED.IsReachingAlignedBarrierOnly, false);
3241 }
3242 if (CalleeED.IsReachedFromAlignedBarrierOnly)
3243 mergeInPredecessorBarriersAndAssumptions(A, ED, CalleeED);
3244 auto &CallOutED = CEDMap[{CB, POST}];
3245 Changed |= mergeInPredecessor(A, CallOutED, ED);
3246 continue;
3247 }
3248 }
3249 if (!IsNoSync) {
3250 ED.IsReachedFromAlignedBarrierOnly = false;
3251 Changed |= setAndRecord(CallInED.IsReachingAlignedBarrierOnly, false);
3253 }
3254 AlignedBarrierLastInBlock &= ED.IsReachedFromAlignedBarrierOnly;
3256 auto &CallOutED = CEDMap[{CB, POST}];
3257 Changed |= mergeInPredecessor(A, CallOutED, ED);
3258 }
3259
3260 if (.mayHaveSideEffects() &&
.mayReadFromMemory())
3261 continue;
3262
3263
3264
3265 if (CB) {
3266 const auto *MemAA = A.getAAFor(
3268
3273 };
3274 if (MemAA && MemAA->getState().isValidState() &&
3275 MemAA->checkForAllAccessesToMemoryKind(
3277 continue;
3278 }
3279
3280 auto &InfoCache = A.getInfoCache();
3281 if (.mayHaveSideEffects() && InfoCache.isOnlyUsedByAssume(I))
3282 continue;
3283
3285 if (LI->hasMetadata(LLVMContext::MD_invariant_load))
3286 continue;
3287
3288 if (!ED.EncounteredNonLocalSideEffect &&
3290 ED.EncounteredNonLocalSideEffect = true;
3291 }
3292
3293 bool IsEndAndNotReachingAlignedBarriersOnly = false;
3295 !BB.getTerminator()->getNumSuccessors()) {
3296
3297 Changed |= mergeInPredecessor(A, InterProceduralED, ED);
3298
3299 auto &FnED = BEDMap[nullptr];
3300 if (IsKernel && !IsExplicitlyAligned)
3301 FnED.IsReachingAlignedBarrierOnly = false;
3302 Changed |= mergeInPredecessor(A, FnED, ED);
3303
3304 if (!FnED.IsReachingAlignedBarrierOnly) {
3305 IsEndAndNotReachingAlignedBarriersOnly = true;
3306 SyncInstWorklist.push_back(BB.getTerminator());
3307 auto &BBED = BEDMap[&BB];
3308 Changed |= setAndRecord(BBED.IsReachingAlignedBarrierOnly, false);
3309 }
3310 }
3311
3312 ExecutionDomainTy &StoredED = BEDMap[&BB];
3313 ED.IsReachingAlignedBarrierOnly = StoredED.IsReachingAlignedBarrierOnly &
3314 !IsEndAndNotReachingAlignedBarriersOnly;
3315
3316
3317
3318
3319
3320 if (ED.IsExecutedByInitialThreadOnly !=
3321 StoredED.IsExecutedByInitialThreadOnly ||
3322 ED.IsReachedFromAlignedBarrierOnly !=
3323 StoredED.IsReachedFromAlignedBarrierOnly ||
3324 ED.EncounteredNonLocalSideEffect !=
3325 StoredED.EncounteredNonLocalSideEffect)
3327
3328
3329 StoredED = std::move(ED);
3330 }
3331
3332
3333
3334 SmallSetVector<BasicBlock *, 16> Visited;
3335 while (!SyncInstWorklist.empty()) {
3338 bool HitAlignedBarrierOrKnownEnd = false;
3339 while ((CurInst = CurInst->getPrevNode())) {
3341 if (!CB)
3342 continue;
3343 auto &CallOutED = CEDMap[{CB, POST}];
3344 Changed |= setAndRecord(CallOutED.IsReachingAlignedBarrierOnly, false);
3345 auto &CallInED = CEDMap[{CB, PRE}];
3346 HitAlignedBarrierOrKnownEnd =
3347 AlignedBarriers.count(CB) || !CallInED.IsReachingAlignedBarrierOnly;
3348 if (HitAlignedBarrierOrKnownEnd)
3349 break;
3350 Changed |= setAndRecord(CallInED.IsReachingAlignedBarrierOnly, false);
3351 }
3352 if (HitAlignedBarrierOrKnownEnd)
3353 continue;
3356 if (LivenessAA && LivenessAA->isEdgeDead(PredBB, SyncBB))
3357 continue;
3358 if (!Visited.insert(PredBB))
3359 continue;
3360 auto &PredED = BEDMap[PredBB];
3361 if (setAndRecord(PredED.IsReachingAlignedBarrierOnly, false)) {
3363 SyncInstWorklist.push_back(PredBB->getTerminator());
3364 }
3365 }
3366 if (SyncBB != &EntryBB)
3367 continue;
3369 setAndRecord(InterProceduralED.IsReachingAlignedBarrierOnly, false);
3370 }
3371
3372 return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
3373}
3374
3375
3376
3377struct AAHeapToShared : public StateWrapper<BooleanState, AbstractAttribute> {
3378 using Base = StateWrapper<BooleanState, AbstractAttribute>;
3379 AAHeapToShared(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
3380
3381
3382 static AAHeapToShared &createForPosition(const IRPosition &IRP,
3383 Attributor &A);
3384
3385
3386 virtual bool isAssumedHeapToShared(CallBase &CB) const = 0;
3387
3388
3389
3390 virtual bool isAssumedHeapToSharedRemovedFree(CallBase &CB) const = 0;
3391
3392
3393 StringRef getName() const override { return "AAHeapToShared"; }
3394
3395
3396 const char *getIdAddr() const override { return &ID; }
3397
3398
3399
3400 static bool classof(const AbstractAttribute *AA) {
3402 }
3403
3404
3405 static const char ID;
3406};
3407
3408struct AAHeapToSharedFunction : public AAHeapToShared {
3409 AAHeapToSharedFunction(const IRPosition &IRP, Attributor &A)
3410 : AAHeapToShared(IRP, A) {}
3411
3412 const std::string getAsStr(Attributor *) const override {
3413 return "[AAHeapToShared] " + std::to_string(MallocCalls.size()) +
3414 " malloc calls eligible.";
3415 }
3416
3417
3418 void trackStatistics() const override {}
3419
3420
3421
3422 void findPotentialRemovedFreeCalls(Attributor &A) {
3423 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
3424 auto &FreeRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
3425
3426 PotentialRemovedFreeCalls.clear();
3427
3428 for (CallBase *CB : MallocCalls) {
3430 for (auto *U : CB->users()) {
3432 if (C && C->getCalledFunction() == FreeRFI.Declaration)
3434 }
3435
3436 if (FreeCalls.size() != 1)
3437 continue;
3438
3439 PotentialRemovedFreeCalls.insert(FreeCalls.front());
3440 }
3441 }
3442
3443 void initialize(Attributor &A) override {
3445 indicatePessimisticFixpoint();
3446 return;
3447 }
3448
3449 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
3450 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
3451 if (!RFI.Declaration)
3452 return;
3453
3455 [](const IRPosition &, const AbstractAttribute *,
3456 bool &) -> std::optional<Value *> { return nullptr; };
3457
3459 for (User *U : RFI.Declaration->users())
3462 continue;
3463 MallocCalls.insert(CB);
3465 SCB);
3466 }
3467
3468 findPotentialRemovedFreeCalls(A);
3469 }
3470
3471 bool isAssumedHeapToShared(CallBase &CB) const override {
3472 return isValidState() && MallocCalls.count(&CB);
3473 }
3474
3475 bool isAssumedHeapToSharedRemovedFree(CallBase &CB) const override {
3476 return isValidState() && PotentialRemovedFreeCalls.count(&CB);
3477 }
3478
3480 if (MallocCalls.empty())
3481 return ChangeStatus::UNCHANGED;
3482
3483 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
3484 auto &FreeCall = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
3485
3488 DepClassTy::OPTIONAL);
3489
3491 for (CallBase *CB : MallocCalls) {
3492
3493 if (HS && HS->isAssumedHeapToStack(*CB))
3494 continue;
3495
3496
3498 for (auto *U : CB->users()) {
3500 if (C && C->getCalledFunction() == FreeCall.Declaration)
3502 }
3503 if (FreeCalls.size() != 1)
3504 continue;
3505
3507
3508 if (AllocSize->getZExtValue() + SharedMemoryUsed > SharedMemoryLimit) {
3510 << " with shared memory."
3511 << " Shared memory usage is limited to "
3513 continue;
3514 }
3515
3516 LLVM_DEBUG(dbgs() << TAG << "Replace globalization call " << *CB
3517 << " with " << AllocSize->getZExtValue()
3518 << " bytes of shared memory\n");
3519
3520
3521
3523 Type *Int8Ty = Type::getInt8Ty(M->getContext());
3524 Type *Int8ArrTy = ArrayType::get(Int8Ty, AllocSize->getZExtValue());
3525 auto *SharedMem = new GlobalVariable(
3529 static_cast<unsigned>(AddressSpace::Shared));
3531 SharedMem, PointerType::getUnqual(M->getContext()));
3532
3533 auto Remark = [&](OptimizationRemark OR) {
3534 return OR << "Replaced globalized variable with "
3535 << ore::NV("SharedMemory", AllocSize->getZExtValue())
3536 << (AllocSize->isOne() ? " byte " : " bytes ")
3537 << "of shared memory.";
3538 };
3539 A.emitRemark(CB, "OMP111", Remark);
3540
3541 MaybeAlign Alignment = CB->getRetAlign();
3543 "HeapToShared on allocation without alignment attribute");
3544 SharedMem->setAlignment(*Alignment);
3545
3547 A.deleteAfterManifest(*CB);
3548 A.deleteAfterManifest(*FreeCalls.front());
3549
3550 SharedMemoryUsed += AllocSize->getZExtValue();
3551 NumBytesMovedToSharedMemory = SharedMemoryUsed;
3552 Changed = ChangeStatus::CHANGED;
3553 }
3554
3556 }
3557
3558 ChangeStatus updateImpl(Attributor &A) override {
3559 if (MallocCalls.empty())
3560 return indicatePessimisticFixpoint();
3561 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
3562 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
3563 if (!RFI.Declaration)
3564 return ChangeStatus::UNCHANGED;
3565
3567
3568 auto NumMallocCalls = MallocCalls.size();
3569
3570
3571 for (User *U : RFI.Declaration->users()) {
3573 if (CB->getCaller() != F)
3574 continue;
3575 if (!MallocCalls.count(CB))
3576 continue;
3578 MallocCalls.remove(CB);
3579 continue;
3580 }
3581 const auto *ED = A.getAAFor(
3583 if (!ED || !ED->isExecutedByInitialThreadOnly(*CB))
3584 MallocCalls.remove(CB);
3585 }
3586 }
3587
3588 findPotentialRemovedFreeCalls(A);
3589
3590 if (NumMallocCalls != MallocCalls.size())
3591 return ChangeStatus::CHANGED;
3592
3593 return ChangeStatus::UNCHANGED;
3594 }
3595
3596
3597 SmallSetVector<CallBase *, 4> MallocCalls;
3598
3599 SmallPtrSet<CallBase *, 4> PotentialRemovedFreeCalls;
3600
3601 unsigned SharedMemoryUsed = 0;
3602};
3603
3604struct AAKernelInfo : public StateWrapper<KernelInfoState, AbstractAttribute> {
3605 using Base = StateWrapper<KernelInfoState, AbstractAttribute>;
3606 AAKernelInfo(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
3607
3608
3609
3610 static bool requiresCalleeForCallBase() { return false; }
3611
3612
3613 void trackStatistics() const override {}
3614
3615
3616 const std::string getAsStr(Attributor *) const override {
3617 if (!isValidState())
3618 return "";
3619 return std::string(SPMDCompatibilityTracker.isAssumed() ? "SPMD"
3620 : "generic") +
3621 std::string(SPMDCompatibilityTracker.isAtFixpoint() ? " [FIX]"
3622 : "") +
3623 std::string(" #PRs: ") +
3624 (ReachedKnownParallelRegions.isValidState()
3625 ? std::to_string(ReachedKnownParallelRegions.size())
3626 : "") +
3627 ", #Unknown PRs: " +
3628 (ReachedUnknownParallelRegions.isValidState()
3629 ? std::to_string(ReachedUnknownParallelRegions.size())
3630 : "") +
3631 ", #Reaching Kernels: " +
3632 (ReachingKernelEntries.isValidState()
3633 ? std::to_string(ReachingKernelEntries.size())
3634 : "") +
3635 ", #ParLevels: " +
3636 (ParallelLevels.isValidState()
3637 ? std::to_string(ParallelLevels.size())
3638 : "") +
3639 ", NestedPar: " + (NestedParallelism ? "yes" : "no");
3640 }
3641
3642
3643 static AAKernelInfo &createForPosition(const IRPosition &IRP, Attributor &A);
3644
3645
3646 StringRef getName() const override { return "AAKernelInfo"; }
3647
3648
3649 const char *getIdAddr() const override { return &ID; }
3650
3651
3652 static bool classof(const AbstractAttribute *AA) {
3654 }
3655
3656 static const char ID;
3657};
3658
3659
3660
3661struct AAKernelInfoFunction : AAKernelInfo {
3662 AAKernelInfoFunction(const IRPosition &IRP, Attributor &A)
3663 : AAKernelInfo(IRP, A) {}
3664
3665 SmallPtrSet<Instruction *, 4> GuardedInstructions;
3666
3667 SmallPtrSetImpl<Instruction *> &getGuardedInstructions() {
3668 return GuardedInstructions;
3669 }
3670
3671 void setConfigurationOfKernelEnvironment(ConstantStruct *ConfigC) {
3673 KernelEnvC, ConfigC, {KernelInfo::ConfigurationIdx});
3674 assert(NewKernelEnvC && "Failed to create new kernel environment");
3676 }
3677
3678#define KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MEMBER) \
3679 void set##MEMBER##OfKernelEnvironment(ConstantInt *NewVal) { \
3680 ConstantStruct *ConfigC = \
3681 KernelInfo::getConfigurationFromKernelEnvironment(KernelEnvC); \
3682 Constant *NewConfigC = ConstantFoldInsertValueInstruction( \
3683 ConfigC, NewVal, {KernelInfo::MEMBER##Idx}); \
3684 assert(NewConfigC && "Failed to create new configuration environment"); \
3685 setConfigurationOfKernelEnvironment(cast(NewConfigC)); \
3686 }
3687
3695
3696#undef KERNEL_ENVIRONMENT_CONFIGURATION_SETTER
3697
3698
3700
3701
3702
3703 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
3704
3705 Function *Fn = getAnchorScope();
3706
3707 OMPInformationCache::RuntimeFunctionInfo &InitRFI =
3708 OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
3709 OMPInformationCache::RuntimeFunctionInfo &DeinitRFI =
3710 OMPInfoCache.RFIs[OMPRTL___kmpc_target_deinit];
3711
3712
3713
3714 auto StoreCallBase = [](Use &U,
3715 OMPInformationCache::RuntimeFunctionInfo &RFI,
3717 CallBase *CB = OpenMPOpt::getCallIfRegularCall(U, &RFI);
3719 "Unexpected use of __kmpc_target_init or __kmpc_target_deinit!");
3721 "Multiple uses of __kmpc_target_init or __kmpc_target_deinit!");
3722 Storage = CB;
3723 return false;
3724 };
3725 InitRFI.foreachUse(
3727 StoreCallBase(U, InitRFI, KernelInitCB);
3728 return false;
3729 },
3730 Fn);
3731 DeinitRFI.foreachUse(
3733 StoreCallBase(U, DeinitRFI, KernelDeinitCB);
3734 return false;
3735 },
3736 Fn);
3737
3738
3739 if (!KernelInitCB || !KernelDeinitCB)
3740 return;
3741
3742
3743 ReachingKernelEntries.insert(Fn);
3744 IsKernelEntry = true;
3745
3746 KernelEnvC =
3750
3752 KernelConfigurationSimplifyCB =
3754 bool &UsedAssumedInformation) -> std::optional<Constant *> {
3755 if (!isAtFixpoint()) {
3756 if ()
3757 return nullptr;
3758 UsedAssumedInformation = true;
3760 }
3761 return KernelEnvC;
3762 };
3763
3764 A.registerGlobalVariableSimplificationCallback(
3765 *KernelEnvGV, KernelConfigurationSimplifyCB);
3766
3767
3768 bool CanChangeToSPMD = OMPInfoCache.runtimeFnsAvailable(
3769 {OMPRTL___kmpc_get_hardware_thread_id_in_block,
3770 OMPRTL___kmpc_barrier_simple_spmd});
3771
3772
3774 KernelInfo::getExecModeFromKernelEnvironment(KernelEnvC);
3775 ConstantInt *AssumedExecModeC = ConstantInt::get(
3779 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
3781
3782
3783 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
3784 else
3785 setExecModeOfKernelEnvironment(AssumedExecModeC);
3786
3790 OpenMPIRBuilder::readThreadBoundsForKernel(T, *Fn);
3791 if (MinThreads)
3792 setMinThreadsOfKernelEnvironment(ConstantInt::get(Int32Ty, MinThreads));
3794 setMaxThreadsOfKernelEnvironment(ConstantInt::get(Int32Ty, MaxThreads));
3795 auto [MinTeams, MaxTeams] =
3796 OpenMPIRBuilder::readTeamBoundsForKernel(T, *Fn);
3797 if (MinTeams)
3798 setMinTeamsOfKernelEnvironment(ConstantInt::get(Int32Ty, MinTeams));
3799 if (MaxTeams)
3800 setMaxTeamsOfKernelEnvironment(ConstantInt::get(Int32Ty, MaxTeams));
3801
3803 KernelInfo::getMayUseNestedParallelismFromKernelEnvironment(KernelEnvC);
3804 ConstantInt *AssumedMayUseNestedParallelismC = ConstantInt::get(
3805 MayUseNestedParallelismC->getIntegerType(), NestedParallelism);
3806 setMayUseNestedParallelismOfKernelEnvironment(
3807 AssumedMayUseNestedParallelismC);
3808
3811 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
3812 KernelEnvC);
3813 ConstantInt *AssumedUseGenericStateMachineC =
3814 ConstantInt::get(UseGenericStateMachineC->getIntegerType(), false);
3815 setUseGenericStateMachineOfKernelEnvironment(
3816 AssumedUseGenericStateMachineC);
3817 }
3818
3819
3822 if (!OMPInfoCache.RFIs[RFKind].Declaration)
3823 return;
3824 A.registerVirtualUseCallback(*OMPInfoCache.RFIs[RFKind].Declaration, CB);
3825 };
3826
3827
3828 auto AddDependence = [](Attributor &A, const AAKernelInfo *KI,
3830 if (QueryingAA) {
3832 }
3833 return true;
3834 };
3835
3838
3839
3840
3841
3842
3843
3844
3845 if (SPMDCompatibilityTracker.isValidState())
3846 return AddDependence(A, this, QueryingAA);
3847
3848 if (!ReachedKnownParallelRegions.isValidState())
3849 return AddDependence(A, this, QueryingAA);
3850 return false;
3851 };
3852
3853
3855 RegisterVirtualUse(OMPRTL___kmpc_get_hardware_num_threads_in_block,
3856 CustomStateMachineUseCB);
3857 RegisterVirtualUse(OMPRTL___kmpc_get_warp_size, CustomStateMachineUseCB);
3858 RegisterVirtualUse(OMPRTL___kmpc_barrier_simple_generic,
3859 CustomStateMachineUseCB);
3860 RegisterVirtualUse(OMPRTL___kmpc_kernel_parallel,
3861 CustomStateMachineUseCB);
3862 RegisterVirtualUse(OMPRTL___kmpc_kernel_end_parallel,
3863 CustomStateMachineUseCB);
3864 }
3865
3866
3867 if (SPMDCompatibilityTracker.isAtFixpoint())
3868 return;
3869
3872
3873
3874 if (!SPMDCompatibilityTracker.isValidState())
3875 return AddDependence(A, this, QueryingAA);
3876 return false;
3877 };
3878 RegisterVirtualUse(OMPRTL___kmpc_get_hardware_thread_id_in_block,
3879 HWThreadIdUseCB);
3880
3883
3884
3885
3886
3887 if (!SPMDCompatibilityTracker.isValidState())
3888 return AddDependence(A, this, QueryingAA);
3889 if (SPMDCompatibilityTracker.empty())
3890 return AddDependence(A, this, QueryingAA);
3891 if (!mayContainParallelRegion())
3892 return AddDependence(A, this, QueryingAA);
3893 return false;
3894 };
3895 RegisterVirtualUse(OMPRTL___kmpc_barrier_simple_spmd, SPMDBarrierUseCB);
3896 }
3897
3898
3899 static std::string sanitizeForGlobalName(std::string S) {
3900 std::replace_if(
3901 S.begin(), S.end(),
3902 [](const char C) {
3903 return !((C >= 'a' && C <= 'z') || (C >= 'A' && C <= 'Z') ||
3904 (C >= '0' && C <= '9') || C == '_');
3905 },
3906 '.');
3907 return S;
3908 }
3909
3910
3911
3913
3914
3915 if (!KernelInitCB || !KernelDeinitCB)
3916 return ChangeStatus::UNCHANGED;
3917
3919
3920 bool HasBuiltStateMachine = true;
3921 if (!changeToSPMDMode(A, Changed)) {
3923 HasBuiltStateMachine = buildCustomStateMachine(A, Changed);
3924 else
3925 HasBuiltStateMachine = false;
3926 }
3927
3928
3929 ConstantStruct *ExistingKernelEnvC =
3931 ConstantInt *OldUseGenericStateMachineVal =
3932 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
3933 ExistingKernelEnvC);
3934 if (!HasBuiltStateMachine)
3935 setUseGenericStateMachineOfKernelEnvironment(
3936 OldUseGenericStateMachineVal);
3937
3938
3939 GlobalVariable *KernelEnvGV =
3943 Changed = ChangeStatus::CHANGED;
3944 }
3945
3947 }
3948
3949 void insertInstructionGuardsHelper(Attributor &A) {
3950 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
3951
3952 auto CreateGuardedRegion = [&](Instruction *RegionStartI,
3954 LoopInfo *LI = nullptr;
3955 DominatorTree *DT = nullptr;
3956 MemorySSAUpdater *MSU = nullptr;
3957 using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
3958
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3986 DT, LI, MSU, "region.guarded.end");
3989 MSU, "region.barrier");
3992 DT, LI, MSU, "region.exit");
3994 SplitBlock(ParentBB, RegionStartI, DT, LI, MSU, "region.guarded");
3995
3997 "Expected a different CFG");
3998
4000 ParentBB, ParentBB->getTerminator(), DT, LI, MSU, "region.check.tid");
4001
4002
4003 A.registerManifestAddedBasicBlock(*RegionEndBB);
4004 A.registerManifestAddedBasicBlock(*RegionBarrierBB);
4005 A.registerManifestAddedBasicBlock(*RegionExitBB);
4006 A.registerManifestAddedBasicBlock(*RegionStartBB);
4007 A.registerManifestAddedBasicBlock(*RegionCheckTidBB);
4008
4009 bool HasBroadcastValues = false;
4010
4011
4012 for (Instruction &I : *RegionStartBB) {
4014 for (Use &U : I.uses()) {
4016 if (UsrI.getParent() != RegionStartBB)
4018 }
4019
4020 if (OutsideUses.empty())
4021 continue;
4022
4023 HasBroadcastValues = true;
4024
4025
4026
4027 auto *SharedMem = new GlobalVariable(
4028 M, I.getType(), false,
4030 sanitizeForGlobalName(
4031 (I.getName() + ".guarded.output.alloc").str()),
4033 static_cast<unsigned>(AddressSpace::Shared));
4034
4035
4036 new StoreInst(&I, SharedMem,
4038
4039 LoadInst *LoadI = new LoadInst(
4040 I.getType(), SharedMem, I.getName() + ".guarded.output.load",
4042
4043
4044 for (Use *U : OutsideUses)
4045 A.changeUseAfterManifest(*U, *LoadI);
4046 }
4047
4048 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
4049
4050
4053 OpenMPIRBuilder::LocationDescription Loc(
4054 InsertPointTy(ParentBB, ParentBB->end()), DL);
4055 OMPInfoCache.OMPBuilder.updateToLocation(Loc);
4056 uint32_t SrcLocStrSize;
4057 auto *SrcLocStr =
4058 OMPInfoCache.OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4060 OMPInfoCache.OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4062
4063
4065 OpenMPIRBuilder::LocationDescription LocRegionCheckTid(
4066 InsertPointTy(RegionCheckTidBB, RegionCheckTidBB->end()), DL);
4067 OMPInfoCache.OMPBuilder.updateToLocation(LocRegionCheckTid);
4068 FunctionCallee HardwareTidFn =
4069 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4070 M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
4071 CallInst *Tid =
4072 OMPInfoCache.OMPBuilder.Builder.CreateCall(HardwareTidFn, {});
4074 OMPInfoCache.setCallingConvention(HardwareTidFn, Tid);
4075 Value *TidCheck = OMPInfoCache.OMPBuilder.Builder.CreateIsNull(Tid);
4076 OMPInfoCache.OMPBuilder.Builder
4077 .CreateCondBr(TidCheck, RegionStartBB, RegionBarrierBB)
4078 ->setDebugLoc(DL);
4079
4080
4081
4082 FunctionCallee BarrierFn =
4083 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4084 M, OMPRTL___kmpc_barrier_simple_spmd);
4085 OMPInfoCache.OMPBuilder.updateToLocation(InsertPointTy(
4088 OMPInfoCache.OMPBuilder.Builder.CreateCall(BarrierFn, {Ident, Tid});
4090 OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
4091
4092
4093 if (HasBroadcastValues) {
4096 RegionBarrierBB->getTerminator()->getIterator());
4098 OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
4099 }
4100 };
4101
4102 auto &AllocSharedRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
4103 SmallPtrSet<BasicBlock *, 8> Visited;
4104 for (Instruction *GuardedI : SPMDCompatibilityTracker) {
4105 BasicBlock *BB = GuardedI->getParent();
4106 if (!Visited.insert(BB).second)
4107 continue;
4108
4112 while (++IP != IPEnd) {
4113 if (!IP->mayHaveSideEffects() && !IP->mayReadFromMemory())
4114 continue;
4116 if (OpenMPOpt::getCallIfRegularCall(*I, &AllocSharedRFI))
4117 continue;
4118 if (->user_empty() || !SPMDCompatibilityTracker.contains(I)) {
4119 LastEffect = nullptr;
4120 continue;
4121 }
4122 if (LastEffect)
4123 Reorders.push_back({I, LastEffect});
4124 LastEffect = &*IP;
4125 }
4126 for (auto &Reorder : Reorders)
4127 Reorder.first->moveBefore(Reorder.second->getIterator());
4128 }
4129
4131
4132 for (Instruction *GuardedI : SPMDCompatibilityTracker) {
4134 auto *CalleeAA = A.lookupAAFor(
4136 DepClassTy::NONE);
4137 assert(CalleeAA != nullptr && "Expected Callee AAKernelInfo");
4139
4140 if (CalleeAAFunction.getGuardedInstructions().contains(GuardedI))
4141 continue;
4142
4143 Instruction *GuardedRegionStart = nullptr, *GuardedRegionEnd = nullptr;
4144 for (Instruction &I : *BB) {
4145
4146
4147 if (SPMDCompatibilityTracker.contains(&I)) {
4148 CalleeAAFunction.getGuardedInstructions().insert(&I);
4149 if (GuardedRegionStart)
4150 GuardedRegionEnd = &I;
4151 else
4152 GuardedRegionStart = GuardedRegionEnd = &I;
4153
4154 continue;
4155 }
4156
4157
4158
4159 if (GuardedRegionStart) {
4161 std::make_pair(GuardedRegionStart, GuardedRegionEnd));
4162 GuardedRegionStart = nullptr;
4163 GuardedRegionEnd = nullptr;
4164 }
4165 }
4166 }
4167
4168 for (auto &GR : GuardedRegions)
4169 CreateGuardedRegion(GR.first, GR.second);
4170 }
4171
4172 void forceSingleThreadPerWorkgroupHelper(Attributor &A) {
4173
4174
4175
4176
4177
4178
4179
4180
4181 auto &Ctx = getAnchorValue().getContext();
4183 assert(Kernel && "Expected an associated function!");
4184
4185
4188 KernelInitCB->getNextNode(), "main.thread.user_code");
4191
4192
4193 A.registerManifestAddedBasicBlock(*InitBB);
4194 A.registerManifestAddedBasicBlock(*UserCodeBB);
4195 A.registerManifestAddedBasicBlock(*ReturnBB);
4196
4197
4201
4202
4204 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
4205 FunctionCallee ThreadIdInBlockFn =
4206 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4207 M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
4208
4209
4210 CallInst *ThreadIdInBlock =
4211 CallInst::Create(ThreadIdInBlockFn, "thread_id.in.block", InitBB);
4212 OMPInfoCache.setCallingConvention(ThreadIdInBlockFn, ThreadIdInBlock);
4214
4215
4217 ICmpInst::Create(ICmpInst::ICmp, CmpInst::ICMP_NE, ThreadIdInBlock,
4218 ConstantInt::get(ThreadIdInBlock->getType(), 0),
4219 "thread.is_main", InitBB);
4222 }
4223
4225 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
4226
4227 if (!SPMDCompatibilityTracker.isAssumed()) {
4228 for (Instruction *NonCompatibleI : SPMDCompatibilityTracker) {
4229 if (!NonCompatibleI)
4230 continue;
4231
4232
4234 if (OMPInfoCache.RTLFunctions.contains(CB->getCalledFunction()))
4235 continue;
4236
4237 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
4238 ORA << "Value has potential side effects preventing SPMD-mode "
4239 "execution";
4241 ORA << ". Add `[[omp::assume(\"ompx_spmd_amenable\")]]` to "
4242 "the called function to override";
4243 }
4244 return ORA << ".";
4245 };
4246 A.emitRemark(NonCompatibleI, "OMP121",
4248
4250 << *NonCompatibleI << "\n");
4251 }
4252
4253 return false;
4254 }
4255
4256
4257
4262 Kernel = CB->getCaller();
4263 }
4265
4266
4267 ConstantStruct *ExistingKernelEnvC =
4269 auto *ExecModeC =
4270 KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC);
4271 const int8_t ExecModeVal = ExecModeC->getSExtValue();
4273 return true;
4274
4275
4276 Changed = ChangeStatus::CHANGED;
4277
4278
4279
4280 if (mayContainParallelRegion())
4281 insertInstructionGuardsHelper(A);
4282 else
4283 forceSingleThreadPerWorkgroupHelper(A);
4284
4285
4286
4288 "Initially non-SPMD kernel has SPMD exec mode!");
4289 setExecModeOfKernelEnvironment(
4292
4293 ++NumOpenMPTargetRegionKernelsSPMD;
4294
4295 auto Remark = [&](OptimizationRemark OR) {
4296 return OR << "Transformed generic-mode kernel to SPMD-mode.";
4297 };
4298 A.emitRemark(KernelInitCB, "OMP120", Remark);
4299 return true;
4300 };
4301
4303
4305 return false;
4306
4307
4308 if (!ReachedKnownParallelRegions.isValidState())
4309 return false;
4310
4311 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
4312 if (!OMPInfoCache.runtimeFnsAvailable(
4313 {OMPRTL___kmpc_get_hardware_num_threads_in_block,
4314 OMPRTL___kmpc_get_warp_size, OMPRTL___kmpc_barrier_simple_generic,
4315 OMPRTL___kmpc_kernel_parallel, OMPRTL___kmpc_kernel_end_parallel}))
4316 return false;
4317
4318 ConstantStruct *ExistingKernelEnvC =
4320
4321
4322
4323
4324
4325 ConstantInt *UseStateMachineC =
4326 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
4327 ExistingKernelEnvC);
4328 ConstantInt *ModeC =
4329 KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC);
4330
4331
4332
4333
4334 if (UseStateMachineC->isZero() ||
4336 return false;
4337
4338 Changed = ChangeStatus::CHANGED;
4339
4340
4341 setUseGenericStateMachineOfKernelEnvironment(
4342 ConstantInt::get(UseStateMachineC->getIntegerType(), false));
4343
4344
4345
4346
4347
4348 if (!mayContainParallelRegion()) {
4349 ++NumOpenMPTargetRegionKernelsWithoutStateMachine;
4350
4351 auto Remark = [&](OptimizationRemark OR) {
4352 return OR << "Removing unused state machine from generic-mode kernel.";
4353 };
4354 A.emitRemark(KernelInitCB, "OMP130", Remark);
4355
4356 return true;
4357 }
4358
4359
4360 if (ReachedUnknownParallelRegions.empty()) {
4361 ++NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback;
4362
4363 auto Remark = [&](OptimizationRemark OR) {
4364 return OR << "Rewriting generic-mode kernel with a customized state "
4365 "machine.";
4366 };
4367 A.emitRemark(KernelInitCB, "OMP131", Remark);
4368 } else {
4369 ++NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback;
4370
4371 auto Remark = [&](OptimizationRemarkAnalysis OR) {
4372 return OR << "Generic-mode kernel is executed with a customized state "
4373 "machine that requires a fallback.";
4374 };
4375 A.emitRemark(KernelInitCB, "OMP132", Remark);
4376
4377
4378 for (CallBase *UnknownParallelRegionCB : ReachedUnknownParallelRegions) {
4379 if (!UnknownParallelRegionCB)
4380 continue;
4381 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
4382 return ORA << "Call may contain unknown parallel regions. Use "
4383 << "`[[omp::assume(\"omp_no_parallelism\")]]` to "
4384 "override.";
4385 };
4386 A.emitRemark(UnknownParallelRegionCB,
4388 }
4389 }
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421 auto &Ctx = getAnchorValue().getContext();
4423 assert(Kernel && "Expected an associated function!");
4424
4425 BasicBlock *InitBB = KernelInitCB->getParent();
4427 KernelInitCB->getNextNode(), "thread.user_code.check");
4431 Ctx, "worker_state_machine.begin", Kernel, UserCodeEntryBB);
4433 Ctx, "worker_state_machine.finished", Kernel, UserCodeEntryBB);
4435 Ctx, "worker_state_machine.is_active.check", Kernel, UserCodeEntryBB);
4436 BasicBlock *StateMachineIfCascadeCurrentBB =
4437 BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.check",
4438 Kernel, UserCodeEntryBB);
4439 BasicBlock *StateMachineEndParallelBB =
4441 Kernel, UserCodeEntryBB);
4443 Ctx, "worker_state_machine.done.barrier", Kernel, UserCodeEntryBB);
4444 A.registerManifestAddedBasicBlock(*InitBB);
4445 A.registerManifestAddedBasicBlock(*UserCodeEntryBB);
4446 A.registerManifestAddedBasicBlock(*IsWorkerCheckBB);
4447 A.registerManifestAddedBasicBlock(*StateMachineBeginBB);
4448 A.registerManifestAddedBasicBlock(*StateMachineFinishedBB);
4449 A.registerManifestAddedBasicBlock(*StateMachineIsActiveCheckBB);
4450 A.registerManifestAddedBasicBlock(*StateMachineIfCascadeCurrentBB);
4451 A.registerManifestAddedBasicBlock(*StateMachineEndParallelBB);
4452 A.registerManifestAddedBasicBlock(*StateMachineDoneBarrierBB);
4453
4454 const DebugLoc &DLoc = KernelInitCB->getDebugLoc();
4457
4460 ConstantInt::get(KernelInitCB->getType(), -1),
4461 "thread.is_worker", InitBB);
4463 BranchInst::Create(IsWorkerCheckBB, UserCodeEntryBB, IsWorker, InitBB);
4464
4466 FunctionCallee BlockHwSizeFn =
4467 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4468 M, OMPRTL___kmpc_get_hardware_num_threads_in_block);
4469 FunctionCallee WarpSizeFn =
4470 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4471 M, OMPRTL___kmpc_get_warp_size);
4472 CallInst *BlockHwSize =
4473 CallInst::Create(BlockHwSizeFn, "block.hw_size", IsWorkerCheckBB);
4474 OMPInfoCache.setCallingConvention(BlockHwSizeFn, BlockHwSize);
4476 CallInst *WarpSize =
4478 OMPInfoCache.setCallingConvention(WarpSizeFn, WarpSize);
4481 BlockHwSize, WarpSize, "block.size", IsWorkerCheckBB);
4483 Instruction *IsMainOrWorker = ICmpInst::Create(
4485 "thread.is_main_or_worker", IsWorkerCheckBB);
4488 IsMainOrWorker, IsWorkerCheckBB);
4489
4490
4491 const DataLayout &DL = M.getDataLayout();
4492 Type *VoidPtrTy = PointerType::getUnqual(Ctx);
4494 new AllocaInst(VoidPtrTy, DL.getAllocaAddrSpace(), nullptr,
4497
4498 OMPInfoCache.OMPBuilder.updateToLocation(
4499 OpenMPIRBuilder::LocationDescription(
4500 IRBuilder<>::InsertPoint(StateMachineBeginBB,
4501 StateMachineBeginBB->end()),
4502 DLoc));
4503
4504 Value *Ident = KernelInfo::getIdentFromKernelEnvironment(KernelEnvC);
4505 Value *GTid = KernelInitCB;
4506
4507 FunctionCallee BarrierFn =
4508 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4509 M, OMPRTL___kmpc_barrier_simple_generic);
4511 CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineBeginBB);
4512 OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
4513 Barrier->setDebugLoc(DLoc);
4514
4516 (unsigned int)AddressSpace::Generic) {
4517 WorkFnAI = new AddrSpaceCastInst(
4518 WorkFnAI, PointerType::get(Ctx, (unsigned int)AddressSpace::Generic),
4519 WorkFnAI->getName() + ".generic", StateMachineBeginBB);
4521 }
4522
4523 FunctionCallee KernelParallelFn =
4524 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4525 M, OMPRTL___kmpc_kernel_parallel);
4527 KernelParallelFn, {WorkFnAI}, "worker.is_active", StateMachineBeginBB);
4528 OMPInfoCache.setCallingConvention(KernelParallelFn, IsActiveWorker);
4530 Instruction *WorkFn = new LoadInst(VoidPtrTy, WorkFnAI, "worker.work_fn",
4531 StateMachineBeginBB);
4533
4534 FunctionType *ParallelRegionFnTy = FunctionType::get(
4535 Type::getVoidTy(Ctx), {Type::getInt16Ty(Ctx), Type::getInt32Ty(Ctx)},
4536 false);
4537
4541 StateMachineBeginBB);
4542 IsDone->setDebugLoc(DLoc);
4543 BranchInst::Create(StateMachineFinishedBB, StateMachineIsActiveCheckBB,
4544 IsDone, StateMachineBeginBB)
4546
4548 StateMachineDoneBarrierBB, IsActiveWorker,
4549 StateMachineIsActiveCheckBB)
4551
4552 Value *ZeroArg =
4554
4555 const unsigned int WrapperFunctionArgNo = 6;
4556
4557
4558
4559
4560 for (int I = 0, E = ReachedKnownParallelRegions.size(); I < E; ++I) {
4561 auto *CB = ReachedKnownParallelRegions[I];
4563 CB->getArgOperand(WrapperFunctionArgNo)->stripPointerCasts());
4565 Ctx, "worker_state_machine.parallel_region.execute", Kernel,
4566 StateMachineEndParallelBB);
4567 CallInst::Create(ParallelRegion, {ZeroArg, GTid}, "", PRExecuteBB)
4568 ->setDebugLoc(DLoc);
4571
4573 BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.check",
4574 Kernel, StateMachineEndParallelBB);
4575 A.registerManifestAddedBasicBlock(*PRExecuteBB);
4576 A.registerManifestAddedBasicBlock(*PRNextBB);
4577
4578
4579
4581 if (I + 1 < E || !ReachedUnknownParallelRegions.empty()) {
4584 "worker.check_parallel_region", StateMachineIfCascadeCurrentBB);
4586 IsPR = CmpI;
4587 } else {
4589 }
4590
4592 StateMachineIfCascadeCurrentBB)
4594 StateMachineIfCascadeCurrentBB = PRNextBB;
4595 }
4596
4597
4598
4599
4600 if (!ReachedUnknownParallelRegions.empty()) {
4601 StateMachineIfCascadeCurrentBB->setName(
4602 "worker_state_machine.parallel_region.fallback.execute");
4603 CallInst::Create(ParallelRegionFnTy, WorkFn, {ZeroArg, GTid}, "",
4604 StateMachineIfCascadeCurrentBB)
4605 ->setDebugLoc(DLoc);
4606 }
4608 StateMachineIfCascadeCurrentBB)
4610
4611 FunctionCallee EndParallelFn =
4612 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4613 M, OMPRTL___kmpc_kernel_end_parallel);
4614 CallInst *EndParallel =
4615 CallInst::Create(EndParallelFn, {}, "", StateMachineEndParallelBB);
4616 OMPInfoCache.setCallingConvention(EndParallelFn, EndParallel);
4618 BranchInst::Create(StateMachineDoneBarrierBB, StateMachineEndParallelBB)
4620
4621 CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineDoneBarrierBB)
4622 ->setDebugLoc(DLoc);
4625
4626 return true;
4627 }
4628
4629
4630
4631 ChangeStatus updateImpl(Attributor &A) override {
4632 KernelInfoState StateBefore = getState();
4633
4634
4635
4636
4637
4638 struct UpdateKernelEnvCRAII {
4639 AAKernelInfoFunction &AA;
4640
4641 UpdateKernelEnvCRAII(AAKernelInfoFunction &AA) : AA(AA) {}
4642
4643 ~UpdateKernelEnvCRAII() {
4644 if (!AA.KernelEnvC)
4645 return;
4646
4647 ConstantStruct *ExistingKernelEnvC =
4649
4650 if (!AA.isValidState()) {
4651 AA.KernelEnvC = ExistingKernelEnvC;
4652 return;
4653 }
4654
4655 if (!AA.ReachedKnownParallelRegions.isValidState())
4656 AA.setUseGenericStateMachineOfKernelEnvironment(
4657 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
4658 ExistingKernelEnvC));
4659
4660 if (!AA.SPMDCompatibilityTracker.isValidState())
4661 AA.setExecModeOfKernelEnvironment(
4662 KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC));
4663
4664 ConstantInt *MayUseNestedParallelismC =
4665 KernelInfo::getMayUseNestedParallelismFromKernelEnvironment(
4666 AA.KernelEnvC);
4667 ConstantInt *NewMayUseNestedParallelismC = ConstantInt::get(
4668 MayUseNestedParallelismC->getIntegerType(), AA.NestedParallelism);
4669 AA.setMayUseNestedParallelismOfKernelEnvironment(
4670 NewMayUseNestedParallelismC);
4671 }
4672 } RAII(*this);
4673
4674
4676
4678 return true;
4679
4680 if (.mayWriteToMemory())
4681 return true;
4683 const auto *UnderlyingObjsAA = A.getAAFor(
4685 DepClassTy::OPTIONAL);
4688 DepClassTy::OPTIONAL);
4689 if (UnderlyingObjsAA &&
4690 UnderlyingObjsAA->forallUnderlyingObjects([&](Value &Obj) {
4691 if (AA::isAssumedThreadLocalObject(A, Obj, *this))
4692 return true;
4693
4694
4695 auto *CB = dyn_cast(&Obj);
4696 return CB && HS && HS->isAssumedHeapToStack(*CB);
4697 }))
4698 return true;
4699 }
4700
4701
4702 SPMDCompatibilityTracker.insert(&I);
4703 return true;
4704 };
4705
4706 bool UsedAssumedInformationInCheckRWInst = false;
4707 if (!SPMDCompatibilityTracker.isAtFixpoint())
4708 if (.checkForAllReadWriteInstructions(
4709 CheckRWInst, *this, UsedAssumedInformationInCheckRWInst))
4710 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4711
4712 bool UsedAssumedInformationFromReachingKernels = false;
4713 if (!IsKernelEntry) {
4714 updateParallelLevels(A);
4715
4716 bool AllReachingKernelsKnown = true;
4717 updateReachingKernelEntries(A, AllReachingKernelsKnown);
4718 UsedAssumedInformationFromReachingKernels = !AllReachingKernelsKnown;
4719
4720 if (!SPMDCompatibilityTracker.empty()) {
4721 if (!ParallelLevels.isValidState())
4722 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4723 else if (!ReachingKernelEntries.isValidState())
4724 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4725 else {
4726
4727
4728
4729 int SPMD = 0, Generic = 0;
4730 for (auto *Kernel : ReachingKernelEntries) {
4731 auto *CBAA = A.getAAFor(
4733 if (CBAA && CBAA->SPMDCompatibilityTracker.isValidState() &&
4734 CBAA->SPMDCompatibilityTracker.isAssumed())
4735 ++SPMD;
4736 else
4738 if (!CBAA || !CBAA->SPMDCompatibilityTracker.isAtFixpoint())
4739 UsedAssumedInformationFromReachingKernels = true;
4740 }
4741 if (SPMD != 0 && Generic != 0)
4742 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4743 }
4744 }
4745 }
4746
4747
4748 bool AllParallelRegionStatesWereFixed = true;
4749 bool AllSPMDStatesWereFixed = true;
4752 auto *CBAA = A.getAAFor(
4754 if (!CBAA)
4755 return false;
4756 getState() ^= CBAA->getState();
4757 AllSPMDStatesWereFixed &= CBAA->SPMDCompatibilityTracker.isAtFixpoint();
4758 AllParallelRegionStatesWereFixed &=
4759 CBAA->ReachedKnownParallelRegions.isAtFixpoint();
4760 AllParallelRegionStatesWereFixed &=
4761 CBAA->ReachedUnknownParallelRegions.isAtFixpoint();
4762 return true;
4763 };
4764
4765 bool UsedAssumedInformationInCheckCallInst = false;
4766 if (.checkForAllCallLikeInstructions(
4767 CheckCallInst, *this, UsedAssumedInformationInCheckCallInst)) {
4769 << "Failed to visit all call-like instructions!\n";);
4770 return indicatePessimisticFixpoint();
4771 }
4772
4773
4774
4775 if (!UsedAssumedInformationInCheckCallInst &&
4776 AllParallelRegionStatesWereFixed) {
4777 ReachedKnownParallelRegions.indicateOptimisticFixpoint();
4778 ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
4779 }
4780
4781
4782
4783 if (!UsedAssumedInformationInCheckRWInst &&
4784 !UsedAssumedInformationInCheckCallInst &&
4785 !UsedAssumedInformationFromReachingKernels && AllSPMDStatesWereFixed)
4786 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
4787
4788 return StateBefore == getState() ? ChangeStatus::UNCHANGED
4789 : ChangeStatus::CHANGED;
4790 }
4791
4792private:
4793
4794 void updateReachingKernelEntries(Attributor &A,
4795 bool &AllReachingKernelsKnown) {
4796 auto PredCallSite = [&](AbstractCallSite ACS) {
4797 Function *Caller = ACS.getInstruction()->getFunction();
4798
4799 assert(Caller && "Caller is nullptr");
4800
4801 auto *CAA = A.getOrCreateAAFor(
4803 if (CAA && CAA->ReachingKernelEntries.isValidState()) {
4804 ReachingKernelEntries ^= CAA->ReachingKernelEntries;
4805 return true;
4806 }
4807
4808
4809
4810 ReachingKernelEntries.indicatePessimisticFixpoint();
4811
4812 return true;
4813 };
4814
4815 if (.checkForAllCallSites(PredCallSite, *this,
4816 true ,
4817 AllReachingKernelsKnown))
4818 ReachingKernelEntries.indicatePessimisticFixpoint();
4819 }
4820
4821
4822 void updateParallelLevels(Attributor &A) {
4823 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
4824 OMPInformationCache::RuntimeFunctionInfo &Parallel51RFI =
4825 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
4826
4827 auto PredCallSite = [&](AbstractCallSite ACS) {
4828 Function *Caller = ACS.getInstruction()->getFunction();
4829
4830 assert(Caller && "Caller is nullptr");
4831
4832 auto *CAA =
4834 if (CAA && CAA->ParallelLevels.isValidState()) {
4835
4836
4837
4838
4839
4840 if (Caller == Parallel51RFI.Declaration) {
4841 ParallelLevels.indicatePessimisticFixpoint();
4842 return true;
4843 }
4844
4845 ParallelLevels ^= CAA->ParallelLevels;
4846
4847 return true;
4848 }
4849
4850
4851
4852 ParallelLevels.indicatePessimisticFixpoint();
4853
4854 return true;
4855 };
4856
4857 bool AllCallSitesKnown = true;
4858 if (.checkForAllCallSites(PredCallSite, *this,
4859 true ,
4860 AllCallSitesKnown))
4861 ParallelLevels.indicatePessimisticFixpoint();
4862 }
4863};
4864
4865
4866
4867
4868struct AAKernelInfoCallSite : AAKernelInfo {
4869 AAKernelInfoCallSite(const IRPosition &IRP, Attributor &A)
4870 : AAKernelInfo(IRP, A) {}
4871
4872
4873 void initialize(Attributor &A) override {
4874 AAKernelInfo::initialize(A);
4875
4876 CallBase &CB = cast(getAssociatedValue());
4877 auto *AssumptionAA = A.getAAFor(
4879
4880
4881 if (AssumptionAA && AssumptionAA->hasAssumption("ompx_spmd_amenable")) {
4882 indicateOptimisticFixpoint();
4883 return;
4884 }
4885
4886
4887
4888
4890 indicateOptimisticFixpoint();
4891 return;
4892 }
4893
4894
4895
4896
4897
4898 auto CheckCallee = [&](Function *Callee, unsigned NumCallees) {
4899 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
4900 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);
4901 if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
4902
4903 if (!Callee || .isFunctionIPOAmendable(*Callee)) {
4904
4905
4906
4907 if (!AssumptionAA ||
4908 !(AssumptionAA->hasAssumption("omp_no_openmp") ||
4909 AssumptionAA->hasAssumption("omp_no_parallelism")))
4910 ReachedUnknownParallelRegions.insert(&CB);
4911
4912
4913
4914 if (!SPMDCompatibilityTracker.isAtFixpoint()) {
4915 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4916 SPMDCompatibilityTracker.insert(&CB);
4917 }
4918
4919
4920
4921 indicateOptimisticFixpoint();
4922 }
4923
4924
4925 return;
4926 }
4927 if (NumCallees > 1) {
4928 indicatePessimisticFixpoint();
4929 return;
4930 }
4931
4933 switch (RF) {
4934
4935 case OMPRTL___kmpc_is_spmd_exec_mode:
4936 case OMPRTL___kmpc_distribute_static_fini:
4937 case OMPRTL___kmpc_for_static_fini:
4938 case OMPRTL___kmpc_global_thread_num:
4939 case OMPRTL___kmpc_get_hardware_num_threads_in_block:
4940 case OMPRTL___kmpc_get_hardware_num_blocks:
4941 case OMPRTL___kmpc_single:
4942 case OMPRTL___kmpc_end_single:
4943 case OMPRTL___kmpc_master:
4944 case OMPRTL___kmpc_end_master:
4945 case OMPRTL___kmpc_barrier:
4946 case OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2:
4947 case OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2:
4948 case OMPRTL___kmpc_error:
4949 case OMPRTL___kmpc_flush:
4950 case OMPRTL___kmpc_get_hardware_thread_id_in_block:
4951 case OMPRTL___kmpc_get_warp_size:
4952 case OMPRTL_omp_get_thread_num:
4953 case OMPRTL_omp_get_num_threads:
4954 case OMPRTL_omp_get_max_threads:
4955 case OMPRTL_omp_in_parallel:
4956 case OMPRTL_omp_get_dynamic:
4957 case OMPRTL_omp_get_cancellation:
4958 case OMPRTL_omp_get_nested:
4959 case OMPRTL_omp_get_schedule:
4960 case OMPRTL_omp_get_thread_limit:
4961 case OMPRTL_omp_get_supported_active_levels:
4962 case OMPRTL_omp_get_max_active_levels:
4963 case OMPRTL_omp_get_level:
4964 case OMPRTL_omp_get_ancestor_thread_num:
4965 case OMPRTL_omp_get_team_size:
4966 case OMPRTL_omp_get_active_level:
4967 case OMPRTL_omp_in_final:
4968 case OMPRTL_omp_get_proc_bind:
4969 case OMPRTL_omp_get_num_places:
4970 case OMPRTL_omp_get_num_procs:
4971 case OMPRTL_omp_get_place_proc_ids:
4972 case OMPRTL_omp_get_place_num:
4973 case OMPRTL_omp_get_partition_num_places:
4974 case OMPRTL_omp_get_partition_place_nums:
4975 case OMPRTL_omp_get_wtime:
4976 break;
4977 case OMPRTL___kmpc_distribute_static_init_4:
4978 case OMPRTL___kmpc_distribute_static_init_4u:
4979 case OMPRTL___kmpc_distribute_static_init_8:
4980 case OMPRTL___kmpc_distribute_static_init_8u:
4981 case OMPRTL___kmpc_for_static_init_4:
4982 case OMPRTL___kmpc_for_static_init_4u:
4983 case OMPRTL___kmpc_for_static_init_8:
4984 case OMPRTL___kmpc_for_static_init_8u: {
4985
4986 unsigned ScheduleArgOpNo = 2;
4987 auto *ScheduleTypeCI =
4989 unsigned ScheduleTypeVal =
4990 ScheduleTypeCI ? ScheduleTypeCI->getZExtValue() : 0;
4992 case OMPScheduleType::UnorderedStatic:
4993 case OMPScheduleType::UnorderedStaticChunked:
4994 case OMPScheduleType::OrderedDistribute:
4995 case OMPScheduleType::OrderedDistributeChunked:
4996 break;
4997 default:
4998 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4999 SPMDCompatibilityTracker.insert(&CB);
5000 break;
5001 };
5002 } break;
5003 case OMPRTL___kmpc_target_init:
5004 KernelInitCB = &CB;
5005 break;
5006 case OMPRTL___kmpc_target_deinit:
5007 KernelDeinitCB = &CB;
5008 break;
5009 case OMPRTL___kmpc_parallel_51:
5010 if (!handleParallel51(A, CB))
5011 indicatePessimisticFixpoint();
5012 return;
5013 case OMPRTL___kmpc_omp_task:
5014
5015 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5016 SPMDCompatibilityTracker.insert(&CB);
5017 ReachedUnknownParallelRegions.insert(&CB);
5018 break;
5019 case OMPRTL___kmpc_alloc_shared:
5020 case OMPRTL___kmpc_free_shared:
5021
5022 return;
5023 default:
5024
5025
5026 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5027 SPMDCompatibilityTracker.insert(&CB);
5028 break;
5029 }
5030
5031
5032
5033 indicateOptimisticFixpoint();
5034 };
5035
5036 const auto *AACE =
5037 A.getAAFor(*this, getIRPosition(), DepClassTy::OPTIONAL);
5038 if (!AACE || !AACE->getState().isValidState() || AACE->hasUnknownCallee()) {
5039 CheckCallee(getAssociatedFunction(), 1);
5040 return;
5041 }
5042 const auto &OptimisticEdges = AACE->getOptimisticEdges();
5043 for (auto *Callee : OptimisticEdges) {
5044 CheckCallee(Callee, OptimisticEdges.size());
5045 if (isAtFixpoint())
5046 break;
5047 }
5048 }
5049
5050 ChangeStatus updateImpl(Attributor &A) override {
5051
5052
5053
5054
5055 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
5056 KernelInfoState StateBefore = getState();
5057
5058 auto CheckCallee = [&](Function *F, int NumCallees) {
5059 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(F);
5060
5061
5062
5063 if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
5065 auto *FnAA =
5066 A.getAAFor(*this, FnPos, DepClassTy::REQUIRED);
5067 if (!FnAA)
5068 return indicatePessimisticFixpoint();
5069 if (getState() == FnAA->getState())
5070 return ChangeStatus::UNCHANGED;
5071 getState() = FnAA->getState();
5072 return ChangeStatus::CHANGED;
5073 }
5074 if (NumCallees > 1)
5075 return indicatePessimisticFixpoint();
5076
5077 CallBase &CB = cast(getAssociatedValue());
5078 if (It->getSecond() == OMPRTL___kmpc_parallel_51) {
5079 if (!handleParallel51(A, CB))
5080 return indicatePessimisticFixpoint();
5081 return StateBefore == getState() ? ChangeStatus::UNCHANGED
5082 : ChangeStatus::CHANGED;
5083 }
5084
5085
5086
5088 (It->getSecond() == OMPRTL___kmpc_alloc_shared ||
5089 It->getSecond() == OMPRTL___kmpc_free_shared) &&
5090 "Expected a __kmpc_alloc_shared or __kmpc_free_shared runtime call");
5091
5092 auto *HeapToStackAA = A.getAAFor(
5094 auto *HeapToSharedAA = A.getAAFor(
5096
5098
5099 switch (RF) {
5100
5101
5102 case OMPRTL___kmpc_alloc_shared:
5103 if ((!HeapToStackAA || !HeapToStackAA->isAssumedHeapToStack(CB)) &&
5104 (!HeapToSharedAA || !HeapToSharedAA->isAssumedHeapToShared(CB)))
5105 SPMDCompatibilityTracker.insert(&CB);
5106 break;
5107 case OMPRTL___kmpc_free_shared:
5108 if ((!HeapToStackAA ||
5109 !HeapToStackAA->isAssumedHeapToStackRemovedFree(CB)) &&
5110 (!HeapToSharedAA ||
5111 !HeapToSharedAA->isAssumedHeapToSharedRemovedFree(CB)))
5112 SPMDCompatibilityTracker.insert(&CB);
5113 break;
5114 default:
5115 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5116 SPMDCompatibilityTracker.insert(&CB);
5117 }
5118 return ChangeStatus::CHANGED;
5119 };
5120
5121 const auto *AACE =
5122 A.getAAFor(*this, getIRPosition(), DepClassTy::OPTIONAL);
5123 if (!AACE || !AACE->getState().isValidState() || AACE->hasUnknownCallee()) {
5124 if (Function *F = getAssociatedFunction())
5125 CheckCallee(F, 1);
5126 } else {
5127 const auto &OptimisticEdges = AACE->getOptimisticEdges();
5128 for (auto *Callee : OptimisticEdges) {
5129 CheckCallee(Callee, OptimisticEdges.size());
5130 if (isAtFixpoint())
5131 break;
5132 }
5133 }
5134
5135 return StateBefore == getState() ? ChangeStatus::UNCHANGED
5136 : ChangeStatus::CHANGED;
5137 }
5138
5139
5140
5141 bool handleParallel51(Attributor &A, CallBase &CB) {
5142 const unsigned int NonWrapperFunctionArgNo = 5;
5143 const unsigned int WrapperFunctionArgNo = 6;
5144 auto ParallelRegionOpArgNo = SPMDCompatibilityTracker.isAssumed()
5145 ? NonWrapperFunctionArgNo
5146 : WrapperFunctionArgNo;
5147
5150 if (!ParallelRegion)
5151 return false;
5152
5153 ReachedKnownParallelRegions.insert(&CB);
5154
5155 auto *FnAA = A.getAAFor(
5157 NestedParallelism |= !FnAA || !FnAA->getState().isValidState() ||
5158 !FnAA->ReachedKnownParallelRegions.empty() ||
5159 !FnAA->ReachedKnownParallelRegions.isValidState() ||
5160 !FnAA->ReachedUnknownParallelRegions.isValidState() ||
5161 !FnAA->ReachedUnknownParallelRegions.empty();
5162 return true;
5163 }
5164};
5165
5166struct AAFoldRuntimeCall
5167 : public StateWrapper<BooleanState, AbstractAttribute> {
5168 using Base = StateWrapper<BooleanState, AbstractAttribute>;
5169
5170 AAFoldRuntimeCall(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
5171
5172
5173 void trackStatistics() const override {}
5174
5175
5176 static AAFoldRuntimeCall &createForPosition(const IRPosition &IRP,
5177 Attributor &A);
5178
5179
5180 StringRef getName() const override { return "AAFoldRuntimeCall"; }
5181
5182
5183 const char *getIdAddr() const override { return &ID; }
5184
5185
5186
5187 static bool classof(const AbstractAttribute *AA) {
5189 }
5190
5191 static const char ID;
5192};
5193
5194struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {
5195 AAFoldRuntimeCallCallSiteReturned(const IRPosition &IRP, Attributor &A)
5196 : AAFoldRuntimeCall(IRP, A) {}
5197
5198
5199 const std::string getAsStr(Attributor *) const override {
5200 if (!isValidState())
5201 return "";
5202
5203 std::string Str("simplified value: ");
5204
5205 if (!SimplifiedValue)
5206 return Str + std::string("none");
5207
5208 if (!*SimplifiedValue)
5209 return Str + std::string("nullptr");
5210
5212 return Str + std::to_string(CI->getSExtValue());
5213
5214 return Str + std::string("unknown");
5215 }
5216
5217 void initialize(Attributor &A) override {
5219 indicatePessimisticFixpoint();
5220
5222
5223 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
5224 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);
5225 assert(It != OMPInfoCache.RuntimeFunctionIDMap.end() &&
5226 "Expected a known OpenMP runtime function");
5227
5228 RFKind = It->getSecond();
5229
5230 CallBase &CB = cast(getAssociatedValue());
5231 A.registerSimplificationCallback(
5233 [&](const IRPosition &IRP, const AbstractAttribute *AA,
5234 bool &UsedAssumedInformation) -> std::optional<Value *> {
5235 assert((isValidState() || SimplifiedValue == nullptr) &&
5236 "Unexpected invalid state!");
5237
5238 if (!isAtFixpoint()) {
5239 UsedAssumedInformation = true;
5240 if (AA)
5241 A.recordDependence(*this, *AA, DepClassTy::OPTIONAL);
5242 }
5243 return SimplifiedValue;
5244 });
5245 }
5246
5247 ChangeStatus updateImpl(Attributor &A) override {
5249 switch (RFKind) {
5250 case OMPRTL___kmpc_is_spmd_exec_mode:
5251 Changed |= foldIsSPMDExecMode(A);
5252 break;
5253 case OMPRTL___kmpc_parallel_level:
5254 Changed |= foldParallelLevel(A);
5255 break;
5256 case OMPRTL___kmpc_get_hardware_num_threads_in_block:
5257 Changed = Changed | foldKernelFnAttribute(A, "omp_target_thread_limit");
5258 break;
5259 case OMPRTL___kmpc_get_hardware_num_blocks:
5260 Changed = Changed | foldKernelFnAttribute(A, "omp_target_num_teams");
5261 break;
5262 default:
5264 }
5265
5267 }
5268
5271
5272 if (SimplifiedValue && *SimplifiedValue) {
5275 A.deleteAfterManifest(I);
5276
5278 auto Remark = [&](OptimizationRemark OR) {
5280 return OR << "Replacing OpenMP runtime call "
5282 << ore::NV("FoldedValue", C->getZExtValue()) << ".";
5283 return OR << "Replacing OpenMP runtime call "
5285 };
5286
5288 A.emitRemark(CB, "OMP180", Remark);
5289
5290 LLVM_DEBUG(dbgs() << TAG << "Replacing runtime call: " << I << " with "
5291 << **SimplifiedValue << "\n");
5292
5293 Changed = ChangeStatus::CHANGED;
5294 }
5295
5297 }
5298
5299 ChangeStatus indicatePessimisticFixpoint() override {
5300 SimplifiedValue = nullptr;
5301 return AAFoldRuntimeCall::indicatePessimisticFixpoint();
5302 }
5303
5304private:
5305
5306 ChangeStatus foldIsSPMDExecMode(Attributor &A) {
5307 std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5308
5309 unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
5310 unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
5311 auto *CallerKernelInfoAA = A.getAAFor(
5313
5314 if (!CallerKernelInfoAA ||
5315 !CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5316 return indicatePessimisticFixpoint();
5317
5318 for (Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5320 DepClassTy::REQUIRED);
5321
5322 if (!AA || !AA->isValidState()) {
5323 SimplifiedValue = nullptr;
5324 return indicatePessimisticFixpoint();
5325 }
5326
5327 if (AA->SPMDCompatibilityTracker.isAssumed()) {
5328 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5329 ++KnownSPMDCount;
5330 else
5331 ++AssumedSPMDCount;
5332 } else {
5333 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5334 ++KnownNonSPMDCount;
5335 else
5336 ++AssumedNonSPMDCount;
5337 }
5338 }
5339
5340 if ((AssumedSPMDCount + KnownSPMDCount) &&
5341 (AssumedNonSPMDCount + KnownNonSPMDCount))
5342 return indicatePessimisticFixpoint();
5343
5344 auto &Ctx = getAnchorValue().getContext();
5345 if (KnownSPMDCount || AssumedSPMDCount) {
5346 assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&
5347 "Expected only SPMD kernels!");
5348
5349
5350 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), true);
5351 } else if (KnownNonSPMDCount || AssumedNonSPMDCount) {
5352 assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&
5353 "Expected only non-SPMD kernels!");
5354
5355
5356 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), false);
5357 } else {
5358
5359
5360
5361 assert(!SimplifiedValue && "SimplifiedValue should be none");
5362 }
5363
5364 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5365 : ChangeStatus::CHANGED;
5366 }
5367
5368
5370 std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5371
5372 auto *CallerKernelInfoAA = A.getAAFor(
5374
5375 if (!CallerKernelInfoAA ||
5376 !CallerKernelInfoAA->ParallelLevels.isValidState())
5377 return indicatePessimisticFixpoint();
5378
5379 if (!CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5380 return indicatePessimisticFixpoint();
5381
5382 if (CallerKernelInfoAA->ReachingKernelEntries.empty()) {
5383 assert(!SimplifiedValue &&
5384 "SimplifiedValue should keep none at this point");
5385 return ChangeStatus::UNCHANGED;
5386 }
5387
5388 unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
5389 unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
5390 for (Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5392 DepClassTy::REQUIRED);
5393 if (!AA || !AA->SPMDCompatibilityTracker.isValidState())
5394 return indicatePessimisticFixpoint();
5395
5396 if (AA->SPMDCompatibilityTracker.isAssumed()) {
5397 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5398 ++KnownSPMDCount;
5399 else
5400 ++AssumedSPMDCount;
5401 } else {
5402 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5403 ++KnownNonSPMDCount;
5404 else
5405 ++AssumedNonSPMDCount;
5406 }
5407 }
5408
5409 if ((AssumedSPMDCount + KnownSPMDCount) &&
5410 (AssumedNonSPMDCount + KnownNonSPMDCount))
5411 return indicatePessimisticFixpoint();
5412
5413 auto &Ctx = getAnchorValue().getContext();
5414
5415
5416
5417 if (AssumedSPMDCount || KnownSPMDCount) {
5418 assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&
5419 "Expected only SPMD kernels!");
5420 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), 1);
5421 } else {
5422 assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&
5423 "Expected only non-SPMD kernels!");
5424 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), 0);
5425 }
5426 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5427 : ChangeStatus::CHANGED;
5428 }
5429
5430 ChangeStatus foldKernelFnAttribute(Attributor &A, llvm::StringRef Attr) {
5431
5432 int32_t CurrentAttrValue = -1;
5433 std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5434
5435 auto *CallerKernelInfoAA = A.getAAFor(
5437
5438 if (!CallerKernelInfoAA ||
5439 !CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5440 return indicatePessimisticFixpoint();
5441
5442
5443 for (Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5444 int32_t NextAttrVal = K->getFnAttributeAsParsedInteger(Attr, -1);
5445
5446 if (NextAttrVal == -1 ||
5447 (CurrentAttrValue != -1 && CurrentAttrValue != NextAttrVal))
5448 return indicatePessimisticFixpoint();
5449 CurrentAttrValue = NextAttrVal;
5450 }
5451
5452 if (CurrentAttrValue != -1) {
5453 auto &Ctx = getAnchorValue().getContext();
5454 SimplifiedValue =
5455 ConstantInt::get(Type::getInt32Ty(Ctx), CurrentAttrValue);
5456 }
5457 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5458 : ChangeStatus::CHANGED;
5459 }
5460
5461
5462
5463
5464 std::optional<Value *> SimplifiedValue;
5465
5466
5468};
5469
5470}
5471
5472
5473void OpenMPOpt::registerFoldRuntimeCall(RuntimeFunction RF) {
5474 auto &RFI = OMPInfoCache.RFIs[RF];
5475 RFI.foreachUse(SCC, [&](Use &U, Function &F) {
5476 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &RFI);
5477 if (!CI)
5478 return false;
5479 A.getOrCreateAAFor(
5481 DepClassTy::NONE, false,
5482 false);
5483 return false;
5484 });
5485}
5486
5487void OpenMPOpt::registerAAs(bool IsModulePass) {
5488 if (SCC.empty())
5489 return;
5490
5491 if (IsModulePass) {
5492
5493
5494
5495
5497 A.getOrCreateAAFor(
5499 DepClassTy::NONE, false,
5500 false);
5501 return false;
5502 };
5503 OMPInformationCache::RuntimeFunctionInfo &InitRFI =
5504 OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
5505 InitRFI.foreachUse(SCC, CreateKernelInfoCB);
5506
5507 registerFoldRuntimeCall(OMPRTL___kmpc_is_spmd_exec_mode);
5508 registerFoldRuntimeCall(OMPRTL___kmpc_parallel_level);
5509 registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_threads_in_block);
5510 registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_blocks);
5511 }
5512
5513
5515 for (int Idx = 0; Idx < OMPInfoCache.ICVs.size() - 1; ++Idx) {
5516 auto ICVInfo = OMPInfoCache.ICVs[static_cast<InternalControlVar>(Idx)];
5517
5518 auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter];
5519
5521 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI);
5522 if (!CI)
5523 return false;
5524
5526
5528 A.getOrCreateAAFor(CBPos);
5529 return false;
5530 };
5531
5532 GetterRFI.foreachUse(SCC, CreateAA);
5533 }
5534 }
5535
5536
5537
5539 return;
5540
5541 for (auto *F : SCC) {
5542 if (F->isDeclaration())
5543 continue;
5544
5545
5546
5547
5548 if (F->hasLocalLinkage()) {
5549 if (llvm::all_of(F->uses(), [this](const Use &U) {
5550 const auto *CB = dyn_cast(U.getUser());
5551 return CB && CB->isCallee(&U) &&
5552 A.isRunOn(const_cast<Function *>(CB->getCaller()));
5553 }))
5554 continue;
5555 }
5556 registerAAsForFunction(A, *F);
5557 }
5558}
5559
5560void OpenMPOpt::registerAAsForFunction(Attributor &A, const Function &F) {
5566 if (F.hasFnAttribute(Attribute::Convergent))
5568
5571 bool UsedAssumedInformation = false;
5574 A.getOrCreateAAFor(
5576 continue;
5577 }
5580 A.getOrCreateAAFor(
5582 }
5585 A.getOrCreateAAFor(
5587 continue;
5588 }
5591 continue;
5592 }
5594 if (II->getIntrinsicID() == Intrinsic::assume) {
5595 A.getOrCreateAAFor(
5597 continue;
5598 }
5599 }
5600 }
5601}
5602
5603const char AAICVTracker::ID = 0;
5604const char AAKernelInfo::ID = 0;
5606const char AAHeapToShared::ID = 0;
5607const char AAFoldRuntimeCall::ID = 0;
5608
5609AAICVTracker &AAICVTracker::createForPosition(const IRPosition &IRP,
5610 Attributor &A) {
5611 AAICVTracker *AA = nullptr;
5617 llvm_unreachable("ICVTracker can only be created for function position!");
5619 AA = new (A.Allocator) AAICVTrackerFunctionReturned(IRP, A);
5620 break;
5622 AA = new (A.Allocator) AAICVTrackerCallSiteReturned(IRP, A);
5623 break;
5625 AA = new (A.Allocator) AAICVTrackerCallSite(IRP, A);
5626 break;
5628 AA = new (A.Allocator) AAICVTrackerFunction(IRP, A);
5629 break;
5630 }
5631
5632 return *AA;
5633}
5634
5637 AAExecutionDomainFunction *AA = nullptr;
5647 "AAExecutionDomain can only be created for function position!");
5649 AA = new (A.Allocator) AAExecutionDomainFunction(IRP, A);
5650 break;
5651 }
5652
5653 return *AA;
5654}
5655
5656AAHeapToShared &AAHeapToShared::createForPosition(const IRPosition &IRP,
5658 AAHeapToSharedFunction *AA = nullptr;
5668 "AAHeapToShared can only be created for function position!");
5670 AA = new (A.Allocator) AAHeapToSharedFunction(IRP, A);
5671 break;
5672 }
5673
5674 return *AA;
5675}
5676
5677AAKernelInfo &AAKernelInfo::createForPosition(const IRPosition &IRP,
5678 Attributor &A) {
5679 AAKernelInfo *AA = nullptr;
5687 llvm_unreachable("KernelInfo can only be created for function position!");
5689 AA = new (A.Allocator) AAKernelInfoCallSite(IRP, A);
5690 break;
5692 AA = new (A.Allocator) AAKernelInfoFunction(IRP, A);
5693 break;
5694 }
5695
5696 return *AA;
5697}
5698
5699AAFoldRuntimeCall &AAFoldRuntimeCall::createForPosition(const IRPosition &IRP,
5700 Attributor &A) {
5701 AAFoldRuntimeCall *AA = nullptr;
5710 llvm_unreachable("KernelInfo can only be created for call site position!");
5712 AA = new (A.Allocator) AAFoldRuntimeCallCallSiteReturned(IRP, A);
5713 break;
5714 }
5715
5716 return *AA;
5717}
5718
5724
5728
5730 LLVM_DEBUG(dbgs() << TAG << "Module before OpenMPOpt Module Pass:\n" << M);
5731
5732 auto IsCalled = [&](Function &F) {
5733 if (Kernels.contains(&F))
5734 return true;
5735 return .use_empty();
5736 };
5737
5738 auto EmitRemark = [&](Function &F) {
5740 ORE.emit([&]() {
5742 return ORA << "Could not internalize function. "
5743 << "Some optimizations may not be possible. [OMP140]";
5744 });
5745 };
5746
5748
5749
5750
5755 if (.isDeclaration() && !Kernels.contains(&F) && IsCalled(F) &&
5758 InternalizeFns.insert(&F);
5759 } else if (.hasLocalLinkage() &&
.hasFnAttribute(Attribute::Cold)) {
5760 EmitRemark(F);
5761 }
5762 }
5763
5766 }
5767
5768
5772 if (.isDeclaration() && !InternalizedMap.lookup(&F)) {
5773 SCC.push_back(&F);
5774 Functions.insert(&F);
5775 }
5776
5777 if (SCC.empty())
5779
5781
5784 };
5785
5788
5792 OMPInformationCache InfoCache(M, AG, Allocator, nullptr, PostLink);
5793
5794 unsigned MaxFixpointIterations =
5796
5806 return F.hasFnAttribute("kernel");
5807 };
5808
5810
5811 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
5812 Changed |= OMPOpt.run(true);
5813
5814
5817 if (.isDeclaration() && !Kernels.contains(&F) &&
5818 .hasFnAttribute(Attribute::NoInline))
5819 F.addFnAttr(Attribute::AlwaysInline);
5820
5822 LLVM_DEBUG(dbgs() << TAG << "Module after OpenMPOpt Module Pass:\n" << M);
5823
5826
5828}
5829
5834 if ((*C.begin()->getFunction().getParent()))
5838
5840
5842 Function *Fn = &N.getFunction();
5843 SCC.push_back(Fn);
5844 }
5845
5846 if (SCC.empty())
5848
5849 Module &M = *C.begin()->getFunction().getParent();
5850
5852 LLVM_DEBUG(dbgs() << TAG << "Module before OpenMPOpt CGSCC Pass:\n" << M);
5853
5856
5858
5861 };
5862
5866
5871 OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator,
5872 &Functions, PostLink);
5873
5874 unsigned MaxFixpointIterations =
5876
5885
5887
5888 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
5889 bool Changed = OMPOpt.run(false);
5890
5892 LLVM_DEBUG(dbgs() << TAG << "Module after OpenMPOpt CGSCC Pass:\n" << M);
5893
5896
5898}
5899
5903
5906
5908 if (F.hasKernelCallingConv()) {
5909
5910
5911
5913 ++NumOpenMPTargetRegionKernels;
5914 Kernels.insert(&F);
5915 } else
5916 ++NumNonOpenMPTargetRegionKernels;
5917 }
5918
5919 return Kernels;
5920}
5921
5923 Metadata *MD = M.getModuleFlag("openmp");
5924 if (!MD)
5925 return false;
5926
5927 return true;
5928}
5929
5931 Metadata *MD = M.getModuleFlag("openmp-device");
5932 if (!MD)
5933 return false;
5934
5935 return true;
5936}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Expand Atomic instructions
static cl::opt< unsigned > SetFixpointIterations("attributor-max-iterations", cl::Hidden, cl::desc("Maximal number of fixpoint iterations."), cl::init(32))
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file provides interfaces used to manipulate a call graph, regardless if it is a "old style" Call...
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
dxil pretty DXIL Metadata Pretty Printer
This file defines the DenseSet and SmallDenseSet classes.
This file defines an array type that can be indexed using scoped enum values.
static void emitRemark(const Function &F, OptimizationRemarkEmitter &ORE, bool Skip)
Loop::LoopBounds::Direction Direction
Machine Check Debug Module
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
This file defines constans and helpers used when dealing with OpenMP.
This file defines constans that will be used by both host and device compilation.
static constexpr auto TAG
Definition OpenMPOpt.cpp:185
static cl::opt< bool > HideMemoryTransferLatency("openmp-hide-memory-transfer-latency", cl::desc("[WIP] Tries to hide the latency of host to device memory" " transfers"), cl::Hidden, cl::init(false))
static cl::opt< bool > DisableOpenMPOptStateMachineRewrite("openmp-opt-disable-state-machine-rewrite", cl::desc("Disable OpenMP optimizations that replace the state machine."), cl::Hidden, cl::init(false))
static cl::opt< bool > EnableParallelRegionMerging("openmp-opt-enable-merging", cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden, cl::init(false))
static cl::opt< bool > PrintModuleAfterOptimizations("openmp-opt-print-module-after", cl::desc("Print the current module after OpenMP optimizations."), cl::Hidden, cl::init(false))
#define KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MEMBER)
Definition OpenMPOpt.cpp:241
#define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX)
Definition OpenMPOpt.cpp:218
#define KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MEMBER)
Definition OpenMPOpt.cpp:3678
static cl::opt< bool > PrintOpenMPKernels("openmp-print-gpu-kernels", cl::init(false), cl::Hidden)
static cl::opt< bool > DisableOpenMPOptFolding("openmp-opt-disable-folding", cl::desc("Disable OpenMP optimizations involving folding."), cl::Hidden, cl::init(false))
static cl::opt< bool > PrintModuleBeforeOptimizations("openmp-opt-print-module-before", cl::desc("Print the current module before OpenMP optimizations."), cl::Hidden, cl::init(false))
static cl::opt< unsigned > SetFixpointIterations("openmp-opt-max-iterations", cl::Hidden, cl::desc("Maximal number of attributor iterations."), cl::init(256))
static cl::opt< bool > DisableInternalization("openmp-opt-disable-internalization", cl::desc("Disable function internalization."), cl::Hidden, cl::init(false))
static cl::opt< bool > PrintICVValues("openmp-print-icv-values", cl::init(false), cl::Hidden)
static cl::opt< bool > DisableOpenMPOptimizations("openmp-opt-disable", cl::desc("Disable OpenMP specific optimizations."), cl::Hidden, cl::init(false))
static cl::opt< unsigned > SharedMemoryLimit("openmp-opt-shared-limit", cl::Hidden, cl::desc("Maximum amount of shared memory to use."), cl::init(std::numeric_limits< unsigned >::max()))
static cl::opt< bool > EnableVerboseRemarks("openmp-opt-verbose-remarks", cl::desc("Enables more verbose remarks."), cl::Hidden, cl::init(false))
static cl::opt< bool > DisableOpenMPOptDeglobalization("openmp-opt-disable-deglobalization", cl::desc("Disable OpenMP optimizations involving deglobalization."), cl::Hidden, cl::init(false))
static cl::opt< bool > DisableOpenMPOptBarrierElimination("openmp-opt-disable-barrier-elimination", cl::desc("Disable OpenMP optimizations that eliminate barriers."), cl::Hidden, cl::init(false))
#define DEBUG_TYPE
Definition OpenMPOpt.cpp:67
static cl::opt< bool > DeduceICVValues("openmp-deduce-icv-values", cl::init(false), cl::Hidden)
#define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX)
Definition OpenMPOpt.cpp:210
#define KERNEL_ENVIRONMENT_GETTER(MEMBER, RETURNTYPE)
Definition OpenMPOpt.cpp:231
static cl::opt< bool > DisableOpenMPOptSPMDization("openmp-opt-disable-spmdization", cl::desc("Disable OpenMP optimizations involving SPMD-ization."), cl::Hidden, cl::init(false))
static cl::opt< bool > AlwaysInlineDeviceFunctions("openmp-opt-inline-device", cl::desc("Inline all applicable functions on the device."), cl::Hidden, cl::init(false))
FunctionAnalysisManager FAM
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static StringRef getName(Value *V)
std::pair< BasicBlock *, BasicBlock * > Edge
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static const int BlockSize
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, const llvm::StringTable &StandardNames, VectorLibrary VecLib)
Initialize the set of available library functions based on the specified target triple.
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
size_t size() const
size - Get the array size.
iterator begin()
Instruction iterator methods.
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
const Function * getParent() const
Return the enclosing method, or null if none.
reverse_iterator rbegin()
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
InstListType::reverse_iterator reverse_iterator
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
void setCallingConv(CallingConv::ID CC)
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool doesNotAccessMemory(unsigned OpNo) const
LLVM_ABI bool isIndirectCall() const
Return true if the callsite is an indirect call.
bool isCallee(Value::const_user_iterator UI) const
Determine whether the passed iterator points to the callee operand's Use.
Value * getArgOperand(unsigned i) const
void setArgOperand(unsigned i, Value *v)
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned getArgOperandNo(const Use *U) const
Given a use for a arg operand, get the arg operand number that corresponds to it.
unsigned arg_size() const
AttributeList getAttributes() const
Return the attributes for this call.
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
bool isArgOperand(const Use *U) const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
LLVM_ABI Function * getCaller()
Helper to get the caller (the parent function).
Wrapper to unify "old style" CallGraph and "new style" LazyCallGraph.
void initialize(LazyCallGraph &LCG, LazyCallGraph::SCC &SCC, CGSCCAnalysisManager &AM, CGSCCUpdateResult &UR)
Initializers for usage outside of a CGSCC pass, inside a CGSCC pass in the old and new pass manager (...
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
@ ICMP_SLT
signed less than
static LLVM_ABI Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
static LLVM_ABI Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
This is the shared class of boolean and integer constants.
IntegerType * getIntegerType() const
Variant of the getType() method to always return an IntegerType, which reduces the amount of casting ...
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
This is an important base class in LLVM.
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
LLVM_ABI Instruction * findNearestCommonDominator(Instruction *I1, Instruction *I2) const
Find the nearest instruction I that dominates both I1 and I2, in the sense that a result produced bef...
static ErrorSuccess success()
Create a success value.
AtomicOrdering getOrdering() const
Returns the ordering constraint of this fence instruction.
A proxy from a FunctionAnalysisManager to an SCC.
const BasicBlock & getEntryBlock() const
const BasicBlock & front() const
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Argument * getArg(unsigned i) const
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
LLVM_ABI bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
bool hasLocalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
@ PrivateLinkage
Like Internal, but omit from symbol table.
@ InternalLinkage
Rename collisions when linking (static functions).
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
LLVM_ABI void setInitializer(Constant *InitVal)
setInitializer - Sets the initializer for this global variable, removing any existing initializer if ...
LLVM_ABI bool isLifetimeStartOrEnd() const LLVM_READONLY
Return true if the instruction is a llvm.lifetime.start or llvm.lifetime.end marker.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
LLVM_ABI bool mayReadFromMemory() const LLVM_READONLY
Return true if this instruction may read memory.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void setSuccessor(unsigned Idx, BasicBlock *BB)
Update the specified successor to point at the provided block.
A node in the call graph.
An SCC of the call graph.
A lazily constructed view of the call graph of a module.
LLVM_ABI void eraseFromParent()
This method unlinks 'this' from the containing function and deletes it.
LLVM_ABI StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
A Module instance is used to store all the information related to an LLVM module.
const Triple & getTargetTriple() const
Get the target triple which is a string describing the target host.
PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM, LazyCallGraph &CG, CGSCCUpdateResult &UR)
Definition OpenMPOpt.cpp:5830
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
Definition OpenMPOpt.cpp:5719
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
static ReturnInst * Create(LLVMContext &C, Value *retVal=nullptr, InsertPosition InsertBefore=nullptr)
A vector that has set insertion semantics.
size_type size() const
Determine the number of elements in the SetVector.
size_type count(const_arg_type key) const
Count the number of elements of a given key in the SetVector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Triple - Helper class for working with autoconf configuration names.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Type * getType() const
All values are typed, get the type of this value.
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
const ParentTy * getParent() const
self_iterator getIterator()
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition OpenMPOpt.cpp:188
GlobalVariable * getKernelEnvironementGVFromKernelInitCB(CallBase *KernelInitCB)
Definition OpenMPOpt.cpp:260
ConstantStruct * getKernelEnvironementFromKernelInitCB(CallBase *KernelInitCB)
Definition OpenMPOpt.cpp:267
Abstract Attribute helper functions.
LLVM_ABI bool isValidAtPosition(const ValueAndContext &VAC, InformationCache &InfoCache)
Return true if the value of VAC is a valid at the position of VAC, that is a constant,...
LLVM_ABI bool isPotentiallyAffectedByBarrier(Attributor &A, const Instruction &I, const AbstractAttribute &QueryingAA)
Return true if I is potentially affected by a barrier.
LLVM_ABI bool isNoSyncInst(Attributor &A, const Instruction &I, const AbstractAttribute &QueryingAA)
Return true if I is a nosync instruction.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
E & operator^=(E &LHS, E RHS)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ BasicBlock
Various leaf nodes.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
constexpr uint64_t PointerSize
aarch64 pointer size.
bool isOpenMPDevice(Module &M)
Helper to determine if M is a OpenMP target offloading device module.
Definition OpenMPOpt.cpp:5930
bool containsOpenMP(Module &M)
Helper to determine if M contains OpenMP.
Definition OpenMPOpt.cpp:5922
InternalControlVar
IDs for all Internal Control Variables (ICVs).
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
KernelSet getDeviceKernels(Module &M)
Get OpenMP device kernels in M.
Definition OpenMPOpt.cpp:5904
@ OMP_TGT_EXEC_MODE_GENERIC_SPMD
@ OMP_TGT_EXEC_MODE_GENERIC
SetVector< Kernel > KernelSet
Set of kernels in the module.
Function * Kernel
Summary of a kernel (=entry point for target offloading).
bool isOpenMPKernel(Function &Fn)
Return true iff Fn is an OpenMP GPU kernel; Fn has the "kernel" attribute.
Definition OpenMPOpt.cpp:5900
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< UseNode * > Use
friend class Instruction
Iterator for Instructions in a `BasicBlock.
LLVM_ABI iterator begin() const
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
FunctionAddr VTableAddr Value
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
bool succ_empty(const Instruction *I)
decltype(auto) dyn_cast(const From &Val)
dyn_cast - Return the argument parameter cast to the specified type.
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
bool operator!=(uint64_t V1, const APInt &V2)
constexpr from_range_t from_range
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
InnerAnalysisManagerProxy< FunctionAnalysisManager, Module > FunctionAnalysisManagerModuleProxy
Provide the FunctionAnalysisManager to Module proxy.
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
AnalysisManager< LazyCallGraph::SCC, LazyCallGraph & > CGSCCAnalysisManager
The CGSCC analysis manager.
@ ThinLTOPostLink
ThinLTO postlink (backend compile) phase.
@ FullLTOPostLink
Full LTO postlink (backend compile) phase.
@ ThinLTOPreLink
ThinLTO prelink (summary) phase.
auto dyn_cast_or_null(const Y &Val)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa - Return true if the parameter to the template is an instance of one of the template type argu...
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
bool operator&=(SparseBitVector< ElementSize > *LHS, const SparseBitVector< ElementSize > &RHS)
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
ArrayRef(const T &OneElt) -> ArrayRef< T >
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
decltype(auto) cast(const From &Val)
cast - Return the argument parameter cast to the specified type.
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
LLVM_ABI Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
Attempt to constant fold an insertvalue instruction with the specified operands and indices.
@ OPTIONAL
The target may be valid if the source is not.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
BumpPtrAllocatorImpl<> BumpPtrAllocator
The standard BumpPtrAllocator which just uses the default template parameters.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
static LLVM_ABI AAExecutionDomain & createForPosition(const IRPosition &IRP, Attributor &A)
Create an abstract attribute view for the position IRP.
Definition OpenMPOpt.cpp:5635
AAExecutionDomain(const IRPosition &IRP, Attributor &A)
static LLVM_ABI const char ID
Unique ID (due to the unique address)
AccessKind
Simple enum to distinguish read/write/read-write accesses.
StateType::base_t MemoryLocationsKind
static LLVM_ABI bool isAlignedBarrier(const CallBase &CB, bool ExecutedAligned)
Helper function to determine if CB is an aligned (GPU) barrier.
Base struct for all "concrete attribute" deductions.
virtual const char * getIdAddr() const =0
This function should return the address of the ID of the AbstractAttribute.
An interface to query the internal state of an abstract attribute.
Wrapper for FunctionAnalysisManager.
Configuration for the Attributor.
std::function< void(Attributor &A, const Function &F)> InitializationCallback
Callback function to be invoked on internal functions marked live.
std::optional< unsigned > MaxFixpointIterations
Maximum number of iterations to run until fixpoint.
bool RewriteSignatures
Flag to determine if we rewrite function signatures.
OptimizationRemarkGetter OREGetter
IPOAmendableCBTy IPOAmendableCB
bool IsModulePass
Is the user of the Attributor a module pass or not.
bool DefaultInitializeLiveInternals
Flag to determine if we want to initialize all default AAs for an internal function marked live.
The fixpoint analysis framework that orchestrates the attribute deduction.
static LLVM_ABI bool isInternalizable(Function &F)
Returns true if the function F can be internalized.
std::function< std::optional< Value * >( const IRPosition &, const AbstractAttribute *, bool &)> SimplifictionCallbackTy
Register CB as a simplification callback.
std::function< std::optional< Constant * >( const GlobalVariable &, const AbstractAttribute *, bool &)> GlobalVariableSimplifictionCallbackTy
Register CB as a simplification callback.
std::function< bool(Attributor &, const AbstractAttribute *)> VirtualUseCallbackTy
static LLVM_ABI bool internalizeFunctions(SmallPtrSetImpl< Function * > &FnSet, DenseMap< Function *, Function * > &FnMap)
Make copies of each function in the set FnSet such that the copied version has internal linkage after...
Simple wrapper for a single bit (boolean) state.
Support structure for SCC passes to communicate updates the call graph back to the CGSCC pass manager...
Helper to describe and deal with positions in the LLVM-IR.
static const IRPosition callsite_returned(const CallBase &CB)
Create a position describing the returned value of CB.
static const IRPosition returned(const Function &F, const CallBaseContext *CBContext=nullptr)
Create a position describing the returned value of F.
static const IRPosition value(const Value &V, const CallBaseContext *CBContext=nullptr)
Create a position describing the value of V.
static const IRPosition inst(const Instruction &I, const CallBaseContext *CBContext=nullptr)
Create a position describing the instruction I.
@ IRP_ARGUMENT
An attribute for a function argument.
@ IRP_RETURNED
An attribute for the function return value.
@ IRP_CALL_SITE
An attribute for a call site (function scope).
@ IRP_CALL_SITE_RETURNED
An attribute for a call site return value.
@ IRP_FUNCTION
An attribute for a function (scope).
@ IRP_FLOAT
A position that is not associated with a spot suitable for attributes.
@ IRP_CALL_SITE_ARGUMENT
An attribute for a call site argument.
@ IRP_INVALID
An invalid position.
static const IRPosition function(const Function &F, const CallBaseContext *CBContext=nullptr)
Create a position describing the function scope of F.
Kind getPositionKind() const
Return the associated position kind.
static const IRPosition callsite_function(const CallBase &CB)
Create a position describing the function scope of CB.
Data structure to hold cached (LLVM-IR) information.
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...