LLVM: lib/Transforms/IPO/OpenMPOpt.cpp Source File (original) (raw)

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

21

50#include "llvm/IR/IntrinsicsAMDGPU.h"

51#include "llvm/IR/IntrinsicsNVPTX.h"

59

60#include

61#include

62#include

63

64using namespace llvm;

65using namespace omp;

66

67#define DEBUG_TYPE "openmp-opt"

68

70 "openmp-opt-disable", cl::desc("Disable OpenMP specific optimizations."),

72

74 "openmp-opt-enable-merging",

75 cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden,

77

80 cl::desc("Disable function internalization."),

82

89

91 "openmp-hide-memory-transfer-latency",

92 cl::desc("[WIP] Tries to hide the latency of host to device memory"

93 " transfers"),

95

97 "openmp-opt-disable-deglobalization",

98 cl::desc("Disable OpenMP optimizations involving deglobalization."),

100

102 "openmp-opt-disable-spmdization",

103 cl::desc("Disable OpenMP optimizations involving SPMD-ization."),

105

107 "openmp-opt-disable-folding",

108 cl::desc("Disable OpenMP optimizations involving folding."), cl::Hidden,

110

112 "openmp-opt-disable-state-machine-rewrite",

113 cl::desc("Disable OpenMP optimizations that replace the state machine."),

115

117 "openmp-opt-disable-barrier-elimination",

118 cl::desc("Disable OpenMP optimizations that eliminate barriers."),

120

122 "openmp-opt-print-module-after",

123 cl::desc("Print the current module after OpenMP optimizations."),

125

127 "openmp-opt-print-module-before",

128 cl::desc("Print the current module before OpenMP optimizations."),

130

132 "openmp-opt-inline-device",

133 cl::desc("Inline all applicable functions on the device."), cl::Hidden,

135

140

143 cl::desc("Maximal number of attributor iterations."),

145

148 cl::desc("Maximum amount of shared memory to use."),

149 cl::init(std::numeric_limits::max()));

150

152 "Number of OpenMP runtime calls deduplicated");

154 "Number of OpenMP parallel regions deleted");

155STATISTIC(NumOpenMPRuntimeFunctionsIdentified,

156 "Number of OpenMP runtime functions identified");

157STATISTIC(NumOpenMPRuntimeFunctionUsesIdentified,

158 "Number of OpenMP runtime function uses identified");

160 "Number of OpenMP target region entry points (=kernels) identified");

162 "Number of non-OpenMP target region kernels identified");

164 "Number of OpenMP target region entry points (=kernels) executed in "

165 "SPMD-mode instead of generic-mode");

166STATISTIC(NumOpenMPTargetRegionKernelsWithoutStateMachine,

167 "Number of OpenMP target region entry points (=kernels) executed in "

168 "generic-mode without a state machines");

169STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback,

170 "Number of OpenMP target region entry points (=kernels) executed in "

171 "generic-mode with customized state machines with fallback");

172STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback,

173 "Number of OpenMP target region entry points (=kernels) executed in "

174 "generic-mode with customized state machines without fallback");

176 NumOpenMPParallelRegionsReplacedInGPUStateMachine,

177 "Number of OpenMP parallel regions replaced with ID in GPU state machines");

179 "Number of OpenMP parallel regions merged");

181 "Amount of memory pushed to shared memory");

182STATISTIC(NumBarriersEliminated, "Number of redundant barriers eliminated");

183

184#if !defined(NDEBUG)

186#endif

187

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210#define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX) \

211 constexpr unsigned MEMBER##Idx = IDX;

212

215

216#undef KERNEL_ENVIRONMENT_IDX

217

218#define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX) \

219 constexpr unsigned MEMBER##Idx = IDX;

220

228

229#undef KERNEL_ENVIRONMENT_CONFIGURATION_IDX

230

231#define KERNEL_ENVIRONMENT_GETTER(MEMBER, RETURNTYPE) \

232 RETURNTYPE *get##MEMBER##FromKernelEnvironment(ConstantStruct *KernelEnvC) { \

233 return cast(KernelEnvC->getAggregateElement(MEMBER##Idx)); \

234 }

235

238

239#undef KERNEL_ENVIRONMENT_GETTER

240

241#define KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MEMBER) \

242 ConstantInt *get##MEMBER##FromKernelEnvironment( \

243 ConstantStruct *KernelEnvC) { \

244 ConstantStruct *ConfigC = \

245 getConfigurationFromKernelEnvironment(KernelEnvC); \

246 return dyn_cast(ConfigC->getAggregateElement(MEMBER##Idx)); \

247 }

248

256

257#undef KERNEL_ENVIRONMENT_CONFIGURATION_GETTER

258

261 constexpr int InitKernelEnvironmentArgNo = 0;

263 KernelInitCB->getArgOperand(InitKernelEnvironmentArgNo)

265}

266

272}

273

274namespace {

275

276struct AAHeapToShared;

277

278struct AAICVTracker;

279

280

281

283 OMPInformationCache(Module &M, AnalysisGetter &AG,

285 bool OpenMPPostLink)

287 OpenMPPostLink(OpenMPPostLink) {

288

289 OMPBuilder.Config.IsTargetDevice = isOpenMPDevice(OMPBuilder.M);

290 const Triple T(OMPBuilder.M.getTargetTriple());

291 switch (T.getArch()) {

295 assert(OMPBuilder.Config.IsTargetDevice &&

296 "OpenMP AMDGPU/NVPTX is only prepared to deal with device code.");

297 OMPBuilder.Config.IsGPU = true;

298 break;

299 default:

300 OMPBuilder.Config.IsGPU = false;

301 break;

302 }

303 OMPBuilder.initialize();

304 initializeRuntimeFunctions(M);

305 initializeInternalControlVars();

306 }

307

308

309 struct InternalControlVarInfo {

310

312

313

314 StringRef Name;

315

316

317 StringRef EnvVarName;

318

319

321

322

323 ConstantInt *InitValue;

324

325

327

328

330

331

333 };

334

335

336 struct RuntimeFunctionInfo {

337

338

340

341

342 StringRef Name;

343

344

345 bool IsVarArg;

346

347

348 Type *ReturnType;

349

350

352

353

354 Function *Declaration = nullptr;

355

356

357 using UseVector = SmallVector<Use *, 16>;

358

359

360 void clearUsesMap() { UsesMap.clear(); }

361

362

363 operator bool() const { return Declaration; }

364

365

366 UseVector &getOrCreateUseVector(Function *F) {

367 std::shared_ptr &UV = UsesMap[F];

368 if (!UV)

369 UV = std::make_shared();

370 return *UV;

371 }

372

373

374

375 const UseVector *getUseVector(Function &F) const {

376 auto I = UsesMap.find(&F);

377 if (I != UsesMap.end())

378 return I->second.get();

379 return nullptr;

380 }

381

382

383 size_t getNumFunctionsWithUses() const { return UsesMap.size(); }

384

385

386

387 size_t getNumArgs() const { return ArgumentTypes.size(); }

388

389

390

391

392 void foreachUse(SmallVectorImpl<Function *> &SCC,

393 function_ref<bool(Use &, Function &)> CB) {

394 for (Function *F : SCC)

395 foreachUse(CB, F);

396 }

397

398

399

400 void foreachUse(function_ref<bool(Use &, Function &)> CB, Function *F) {

401 SmallVector<unsigned, 8> ToBeDeleted;

402 ToBeDeleted.clear();

403

404 unsigned Idx = 0;

405 UseVector &UV = getOrCreateUseVector(F);

406

407 for (Use *U : UV) {

408 if (CB(*U, *F))

410 ++Idx;

411 }

412

413

414

415 while (!ToBeDeleted.empty()) {

417 UV[Idx] = UV.back();

418 UV.pop_back();

419 }

420 }

421

422 private:

423

424

425 DenseMap<Function *, std::shared_ptr> UsesMap;

426

427 public:

428

429 decltype(UsesMap)::iterator begin() { return UsesMap.begin(); }

430 decltype(UsesMap)::iterator end() { return UsesMap.end(); }

431 };

432

433

434 OpenMPIRBuilder OMPBuilder;

435

436

438 RuntimeFunction::OMPRTL___last>

439 RFIs;

440

441

442 DenseMap<Function *, RuntimeFunction> RuntimeFunctionIDMap;

443

444

446 InternalControlVar::ICV___last>

447 ICVs;

448

449

450

451 void initializeInternalControlVars() {

452#define ICV_RT_SET(_Name, RTL) \

453 { \

454 auto &ICV = ICVs[_Name]; \

455 ICV.Setter = RTL; \

456 }

457#define ICV_RT_GET(Name, RTL) \

458 { \

459 auto &ICV = ICVs[Name]; \

460 ICV.Getter = RTL; \

461 }

462#define ICV_DATA_ENV(Enum, _Name, _EnvVarName, Init) \

463 { \

464 auto &ICV = ICVs[Enum]; \

465 ICV.Name = _Name; \

466 ICV.Kind = Enum; \

467 ICV.InitKind = Init; \

468 ICV.EnvVarName = _EnvVarName; \

469 switch (ICV.InitKind) { \

470 case ICV_IMPLEMENTATION_DEFINED: \

471 ICV.InitValue = nullptr; \

472 break; \

473 case ICV_ZERO: \

474 ICV.InitValue = ConstantInt::get( \

475 Type::getInt32Ty(OMPBuilder.Int32->getContext()), 0); \

476 break; \

477 case ICV_FALSE: \

478 ICV.InitValue = ConstantInt::getFalse(OMPBuilder.Int1->getContext()); \

479 break; \

480 case ICV_LAST: \

481 break; \

482 } \

483 }

484#include "llvm/Frontend/OpenMP/OMPKinds.def"

485 }

486

487

488

489

490 static bool declMatchesRTFTypes(Function *F, Type *RTFRetType,

492

493

494

495 if (F)

496 return false;

497 if (F->getReturnType() != RTFRetType)

498 return false;

499 if (F->arg_size() != RTFArgTypes.size())

500 return false;

501

502 auto *RTFTyIt = RTFArgTypes.begin();

503 for (Argument &Arg : F->args()) {

504 if (Arg.getType() != *RTFTyIt)

505 return false;

506

507 ++RTFTyIt;

508 }

509

510 return true;

511 }

512

513

514 unsigned collectUses(RuntimeFunctionInfo &RFI, bool CollectStats = true) {

515 unsigned NumUses = 0;

516 if (!RFI.Declaration)

517 return NumUses;

518 OMPBuilder.addAttributes(RFI.Kind, *RFI.Declaration);

519

520 if (CollectStats) {

521 NumOpenMPRuntimeFunctionsIdentified += 1;

522 NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses();

523 }

524

525

526 for (Use &U : RFI.Declaration->uses()) {

528 if (CGSCC || CGSCC->empty() || CGSCC->contains(UserI->getFunction())) {

529 RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U);

530 ++NumUses;

531 }

532 } else {

533 RFI.getOrCreateUseVector(nullptr).push_back(&U);

534 ++NumUses;

535 }

536 }

537 return NumUses;

538 }

539

540

542 auto &RFI = RFIs[RTF];

543 RFI.clearUsesMap();

544 collectUses(RFI, false);

545 }

546

547

548 void recollectUses() {

549 for (int Idx = 0; Idx < RFIs.size(); ++Idx)

550 recollectUsesForFunction(static_cast<RuntimeFunction>(Idx));

551 }

552

553

554 void setCallingConvention(FunctionCallee Callee, CallInst *CI) {

557 }

558

559

560

562

563 if (!OpenMPPostLink)

564 return true;

565

566

567

569 RuntimeFunctionInfo &RFI = RFIs[Fn];

570

571 if (!RFI.Declaration || RFI.Declaration->isDeclaration())

572 return false;

573 }

574 return true;

575 }

576

577

578

579 void initializeRuntimeFunctions(Module &M) {

580

581

582#define OMP_TYPE(VarName, ...) \

583 Type *VarName = OMPBuilder.VarName; \

584 (void)VarName;

585

586#define OMP_ARRAY_TYPE(VarName, ...) \

587 ArrayType *VarName##Ty = OMPBuilder.VarName##Ty; \

588 (void)VarName##Ty; \

589 PointerType *VarName##PtrTy = OMPBuilder.VarName##PtrTy; \

590 (void)VarName##PtrTy;

591

592#define OMP_FUNCTION_TYPE(VarName, ...) \

593 FunctionType *VarName = OMPBuilder.VarName; \

594 (void)VarName; \

595 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \

596 (void)VarName##Ptr;

597

598#define OMP_STRUCT_TYPE(VarName, ...) \

599 StructType *VarName = OMPBuilder.VarName; \

600 (void)VarName; \

601 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \

602 (void)VarName##Ptr;

603

604#define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...) \

605 { \

606 SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__}); \

607 Function *F = M.getFunction(_Name); \

608 RTLFunctions.insert(F); \

609 if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) { \

610 RuntimeFunctionIDMap[F] = _Enum; \

611 auto &RFI = RFIs[_Enum]; \

612 RFI.Kind = _Enum; \

613 RFI.Name = _Name; \

614 RFI.IsVarArg = _IsVarArg; \

615 RFI.ReturnType = OMPBuilder._ReturnType; \

616 RFI.ArgumentTypes = std::move(ArgsTypes); \

617 RFI.Declaration = F; \

618 unsigned NumUses = collectUses(RFI); \

619 (void)NumUses; \

620 LLVM_DEBUG({ \

621 dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not") \

622 << " found\n"; \

623 if (RFI.Declaration) \

624 dbgs() << TAG << "-> got " << NumUses << " uses in " \

625 << RFI.getNumFunctionsWithUses() \

626 << " different functions.\n"; \

627 }); \

628 } \

629 }

630#include "llvm/Frontend/OpenMP/OMPKinds.def"

631

632

633

635 for (Function &F : M) {

636 for (StringRef Prefix : {"__kmpc", "_ZN4ompx", "omp_"})

637 if (F.hasFnAttribute(Attribute::NoInline) &&

638 F.getName().starts_with(Prefix) &&

639 F.hasFnAttribute(Attribute::OptimizeNone))

640 F.removeFnAttr(Attribute::NoInline);

641 }

642 }

643

644

645 }

646

647

648 DenseSet<const Function *> RTLFunctions;

649

650

651 bool OpenMPPostLink = false;

652};

653

654template <typename Ty, bool InsertInvalidates = true>

655struct BooleanStateWithSetVector : public BooleanState {

656 bool contains(const Ty &Elem) const { return Set.contains(Elem); }

657 bool insert(const Ty &Elem) {

658 if (InsertInvalidates)

659 BooleanState::indicatePessimisticFixpoint();

660 return Set.insert(Elem);

661 }

662

663 const Ty &operator[](int Idx) const { return Set[Idx]; }

664 bool operator==(const BooleanStateWithSetVector &RHS) const {

665 return BooleanState::operator==(RHS) && Set == RHS.Set;

666 }

667 bool operator!=(const BooleanStateWithSetVector &RHS) const {

668 return !(*this == RHS);

669 }

670

671 bool empty() const { return Set.empty(); }

672 size_t size() const { return Set.size(); }

673

674

675 BooleanStateWithSetVector &operator^=(const BooleanStateWithSetVector &RHS) {

676 BooleanState::operator^=(RHS);

677 Set.insert_range(RHS.Set);

678 return *this;

679 }

680

681private:

682

683 SetVector Set;

684

685public:

686 typename decltype(Set)::iterator begin() { return Set.begin(); }

687 typename decltype(Set)::iterator end() { return Set.end(); }

688 typename decltype(Set)::const_iterator begin() const { return Set.begin(); }

689 typename decltype(Set)::const_iterator end() const { return Set.end(); }

690};

691

692template <typename Ty, bool InsertInvalidates = true>

693using BooleanStateWithPtrSetVector =

694 BooleanStateWithSetVector<Ty *, InsertInvalidates>;

695

697

698 bool IsAtFixpoint = false;

699

700

701

702 BooleanStateWithPtrSetVector<CallBase, false>

703 ReachedKnownParallelRegions;

704

705

706 BooleanStateWithPtrSetVector ReachedUnknownParallelRegions;

707

708

709

710

711 BooleanStateWithPtrSetVector<Instruction, false> SPMDCompatibilityTracker;

712

713

714

715 CallBase *KernelInitCB = nullptr;

716

717

718

719 ConstantStruct *KernelEnvC = nullptr;

720

721

722

723 CallBase *KernelDeinitCB = nullptr;

724

725

726 bool IsKernelEntry = false;

727

728

729 BooleanStateWithPtrSetVector<Function, false> ReachingKernelEntries;

730

731

732

733

734 BooleanStateWithSetVector<uint8_t> ParallelLevels;

735

736

737 bool NestedParallelism = false;

738

739

740

741

742 KernelInfoState() = default;

743 KernelInfoState(bool BestState) {

744 if (!BestState)

745 indicatePessimisticFixpoint();

746 }

747

748

749 bool isValidState() const override { return true; }

750

751

752 bool isAtFixpoint() const override { return IsAtFixpoint; }

753

754

755 ChangeStatus indicatePessimisticFixpoint() override {

756 IsAtFixpoint = true;

757 ParallelLevels.indicatePessimisticFixpoint();

758 ReachingKernelEntries.indicatePessimisticFixpoint();

759 SPMDCompatibilityTracker.indicatePessimisticFixpoint();

760 ReachedKnownParallelRegions.indicatePessimisticFixpoint();

761 ReachedUnknownParallelRegions.indicatePessimisticFixpoint();

762 NestedParallelism = true;

763 return ChangeStatus::CHANGED;

764 }

765

766

767 ChangeStatus indicateOptimisticFixpoint() override {

768 IsAtFixpoint = true;

769 ParallelLevels.indicateOptimisticFixpoint();

770 ReachingKernelEntries.indicateOptimisticFixpoint();

771 SPMDCompatibilityTracker.indicateOptimisticFixpoint();

772 ReachedKnownParallelRegions.indicateOptimisticFixpoint();

773 ReachedUnknownParallelRegions.indicateOptimisticFixpoint();

774 return ChangeStatus::UNCHANGED;

775 }

776

777

778 KernelInfoState &getAssumed() { return *this; }

779 const KernelInfoState &getAssumed() const { return *this; }

780

781 bool operator==(const KernelInfoState &RHS) const {

782 if (SPMDCompatibilityTracker != RHS.SPMDCompatibilityTracker)

783 return false;

784 if (ReachedKnownParallelRegions != RHS.ReachedKnownParallelRegions)

785 return false;

786 if (ReachedUnknownParallelRegions != RHS.ReachedUnknownParallelRegions)

787 return false;

788 if (ReachingKernelEntries != RHS.ReachingKernelEntries)

789 return false;

790 if (ParallelLevels != RHS.ParallelLevels)

791 return false;

792 if (NestedParallelism != RHS.NestedParallelism)

793 return false;

794 return true;

795 }

796

797

798 bool mayContainParallelRegion() {

799 return !ReachedKnownParallelRegions.empty() ||

800 !ReachedUnknownParallelRegions.empty();

801 }

802

803

804 static KernelInfoState getBestState() { return KernelInfoState(true); }

805

806 static KernelInfoState getBestState(KernelInfoState &KIS) {

807 return getBestState();

808 }

809

810

811 static KernelInfoState getWorstState() { return KernelInfoState(false); }

812

813

814 KernelInfoState operator^=(const KernelInfoState &KIS) {

815

816 if (KIS.KernelInitCB) {

817 if (KernelInitCB && KernelInitCB != KIS.KernelInitCB)

818 llvm_unreachable("Kernel that calls another kernel violates OpenMP-Opt "

819 "assumptions.");

820 KernelInitCB = KIS.KernelInitCB;

821 }

822 if (KIS.KernelDeinitCB) {

823 if (KernelDeinitCB && KernelDeinitCB != KIS.KernelDeinitCB)

824 llvm_unreachable("Kernel that calls another kernel violates OpenMP-Opt "

825 "assumptions.");

826 KernelDeinitCB = KIS.KernelDeinitCB;

827 }

828 if (KIS.KernelEnvC) {

829 if (KernelEnvC && KernelEnvC != KIS.KernelEnvC)

830 llvm_unreachable("Kernel that calls another kernel violates OpenMP-Opt "

831 "assumptions.");

832 KernelEnvC = KIS.KernelEnvC;

833 }

834 SPMDCompatibilityTracker ^= KIS.SPMDCompatibilityTracker;

835 ReachedKnownParallelRegions ^= KIS.ReachedKnownParallelRegions;

836 ReachedUnknownParallelRegions ^= KIS.ReachedUnknownParallelRegions;

837 NestedParallelism |= KIS.NestedParallelism;

838 return *this;

839 }

840

841 KernelInfoState operator&=(const KernelInfoState &KIS) {

842 return (*this ^= KIS);

843 }

844

845

846};

847

848

849

850struct OffloadArray {

851

852 AllocaInst *Array = nullptr;

853

854 SmallVector<Value *, 8> StoredValues;

855

856 SmallVector<StoreInst *, 8> LastAccesses;

857

858 OffloadArray() = default;

859

860

861

862

863

864 bool initialize(AllocaInst &Array, Instruction &Before) {

865 if (!Array.getAllocatedType()->isArrayTy())

866 return false;

867

868 if (!getValues(Array, Before))

869 return false;

870

871 this->Array = &Array;

872 return true;

873 }

874

875 static const unsigned DeviceIDArgNum = 1;

876 static const unsigned BasePtrsArgNum = 3;

877 static const unsigned PtrsArgNum = 4;

878 static const unsigned SizesArgNum = 5;

879

880private:

881

882

883

884 bool getValues(AllocaInst &Array, Instruction &Before) {

885

886 const uint64_t NumValues = Array.getAllocatedType()->getArrayNumElements();

887 StoredValues.assign(NumValues, nullptr);

888 LastAccesses.assign(NumValues, nullptr);

889

890

891

894 return false;

895

896 const DataLayout &DL = Array.getDataLayout();

897 const unsigned int PointerSize = DL.getPointerSize();

898

899 for (Instruction &I : *BB) {

900 if (&I == &Before)

901 break;

902

904 continue;

905

908 auto *Dst =

910 if (Dst == &Array) {

913 LastAccesses[Idx] = S;

914 }

915 }

916

917 return isFilled();

918 }

919

920

921

922 bool isFilled() {

923 const unsigned NumValues = StoredValues.size();

924 for (unsigned I = 0; I < NumValues; ++I) {

925 if (!StoredValues[I] || !LastAccesses[I])

926 return false;

927 }

928

929 return true;

930 }

931};

932

933struct OpenMPOpt {

934

935 using OptimizationRemarkGetter =

936 function_ref<OptimizationRemarkEmitter &(Function *)>;

937

938 OpenMPOpt(SmallVectorImpl<Function *> &SCC, CallGraphUpdater &CGUpdater,

939 OptimizationRemarkGetter OREGetter,

940 OMPInformationCache &OMPInfoCache, Attributor &A)

941 : M(*(*SCC.begin())->getParent()), SCC(SCC), CGUpdater(CGUpdater),

942 OREGetter(OREGetter), OMPInfoCache(OMPInfoCache), A(A) {}

943

944

945 bool remarksEnabled() {

946 auto &Ctx = M.getContext();

947 return Ctx.getDiagHandlerPtr()->isAnyRemarkEnabled(DEBUG_TYPE);

948 }

949

950

951 bool run(bool IsModulePass) {

952 if (SCC.empty())

953 return false;

954

956

958 << " functions\n");

959

960 if (IsModulePass) {

961 Changed |= runAttributor(IsModulePass);

962

963

964 OMPInfoCache.recollectUses();

965

966

967 Changed |= rewriteDeviceCodeStateMachine();

968

969 if (remarksEnabled())

970 analysisGlobalization();

971 } else {

973 printICVs();

975 printKernels();

976

977 Changed |= runAttributor(IsModulePass);

978

979

980 OMPInfoCache.recollectUses();

981

982 Changed |= deleteParallelRegions();

983

985 Changed |= hideMemTransfersLatency();

986 Changed |= deduplicateRuntimeCalls();

988 if (mergeParallelRegions()) {

989 deduplicateRuntimeCalls();

991 }

992 }

993 }

994

995 if (OMPInfoCache.OpenMPPostLink)

996 Changed |= removeRuntimeSymbols();

997

999 }

1000

1001

1002

1003 void printICVs() const {

1004 InternalControlVar ICVs[] = {ICV_nthreads, ICV_active_levels, ICV_cancel,

1005 ICV_proc_bind};

1006

1007 for (Function *F : SCC) {

1008 for (auto ICV : ICVs) {

1009 auto ICVInfo = OMPInfoCache.ICVs[ICV];

1010 auto Remark = [&](OptimizationRemarkAnalysis ORA) {

1011 return ORA << "OpenMP ICV " << ore::NV("OpenMPICV", ICVInfo.Name)

1012 << " Value: "

1013 << (ICVInfo.InitValue

1014 ? toString(ICVInfo.InitValue->getValue(), 10, true)

1015 : "IMPLEMENTATION_DEFINED");

1016 };

1017

1019 }

1020 }

1021 }

1022

1023

1024 void printKernels() const {

1025 for (Function *F : SCC) {

1027 continue;

1028

1029 auto Remark = [&](OptimizationRemarkAnalysis ORA) {

1030 return ORA << "OpenMP GPU kernel "

1031 << ore::NV("OpenMPGPUKernel", F->getName()) << "\n";

1032 };

1033

1035 }

1036 }

1037

1038

1039

1040 static CallInst *getCallIfRegularCall(

1041 Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) {

1044 (!RFI ||

1045 (RFI->Declaration && CI->getCalledFunction() == RFI->Declaration)))

1046 return CI;

1047 return nullptr;

1048 }

1049

1050

1051

1052 static CallInst *getCallIfRegularCall(

1053 Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) {

1056 (!RFI ||

1057 (RFI->Declaration && CI->getCalledFunction() == RFI->Declaration)))

1058 return CI;

1059 return nullptr;

1060 }

1061

1062private:

1063

1064 bool mergeParallelRegions() {

1065 const unsigned CallbackCalleeOperand = 2;

1066 const unsigned CallbackFirstArgOperand = 3;

1067 using InsertPointTy = OpenMPIRBuilder::InsertPointTy;

1068

1069

1070 OMPInformationCache::RuntimeFunctionInfo &RFI =

1071 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];

1072

1073 if (!RFI.Declaration)

1074 return false;

1075

1076

1077 OMPInformationCache::RuntimeFunctionInfo UnmergableCallsInfo[] = {

1078 OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind],

1079 OMPInfoCache.RFIs[OMPRTL___kmpc_push_num_threads],

1080 };

1081

1083 LoopInfo *LI = nullptr;

1084 DominatorTree *DT = nullptr;

1085

1086 SmallDenseMap<BasicBlock *, SmallPtrSet<Instruction *, 4>> BB2PRMap;

1087

1088 BasicBlock *StartBB = nullptr, *EndBB = nullptr;

1089 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {

1090 BasicBlock *CGStartBB = CodeGenIP.getBlock();

1092 SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);

1093 assert(StartBB != nullptr && "StartBB should not be null");

1095 assert(EndBB != nullptr && "EndBB should not be null");

1096 EndBB->getTerminator()->setSuccessor(0, CGEndBB);

1098 };

1099

1100 auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &,

1101 Value &Inner, Value *&ReplacementValue) -> InsertPointTy {

1102 ReplacementValue = &Inner;

1103 return CodeGenIP;

1104 };

1105

1106 auto FiniCB = [&](InsertPointTy CodeGenIP) { return Error::success(); };

1107

1108

1109

1110 auto CreateSequentialRegion = [&](Function *OuterFn,

1114

1115

1116 BasicBlock *ParentBB = SeqStartI->getParent();

1118 SplitBlock(ParentBB, SeqEndI->getNextNode(), DT, LI);

1122 SplitBlock(ParentBB, SeqStartI, DT, LI, nullptr, "seq.par.merged");

1123

1125 "Expected a different CFG");

1128

1129 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {

1130 BasicBlock *CGStartBB = CodeGenIP.getBlock();

1132 SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);

1133 assert(SeqStartBB != nullptr && "SeqStartBB should not be null");

1135 assert(SeqEndBB != nullptr && "SeqEndBB should not be null");

1138 };

1139 auto FiniCB = [&](InsertPointTy CodeGenIP) { return Error::success(); };

1140

1141

1142

1143 for (Instruction &I : *SeqStartBB) {

1144 SmallPtrSet<Instruction *, 4> OutsideUsers;

1145 for (User *Usr : I.users()) {

1147

1148

1150 continue;

1151

1152 if (UsrI.getParent() != SeqStartBB)

1153 OutsideUsers.insert(&UsrI);

1154 }

1155

1156 if (OutsideUsers.empty())

1157 continue;

1158

1159

1160

1161 const DataLayout &DL = M.getDataLayout();

1162 AllocaInst *AllocaI = new AllocaInst(

1163 I.getType(), DL.getAllocaAddrSpace(), nullptr,

1165

1166

1167

1168 new StoreInst(&I, AllocaI, SeqStartBB->getTerminator()->getIterator());

1169

1170

1171

1172 for (Instruction *UsrI : OutsideUsers) {

1173 LoadInst *LoadI = new LoadInst(I.getType(), AllocaI,

1174 I.getName() + ".seq.output.load",

1177 }

1178 }

1179

1180 OpenMPIRBuilder::LocationDescription Loc(

1181 InsertPointTy(ParentBB, ParentBB->end()), DL);

1182 OpenMPIRBuilder::InsertPointTy SeqAfterIP = cantFail(

1183 OMPInfoCache.OMPBuilder.createMaster(Loc, BodyGenCB, FiniCB));

1185 OMPInfoCache.OMPBuilder.createBarrier(SeqAfterIP, OMPD_parallel));

1186

1188

1189 LLVM_DEBUG(dbgs() << TAG << "After sequential inlining " << *OuterFn

1190 << "\n");

1191 };

1192

1193

1194

1195

1196

1197

1198

1199

1200 auto Merge = [&](const SmallVectorImpl<CallInst *> &MergableCIs,

1202

1203

1204 assert(MergableCIs.size() > 1 && "Assumed multiple mergable CIs");

1205

1206 auto Remark = [&](OptimizationRemark OR) {

1207 OR << "Parallel region merged with parallel region"

1208 << (MergableCIs.size() > 2 ? "s" : "") << " at ";

1211 if (CI != MergableCIs.back())

1212 OR << ", ";

1213 }

1214 return OR << ".";

1215 };

1216

1218

1219 Function *OriginalFn = BB->getParent();

1221 << " parallel regions in " << OriginalFn->getName()

1222 << "\n");

1223

1224

1225 EndBB = SplitBlock(BB, MergableCIs.back()->getNextNode(), DT, LI);

1227 SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI);

1228 StartBB = SplitBlock(BB, MergableCIs.front(), DT, LI, nullptr,

1229 "omp.par.merged");

1230

1231 assert(BB->getUniqueSuccessor() == StartBB && "Expected a different CFG");

1232 const DebugLoc DL = BB->getTerminator()->getDebugLoc();

1234

1235

1236

1237 for (auto *It = MergableCIs.begin(), *End = MergableCIs.end() - 1;

1238 It != End; ++It) {

1241

1242

1243 if (ForkCI->getNextNode() == NextForkCI)

1244 continue;

1245

1246 CreateSequentialRegion(OriginalFn, BB, ForkCI->getNextNode(),

1248 }

1249

1250 OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB, BB->end()),

1251 DL);

1252 IRBuilder<>::InsertPoint AllocaIP(

1255

1256

1257 OpenMPIRBuilder::InsertPointTy AfterIP =

1258 cantFail(OMPInfoCache.OMPBuilder.createParallel(

1259 Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB, nullptr, nullptr,

1260 OMP_PROC_BIND_default, false));

1262

1263

1264 OMPInfoCache.OMPBuilder.finalize(OriginalFn);

1265

1266 Function *OutlinedFn = MergableCIs.front()->getCaller();

1267

1268

1269

1270 SmallVector<Value *, 8> Args;

1271 for (auto *CI : MergableCIs) {

1273 FunctionType *FT = OMPInfoCache.OMPBuilder.ParallelTask;

1274 Args.clear();

1275 Args.push_back(OutlinedFn->getArg(0));

1276 Args.push_back(OutlinedFn->getArg(1));

1277 for (unsigned U = CallbackFirstArgOperand, E = CI->arg_size(); U < E;

1278 ++U)

1280

1281 CallInst *NewCI =

1285

1286

1287 for (unsigned U = CallbackFirstArgOperand, E = CI->arg_size(); U < E;

1288 ++U)

1291 U - (CallbackFirstArgOperand - CallbackCalleeOperand), A);

1292

1293

1294 if (CI != MergableCIs.back()) {

1295

1296

1297 cantFail(OMPInfoCache.OMPBuilder.createBarrier(

1298 InsertPointTy(NewCI->getParent(),

1300 OMPD_parallel));

1301 }

1302

1304 }

1305

1306 assert(OutlinedFn != OriginalFn && "Outlining failed");

1307 CGUpdater.registerOutlinedFunction(*OriginalFn, *OutlinedFn);

1308 CGUpdater.reanalyzeFunction(*OriginalFn);

1309

1310 NumOpenMPParallelRegionsMerged += MergableCIs.size();

1311

1312 return true;

1313 };

1314

1315

1316

1318 CallInst *CI = getCallIfRegularCall(U, &RFI);

1320

1321 return false;

1322 };

1323

1324 BB2PRMap.clear();

1325 RFI.foreachUse(SCC, DetectPRsCB);

1327

1328

1329

1330

1331 for (auto &It : BB2PRMap) {

1332 auto &CIs = It.getSecond();

1333 if (CIs.size() < 2)

1334 continue;

1335

1338

1339

1340

1341

1342

1343

1344

1345

1346

1347 auto IsMergable = [&](Instruction &I, bool IsBeforeMergableRegion) {

1348

1349

1350 if (I.isTerminator())

1351 return false;

1352

1354 return true;

1355

1357 if (IsBeforeMergableRegion) {

1359 if (!CalledFunction)

1360 return false;

1361

1362

1363

1364

1365

1366 for (const auto &RFI : UnmergableCallsInfo) {

1367 if (CalledFunction == RFI.Declaration)

1368 return false;

1369 }

1370 } else {

1371

1372

1373

1374

1376 return false;

1377 }

1378

1379 return true;

1380 };

1381

1382 for (auto It = BB->begin(), End = BB->end(); It != End;) {

1384 ++It;

1385

1386 if (CIs.count(&I)) {

1388 continue;

1389 }

1390

1391

1392 if (IsMergable(I, MergableCIs.empty()))

1393 continue;

1394

1395

1396

1397 for (; It != End; ++It) {

1399 if (CIs.count(&SkipI)) {

1401 << " due to " << I << "\n");

1402 ++It;

1403 break;

1404 }

1405 }

1406

1407

1408 if (MergableCIs.size() > 1) {

1409 MergableCIsVector.push_back(MergableCIs);

1411 << " parallel regions in block " << BB->getName()

1413 << "\n";);

1414 }

1415

1416 MergableCIs.clear();

1417 }

1418

1419 if (!MergableCIsVector.empty()) {

1421

1422 for (auto &MergableCIs : MergableCIsVector)

1423 Merge(MergableCIs, BB);

1424 MergableCIsVector.clear();

1425 }

1426 }

1427

1429

1430

1431 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_fork_call);

1432 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_barrier);

1433 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_master);

1434 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_end_master);

1435 }

1436

1438 }

1439

1440

1441 bool deleteParallelRegions() {

1442 const unsigned CallbackCalleeOperand = 2;

1443

1444 OMPInformationCache::RuntimeFunctionInfo &RFI =

1445 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];

1446

1447 if (!RFI.Declaration)

1448 return false;

1449

1451 auto DeleteCallCB = [&](Use &U, Function &) {

1452 CallInst *CI = getCallIfRegularCall(U);

1453 if (!CI)

1454 return false;

1457 if (!Fn)

1458 return false;

1459 if (!Fn->onlyReadsMemory())

1460 return false;

1461 if (!Fn->hasFnAttribute(Attribute::WillReturn))

1462 return false;

1463

1464 LLVM_DEBUG(dbgs() << TAG << "Delete read-only parallel region in "

1466

1467 auto Remark = [&](OptimizationRemark OR) {

1468 return OR << "Removing parallel region with no side-effects.";

1469 };

1471

1474 ++NumOpenMPParallelRegionsDeleted;

1475 return true;

1476 };

1477

1478 RFI.foreachUse(SCC, DeleteCallCB);

1479

1481 }

1482

1483

1484 bool deduplicateRuntimeCalls() {

1486

1488 OMPRTL_omp_get_num_threads,

1489 OMPRTL_omp_in_parallel,

1490 OMPRTL_omp_get_cancellation,

1491 OMPRTL_omp_get_supported_active_levels,

1492 OMPRTL_omp_get_level,

1493 OMPRTL_omp_get_ancestor_thread_num,

1494 OMPRTL_omp_get_team_size,

1495 OMPRTL_omp_get_active_level,

1496 OMPRTL_omp_in_final,

1497 OMPRTL_omp_get_proc_bind,

1498 OMPRTL_omp_get_num_places,

1499 OMPRTL_omp_get_num_procs,

1500 OMPRTL_omp_get_place_num,

1501 OMPRTL_omp_get_partition_num_places,

1502 OMPRTL_omp_get_partition_place_nums};

1503

1504

1505 SmallSetVector<Value *, 16> GTIdArgs;

1506 collectGlobalThreadIdArguments(GTIdArgs);

1508 << " global thread ID arguments\n");

1509

1510 for (Function *F : SCC) {

1511 for (auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs)

1512 Changed |= deduplicateRuntimeCalls(

1513 *F, OMPInfoCache.RFIs[DeduplicableRuntimeCallID]);

1514

1515

1516

1517 Value *GTIdArg = nullptr;

1518 for (Argument &Arg : F->args())

1519 if (GTIdArgs.count(&Arg)) {

1520 GTIdArg = &Arg;

1521 break;

1522 }

1523 Changed |= deduplicateRuntimeCalls(

1524 *F, OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num], GTIdArg);

1525 }

1526

1528 }

1529

1530

1531 bool removeRuntimeSymbols() {

1532

1533

1534

1535

1536 if (GlobalVariable *GV = M.getNamedGlobal("__llvm_rpc_client")) {

1537 if (GV->hasNUsesOrMore(1))

1538 return false;

1539

1541 GV->eraseFromParent();

1542 return true;

1543 }

1544 return false;

1545 }

1546

1547

1548

1549

1550

1551

1552

1553 bool hideMemTransfersLatency() {

1554 auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper];

1556 auto SplitMemTransfers = [&](Use &U, Function &Decl) {

1557 auto *RTCall = getCallIfRegularCall(U, &RFI);

1558 if (!RTCall)

1559 return false;

1560

1561 OffloadArray OffloadArrays[3];

1562 if (!getValuesInOffloadArrays(*RTCall, OffloadArrays))

1563 return false;

1564

1565 LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays));

1566

1567

1568 bool WasSplit = false;

1569 Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall);

1570 if (WaitMovementPoint)

1571 WasSplit = splitTargetDataBeginRTC(*RTCall, *WaitMovementPoint);

1572

1574 return WasSplit;

1575 };

1576 if (OMPInfoCache.runtimeFnsAvailable(

1577 {OMPRTL___tgt_target_data_begin_mapper_issue,

1578 OMPRTL___tgt_target_data_begin_mapper_wait}))

1579 RFI.foreachUse(SCC, SplitMemTransfers);

1580

1582 }

1583

1584 void analysisGlobalization() {

1585 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];

1586

1587 auto CheckGlobalization = [&](Use &U, Function &Decl) {

1588 if (CallInst *CI = getCallIfRegularCall(U, &RFI)) {

1589 auto Remark = [&](OptimizationRemarkMissed ORM) {

1590 return ORM

1591 << "Found thread data sharing on the GPU. "

1592 << "Expect degraded performance due to data globalization.";

1593 };

1595 }

1596

1597 return false;

1598 };

1599

1600 RFI.foreachUse(SCC, CheckGlobalization);

1601 }

1602

1603

1604

1605 bool getValuesInOffloadArrays(CallInst &RuntimeCall,

1607 assert(OAs.size() == 3 && "Need space for three offload arrays!");

1608

1609

1610

1611

1612

1613

1614

1615

1616

1617 Value *BasePtrsArg =

1618 RuntimeCall.getArgOperand(OffloadArray::BasePtrsArgNum);

1619

1620 Value *PtrsArg = RuntimeCall.getArgOperand(OffloadArray::PtrsArgNum);

1621

1622 Value *SizesArg = RuntimeCall.getArgOperand(OffloadArray::SizesArgNum);

1623

1624

1627 return false;

1629 if (!OAs[0].initialize(*BasePtrsArray, RuntimeCall))

1630 return false;

1631

1632

1635 return false;

1637 if (!OAs[1].initialize(*PtrsArray, RuntimeCall))

1638 return false;

1639

1640

1642

1646 return false;

1647

1649 if (!OAs[2].initialize(*SizesArray, RuntimeCall))

1650 return false;

1651

1652 return true;

1653 }

1654

1655

1656

1657

1658

1660 assert(OAs.size() == 3 && "There are three offload arrays to debug!");

1661

1662 LLVM_DEBUG(dbgs() << TAG << " Successfully got offload values:\n");

1663 std::string ValuesStr;

1664 raw_string_ostream Printer(ValuesStr);

1665 std::string Separator = " --- ";

1666

1667 for (auto *BP : OAs[0].StoredValues) {

1670 }

1671 LLVM_DEBUG(dbgs() << "\t\toffload_baseptrs: " << ValuesStr << "\n");

1672 ValuesStr.clear();

1673

1674 for (auto *P : OAs[1].StoredValues) {

1677 }

1678 LLVM_DEBUG(dbgs() << "\t\toffload_ptrs: " << ValuesStr << "\n");

1679 ValuesStr.clear();

1680

1681 for (auto *S : OAs[2].StoredValues) {

1684 }

1685 LLVM_DEBUG(dbgs() << "\t\toffload_sizes: " << ValuesStr << "\n");

1686 }

1687

1688

1689

1690 Instruction *canBeMovedDownwards(CallInst &RuntimeCall) {

1691

1692

1693

1695 bool IsWorthIt = false;

1696 while ((CurrentI = CurrentI->getNextNode())) {

1697

1698

1699

1700

1702 if (IsWorthIt)

1703 return CurrentI;

1704

1705 return nullptr;

1706 }

1707

1708

1709

1710 IsWorthIt = true;

1711 }

1712

1713

1714 return RuntimeCall.getParent()->getTerminator();

1715 }

1716

1717

1718 bool splitTargetDataBeginRTC(CallInst &RuntimeCall,

1719 Instruction &WaitMovementPoint) {

1720

1721

1722

1723 auto &IRBuilder = OMPInfoCache.OMPBuilder;

1726 IRBuilder.Builder.SetInsertPoint(&Entry,

1727 Entry.getFirstNonPHIOrDbgOrAlloca());

1729 IRBuilder.AsyncInfo, nullptr, "handle");

1730 Handle =

1731 IRBuilder.Builder.CreateAddrSpaceCast(Handle, IRBuilder.AsyncInfoPtr);

1732

1733

1734

1735

1736 FunctionCallee IssueDecl = IRBuilder.getOrCreateRuntimeFunction(

1737 M, OMPRTL___tgt_target_data_begin_mapper_issue);

1738

1739

1740 SmallVector<Value *, 16> Args;

1741 for (auto &Arg : RuntimeCall.args())

1742 Args.push_back(Arg.get());

1743 Args.push_back(Handle);

1744

1745 CallInst *IssueCallsite = CallInst::Create(IssueDecl, Args, "",

1747 OMPInfoCache.setCallingConvention(IssueDecl, IssueCallsite);

1749

1750

1751

1752 FunctionCallee WaitDecl = IRBuilder.getOrCreateRuntimeFunction(

1753 M, OMPRTL___tgt_target_data_begin_mapper_wait);

1754

1755 Value *WaitParams[2] = {

1757 OffloadArray::DeviceIDArgNum),

1758 Handle

1759 };

1761 WaitDecl, WaitParams, "", WaitMovementPoint.getIterator());

1762 OMPInfoCache.setCallingConvention(WaitDecl, WaitCallsite);

1763

1764 return true;

1765 }

1766

1767 static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent,

1768 bool GlobalOnly, bool &SingleChoice) {

1769 if (CurrentIdent == NextIdent)

1770 return CurrentIdent;

1771

1772

1773

1775 SingleChoice = !CurrentIdent;

1776 return NextIdent;

1777 }

1778 return nullptr;

1779 }

1780

1781

1782

1783

1784

1785

1787 getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI,

1788 Function &F, bool GlobalOnly) {

1789 bool SingleChoice = true;

1790 Value *Ident = nullptr;

1792 CallInst *CI = getCallIfRegularCall(U, &RFI);

1793 if (!CI || &F != &Caller)

1794 return false;

1795 Ident = combinedIdentStruct(Ident, CI->getArgOperand(0),

1796 true, SingleChoice);

1797 return false;

1798 };

1799 RFI.foreachUse(SCC, CombineIdentStruct);

1800

1801 if (!Ident || !SingleChoice) {

1802

1803

1804 if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock())

1805 OMPInfoCache.OMPBuilder.updateToLocation(OpenMPIRBuilder::InsertPointTy(

1806 &F.getEntryBlock(), F.getEntryBlock().begin()));

1807

1808

1809 uint32_t SrcLocStrSize;

1811 OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);

1812 Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc, SrcLocStrSize);

1813 }

1814 return Ident;

1815 }

1816

1817

1818

1819 bool deduplicateRuntimeCalls(Function &F,

1820 OMPInformationCache::RuntimeFunctionInfo &RFI,

1821 Value *ReplVal = nullptr) {

1822 auto *UV = RFI.getUseVector(F);

1823 if (!UV || UV->size() + (ReplVal != nullptr) < 2)

1824 return false;

1825

1827 dbgs() << TAG << "Deduplicate " << UV->size() << " uses of " << RFI.Name

1828 << (ReplVal ? " with an existing value\n" : "\n") << "\n");

1829

1832 "Unexpected replacement value!");

1833

1834

1835 auto CanBeMoved = [this](CallBase &CB) {

1836 unsigned NumArgs = CB.arg_size();

1837 if (NumArgs == 0)

1838 return true;

1839 if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr)

1840 return false;

1841 for (unsigned U = 1; U < NumArgs; ++U)

1843 return false;

1844 return true;

1845 };

1846

1847 if (!ReplVal) {

1848 auto *DT =

1849 OMPInfoCache.getAnalysisResultForFunction(F);

1850 if (!DT)

1851 return false;

1853 for (Use *U : *UV) {

1854 if (CallInst *CI = getCallIfRegularCall(*U, &RFI)) {

1855 if (IP)

1857 else

1858 IP = CI;

1859 if (!CanBeMoved(*CI))

1860 continue;

1861 if (!ReplVal)

1862 ReplVal = CI;

1863 }

1864 }

1865 if (!ReplVal)

1866 return false;

1867 assert(IP && "Expected insertion point!");

1869 }

1870

1871

1872

1873

1877 Value *Ident = getCombinedIdentFromCallUsesIn(RFI, F,

1878 true);

1880 }

1881 }

1882

1885 CallInst *CI = getCallIfRegularCall(U, &RFI);

1886 if (!CI || CI == ReplVal || &F != &Caller)

1887 return false;

1889

1890 auto Remark = [&](OptimizationRemark OR) {

1891 return OR << "OpenMP runtime call "

1892 << ore::NV("OpenMPOptRuntime", RFI.Name) << " deduplicated.";

1893 };

1896 else

1898

1901 ++NumOpenMPRuntimeCallsDeduplicated;

1903 return true;

1904 };

1905 RFI.foreachUse(SCC, ReplaceAndDeleteCB);

1906

1908 }

1909

1910

1911 void collectGlobalThreadIdArguments(SmallSetVector<Value *, 16> &GTIdArgs) {

1912

1913

1914

1915

1916

1917

1918 auto CallArgOpIsGTId = [&](Function &F, unsigned ArgNo, CallInst &RefCI) {

1919 if (F.hasLocalLinkage())

1920 return false;

1921 for (Use &U : F.uses()) {

1922 if (CallInst *CI = getCallIfRegularCall(U)) {

1924 if (CI == &RefCI || GTIdArgs.count(ArgOp) ||

1925 getCallIfRegularCall(

1926 *ArgOp, &OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num]))

1927 continue;

1928 }

1929 return false;

1930 }

1931 return true;

1932 };

1933

1934

1935 auto AddUserArgs = [&](Value &GTId) {

1936 for (Use &U : GTId.uses())

1940 if (CallArgOpIsGTId(*Callee, U.getOperandNo(), *CI))

1941 GTIdArgs.insert(Callee->getArg(U.getOperandNo()));

1942 };

1943

1944

1945 OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI =

1946 OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num];

1947

1948 GlobThreadNumRFI.foreachUse(SCC, [&](Use &U, Function &F) {

1949 if (CallInst *CI = getCallIfRegularCall(U, &GlobThreadNumRFI))

1950 AddUserArgs(*CI);

1951 return false;

1952 });

1953

1954

1955

1956

1957 for (unsigned U = 0; U < GTIdArgs.size(); ++U)

1958 AddUserArgs(*GTIdArgs[U]);

1959 }

1960

1961

1962

1963

1964

1965

1966 DenseMap<Function *, std::optional> UniqueKernelMap;

1967

1968

1969 Kernel getUniqueKernelFor(Function &F);

1970

1971

1972 Kernel getUniqueKernelFor(Instruction &I) {

1973 return getUniqueKernelFor(*I.getFunction());

1974 }

1975

1976

1977

1978 bool rewriteDeviceCodeStateMachine();

1979

1980

1981

1982

1983

1984

1985

1986

1987

1988

1989

1990

1991

1992

1993

1994 template <typename RemarkKind, typename RemarkCallBack>

1995 void emitRemark(Instruction *I, StringRef RemarkName,

1996 RemarkCallBack &&RemarkCB) const {

1997 Function *F = I->getParent()->getParent();

1998 auto &ORE = OREGetter(F);

1999

2001 ORE.emit([&]() {

2002 return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, I))

2003 << " [" << RemarkName << "]";

2004 });

2005 else

2006 ORE.emit(

2007 [&]() { return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, I)); });

2008 }

2009

2010

2011 template <typename RemarkKind, typename RemarkCallBack>

2012 void emitRemark(Function *F, StringRef RemarkName,

2013 RemarkCallBack &&RemarkCB) const {

2014 auto &ORE = OREGetter(F);

2015

2017 ORE.emit([&]() {

2018 return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, F))

2019 << " [" << RemarkName << "]";

2020 });

2021 else

2022 ORE.emit(

2023 [&]() { return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, F)); });

2024 }

2025

2026

2028

2029

2030 SmallVectorImpl<Function *> &SCC;

2031

2032

2033

2034 CallGraphUpdater &CGUpdater;

2035

2036

2037 OptimizationRemarkGetter OREGetter;

2038

2039

2040 OMPInformationCache &OMPInfoCache;

2041

2042

2043 Attributor &A;

2044

2045

2046 bool runAttributor(bool IsModulePass) {

2047 if (SCC.empty())

2048 return false;

2049

2050 registerAAs(IsModulePass);

2051

2053

2054 LLVM_DEBUG(dbgs() << "[Attributor] Done with " << SCC.size()

2055 << " functions, result: " << Changed << ".\n");

2056

2057 if (Changed == ChangeStatus::CHANGED)

2058 OMPInfoCache.invalidateAnalyses();

2059

2060 return Changed == ChangeStatus::CHANGED;

2061 }

2062

2064

2065

2066

2067 void registerAAs(bool IsModulePass);

2068

2069public:

2070

2071

2072 static void registerAAsForFunction(Attributor &A, const Function &F);

2073};

2074

2076 if (OMPInfoCache.CGSCC && !OMPInfoCache.CGSCC->empty() &&

2077 !OMPInfoCache.CGSCC->contains(&F))

2078 return nullptr;

2079

2080

2081 {

2082 std::optional &CachedKernel = UniqueKernelMap[&F];

2083 if (CachedKernel)

2084 return *CachedKernel;

2085

2086

2087

2088

2090 CachedKernel = Kernel(&F);

2091 return *CachedKernel;

2092 }

2093

2094 CachedKernel = nullptr;

2095 if (F.hasLocalLinkage()) {

2096

2097

2098 auto Remark = [&](OptimizationRemarkAnalysis ORA) {

2099 return ORA << "Potentially unknown OpenMP target region caller.";

2100 };

2102

2103 return nullptr;

2104 }

2105 }

2106

2107 auto GetUniqueKernelForUse = [&](const Use &U) -> Kernel {

2109

2110 if (Cmp->isEquality())

2111 return getUniqueKernelFor(*Cmp);

2112 return nullptr;

2113 }

2115

2116 if (CB->isCallee(&U))

2117 return getUniqueKernelFor(*CB);

2118

2119 OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =

2120 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];

2121

2122 if (OpenMPOpt::getCallIfRegularCall(*U.getUser(), &KernelParallelRFI))

2123 return getUniqueKernelFor(*CB);

2124 return nullptr;

2125 }

2126

2127 return nullptr;

2128 };

2129

2130

2131 SmallPtrSet<Kernel, 2> PotentialKernels;

2132 OMPInformationCache::foreachUse(F, [&](const Use &U) {

2133 PotentialKernels.insert(GetUniqueKernelForUse(U));

2134 });

2135

2137 if (PotentialKernels.size() == 1)

2138 K = *PotentialKernels.begin();

2139

2140

2141 UniqueKernelMap[&F] = K;

2142

2143 return K;

2144}

2145

2146bool OpenMPOpt::rewriteDeviceCodeStateMachine() {

2147 OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =

2148 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];

2149

2151 if (!KernelParallelRFI)

2153

2154

2157

2158 for (Function *F : SCC) {

2159

2160

2161

2162 bool UnknownUse = false;

2163 bool KernelParallelUse = false;

2164 unsigned NumDirectCalls = 0;

2165

2167 OMPInformationCache::foreachUse(*F, [&](Use &U) {

2169 if (CB->isCallee(&U)) {

2170 ++NumDirectCalls;

2171 return;

2172 }

2173

2175 ToBeReplacedStateMachineUses.push_back(&U);

2176 return;

2177 }

2178

2179

2180 CallInst *CI =

2181 OpenMPOpt::getCallIfRegularCall(*U.getUser(), &KernelParallelRFI);

2182 const unsigned int WrapperFunctionArgNo = 6;

2183 if (!KernelParallelUse && CI &&

2185 KernelParallelUse = true;

2186 ToBeReplacedStateMachineUses.push_back(&U);

2187 return;

2188 }

2189 UnknownUse = true;

2190 });

2191

2192

2193

2194 if (!KernelParallelUse)

2195 continue;

2196

2197

2198

2199

2200 if (UnknownUse || NumDirectCalls != 1 ||

2201 ToBeReplacedStateMachineUses.size() > 2) {

2202 auto Remark = [&](OptimizationRemarkAnalysis ORA) {

2203 return ORA << "Parallel region is used in "

2204 << (UnknownUse ? "unknown" : "unexpected")

2205 << " ways. Will not attempt to rewrite the state machine.";

2206 };

2208 continue;

2209 }

2210

2211

2212

2213 Kernel K = getUniqueKernelFor(*F);

2214 if (!K) {

2215 auto Remark = [&](OptimizationRemarkAnalysis ORA) {

2216 return ORA << "Parallel region is not called from a unique kernel. "

2217 "Will not attempt to rewrite the state machine.";

2218 };

2220 continue;

2221 }

2222

2223

2224

2225

2226

2227

2228 Module &M = *F->getParent();

2229 Type *Int8Ty = Type::getInt8Ty(M.getContext());

2230

2231 auto *ID = new GlobalVariable(

2234

2235 for (Use *U : ToBeReplacedStateMachineUses)

2237 ID, U->get()->getType()));

2238

2239 ++NumOpenMPParallelRegionsReplacedInGPUStateMachine;

2240

2242 }

2243

2245}

2246

2247

2248struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> {

2249 using Base = StateWrapper<BooleanState, AbstractAttribute>;

2250 AAICVTracker(const IRPosition &IRP, Attributor &A) : Base(IRP) {}

2251

2252

2253 bool isAssumedTracked() const { return getAssumed(); }

2254

2255

2256 bool isKnownTracked() const { return getAssumed(); }

2257

2258

2259 static AAICVTracker &createForPosition(const IRPosition &IRP, Attributor &A);

2260

2261

2262 virtual std::optional<Value *> getReplacementValue(InternalControlVar ICV,

2263 const Instruction *I,

2264 Attributor &A) const {

2265 return std::nullopt;

2266 }

2267

2268

2269

2270

2271 virtual std::optional<Value *>

2273

2274

2275

2277

2278

2279 StringRef getName() const override { return "AAICVTracker"; }

2280

2281

2282 const char *getIdAddr() const override { return &ID; }

2283

2284

2285 static bool classof(const AbstractAttribute *AA) {

2287 }

2288

2289 static const char ID;

2290};

2291

2292struct AAICVTrackerFunction : public AAICVTracker {

2293 AAICVTrackerFunction(const IRPosition &IRP, Attributor &A)

2294 : AAICVTracker(IRP, A) {}

2295

2296

2297 const std::string getAsStr(Attributor *) const override {

2298 return "ICVTrackerFunction";

2299 }

2300

2301

2302 void trackStatistics() const override {}

2303

2304

2306 return ChangeStatus::UNCHANGED;

2307 }

2308

2309

2310 EnumeratedArray<DenseMap<Instruction *, Value *>, InternalControlVar,

2311 InternalControlVar::ICV___last>

2312 ICVReplacementValuesMap;

2313

2314 ChangeStatus updateImpl(Attributor &A) override {

2315 ChangeStatus HasChanged = ChangeStatus::UNCHANGED;

2316

2318

2319 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());

2320

2322 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];

2323

2324 auto &ValuesMap = ICVReplacementValuesMap[ICV];

2325 auto TrackValues = [&](Use &U, Function &) {

2326 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U);

2327 if (!CI)

2328 return false;

2329

2330

2331

2332 if (ValuesMap.insert(std::make_pair(CI, CI->getArgOperand(0))).second)

2333 HasChanged = ChangeStatus::CHANGED;

2334

2335 return false;

2336 };

2337

2339 std::optional<Value *> ReplVal = getValueForCall(A, I, ICV);

2340 if (ReplVal && ValuesMap.insert(std::make_pair(&I, *ReplVal)).second)

2341 HasChanged = ChangeStatus::CHANGED;

2342

2343 return true;

2344 };

2345

2346

2347 SetterRFI.foreachUse(TrackValues, F);

2348

2349 bool UsedAssumedInformation = false;

2350 A.checkForAllInstructions(CallCheck, *this, {Instruction::Call},

2351 UsedAssumedInformation,

2352 true);

2353

2354

2355

2357 if (HasChanged == ChangeStatus::CHANGED)

2358 ValuesMap.try_emplace(Entry);

2359 }

2360

2361 return HasChanged;

2362 }

2363

2364

2365

2366 std::optional<Value *> getValueForCall(Attributor &A, const Instruction &I,

2368

2370 if (!CB || CB->hasFnAttr("no_openmp") ||

2371 CB->hasFnAttr("no_openmp_routines") ||

2372 CB->hasFnAttr("no_openmp_constructs"))

2373 return std::nullopt;

2374

2375 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());

2376 auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter];

2377 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];

2378 Function *CalledFunction = CB->getCalledFunction();

2379

2380

2381 if (CalledFunction == nullptr)

2382 return nullptr;

2383 if (CalledFunction == GetterRFI.Declaration)

2384 return std::nullopt;

2385 if (CalledFunction == SetterRFI.Declaration) {

2386 if (ICVReplacementValuesMap[ICV].count(&I))

2387 return ICVReplacementValuesMap[ICV].lookup(&I);

2388

2389 return nullptr;

2390 }

2391

2392

2394 return nullptr;

2395

2396 const auto *ICVTrackingAA = A.getAAFor(

2398

2399 if (ICVTrackingAA->isAssumedTracked()) {

2400 std::optional<Value *> URV =

2401 ICVTrackingAA->getUniqueReplacementValue(ICV);

2403 OMPInfoCache)))

2404 return URV;

2405 }

2406

2407

2408 return nullptr;

2409 }

2410

2411

2412 std::optional<Value *>

2414 return std::nullopt;

2415 }

2416

2417

2419 const Instruction *I,

2420 Attributor &A) const override {

2421 const auto &ValuesMap = ICVReplacementValuesMap[ICV];

2422 if (ValuesMap.count(I))

2423 return ValuesMap.lookup(I);

2424

2426 SmallPtrSet<const Instruction *, 16> Visited;

2428

2429 std::optional<Value *> ReplVal;

2430

2431 while (!Worklist.empty()) {

2433 if (!Visited.insert(CurrInst).second)

2434 continue;

2435

2437

2438

2439

2440 while ((CurrInst = CurrInst->getPrevNode())) {

2441 if (ValuesMap.count(CurrInst)) {

2442 std::optional<Value *> NewReplVal = ValuesMap.lookup(CurrInst);

2443

2444 if (!ReplVal) {

2445 ReplVal = NewReplVal;

2446 break;

2447 }

2448

2449

2450 if (NewReplVal)

2451 if (ReplVal != NewReplVal)

2452 return nullptr;

2453

2454 break;

2455 }

2456

2457 std::optional<Value *> NewReplVal = getValueForCall(A, *CurrInst, ICV);

2458 if (!NewReplVal)

2459 continue;

2460

2461

2462 if (!ReplVal) {

2463 ReplVal = NewReplVal;

2464 break;

2465 }

2466

2467

2468

2469 if (ReplVal != NewReplVal)

2470 return nullptr;

2471 }

2472

2473

2474 if (CurrBB == I->getParent() && ReplVal)

2475 return ReplVal;

2476

2477

2478 for (const BasicBlock *Pred : predecessors(CurrBB))

2479 if (const Instruction *Terminator = Pred->getTerminator())

2481 }

2482

2483 return ReplVal;

2484 }

2485};

2486

2487struct AAICVTrackerFunctionReturned : AAICVTracker {

2488 AAICVTrackerFunctionReturned(const IRPosition &IRP, Attributor &A)

2489 : AAICVTracker(IRP, A) {}

2490

2491

2492 const std::string getAsStr(Attributor *) const override {

2493 return "ICVTrackerFunctionReturned";

2494 }

2495

2496

2497 void trackStatistics() const override {}

2498

2499

2501 return ChangeStatus::UNCHANGED;

2502 }

2503

2504

2506 InternalControlVar::ICV___last>

2507 ICVReplacementValuesMap;

2508

2509

2510 std::optional<Value *>

2512 return ICVReplacementValuesMap[ICV];

2513 }

2514

2515 ChangeStatus updateImpl(Attributor &A) override {

2517 const auto *ICVTrackingAA = A.getAAFor(

2519

2520 if (!ICVTrackingAA->isAssumedTracked())

2521 return indicatePessimisticFixpoint();

2522

2524 std::optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];

2525 std::optional<Value *> UniqueICVValue;

2526

2527 auto CheckReturnInst = [&](Instruction &I) {

2528 std::optional<Value *> NewReplVal =

2529 ICVTrackingAA->getReplacementValue(ICV, &I, A);

2530

2531

2532 if (UniqueICVValue && UniqueICVValue != NewReplVal)

2533 return false;

2534

2535 UniqueICVValue = NewReplVal;

2536

2537 return true;

2538 };

2539

2540 bool UsedAssumedInformation = false;

2541 if (A.checkForAllInstructions(CheckReturnInst, *this, {Instruction::Ret},

2542 UsedAssumedInformation,

2543 true))

2544 UniqueICVValue = nullptr;

2545

2546 if (UniqueICVValue == ReplVal)

2547 continue;

2548

2549 ReplVal = UniqueICVValue;

2550 Changed = ChangeStatus::CHANGED;

2551 }

2552

2554 }

2555};

2556

2557struct AAICVTrackerCallSite : AAICVTracker {

2558 AAICVTrackerCallSite(const IRPosition &IRP, Attributor &A)

2559 : AAICVTracker(IRP, A) {}

2560

2561 void initialize(Attributor &A) override {

2562 assert(getAnchorScope() && "Expected anchor function");

2563

2564

2565

2566 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());

2568 auto ICVInfo = OMPInfoCache.ICVs[ICV];

2569 auto &Getter = OMPInfoCache.RFIs[ICVInfo.Getter];

2570 if (Getter.Declaration == getAssociatedFunction()) {

2571 AssociatedICV = ICVInfo.Kind;

2572 return;

2573 }

2574 }

2575

2576

2577 indicatePessimisticFixpoint();

2578 }

2579

2581 if (!ReplVal || !*ReplVal)

2582 return ChangeStatus::UNCHANGED;

2583

2585 A.deleteAfterManifest(*getCtxI());

2586

2587 return ChangeStatus::CHANGED;

2588 }

2589

2590

2591 const std::string getAsStr(Attributor *) const override {

2592 return "ICVTrackerCallSite";

2593 }

2594

2595

2596 void trackStatistics() const override {}

2597

2599 std::optional<Value *> ReplVal;

2600

2601 ChangeStatus updateImpl(Attributor &A) override {

2602 const auto *ICVTrackingAA = A.getAAFor(

2604

2605

2606 if (!ICVTrackingAA->isAssumedTracked())

2607 return indicatePessimisticFixpoint();

2608

2609 std::optional<Value *> NewReplVal =

2610 ICVTrackingAA->getReplacementValue(AssociatedICV, getCtxI(), A);

2611

2612 if (ReplVal == NewReplVal)

2613 return ChangeStatus::UNCHANGED;

2614

2615 ReplVal = NewReplVal;

2616 return ChangeStatus::CHANGED;

2617 }

2618

2619

2620

2621 std::optional<Value *>

2623 return ReplVal;

2624 }

2625};

2626

2627struct AAICVTrackerCallSiteReturned : AAICVTracker {

2628 AAICVTrackerCallSiteReturned(const IRPosition &IRP, Attributor &A)

2629 : AAICVTracker(IRP, A) {}

2630

2631

2632 const std::string getAsStr(Attributor *) const override {

2633 return "ICVTrackerCallSiteReturned";

2634 }

2635

2636

2637 void trackStatistics() const override {}

2638

2639

2641 return ChangeStatus::UNCHANGED;

2642 }

2643

2644

2646 InternalControlVar::ICV___last>

2647 ICVReplacementValuesMap;

2648

2649

2650

2651 std::optional<Value *>

2653 return ICVReplacementValuesMap[ICV];

2654 }

2655

2656 ChangeStatus updateImpl(Attributor &A) override {

2658 const auto *ICVTrackingAA = A.getAAFor(

2660 DepClassTy::REQUIRED);

2661

2662

2663 if (!ICVTrackingAA->isAssumedTracked())

2664 return indicatePessimisticFixpoint();

2665

2667 std::optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];

2668 std::optional<Value *> NewReplVal =

2669 ICVTrackingAA->getUniqueReplacementValue(ICV);

2670

2671 if (ReplVal == NewReplVal)

2672 continue;

2673

2674 ReplVal = NewReplVal;

2675 Changed = ChangeStatus::CHANGED;

2676 }

2678 }

2679};

2680

2681

2682

2683static bool hasFunctionEndAsUniqueSuccessor(const BasicBlock *BB) {

2685 return true;

2688 return false;

2689 return hasFunctionEndAsUniqueSuccessor(Successor);

2690}

2691

2692struct AAExecutionDomainFunction : public AAExecutionDomain {

2693 AAExecutionDomainFunction(const IRPosition &IRP, Attributor &A)

2694 : AAExecutionDomain(IRP, A) {}

2695

2696 ~AAExecutionDomainFunction() override { delete RPOT; }

2697

2698 void initialize(Attributor &A) override {

2700 assert(F && "Expected anchor function");

2701 RPOT = new ReversePostOrderTraversal<Function *>(F);

2702 }

2703

2704 const std::string getAsStr(Attributor *) const override {

2705 unsigned TotalBlocks = 0, InitialThreadBlocks = 0, AlignedBlocks = 0;

2706 for (auto &It : BEDMap) {

2707 if (!It.getFirst())

2708 continue;

2709 TotalBlocks++;

2710 InitialThreadBlocks += It.getSecond().IsExecutedByInitialThreadOnly;

2711 AlignedBlocks += It.getSecond().IsReachedFromAlignedBarrierOnly &&

2712 It.getSecond().IsReachingAlignedBarrierOnly;

2713 }

2714 return "[AAExecutionDomain] " + std::to_string(InitialThreadBlocks) + "/" +

2715 std::to_string(AlignedBlocks) + " of " +

2716 std::to_string(TotalBlocks) +

2717 " executed by initial thread / aligned";

2718 }

2719

2720

2721 void trackStatistics() const override {}

2722

2725 for (const BasicBlock &BB : *getAnchorScope()) {

2726 if (!isExecutedByInitialThreadOnly(BB))

2727 continue;

2728 dbgs() << TAG << " Basic block @" << getAnchorScope()->getName() << " "

2729 << BB.getName() << " is executed by a single thread.\n";

2730 }

2731 });

2732

2734

2737

2738 SmallPtrSet<CallBase *, 16> DeletedBarriers;

2739 auto HandleAlignedBarrier = [&](CallBase *CB) {

2740 const ExecutionDomainTy &ED = CB ? CEDMap[{CB, PRE}] : BEDMap[nullptr];

2741 if (!ED.IsReachedFromAlignedBarrierOnly ||

2742 ED.EncounteredNonLocalSideEffect)

2743 return;

2744 if (!ED.EncounteredAssumes.empty() && A.isModulePass())

2745 return;

2746

2747

2748

2749

2750

2751

2752

2753

2754 if (CB) {

2755 DeletedBarriers.insert(CB);

2756 A.deleteAfterManifest(*CB);

2757 ++NumBarriersEliminated;

2758 Changed = ChangeStatus::CHANGED;

2759 } else if (!ED.AlignedBarriers.empty()) {

2760 Changed = ChangeStatus::CHANGED;

2762 ED.AlignedBarriers.end());

2763 SmallSetVector<CallBase *, 16> Visited;

2764 while (!Worklist.empty()) {

2766 if (!Visited.insert(LastCB))

2767 continue;

2768 if (LastCB->getFunction() != getAnchorScope())

2769 continue;

2770 if (!hasFunctionEndAsUniqueSuccessor(LastCB->getParent()))

2771 continue;

2772 if (!DeletedBarriers.count(LastCB)) {

2773 ++NumBarriersEliminated;

2774 A.deleteAfterManifest(*LastCB);

2775 continue;

2776 }

2777

2778

2779

2780 const ExecutionDomainTy &LastED = CEDMap[{LastCB, PRE}];

2781 Worklist.append(LastED.AlignedBarriers.begin(),

2782 LastED.AlignedBarriers.end());

2783 }

2784 }

2785

2786

2787

2788 if (!ED.EncounteredAssumes.empty() && (CB || !ED.AlignedBarriers.empty()))

2789 for (auto *AssumeCB : ED.EncounteredAssumes)

2790 A.deleteAfterManifest(*AssumeCB);

2791 };

2792

2793 for (auto *CB : AlignedBarriers)

2794 HandleAlignedBarrier(CB);

2795

2796

2798 HandleAlignedBarrier(nullptr);

2799

2801 }

2802

2803 bool isNoOpFence(const FenceInst &FI) const override {

2804 return getState().isValidState() && !NonNoOpFences.count(&FI);

2805 }

2806

2807

2808

2809 void

2810 mergeInPredecessorBarriersAndAssumptions(Attributor &A, ExecutionDomainTy &ED,

2811 const ExecutionDomainTy &PredED);

2812

2813

2814

2815

2816 bool mergeInPredecessor(Attributor &A, ExecutionDomainTy &ED,

2817 const ExecutionDomainTy &PredED,

2818 bool InitialEdgeOnly = false);

2819

2820

2821 bool handleCallees(Attributor &A, ExecutionDomainTy &EntryBBED);

2822

2823

2824 ChangeStatus updateImpl(Attributor &A) override;

2825

2826

2827

2828 bool isExecutedByInitialThreadOnly(const BasicBlock &BB) const override {

2829 if (!isValidState())

2830 return false;

2831 assert(BB.getParent() == getAnchorScope() && "Block is out of scope!");

2832 return BEDMap.lookup(&BB).IsExecutedByInitialThreadOnly;

2833 }

2834

2835 bool isExecutedInAlignedRegion(Attributor &A,

2836 const Instruction &I) const override {

2837 assert(I.getFunction() == getAnchorScope() &&

2838 "Instruction is out of scope!");

2839 if (!isValidState())

2840 return false;

2841

2842 bool ForwardIsOk = true;

2844

2845

2846 CurI = &I;

2847 do {

2849 if (!CB)

2850 continue;

2851 if (CB != &I && AlignedBarriers.contains(const_cast<CallBase *>(CB)))

2852 return true;

2853 const auto &It = CEDMap.find({CB, PRE});

2854 if (It == CEDMap.end())

2855 continue;

2856 if (!It->getSecond().IsReachingAlignedBarrierOnly)

2857 ForwardIsOk = false;

2858 break;

2860

2861 if (!CurI && !BEDMap.lookup(I.getParent()).IsReachingAlignedBarrierOnly)

2862 ForwardIsOk = false;

2863

2864

2865 CurI = &I;

2866 do {

2868 if (!CB)

2869 continue;

2870 if (CB != &I && AlignedBarriers.contains(const_cast<CallBase *>(CB)))

2871 return true;

2872 const auto &It = CEDMap.find({CB, POST});

2873 if (It == CEDMap.end())

2874 continue;

2875 if (It->getSecond().IsReachedFromAlignedBarrierOnly)

2876 break;

2877 return false;

2879

2880

2881

2882 if (!ForwardIsOk)

2883 return false;

2884

2885 if (!CurI) {

2888 return BEDMap.lookup(nullptr).IsReachedFromAlignedBarrierOnly;

2890 return BEDMap.lookup(PredBB).IsReachedFromAlignedBarrierOnly;

2891 })) {

2892 return false;

2893 }

2894 }

2895

2896

2897 return true;

2898 }

2899

2900 ExecutionDomainTy getExecutionDomain(const BasicBlock &BB) const override {

2901 assert(isValidState() &&

2902 "No request should be made against an invalid state!");

2903 return BEDMap.lookup(&BB);

2904 }

2905 std::pair<ExecutionDomainTy, ExecutionDomainTy>

2906 getExecutionDomain(const CallBase &CB) const override {

2907 assert(isValidState() &&

2908 "No request should be made against an invalid state!");

2909 return {CEDMap.lookup({&CB, PRE}), CEDMap.lookup({&CB, POST})};

2910 }

2911 ExecutionDomainTy getFunctionExecutionDomain() const override {

2912 assert(isValidState() &&

2913 "No request should be made against an invalid state!");

2914 return InterProceduralED;

2915 }

2916

2917

2918

2919

2920 static bool isInitialThreadOnlyEdge(Attributor &A, BranchInst *Edge,

2921 BasicBlock &SuccessorBB) {

2922 if (Edge || Edge->isConditional())

2923 return false;

2924 if (Edge->getSuccessor(0) != &SuccessorBB)

2925 return false;

2926

2928 if (!Cmp || Cmp->isTrueWhenEqual() || Cmp->isEquality())

2929 return false;

2930

2932 if (C)

2933 return false;

2934

2935

2936 if (C->isAllOnesValue()) {

2938 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());

2939 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];

2940 CB = CB ? OpenMPOpt::getCallIfRegularCall(*CB, &RFI) : nullptr;

2941 if (!CB)

2942 return false;

2943 ConstantStruct *KernelEnvC =

2945 ConstantInt *ExecModeC =

2946 KernelInfo::getExecModeFromKernelEnvironment(KernelEnvC);

2948 }

2949

2950 if (C->isZero()) {

2951

2953 if (II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_tid_x)

2954 return true;

2955

2956

2958 if (II->getIntrinsicID() == Intrinsic::amdgcn_workitem_id_x)

2959 return true;

2960 }

2961

2962 return false;

2963 };

2964

2965

2966 ExecutionDomainTy InterProceduralED;

2967

2968 enum Direction { PRE = 0, POST = 1 };

2969

2970 DenseMap<const BasicBlock *, ExecutionDomainTy> BEDMap;

2971 DenseMap<PointerIntPair<const CallBase *, 1, Direction>, ExecutionDomainTy>

2972 CEDMap;

2973 SmallSetVector<CallBase *, 16> AlignedBarriers;

2974

2975 ReversePostOrderTraversal<Function *> *RPOT = nullptr;

2976

2977

2978 static bool setAndRecord(bool &R, bool V) {

2979 bool Eq = (R == V);

2980 R = V;

2981 return !Eq;

2982 }

2983

2984

2985

2986 SmallPtrSet<const FenceInst *, 8> NonNoOpFences;

2987};

2988

2989void AAExecutionDomainFunction::mergeInPredecessorBarriersAndAssumptions(

2990 Attributor &A, ExecutionDomainTy &ED, const ExecutionDomainTy &PredED) {

2991 for (auto *EA : PredED.EncounteredAssumes)

2992 ED.addAssumeInst(A, *EA);

2993

2994 for (auto *AB : PredED.AlignedBarriers)

2995 ED.addAlignedBarrier(A, *AB);

2996}

2997

2998bool AAExecutionDomainFunction::mergeInPredecessor(

2999 Attributor &A, ExecutionDomainTy &ED, const ExecutionDomainTy &PredED,

3000 bool InitialEdgeOnly) {

3001

3004 setAndRecord(ED.IsExecutedByInitialThreadOnly,

3005 InitialEdgeOnly || (PredED.IsExecutedByInitialThreadOnly &&

3006 ED.IsExecutedByInitialThreadOnly));

3007

3008 Changed |= setAndRecord(ED.IsReachedFromAlignedBarrierOnly,

3009 ED.IsReachedFromAlignedBarrierOnly &&

3010 PredED.IsReachedFromAlignedBarrierOnly);

3011 Changed |= setAndRecord(ED.EncounteredNonLocalSideEffect,

3012 ED.EncounteredNonLocalSideEffect |

3013 PredED.EncounteredNonLocalSideEffect);

3014

3015 if (ED.IsReachedFromAlignedBarrierOnly)

3016 mergeInPredecessorBarriersAndAssumptions(A, ED, PredED);

3017 else

3018 ED.clearAssumeInstAndAlignedBarriers();

3020}

3021

3022bool AAExecutionDomainFunction::handleCallees(Attributor &A,

3023 ExecutionDomainTy &EntryBBED) {

3025 auto PredForCallSite = [&](AbstractCallSite ACS) {

3026 const auto *EDAA = A.getAAFor(

3028 DepClassTy::OPTIONAL);

3029 if (!EDAA || !EDAA->getState().isValidState())

3030 return false;

3032 EDAA->getExecutionDomain(*cast(ACS.getInstruction())));

3033 return true;

3034 };

3035

3036 ExecutionDomainTy ExitED;

3037 bool AllCallSitesKnown;

3038 if (A.checkForAllCallSites(PredForCallSite, *this,

3039 true,

3040 AllCallSitesKnown)) {

3041 for (const auto &[CSInED, CSOutED] : CallSiteEDs) {

3042 mergeInPredecessor(A, EntryBBED, CSInED);

3043 ExitED.IsReachingAlignedBarrierOnly &=

3044 CSOutED.IsReachingAlignedBarrierOnly;

3045 }

3046

3047 } else {

3048

3049

3051 EntryBBED.IsExecutedByInitialThreadOnly = false;

3052 EntryBBED.IsReachedFromAlignedBarrierOnly = true;

3053 EntryBBED.EncounteredNonLocalSideEffect = false;

3054 ExitED.IsReachingAlignedBarrierOnly = false;

3055 } else {

3056 EntryBBED.IsExecutedByInitialThreadOnly = false;

3057 EntryBBED.IsReachedFromAlignedBarrierOnly = false;

3058 EntryBBED.EncounteredNonLocalSideEffect = true;

3059 ExitED.IsReachingAlignedBarrierOnly = false;

3060 }

3061 }

3062

3064 auto &FnED = BEDMap[nullptr];

3065 Changed |= setAndRecord(FnED.IsReachedFromAlignedBarrierOnly,

3066 FnED.IsReachedFromAlignedBarrierOnly &

3067 EntryBBED.IsReachedFromAlignedBarrierOnly);

3068 Changed |= setAndRecord(FnED.IsReachingAlignedBarrierOnly,

3069 FnED.IsReachingAlignedBarrierOnly &

3070 ExitED.IsReachingAlignedBarrierOnly);

3071 Changed |= setAndRecord(FnED.IsExecutedByInitialThreadOnly,

3072 EntryBBED.IsExecutedByInitialThreadOnly);

3074}

3075

3076ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {

3077

3079

3080

3081

3082

3083 auto HandleAlignedBarrier = [&](CallBase &CB, ExecutionDomainTy &ED) {

3084 Changed |= AlignedBarriers.insert(&CB);

3085

3086 auto &CallInED = CEDMap[{&CB, PRE}];

3087 Changed |= mergeInPredecessor(A, CallInED, ED);

3088 CallInED.IsReachingAlignedBarrierOnly = true;

3089

3090 ED.EncounteredNonLocalSideEffect = false;

3091 ED.IsReachedFromAlignedBarrierOnly = true;

3092

3093 ED.clearAssumeInstAndAlignedBarriers();

3094 ED.addAlignedBarrier(A, CB);

3095 auto &CallOutED = CEDMap[{&CB, POST}];

3096 Changed |= mergeInPredecessor(A, CallOutED, ED);

3097 };

3098

3099 auto *LivenessAA =

3100 A.getAAFor(*this, getIRPosition(), DepClassTy::OPTIONAL);

3101

3103 BasicBlock &EntryBB = F->getEntryBlock();

3105

3107 for (auto &RIt : *RPOT) {

3109

3110 bool IsEntryBB = &BB == &EntryBB;

3111

3112

3113 bool AlignedBarrierLastInBlock = IsEntryBB && IsKernel;

3114 bool IsExplicitlyAligned = IsEntryBB && IsKernel;

3115 ExecutionDomainTy ED;

3116

3117 if (IsEntryBB) {

3118 Changed |= handleCallees(A, ED);

3119 } else {

3120

3121

3122 if (LivenessAA && LivenessAA->isAssumedDead(&BB))

3123 continue;

3124

3126 if (LivenessAA && LivenessAA->isEdgeDead(PredBB, &BB))

3127 continue;

3128 bool InitialEdgeOnly = isInitialThreadOnlyEdge(

3130 mergeInPredecessor(A, ED, BEDMap[PredBB], InitialEdgeOnly);

3131 }

3132 }

3133

3134

3135

3136 for (Instruction &I : BB) {

3137 bool UsedAssumedInformation;

3138 if (A.isAssumedDead(I, *this, LivenessAA, UsedAssumedInformation,

3139 false, DepClassTy::OPTIONAL,

3140 true))

3141 continue;

3142

3143

3144

3147 ED.addAssumeInst(A, *AI);

3148 continue;

3149 }

3150

3151 if (II->isAssumeLikeIntrinsic())

3152 continue;

3153 }

3154

3156 if (!ED.EncounteredNonLocalSideEffect) {

3157

3158 if (ED.IsReachedFromAlignedBarrierOnly)

3159 continue;

3160

3161

3163 case AtomicOrdering::NotAtomic:

3164 continue;

3165 case AtomicOrdering::Unordered:

3166 continue;

3167 case AtomicOrdering::Monotonic:

3168 continue;

3169 case AtomicOrdering::Acquire:

3170 break;

3171 case AtomicOrdering::Release:

3172 continue;

3173 case AtomicOrdering::AcquireRelease:

3174 break;

3175 case AtomicOrdering::SequentiallyConsistent:

3176 break;

3177 };

3178 }

3179 NonNoOpFences.insert(FI);

3180 }

3181

3184 bool IsAlignedBarrier =

3185 !IsNoSync && CB &&

3187

3188 AlignedBarrierLastInBlock &= IsNoSync;

3189 IsExplicitlyAligned &= IsNoSync;

3190

3191

3192

3193

3194 if (CB) {

3195 if (IsAlignedBarrier) {

3196 HandleAlignedBarrier(*CB, ED);

3197 AlignedBarrierLastInBlock = true;

3198 IsExplicitlyAligned = true;

3199 continue;

3200 }

3201

3202

3204 if (!ED.EncounteredNonLocalSideEffect &&

3206 ED.EncounteredNonLocalSideEffect = true;

3207 if (!IsNoSync) {

3208 ED.IsReachedFromAlignedBarrierOnly = false;

3210 }

3211 continue;

3212 }

3213

3214

3215

3216 auto &CallInED = CEDMap[{CB, PRE}];

3217 Changed |= mergeInPredecessor(A, CallInED, ED);

3218

3219

3220

3221

3223 if (!IsNoSync && Callee && Callee->isDeclaration()) {

3224 const auto *EDAA = A.getAAFor(

3226 if (EDAA && EDAA->getState().isValidState()) {

3227 const auto &CalleeED = EDAA->getFunctionExecutionDomain();

3228 ED.IsReachedFromAlignedBarrierOnly =

3229 CalleeED.IsReachedFromAlignedBarrierOnly;

3230 AlignedBarrierLastInBlock = ED.IsReachedFromAlignedBarrierOnly;

3231 if (IsNoSync || !CalleeED.IsReachedFromAlignedBarrierOnly)

3232 ED.EncounteredNonLocalSideEffect |=

3233 CalleeED.EncounteredNonLocalSideEffect;

3234 else

3235 ED.EncounteredNonLocalSideEffect =

3236 CalleeED.EncounteredNonLocalSideEffect;

3237 if (!CalleeED.IsReachingAlignedBarrierOnly) {

3239 setAndRecord(CallInED.IsReachingAlignedBarrierOnly, false);

3241 }

3242 if (CalleeED.IsReachedFromAlignedBarrierOnly)

3243 mergeInPredecessorBarriersAndAssumptions(A, ED, CalleeED);

3244 auto &CallOutED = CEDMap[{CB, POST}];

3245 Changed |= mergeInPredecessor(A, CallOutED, ED);

3246 continue;

3247 }

3248 }

3249 if (!IsNoSync) {

3250 ED.IsReachedFromAlignedBarrierOnly = false;

3251 Changed |= setAndRecord(CallInED.IsReachingAlignedBarrierOnly, false);

3253 }

3254 AlignedBarrierLastInBlock &= ED.IsReachedFromAlignedBarrierOnly;

3256 auto &CallOutED = CEDMap[{CB, POST}];

3257 Changed |= mergeInPredecessor(A, CallOutED, ED);

3258 }

3259

3260 if (I.mayHaveSideEffects() && I.mayReadFromMemory())

3261 continue;

3262

3263

3264

3265 if (CB) {

3266 const auto *MemAA = A.getAAFor(

3268

3273 };

3274 if (MemAA && MemAA->getState().isValidState() &&

3275 MemAA->checkForAllAccessesToMemoryKind(

3277 continue;

3278 }

3279

3280 auto &InfoCache = A.getInfoCache();

3281 if (I.mayHaveSideEffects() && InfoCache.isOnlyUsedByAssume(I))

3282 continue;

3283

3285 if (LI->hasMetadata(LLVMContext::MD_invariant_load))

3286 continue;

3287

3288 if (!ED.EncounteredNonLocalSideEffect &&

3290 ED.EncounteredNonLocalSideEffect = true;

3291 }

3292

3293 bool IsEndAndNotReachingAlignedBarriersOnly = false;

3295 !BB.getTerminator()->getNumSuccessors()) {

3296

3297 Changed |= mergeInPredecessor(A, InterProceduralED, ED);

3298

3299 auto &FnED = BEDMap[nullptr];

3300 if (IsKernel && !IsExplicitlyAligned)

3301 FnED.IsReachingAlignedBarrierOnly = false;

3302 Changed |= mergeInPredecessor(A, FnED, ED);

3303

3304 if (!FnED.IsReachingAlignedBarrierOnly) {

3305 IsEndAndNotReachingAlignedBarriersOnly = true;

3306 SyncInstWorklist.push_back(BB.getTerminator());

3307 auto &BBED = BEDMap[&BB];

3308 Changed |= setAndRecord(BBED.IsReachingAlignedBarrierOnly, false);

3309 }

3310 }

3311

3312 ExecutionDomainTy &StoredED = BEDMap[&BB];

3313 ED.IsReachingAlignedBarrierOnly = StoredED.IsReachingAlignedBarrierOnly &

3314 !IsEndAndNotReachingAlignedBarriersOnly;

3315

3316

3317

3318

3319

3320 if (ED.IsExecutedByInitialThreadOnly !=

3321 StoredED.IsExecutedByInitialThreadOnly ||

3322 ED.IsReachedFromAlignedBarrierOnly !=

3323 StoredED.IsReachedFromAlignedBarrierOnly ||

3324 ED.EncounteredNonLocalSideEffect !=

3325 StoredED.EncounteredNonLocalSideEffect)

3327

3328

3329 StoredED = std::move(ED);

3330 }

3331

3332

3333

3334 SmallSetVector<BasicBlock *, 16> Visited;

3335 while (!SyncInstWorklist.empty()) {

3338 bool HitAlignedBarrierOrKnownEnd = false;

3339 while ((CurInst = CurInst->getPrevNode())) {

3341 if (!CB)

3342 continue;

3343 auto &CallOutED = CEDMap[{CB, POST}];

3344 Changed |= setAndRecord(CallOutED.IsReachingAlignedBarrierOnly, false);

3345 auto &CallInED = CEDMap[{CB, PRE}];

3346 HitAlignedBarrierOrKnownEnd =

3347 AlignedBarriers.count(CB) || !CallInED.IsReachingAlignedBarrierOnly;

3348 if (HitAlignedBarrierOrKnownEnd)

3349 break;

3350 Changed |= setAndRecord(CallInED.IsReachingAlignedBarrierOnly, false);

3351 }

3352 if (HitAlignedBarrierOrKnownEnd)

3353 continue;

3356 if (LivenessAA && LivenessAA->isEdgeDead(PredBB, SyncBB))

3357 continue;

3358 if (!Visited.insert(PredBB))

3359 continue;

3360 auto &PredED = BEDMap[PredBB];

3361 if (setAndRecord(PredED.IsReachingAlignedBarrierOnly, false)) {

3363 SyncInstWorklist.push_back(PredBB->getTerminator());

3364 }

3365 }

3366 if (SyncBB != &EntryBB)

3367 continue;

3369 setAndRecord(InterProceduralED.IsReachingAlignedBarrierOnly, false);

3370 }

3371

3372 return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;

3373}

3374

3375

3376

3377struct AAHeapToShared : public StateWrapper<BooleanState, AbstractAttribute> {

3378 using Base = StateWrapper<BooleanState, AbstractAttribute>;

3379 AAHeapToShared(const IRPosition &IRP, Attributor &A) : Base(IRP) {}

3380

3381

3382 static AAHeapToShared &createForPosition(const IRPosition &IRP,

3383 Attributor &A);

3384

3385

3386 virtual bool isAssumedHeapToShared(CallBase &CB) const = 0;

3387

3388

3389

3390 virtual bool isAssumedHeapToSharedRemovedFree(CallBase &CB) const = 0;

3391

3392

3393 StringRef getName() const override { return "AAHeapToShared"; }

3394

3395

3396 const char *getIdAddr() const override { return &ID; }

3397

3398

3399

3400 static bool classof(const AbstractAttribute *AA) {

3402 }

3403

3404

3405 static const char ID;

3406};

3407

3408struct AAHeapToSharedFunction : public AAHeapToShared {

3409 AAHeapToSharedFunction(const IRPosition &IRP, Attributor &A)

3410 : AAHeapToShared(IRP, A) {}

3411

3412 const std::string getAsStr(Attributor *) const override {

3413 return "[AAHeapToShared] " + std::to_string(MallocCalls.size()) +

3414 " malloc calls eligible.";

3415 }

3416

3417

3418 void trackStatistics() const override {}

3419

3420

3421

3422 void findPotentialRemovedFreeCalls(Attributor &A) {

3423 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());

3424 auto &FreeRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];

3425

3426 PotentialRemovedFreeCalls.clear();

3427

3428 for (CallBase *CB : MallocCalls) {

3430 for (auto *U : CB->users()) {

3432 if (C && C->getCalledFunction() == FreeRFI.Declaration)

3434 }

3435

3436 if (FreeCalls.size() != 1)

3437 continue;

3438

3439 PotentialRemovedFreeCalls.insert(FreeCalls.front());

3440 }

3441 }

3442

3443 void initialize(Attributor &A) override {

3445 indicatePessimisticFixpoint();

3446 return;

3447 }

3448

3449 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());

3450 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];

3451 if (!RFI.Declaration)

3452 return;

3453

3455 [](const IRPosition &, const AbstractAttribute *,

3456 bool &) -> std::optional<Value *> { return nullptr; };

3457

3459 for (User *U : RFI.Declaration->users())

3462 continue;

3463 MallocCalls.insert(CB);

3465 SCB);

3466 }

3467

3468 findPotentialRemovedFreeCalls(A);

3469 }

3470

3471 bool isAssumedHeapToShared(CallBase &CB) const override {

3472 return isValidState() && MallocCalls.count(&CB);

3473 }

3474

3475 bool isAssumedHeapToSharedRemovedFree(CallBase &CB) const override {

3476 return isValidState() && PotentialRemovedFreeCalls.count(&CB);

3477 }

3478

3480 if (MallocCalls.empty())

3481 return ChangeStatus::UNCHANGED;

3482

3483 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());

3484 auto &FreeCall = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];

3485

3488 DepClassTy::OPTIONAL);

3489

3491 for (CallBase *CB : MallocCalls) {

3492

3493 if (HS && HS->isAssumedHeapToStack(*CB))

3494 continue;

3495

3496

3498 for (auto *U : CB->users()) {

3500 if (C && C->getCalledFunction() == FreeCall.Declaration)

3502 }

3503 if (FreeCalls.size() != 1)

3504 continue;

3505

3507

3508 if (AllocSize->getZExtValue() + SharedMemoryUsed > SharedMemoryLimit) {

3510 << " with shared memory."

3511 << " Shared memory usage is limited to "

3513 continue;

3514 }

3515

3516 LLVM_DEBUG(dbgs() << TAG << "Replace globalization call " << *CB

3517 << " with " << AllocSize->getZExtValue()

3518 << " bytes of shared memory\n");

3519

3520

3521

3523 Type *Int8Ty = Type::getInt8Ty(M->getContext());

3524 Type *Int8ArrTy = ArrayType::get(Int8Ty, AllocSize->getZExtValue());

3525 auto *SharedMem = new GlobalVariable(

3529 static_cast<unsigned>(AddressSpace::Shared));

3531 SharedMem, PointerType::getUnqual(M->getContext()));

3532

3533 auto Remark = [&](OptimizationRemark OR) {

3534 return OR << "Replaced globalized variable with "

3535 << ore::NV("SharedMemory", AllocSize->getZExtValue())

3536 << (AllocSize->isOne() ? " byte " : " bytes ")

3537 << "of shared memory.";

3538 };

3539 A.emitRemark(CB, "OMP111", Remark);

3540

3541 MaybeAlign Alignment = CB->getRetAlign();

3543 "HeapToShared on allocation without alignment attribute");

3544 SharedMem->setAlignment(*Alignment);

3545

3547 A.deleteAfterManifest(*CB);

3548 A.deleteAfterManifest(*FreeCalls.front());

3549

3550 SharedMemoryUsed += AllocSize->getZExtValue();

3551 NumBytesMovedToSharedMemory = SharedMemoryUsed;

3552 Changed = ChangeStatus::CHANGED;

3553 }

3554

3556 }

3557

3558 ChangeStatus updateImpl(Attributor &A) override {

3559 if (MallocCalls.empty())

3560 return indicatePessimisticFixpoint();

3561 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());

3562 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];

3563 if (!RFI.Declaration)

3564 return ChangeStatus::UNCHANGED;

3565

3567

3568 auto NumMallocCalls = MallocCalls.size();

3569

3570

3571 for (User *U : RFI.Declaration->users()) {

3573 if (CB->getCaller() != F)

3574 continue;

3575 if (!MallocCalls.count(CB))

3576 continue;

3578 MallocCalls.remove(CB);

3579 continue;

3580 }

3581 const auto *ED = A.getAAFor(

3583 if (!ED || !ED->isExecutedByInitialThreadOnly(*CB))

3584 MallocCalls.remove(CB);

3585 }

3586 }

3587

3588 findPotentialRemovedFreeCalls(A);

3589

3590 if (NumMallocCalls != MallocCalls.size())

3591 return ChangeStatus::CHANGED;

3592

3593 return ChangeStatus::UNCHANGED;

3594 }

3595

3596

3597 SmallSetVector<CallBase *, 4> MallocCalls;

3598

3599 SmallPtrSet<CallBase *, 4> PotentialRemovedFreeCalls;

3600

3601 unsigned SharedMemoryUsed = 0;

3602};

3603

3604struct AAKernelInfo : public StateWrapper<KernelInfoState, AbstractAttribute> {

3605 using Base = StateWrapper<KernelInfoState, AbstractAttribute>;

3606 AAKernelInfo(const IRPosition &IRP, Attributor &A) : Base(IRP) {}

3607

3608

3609

3610 static bool requiresCalleeForCallBase() { return false; }

3611

3612

3613 void trackStatistics() const override {}

3614

3615

3616 const std::string getAsStr(Attributor *) const override {

3617 if (!isValidState())

3618 return "";

3619 return std::string(SPMDCompatibilityTracker.isAssumed() ? "SPMD"

3620 : "generic") +

3621 std::string(SPMDCompatibilityTracker.isAtFixpoint() ? " [FIX]"

3622 : "") +

3623 std::string(" #PRs: ") +

3624 (ReachedKnownParallelRegions.isValidState()

3625 ? std::to_string(ReachedKnownParallelRegions.size())

3626 : "") +

3627 ", #Unknown PRs: " +

3628 (ReachedUnknownParallelRegions.isValidState()

3629 ? std::to_string(ReachedUnknownParallelRegions.size())

3630 : "") +

3631 ", #Reaching Kernels: " +

3632 (ReachingKernelEntries.isValidState()

3633 ? std::to_string(ReachingKernelEntries.size())

3634 : "") +

3635 ", #ParLevels: " +

3636 (ParallelLevels.isValidState()

3637 ? std::to_string(ParallelLevels.size())

3638 : "") +

3639 ", NestedPar: " + (NestedParallelism ? "yes" : "no");

3640 }

3641

3642

3643 static AAKernelInfo &createForPosition(const IRPosition &IRP, Attributor &A);

3644

3645

3646 StringRef getName() const override { return "AAKernelInfo"; }

3647

3648

3649 const char *getIdAddr() const override { return &ID; }

3650

3651

3652 static bool classof(const AbstractAttribute *AA) {

3654 }

3655

3656 static const char ID;

3657};

3658

3659

3660

3661struct AAKernelInfoFunction : AAKernelInfo {

3662 AAKernelInfoFunction(const IRPosition &IRP, Attributor &A)

3663 : AAKernelInfo(IRP, A) {}

3664

3665 SmallPtrSet<Instruction *, 4> GuardedInstructions;

3666

3667 SmallPtrSetImpl<Instruction *> &getGuardedInstructions() {

3668 return GuardedInstructions;

3669 }

3670

3671 void setConfigurationOfKernelEnvironment(ConstantStruct *ConfigC) {

3673 KernelEnvC, ConfigC, {KernelInfo::ConfigurationIdx});

3674 assert(NewKernelEnvC && "Failed to create new kernel environment");

3676 }

3677

3678#define KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MEMBER) \

3679 void set##MEMBER##OfKernelEnvironment(ConstantInt *NewVal) { \

3680 ConstantStruct *ConfigC = \

3681 KernelInfo::getConfigurationFromKernelEnvironment(KernelEnvC); \

3682 Constant *NewConfigC = ConstantFoldInsertValueInstruction( \

3683 ConfigC, NewVal, {KernelInfo::MEMBER##Idx}); \

3684 assert(NewConfigC && "Failed to create new configuration environment"); \

3685 setConfigurationOfKernelEnvironment(cast(NewConfigC)); \

3686 }

3687

3695

3696#undef KERNEL_ENVIRONMENT_CONFIGURATION_SETTER

3697

3698

3700

3701

3702

3703 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());

3704

3705 Function *Fn = getAnchorScope();

3706

3707 OMPInformationCache::RuntimeFunctionInfo &InitRFI =

3708 OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];

3709 OMPInformationCache::RuntimeFunctionInfo &DeinitRFI =

3710 OMPInfoCache.RFIs[OMPRTL___kmpc_target_deinit];

3711

3712

3713

3714 auto StoreCallBase = [](Use &U,

3715 OMPInformationCache::RuntimeFunctionInfo &RFI,

3717 CallBase *CB = OpenMPOpt::getCallIfRegularCall(U, &RFI);

3719 "Unexpected use of __kmpc_target_init or __kmpc_target_deinit!");

3721 "Multiple uses of __kmpc_target_init or __kmpc_target_deinit!");

3722 Storage = CB;

3723 return false;

3724 };

3725 InitRFI.foreachUse(

3727 StoreCallBase(U, InitRFI, KernelInitCB);

3728 return false;

3729 },

3730 Fn);

3731 DeinitRFI.foreachUse(

3733 StoreCallBase(U, DeinitRFI, KernelDeinitCB);

3734 return false;

3735 },

3736 Fn);

3737

3738

3739 if (!KernelInitCB || !KernelDeinitCB)

3740 return;

3741

3742

3743 ReachingKernelEntries.insert(Fn);

3744 IsKernelEntry = true;

3745

3746 KernelEnvC =

3750

3752 KernelConfigurationSimplifyCB =

3754 bool &UsedAssumedInformation) -> std::optional<Constant *> {

3755 if (!isAtFixpoint()) {

3756 if (AA)

3757 return nullptr;

3758 UsedAssumedInformation = true;

3760 }

3761 return KernelEnvC;

3762 };

3763

3764 A.registerGlobalVariableSimplificationCallback(

3765 *KernelEnvGV, KernelConfigurationSimplifyCB);

3766

3767

3768 bool CanChangeToSPMD = OMPInfoCache.runtimeFnsAvailable(

3769 {OMPRTL___kmpc_get_hardware_thread_id_in_block,

3770 OMPRTL___kmpc_barrier_simple_spmd});

3771

3772

3774 KernelInfo::getExecModeFromKernelEnvironment(KernelEnvC);

3775 ConstantInt *AssumedExecModeC = ConstantInt::get(

3779 SPMDCompatibilityTracker.indicateOptimisticFixpoint();

3781

3782

3783 SPMDCompatibilityTracker.indicatePessimisticFixpoint();

3784 else

3785 setExecModeOfKernelEnvironment(AssumedExecModeC);

3786

3790 OpenMPIRBuilder::readThreadBoundsForKernel(T, *Fn);

3791 if (MinThreads)

3792 setMinThreadsOfKernelEnvironment(ConstantInt::get(Int32Ty, MinThreads));

3794 setMaxThreadsOfKernelEnvironment(ConstantInt::get(Int32Ty, MaxThreads));

3795 auto [MinTeams, MaxTeams] =

3796 OpenMPIRBuilder::readTeamBoundsForKernel(T, *Fn);

3797 if (MinTeams)

3798 setMinTeamsOfKernelEnvironment(ConstantInt::get(Int32Ty, MinTeams));

3799 if (MaxTeams)

3800 setMaxTeamsOfKernelEnvironment(ConstantInt::get(Int32Ty, MaxTeams));

3801

3803 KernelInfo::getMayUseNestedParallelismFromKernelEnvironment(KernelEnvC);

3804 ConstantInt *AssumedMayUseNestedParallelismC = ConstantInt::get(

3805 MayUseNestedParallelismC->getIntegerType(), NestedParallelism);

3806 setMayUseNestedParallelismOfKernelEnvironment(

3807 AssumedMayUseNestedParallelismC);

3808

3811 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(

3812 KernelEnvC);

3813 ConstantInt *AssumedUseGenericStateMachineC =

3814 ConstantInt::get(UseGenericStateMachineC->getIntegerType(), false);

3815 setUseGenericStateMachineOfKernelEnvironment(

3816 AssumedUseGenericStateMachineC);

3817 }

3818

3819

3822 if (!OMPInfoCache.RFIs[RFKind].Declaration)

3823 return;

3824 A.registerVirtualUseCallback(*OMPInfoCache.RFIs[RFKind].Declaration, CB);

3825 };

3826

3827

3828 auto AddDependence = [](Attributor &A, const AAKernelInfo *KI,

3830 if (QueryingAA) {

3832 }

3833 return true;

3834 };

3835

3838

3839

3840

3841

3842

3843

3844

3845 if (SPMDCompatibilityTracker.isValidState())

3846 return AddDependence(A, this, QueryingAA);

3847

3848 if (!ReachedKnownParallelRegions.isValidState())

3849 return AddDependence(A, this, QueryingAA);

3850 return false;

3851 };

3852

3853

3855 RegisterVirtualUse(OMPRTL___kmpc_get_hardware_num_threads_in_block,

3856 CustomStateMachineUseCB);

3857 RegisterVirtualUse(OMPRTL___kmpc_get_warp_size, CustomStateMachineUseCB);

3858 RegisterVirtualUse(OMPRTL___kmpc_barrier_simple_generic,

3859 CustomStateMachineUseCB);

3860 RegisterVirtualUse(OMPRTL___kmpc_kernel_parallel,

3861 CustomStateMachineUseCB);

3862 RegisterVirtualUse(OMPRTL___kmpc_kernel_end_parallel,

3863 CustomStateMachineUseCB);

3864 }

3865

3866

3867 if (SPMDCompatibilityTracker.isAtFixpoint())

3868 return;

3869

3872

3873

3874 if (!SPMDCompatibilityTracker.isValidState())

3875 return AddDependence(A, this, QueryingAA);

3876 return false;

3877 };

3878 RegisterVirtualUse(OMPRTL___kmpc_get_hardware_thread_id_in_block,

3879 HWThreadIdUseCB);

3880

3883

3884

3885

3886

3887 if (!SPMDCompatibilityTracker.isValidState())

3888 return AddDependence(A, this, QueryingAA);

3889 if (SPMDCompatibilityTracker.empty())

3890 return AddDependence(A, this, QueryingAA);

3891 if (!mayContainParallelRegion())

3892 return AddDependence(A, this, QueryingAA);

3893 return false;

3894 };

3895 RegisterVirtualUse(OMPRTL___kmpc_barrier_simple_spmd, SPMDBarrierUseCB);

3896 }

3897

3898

3899 static std::string sanitizeForGlobalName(std::string S) {

3900 std::replace_if(

3901 S.begin(), S.end(),

3902 [](const char C) {

3903 return !((C >= 'a' && C <= 'z') || (C >= 'A' && C <= 'Z') ||

3904 (C >= '0' && C <= '9') || C == '_');

3905 },

3906 '.');

3907 return S;

3908 }

3909

3910

3911

3913

3914

3915 if (!KernelInitCB || !KernelDeinitCB)

3916 return ChangeStatus::UNCHANGED;

3917

3919

3920 bool HasBuiltStateMachine = true;

3921 if (!changeToSPMDMode(A, Changed)) {

3923 HasBuiltStateMachine = buildCustomStateMachine(A, Changed);

3924 else

3925 HasBuiltStateMachine = false;

3926 }

3927

3928

3929 ConstantStruct *ExistingKernelEnvC =

3931 ConstantInt *OldUseGenericStateMachineVal =

3932 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(

3933 ExistingKernelEnvC);

3934 if (!HasBuiltStateMachine)

3935 setUseGenericStateMachineOfKernelEnvironment(

3936 OldUseGenericStateMachineVal);

3937

3938

3939 GlobalVariable *KernelEnvGV =

3943 Changed = ChangeStatus::CHANGED;

3944 }

3945

3947 }

3948

3949 void insertInstructionGuardsHelper(Attributor &A) {

3950 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());

3951

3952 auto CreateGuardedRegion = [&](Instruction *RegionStartI,

3954 LoopInfo *LI = nullptr;

3955 DominatorTree *DT = nullptr;

3956 MemorySSAUpdater *MSU = nullptr;

3957 using InsertPointTy = OpenMPIRBuilder::InsertPointTy;

3958

3962

3963

3964

3965

3966

3967

3968

3969

3970

3971

3972

3973

3974

3975

3976

3977

3978

3979

3980

3981

3982

3983

3984

3986 DT, LI, MSU, "region.guarded.end");

3989 MSU, "region.barrier");

3992 DT, LI, MSU, "region.exit");

3994 SplitBlock(ParentBB, RegionStartI, DT, LI, MSU, "region.guarded");

3995

3997 "Expected a different CFG");

3998

4000 ParentBB, ParentBB->getTerminator(), DT, LI, MSU, "region.check.tid");

4001

4002

4003 A.registerManifestAddedBasicBlock(*RegionEndBB);

4004 A.registerManifestAddedBasicBlock(*RegionBarrierBB);

4005 A.registerManifestAddedBasicBlock(*RegionExitBB);

4006 A.registerManifestAddedBasicBlock(*RegionStartBB);

4007 A.registerManifestAddedBasicBlock(*RegionCheckTidBB);

4008

4009 bool HasBroadcastValues = false;

4010

4011

4012 for (Instruction &I : *RegionStartBB) {

4014 for (Use &U : I.uses()) {

4016 if (UsrI.getParent() != RegionStartBB)

4018 }

4019

4020 if (OutsideUses.empty())

4021 continue;

4022

4023 HasBroadcastValues = true;

4024

4025

4026

4027 auto *SharedMem = new GlobalVariable(

4028 M, I.getType(), false,

4030 sanitizeForGlobalName(

4031 (I.getName() + ".guarded.output.alloc").str()),

4033 static_cast<unsigned>(AddressSpace::Shared));

4034

4035

4036 new StoreInst(&I, SharedMem,

4038

4039 LoadInst *LoadI = new LoadInst(

4040 I.getType(), SharedMem, I.getName() + ".guarded.output.load",

4042

4043

4044 for (Use *U : OutsideUses)

4045 A.changeUseAfterManifest(*U, *LoadI);

4046 }

4047

4048 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());

4049

4050

4053 OpenMPIRBuilder::LocationDescription Loc(

4054 InsertPointTy(ParentBB, ParentBB->end()), DL);

4055 OMPInfoCache.OMPBuilder.updateToLocation(Loc);

4056 uint32_t SrcLocStrSize;

4057 auto *SrcLocStr =

4058 OMPInfoCache.OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize);

4060 OMPInfoCache.OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize);

4062

4063

4065 OpenMPIRBuilder::LocationDescription LocRegionCheckTid(

4066 InsertPointTy(RegionCheckTidBB, RegionCheckTidBB->end()), DL);

4067 OMPInfoCache.OMPBuilder.updateToLocation(LocRegionCheckTid);

4068 FunctionCallee HardwareTidFn =

4069 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(

4070 M, OMPRTL___kmpc_get_hardware_thread_id_in_block);

4071 CallInst *Tid =

4072 OMPInfoCache.OMPBuilder.Builder.CreateCall(HardwareTidFn, {});

4074 OMPInfoCache.setCallingConvention(HardwareTidFn, Tid);

4075 Value *TidCheck = OMPInfoCache.OMPBuilder.Builder.CreateIsNull(Tid);

4076 OMPInfoCache.OMPBuilder.Builder

4077 .CreateCondBr(TidCheck, RegionStartBB, RegionBarrierBB)

4078 ->setDebugLoc(DL);

4079

4080

4081

4082 FunctionCallee BarrierFn =

4083 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(

4084 M, OMPRTL___kmpc_barrier_simple_spmd);

4085 OMPInfoCache.OMPBuilder.updateToLocation(InsertPointTy(

4088 OMPInfoCache.OMPBuilder.Builder.CreateCall(BarrierFn, {Ident, Tid});

4090 OMPInfoCache.setCallingConvention(BarrierFn, Barrier);

4091

4092

4093 if (HasBroadcastValues) {

4096 RegionBarrierBB->getTerminator()->getIterator());

4098 OMPInfoCache.setCallingConvention(BarrierFn, Barrier);

4099 }

4100 };

4101

4102 auto &AllocSharedRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];

4103 SmallPtrSet<BasicBlock *, 8> Visited;

4104 for (Instruction *GuardedI : SPMDCompatibilityTracker) {

4105 BasicBlock *BB = GuardedI->getParent();

4106 if (!Visited.insert(BB).second)

4107 continue;

4108

4112 while (++IP != IPEnd) {

4113 if (!IP->mayHaveSideEffects() && !IP->mayReadFromMemory())

4114 continue;

4116 if (OpenMPOpt::getCallIfRegularCall(*I, &AllocSharedRFI))

4117 continue;

4118 if (I->user_empty() || !SPMDCompatibilityTracker.contains(I)) {

4119 LastEffect = nullptr;

4120 continue;

4121 }

4122 if (LastEffect)

4123 Reorders.push_back({I, LastEffect});

4124 LastEffect = &*IP;

4125 }

4126 for (auto &Reorder : Reorders)

4127 Reorder.first->moveBefore(Reorder.second->getIterator());

4128 }

4129

4131

4132 for (Instruction *GuardedI : SPMDCompatibilityTracker) {

4134 auto *CalleeAA = A.lookupAAFor(

4136 DepClassTy::NONE);

4137 assert(CalleeAA != nullptr && "Expected Callee AAKernelInfo");

4139

4140 if (CalleeAAFunction.getGuardedInstructions().contains(GuardedI))

4141 continue;

4142

4143 Instruction *GuardedRegionStart = nullptr, *GuardedRegionEnd = nullptr;

4144 for (Instruction &I : *BB) {

4145

4146

4147 if (SPMDCompatibilityTracker.contains(&I)) {

4148 CalleeAAFunction.getGuardedInstructions().insert(&I);

4149 if (GuardedRegionStart)

4150 GuardedRegionEnd = &I;

4151 else

4152 GuardedRegionStart = GuardedRegionEnd = &I;

4153

4154 continue;

4155 }

4156

4157

4158

4159 if (GuardedRegionStart) {

4161 std::make_pair(GuardedRegionStart, GuardedRegionEnd));

4162 GuardedRegionStart = nullptr;

4163 GuardedRegionEnd = nullptr;

4164 }

4165 }

4166 }

4167

4168 for (auto &GR : GuardedRegions)

4169 CreateGuardedRegion(GR.first, GR.second);

4170 }

4171

4172 void forceSingleThreadPerWorkgroupHelper(Attributor &A) {

4173

4174

4175

4176

4177

4178

4179

4180

4181 auto &Ctx = getAnchorValue().getContext();

4183 assert(Kernel && "Expected an associated function!");

4184

4185

4188 KernelInitCB->getNextNode(), "main.thread.user_code");

4191

4192

4193 A.registerManifestAddedBasicBlock(*InitBB);

4194 A.registerManifestAddedBasicBlock(*UserCodeBB);

4195 A.registerManifestAddedBasicBlock(*ReturnBB);

4196

4197

4201

4202

4204 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());

4205 FunctionCallee ThreadIdInBlockFn =

4206 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(

4207 M, OMPRTL___kmpc_get_hardware_thread_id_in_block);

4208

4209

4210 CallInst *ThreadIdInBlock =

4211 CallInst::Create(ThreadIdInBlockFn, "thread_id.in.block", InitBB);

4212 OMPInfoCache.setCallingConvention(ThreadIdInBlockFn, ThreadIdInBlock);

4214

4215

4217 ICmpInst::Create(ICmpInst::ICmp, CmpInst::ICMP_NE, ThreadIdInBlock,

4218 ConstantInt::get(ThreadIdInBlock->getType(), 0),

4219 "thread.is_main", InitBB);

4222 }

4223

4225 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());

4226

4227 if (!SPMDCompatibilityTracker.isAssumed()) {

4228 for (Instruction *NonCompatibleI : SPMDCompatibilityTracker) {

4229 if (!NonCompatibleI)

4230 continue;

4231

4232

4234 if (OMPInfoCache.RTLFunctions.contains(CB->getCalledFunction()))

4235 continue;

4236

4237 auto Remark = [&](OptimizationRemarkAnalysis ORA) {

4238 ORA << "Value has potential side effects preventing SPMD-mode "

4239 "execution";

4241 ORA << ". Add `[[omp::assume(\"ompx_spmd_amenable\")]]` to "

4242 "the called function to override";

4243 }

4244 return ORA << ".";

4245 };

4246 A.emitRemark(NonCompatibleI, "OMP121",

4248

4250 << *NonCompatibleI << "\n");

4251 }

4252

4253 return false;

4254 }

4255

4256

4257

4262 Kernel = CB->getCaller();

4263 }

4265

4266

4267 ConstantStruct *ExistingKernelEnvC =

4269 auto *ExecModeC =

4270 KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC);

4271 const int8_t ExecModeVal = ExecModeC->getSExtValue();

4273 return true;

4274

4275

4276 Changed = ChangeStatus::CHANGED;

4277

4278

4279

4280 if (mayContainParallelRegion())

4281 insertInstructionGuardsHelper(A);

4282 else

4283 forceSingleThreadPerWorkgroupHelper(A);

4284

4285

4286

4288 "Initially non-SPMD kernel has SPMD exec mode!");

4289 setExecModeOfKernelEnvironment(

4292

4293 ++NumOpenMPTargetRegionKernelsSPMD;

4294

4295 auto Remark = [&](OptimizationRemark OR) {

4296 return OR << "Transformed generic-mode kernel to SPMD-mode.";

4297 };

4298 A.emitRemark(KernelInitCB, "OMP120", Remark);

4299 return true;

4300 };

4301

4303

4305 return false;

4306

4307

4308 if (!ReachedKnownParallelRegions.isValidState())

4309 return false;

4310

4311 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());

4312 if (!OMPInfoCache.runtimeFnsAvailable(

4313 {OMPRTL___kmpc_get_hardware_num_threads_in_block,

4314 OMPRTL___kmpc_get_warp_size, OMPRTL___kmpc_barrier_simple_generic,

4315 OMPRTL___kmpc_kernel_parallel, OMPRTL___kmpc_kernel_end_parallel}))

4316 return false;

4317

4318 ConstantStruct *ExistingKernelEnvC =

4320

4321

4322

4323

4324

4325 ConstantInt *UseStateMachineC =

4326 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(

4327 ExistingKernelEnvC);

4328 ConstantInt *ModeC =

4329 KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC);

4330

4331

4332

4333

4334 if (UseStateMachineC->isZero() ||

4336 return false;

4337

4338 Changed = ChangeStatus::CHANGED;

4339

4340

4341 setUseGenericStateMachineOfKernelEnvironment(

4342 ConstantInt::get(UseStateMachineC->getIntegerType(), false));

4343

4344

4345

4346

4347

4348 if (!mayContainParallelRegion()) {

4349 ++NumOpenMPTargetRegionKernelsWithoutStateMachine;

4350

4351 auto Remark = [&](OptimizationRemark OR) {

4352 return OR << "Removing unused state machine from generic-mode kernel.";

4353 };

4354 A.emitRemark(KernelInitCB, "OMP130", Remark);

4355

4356 return true;

4357 }

4358

4359

4360 if (ReachedUnknownParallelRegions.empty()) {

4361 ++NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback;

4362

4363 auto Remark = [&](OptimizationRemark OR) {

4364 return OR << "Rewriting generic-mode kernel with a customized state "

4365 "machine.";

4366 };

4367 A.emitRemark(KernelInitCB, "OMP131", Remark);

4368 } else {

4369 ++NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback;

4370

4371 auto Remark = [&](OptimizationRemarkAnalysis OR) {

4372 return OR << "Generic-mode kernel is executed with a customized state "

4373 "machine that requires a fallback.";

4374 };

4375 A.emitRemark(KernelInitCB, "OMP132", Remark);

4376

4377

4378 for (CallBase *UnknownParallelRegionCB : ReachedUnknownParallelRegions) {

4379 if (!UnknownParallelRegionCB)

4380 continue;

4381 auto Remark = [&](OptimizationRemarkAnalysis ORA) {

4382 return ORA << "Call may contain unknown parallel regions. Use "

4383 << "`[[omp::assume(\"omp_no_parallelism\")]]` to "

4384 "override.";

4385 };

4386 A.emitRemark(UnknownParallelRegionCB,

4388 }

4389 }

4390

4391

4392

4393

4394

4395

4396

4397

4398

4399

4400

4401

4402

4403

4404

4405

4406

4407

4408

4409

4410

4411

4412

4413

4414

4415

4416

4417

4418

4419

4420

4421 auto &Ctx = getAnchorValue().getContext();

4423 assert(Kernel && "Expected an associated function!");

4424

4425 BasicBlock *InitBB = KernelInitCB->getParent();

4427 KernelInitCB->getNextNode(), "thread.user_code.check");

4431 Ctx, "worker_state_machine.begin", Kernel, UserCodeEntryBB);

4433 Ctx, "worker_state_machine.finished", Kernel, UserCodeEntryBB);

4435 Ctx, "worker_state_machine.is_active.check", Kernel, UserCodeEntryBB);

4436 BasicBlock *StateMachineIfCascadeCurrentBB =

4437 BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.check",

4438 Kernel, UserCodeEntryBB);

4439 BasicBlock *StateMachineEndParallelBB =

4441 Kernel, UserCodeEntryBB);

4443 Ctx, "worker_state_machine.done.barrier", Kernel, UserCodeEntryBB);

4444 A.registerManifestAddedBasicBlock(*InitBB);

4445 A.registerManifestAddedBasicBlock(*UserCodeEntryBB);

4446 A.registerManifestAddedBasicBlock(*IsWorkerCheckBB);

4447 A.registerManifestAddedBasicBlock(*StateMachineBeginBB);

4448 A.registerManifestAddedBasicBlock(*StateMachineFinishedBB);

4449 A.registerManifestAddedBasicBlock(*StateMachineIsActiveCheckBB);

4450 A.registerManifestAddedBasicBlock(*StateMachineIfCascadeCurrentBB);

4451 A.registerManifestAddedBasicBlock(*StateMachineEndParallelBB);

4452 A.registerManifestAddedBasicBlock(*StateMachineDoneBarrierBB);

4453

4454 const DebugLoc &DLoc = KernelInitCB->getDebugLoc();

4457

4460 ConstantInt::get(KernelInitCB->getType(), -1),

4461 "thread.is_worker", InitBB);

4463 BranchInst::Create(IsWorkerCheckBB, UserCodeEntryBB, IsWorker, InitBB);

4464

4466 FunctionCallee BlockHwSizeFn =

4467 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(

4468 M, OMPRTL___kmpc_get_hardware_num_threads_in_block);

4469 FunctionCallee WarpSizeFn =

4470 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(

4471 M, OMPRTL___kmpc_get_warp_size);

4472 CallInst *BlockHwSize =

4473 CallInst::Create(BlockHwSizeFn, "block.hw_size", IsWorkerCheckBB);

4474 OMPInfoCache.setCallingConvention(BlockHwSizeFn, BlockHwSize);

4476 CallInst *WarpSize =

4478 OMPInfoCache.setCallingConvention(WarpSizeFn, WarpSize);

4481 BlockHwSize, WarpSize, "block.size", IsWorkerCheckBB);

4483 Instruction *IsMainOrWorker = ICmpInst::Create(

4485 "thread.is_main_or_worker", IsWorkerCheckBB);

4488 IsMainOrWorker, IsWorkerCheckBB);

4489

4490

4491 const DataLayout &DL = M.getDataLayout();

4492 Type *VoidPtrTy = PointerType::getUnqual(Ctx);

4494 new AllocaInst(VoidPtrTy, DL.getAllocaAddrSpace(), nullptr,

4497

4498 OMPInfoCache.OMPBuilder.updateToLocation(

4499 OpenMPIRBuilder::LocationDescription(

4500 IRBuilder<>::InsertPoint(StateMachineBeginBB,

4501 StateMachineBeginBB->end()),

4502 DLoc));

4503

4504 Value *Ident = KernelInfo::getIdentFromKernelEnvironment(KernelEnvC);

4505 Value *GTid = KernelInitCB;

4506

4507 FunctionCallee BarrierFn =

4508 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(

4509 M, OMPRTL___kmpc_barrier_simple_generic);

4511 CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineBeginBB);

4512 OMPInfoCache.setCallingConvention(BarrierFn, Barrier);

4513 Barrier->setDebugLoc(DLoc);

4514

4516 (unsigned int)AddressSpace::Generic) {

4517 WorkFnAI = new AddrSpaceCastInst(

4518 WorkFnAI, PointerType::get(Ctx, (unsigned int)AddressSpace::Generic),

4519 WorkFnAI->getName() + ".generic", StateMachineBeginBB);

4521 }

4522

4523 FunctionCallee KernelParallelFn =

4524 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(

4525 M, OMPRTL___kmpc_kernel_parallel);

4527 KernelParallelFn, {WorkFnAI}, "worker.is_active", StateMachineBeginBB);

4528 OMPInfoCache.setCallingConvention(KernelParallelFn, IsActiveWorker);

4530 Instruction *WorkFn = new LoadInst(VoidPtrTy, WorkFnAI, "worker.work_fn",

4531 StateMachineBeginBB);

4533

4534 FunctionType *ParallelRegionFnTy = FunctionType::get(

4535 Type::getVoidTy(Ctx), {Type::getInt16Ty(Ctx), Type::getInt32Ty(Ctx)},

4536 false);

4537

4541 StateMachineBeginBB);

4542 IsDone->setDebugLoc(DLoc);

4543 BranchInst::Create(StateMachineFinishedBB, StateMachineIsActiveCheckBB,

4544 IsDone, StateMachineBeginBB)

4546

4548 StateMachineDoneBarrierBB, IsActiveWorker,

4549 StateMachineIsActiveCheckBB)

4551

4552 Value *ZeroArg =

4554

4555 const unsigned int WrapperFunctionArgNo = 6;

4556

4557

4558

4559

4560 for (int I = 0, E = ReachedKnownParallelRegions.size(); I < E; ++I) {

4561 auto *CB = ReachedKnownParallelRegions[I];

4563 CB->getArgOperand(WrapperFunctionArgNo)->stripPointerCasts());

4565 Ctx, "worker_state_machine.parallel_region.execute", Kernel,

4566 StateMachineEndParallelBB);

4567 CallInst::Create(ParallelRegion, {ZeroArg, GTid}, "", PRExecuteBB)

4568 ->setDebugLoc(DLoc);

4571

4573 BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.check",

4574 Kernel, StateMachineEndParallelBB);

4575 A.registerManifestAddedBasicBlock(*PRExecuteBB);

4576 A.registerManifestAddedBasicBlock(*PRNextBB);

4577

4578

4579

4581 if (I + 1 < E || !ReachedUnknownParallelRegions.empty()) {

4584 "worker.check_parallel_region", StateMachineIfCascadeCurrentBB);

4586 IsPR = CmpI;

4587 } else {

4589 }

4590

4592 StateMachineIfCascadeCurrentBB)

4594 StateMachineIfCascadeCurrentBB = PRNextBB;

4595 }

4596

4597

4598

4599

4600 if (!ReachedUnknownParallelRegions.empty()) {

4601 StateMachineIfCascadeCurrentBB->setName(

4602 "worker_state_machine.parallel_region.fallback.execute");

4603 CallInst::Create(ParallelRegionFnTy, WorkFn, {ZeroArg, GTid}, "",

4604 StateMachineIfCascadeCurrentBB)

4605 ->setDebugLoc(DLoc);

4606 }

4608 StateMachineIfCascadeCurrentBB)

4610

4611 FunctionCallee EndParallelFn =

4612 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(

4613 M, OMPRTL___kmpc_kernel_end_parallel);

4614 CallInst *EndParallel =

4615 CallInst::Create(EndParallelFn, {}, "", StateMachineEndParallelBB);

4616 OMPInfoCache.setCallingConvention(EndParallelFn, EndParallel);

4618 BranchInst::Create(StateMachineDoneBarrierBB, StateMachineEndParallelBB)

4620

4621 CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineDoneBarrierBB)

4622 ->setDebugLoc(DLoc);

4625

4626 return true;

4627 }

4628

4629

4630

4631 ChangeStatus updateImpl(Attributor &A) override {

4632 KernelInfoState StateBefore = getState();

4633

4634

4635

4636

4637

4638 struct UpdateKernelEnvCRAII {

4639 AAKernelInfoFunction &AA;

4640

4641 UpdateKernelEnvCRAII(AAKernelInfoFunction &AA) : AA(AA) {}

4642

4643 ~UpdateKernelEnvCRAII() {

4644 if (!AA.KernelEnvC)

4645 return;

4646

4647 ConstantStruct *ExistingKernelEnvC =

4649

4650 if (!AA.isValidState()) {

4651 AA.KernelEnvC = ExistingKernelEnvC;

4652 return;

4653 }

4654

4655 if (!AA.ReachedKnownParallelRegions.isValidState())

4656 AA.setUseGenericStateMachineOfKernelEnvironment(

4657 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(

4658 ExistingKernelEnvC));

4659

4660 if (!AA.SPMDCompatibilityTracker.isValidState())

4661 AA.setExecModeOfKernelEnvironment(

4662 KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC));

4663

4664 ConstantInt *MayUseNestedParallelismC =

4665 KernelInfo::getMayUseNestedParallelismFromKernelEnvironment(

4666 AA.KernelEnvC);

4667 ConstantInt *NewMayUseNestedParallelismC = ConstantInt::get(

4668 MayUseNestedParallelismC->getIntegerType(), AA.NestedParallelism);

4669 AA.setMayUseNestedParallelismOfKernelEnvironment(

4670 NewMayUseNestedParallelismC);

4671 }

4672 } RAII(*this);

4673

4674

4676

4678 return true;

4679

4680 if (I.mayWriteToMemory())

4681 return true;

4683 const auto *UnderlyingObjsAA = A.getAAFor(

4685 DepClassTy::OPTIONAL);

4686 auto *HS = A.getAAFor(

4688 DepClassTy::OPTIONAL);

4689 if (UnderlyingObjsAA &&

4690 UnderlyingObjsAA->forallUnderlyingObjects([&](Value &Obj) {

4691 if (AA::isAssumedThreadLocalObject(A, Obj, *this))

4692 return true;

4693

4694

4695 auto *CB = dyn_cast(&Obj);

4696 return CB && HS && HS->isAssumedHeapToStack(*CB);

4697 }))

4698 return true;

4699 }

4700

4701

4702 SPMDCompatibilityTracker.insert(&I);

4703 return true;

4704 };

4705

4706 bool UsedAssumedInformationInCheckRWInst = false;

4707 if (!SPMDCompatibilityTracker.isAtFixpoint())

4708 if (A.checkForAllReadWriteInstructions(

4709 CheckRWInst, *this, UsedAssumedInformationInCheckRWInst))

4710 SPMDCompatibilityTracker.indicatePessimisticFixpoint();

4711

4712 bool UsedAssumedInformationFromReachingKernels = false;

4713 if (!IsKernelEntry) {

4714 updateParallelLevels(A);

4715

4716 bool AllReachingKernelsKnown = true;

4717 updateReachingKernelEntries(A, AllReachingKernelsKnown);

4718 UsedAssumedInformationFromReachingKernels = !AllReachingKernelsKnown;

4719

4720 if (!SPMDCompatibilityTracker.empty()) {

4721 if (!ParallelLevels.isValidState())

4722 SPMDCompatibilityTracker.indicatePessimisticFixpoint();

4723 else if (!ReachingKernelEntries.isValidState())

4724 SPMDCompatibilityTracker.indicatePessimisticFixpoint();

4725 else {

4726

4727

4728

4729 int SPMD = 0, Generic = 0;

4730 for (auto *Kernel : ReachingKernelEntries) {

4731 auto *CBAA = A.getAAFor(

4733 if (CBAA && CBAA->SPMDCompatibilityTracker.isValidState() &&

4734 CBAA->SPMDCompatibilityTracker.isAssumed())

4735 ++SPMD;

4736 else

4738 if (!CBAA || !CBAA->SPMDCompatibilityTracker.isAtFixpoint())

4739 UsedAssumedInformationFromReachingKernels = true;

4740 }

4741 if (SPMD != 0 && Generic != 0)

4742 SPMDCompatibilityTracker.indicatePessimisticFixpoint();

4743 }

4744 }

4745 }

4746

4747

4748 bool AllParallelRegionStatesWereFixed = true;

4749 bool AllSPMDStatesWereFixed = true;

4752 auto *CBAA = A.getAAFor(

4754 if (!CBAA)

4755 return false;

4756 getState() ^= CBAA->getState();

4757 AllSPMDStatesWereFixed &= CBAA->SPMDCompatibilityTracker.isAtFixpoint();

4758 AllParallelRegionStatesWereFixed &=

4759 CBAA->ReachedKnownParallelRegions.isAtFixpoint();

4760 AllParallelRegionStatesWereFixed &=

4761 CBAA->ReachedUnknownParallelRegions.isAtFixpoint();

4762 return true;

4763 };

4764

4765 bool UsedAssumedInformationInCheckCallInst = false;

4766 if (A.checkForAllCallLikeInstructions(

4767 CheckCallInst, *this, UsedAssumedInformationInCheckCallInst)) {

4769 << "Failed to visit all call-like instructions!\n";);

4770 return indicatePessimisticFixpoint();

4771 }

4772

4773

4774

4775 if (!UsedAssumedInformationInCheckCallInst &&

4776 AllParallelRegionStatesWereFixed) {

4777 ReachedKnownParallelRegions.indicateOptimisticFixpoint();

4778 ReachedUnknownParallelRegions.indicateOptimisticFixpoint();

4779 }

4780

4781

4782

4783 if (!UsedAssumedInformationInCheckRWInst &&

4784 !UsedAssumedInformationInCheckCallInst &&

4785 !UsedAssumedInformationFromReachingKernels && AllSPMDStatesWereFixed)

4786 SPMDCompatibilityTracker.indicateOptimisticFixpoint();

4787

4788 return StateBefore == getState() ? ChangeStatus::UNCHANGED

4789 : ChangeStatus::CHANGED;

4790 }

4791

4792private:

4793

4794 void updateReachingKernelEntries(Attributor &A,

4795 bool &AllReachingKernelsKnown) {

4796 auto PredCallSite = [&](AbstractCallSite ACS) {

4797 Function *Caller = ACS.getInstruction()->getFunction();

4798

4799 assert(Caller && "Caller is nullptr");

4800

4801 auto *CAA = A.getOrCreateAAFor(

4803 if (CAA && CAA->ReachingKernelEntries.isValidState()) {

4804 ReachingKernelEntries ^= CAA->ReachingKernelEntries;

4805 return true;

4806 }

4807

4808

4809

4810 ReachingKernelEntries.indicatePessimisticFixpoint();

4811

4812 return true;

4813 };

4814

4815 if (A.checkForAllCallSites(PredCallSite, *this,

4816 true ,

4817 AllReachingKernelsKnown))

4818 ReachingKernelEntries.indicatePessimisticFixpoint();

4819 }

4820

4821

4822 void updateParallelLevels(Attributor &A) {

4823 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());

4824 OMPInformationCache::RuntimeFunctionInfo &Parallel51RFI =

4825 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];

4826

4827 auto PredCallSite = [&](AbstractCallSite ACS) {

4828 Function *Caller = ACS.getInstruction()->getFunction();

4829

4830 assert(Caller && "Caller is nullptr");

4831

4832 auto *CAA =

4834 if (CAA && CAA->ParallelLevels.isValidState()) {

4835

4836

4837

4838

4839

4840 if (Caller == Parallel51RFI.Declaration) {

4841 ParallelLevels.indicatePessimisticFixpoint();

4842 return true;

4843 }

4844

4845 ParallelLevels ^= CAA->ParallelLevels;

4846

4847 return true;

4848 }

4849

4850

4851

4852 ParallelLevels.indicatePessimisticFixpoint();

4853

4854 return true;

4855 };

4856

4857 bool AllCallSitesKnown = true;

4858 if (A.checkForAllCallSites(PredCallSite, *this,

4859 true ,

4860 AllCallSitesKnown))

4861 ParallelLevels.indicatePessimisticFixpoint();

4862 }

4863};

4864

4865

4866

4867

4868struct AAKernelInfoCallSite : AAKernelInfo {

4869 AAKernelInfoCallSite(const IRPosition &IRP, Attributor &A)

4870 : AAKernelInfo(IRP, A) {}

4871

4872

4873 void initialize(Attributor &A) override {

4874 AAKernelInfo::initialize(A);

4875

4876 CallBase &CB = cast(getAssociatedValue());

4877 auto *AssumptionAA = A.getAAFor(

4879

4880

4881 if (AssumptionAA && AssumptionAA->hasAssumption("ompx_spmd_amenable")) {

4882 indicateOptimisticFixpoint();

4883 return;

4884 }

4885

4886

4887

4888

4890 indicateOptimisticFixpoint();

4891 return;

4892 }

4893

4894

4895

4896

4897

4898 auto CheckCallee = [&](Function *Callee, unsigned NumCallees) {

4899 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());

4900 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);

4901 if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {

4902

4903 if (!Callee || A.isFunctionIPOAmendable(*Callee)) {

4904

4905

4906

4907 if (!AssumptionAA ||

4908 !(AssumptionAA->hasAssumption("omp_no_openmp") ||

4909 AssumptionAA->hasAssumption("omp_no_parallelism")))

4910 ReachedUnknownParallelRegions.insert(&CB);

4911

4912

4913

4914 if (!SPMDCompatibilityTracker.isAtFixpoint()) {

4915 SPMDCompatibilityTracker.indicatePessimisticFixpoint();

4916 SPMDCompatibilityTracker.insert(&CB);

4917 }

4918

4919

4920

4921 indicateOptimisticFixpoint();

4922 }

4923

4924

4925 return;

4926 }

4927 if (NumCallees > 1) {

4928 indicatePessimisticFixpoint();

4929 return;

4930 }

4931

4933 switch (RF) {

4934

4935 case OMPRTL___kmpc_is_spmd_exec_mode:

4936 case OMPRTL___kmpc_distribute_static_fini:

4937 case OMPRTL___kmpc_for_static_fini:

4938 case OMPRTL___kmpc_global_thread_num:

4939 case OMPRTL___kmpc_get_hardware_num_threads_in_block:

4940 case OMPRTL___kmpc_get_hardware_num_blocks:

4941 case OMPRTL___kmpc_single:

4942 case OMPRTL___kmpc_end_single:

4943 case OMPRTL___kmpc_master:

4944 case OMPRTL___kmpc_end_master:

4945 case OMPRTL___kmpc_barrier:

4946 case OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2:

4947 case OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2:

4948 case OMPRTL___kmpc_error:

4949 case OMPRTL___kmpc_flush:

4950 case OMPRTL___kmpc_get_hardware_thread_id_in_block:

4951 case OMPRTL___kmpc_get_warp_size:

4952 case OMPRTL_omp_get_thread_num:

4953 case OMPRTL_omp_get_num_threads:

4954 case OMPRTL_omp_get_max_threads:

4955 case OMPRTL_omp_in_parallel:

4956 case OMPRTL_omp_get_dynamic:

4957 case OMPRTL_omp_get_cancellation:

4958 case OMPRTL_omp_get_nested:

4959 case OMPRTL_omp_get_schedule:

4960 case OMPRTL_omp_get_thread_limit:

4961 case OMPRTL_omp_get_supported_active_levels:

4962 case OMPRTL_omp_get_max_active_levels:

4963 case OMPRTL_omp_get_level:

4964 case OMPRTL_omp_get_ancestor_thread_num:

4965 case OMPRTL_omp_get_team_size:

4966 case OMPRTL_omp_get_active_level:

4967 case OMPRTL_omp_in_final:

4968 case OMPRTL_omp_get_proc_bind:

4969 case OMPRTL_omp_get_num_places:

4970 case OMPRTL_omp_get_num_procs:

4971 case OMPRTL_omp_get_place_proc_ids:

4972 case OMPRTL_omp_get_place_num:

4973 case OMPRTL_omp_get_partition_num_places:

4974 case OMPRTL_omp_get_partition_place_nums:

4975 case OMPRTL_omp_get_wtime:

4976 break;

4977 case OMPRTL___kmpc_distribute_static_init_4:

4978 case OMPRTL___kmpc_distribute_static_init_4u:

4979 case OMPRTL___kmpc_distribute_static_init_8:

4980 case OMPRTL___kmpc_distribute_static_init_8u:

4981 case OMPRTL___kmpc_for_static_init_4:

4982 case OMPRTL___kmpc_for_static_init_4u:

4983 case OMPRTL___kmpc_for_static_init_8:

4984 case OMPRTL___kmpc_for_static_init_8u: {

4985

4986 unsigned ScheduleArgOpNo = 2;

4987 auto *ScheduleTypeCI =

4989 unsigned ScheduleTypeVal =

4990 ScheduleTypeCI ? ScheduleTypeCI->getZExtValue() : 0;

4992 case OMPScheduleType::UnorderedStatic:

4993 case OMPScheduleType::UnorderedStaticChunked:

4994 case OMPScheduleType::OrderedDistribute:

4995 case OMPScheduleType::OrderedDistributeChunked:

4996 break;

4997 default:

4998 SPMDCompatibilityTracker.indicatePessimisticFixpoint();

4999 SPMDCompatibilityTracker.insert(&CB);

5000 break;

5001 };

5002 } break;

5003 case OMPRTL___kmpc_target_init:

5004 KernelInitCB = &CB;

5005 break;

5006 case OMPRTL___kmpc_target_deinit:

5007 KernelDeinitCB = &CB;

5008 break;

5009 case OMPRTL___kmpc_parallel_51:

5010 if (!handleParallel51(A, CB))

5011 indicatePessimisticFixpoint();

5012 return;

5013 case OMPRTL___kmpc_omp_task:

5014

5015 SPMDCompatibilityTracker.indicatePessimisticFixpoint();

5016 SPMDCompatibilityTracker.insert(&CB);

5017 ReachedUnknownParallelRegions.insert(&CB);

5018 break;

5019 case OMPRTL___kmpc_alloc_shared:

5020 case OMPRTL___kmpc_free_shared:

5021

5022 return;

5023 default:

5024

5025

5026 SPMDCompatibilityTracker.indicatePessimisticFixpoint();

5027 SPMDCompatibilityTracker.insert(&CB);

5028 break;

5029 }

5030

5031

5032

5033 indicateOptimisticFixpoint();

5034 };

5035

5036 const auto *AACE =

5037 A.getAAFor(*this, getIRPosition(), DepClassTy::OPTIONAL);

5038 if (!AACE || !AACE->getState().isValidState() || AACE->hasUnknownCallee()) {

5039 CheckCallee(getAssociatedFunction(), 1);

5040 return;

5041 }

5042 const auto &OptimisticEdges = AACE->getOptimisticEdges();

5043 for (auto *Callee : OptimisticEdges) {

5044 CheckCallee(Callee, OptimisticEdges.size());

5045 if (isAtFixpoint())

5046 break;

5047 }

5048 }

5049

5050 ChangeStatus updateImpl(Attributor &A) override {

5051

5052

5053

5054

5055 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());

5056 KernelInfoState StateBefore = getState();

5057

5058 auto CheckCallee = [&](Function *F, int NumCallees) {

5059 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(F);

5060

5061

5062

5063 if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {

5065 auto *FnAA =

5066 A.getAAFor(*this, FnPos, DepClassTy::REQUIRED);

5067 if (!FnAA)

5068 return indicatePessimisticFixpoint();

5069 if (getState() == FnAA->getState())

5070 return ChangeStatus::UNCHANGED;

5071 getState() = FnAA->getState();

5072 return ChangeStatus::CHANGED;

5073 }

5074 if (NumCallees > 1)

5075 return indicatePessimisticFixpoint();

5076

5077 CallBase &CB = cast(getAssociatedValue());

5078 if (It->getSecond() == OMPRTL___kmpc_parallel_51) {

5079 if (!handleParallel51(A, CB))

5080 return indicatePessimisticFixpoint();

5081 return StateBefore == getState() ? ChangeStatus::UNCHANGED

5082 : ChangeStatus::CHANGED;

5083 }

5084

5085

5086

5088 (It->getSecond() == OMPRTL___kmpc_alloc_shared ||

5089 It->getSecond() == OMPRTL___kmpc_free_shared) &&

5090 "Expected a __kmpc_alloc_shared or __kmpc_free_shared runtime call");

5091

5092 auto *HeapToStackAA = A.getAAFor(

5094 auto *HeapToSharedAA = A.getAAFor(

5096

5098

5099 switch (RF) {

5100

5101

5102 case OMPRTL___kmpc_alloc_shared:

5103 if ((!HeapToStackAA || !HeapToStackAA->isAssumedHeapToStack(CB)) &&

5104 (!HeapToSharedAA || !HeapToSharedAA->isAssumedHeapToShared(CB)))

5105 SPMDCompatibilityTracker.insert(&CB);

5106 break;

5107 case OMPRTL___kmpc_free_shared:

5108 if ((!HeapToStackAA ||

5109 !HeapToStackAA->isAssumedHeapToStackRemovedFree(CB)) &&

5110 (!HeapToSharedAA ||

5111 !HeapToSharedAA->isAssumedHeapToSharedRemovedFree(CB)))

5112 SPMDCompatibilityTracker.insert(&CB);

5113 break;

5114 default:

5115 SPMDCompatibilityTracker.indicatePessimisticFixpoint();

5116 SPMDCompatibilityTracker.insert(&CB);

5117 }

5118 return ChangeStatus::CHANGED;

5119 };

5120

5121 const auto *AACE =

5122 A.getAAFor(*this, getIRPosition(), DepClassTy::OPTIONAL);

5123 if (!AACE || !AACE->getState().isValidState() || AACE->hasUnknownCallee()) {

5124 if (Function *F = getAssociatedFunction())

5125 CheckCallee(F, 1);

5126 } else {

5127 const auto &OptimisticEdges = AACE->getOptimisticEdges();

5128 for (auto *Callee : OptimisticEdges) {

5129 CheckCallee(Callee, OptimisticEdges.size());

5130 if (isAtFixpoint())

5131 break;

5132 }

5133 }

5134

5135 return StateBefore == getState() ? ChangeStatus::UNCHANGED

5136 : ChangeStatus::CHANGED;

5137 }

5138

5139

5140

5141 bool handleParallel51(Attributor &A, CallBase &CB) {

5142 const unsigned int NonWrapperFunctionArgNo = 5;

5143 const unsigned int WrapperFunctionArgNo = 6;

5144 auto ParallelRegionOpArgNo = SPMDCompatibilityTracker.isAssumed()

5145 ? NonWrapperFunctionArgNo

5146 : WrapperFunctionArgNo;

5147

5150 if (!ParallelRegion)

5151 return false;

5152

5153 ReachedKnownParallelRegions.insert(&CB);

5154

5155 auto *FnAA = A.getAAFor(

5157 NestedParallelism |= !FnAA || !FnAA->getState().isValidState() ||

5158 !FnAA->ReachedKnownParallelRegions.empty() ||

5159 !FnAA->ReachedKnownParallelRegions.isValidState() ||

5160 !FnAA->ReachedUnknownParallelRegions.isValidState() ||

5161 !FnAA->ReachedUnknownParallelRegions.empty();

5162 return true;

5163 }

5164};

5165

5166struct AAFoldRuntimeCall

5167 : public StateWrapper<BooleanState, AbstractAttribute> {

5168 using Base = StateWrapper<BooleanState, AbstractAttribute>;

5169

5170 AAFoldRuntimeCall(const IRPosition &IRP, Attributor &A) : Base(IRP) {}

5171

5172

5173 void trackStatistics() const override {}

5174

5175

5176 static AAFoldRuntimeCall &createForPosition(const IRPosition &IRP,

5177 Attributor &A);

5178

5179

5180 StringRef getName() const override { return "AAFoldRuntimeCall"; }

5181

5182

5183 const char *getIdAddr() const override { return &ID; }

5184

5185

5186

5187 static bool classof(const AbstractAttribute *AA) {

5189 }

5190

5191 static const char ID;

5192};

5193

5194struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {

5195 AAFoldRuntimeCallCallSiteReturned(const IRPosition &IRP, Attributor &A)

5196 : AAFoldRuntimeCall(IRP, A) {}

5197

5198

5199 const std::string getAsStr(Attributor *) const override {

5200 if (!isValidState())

5201 return "";

5202

5203 std::string Str("simplified value: ");

5204

5205 if (!SimplifiedValue)

5206 return Str + std::string("none");

5207

5208 if (!*SimplifiedValue)

5209 return Str + std::string("nullptr");

5210

5212 return Str + std::to_string(CI->getSExtValue());

5213

5214 return Str + std::string("unknown");

5215 }

5216

5217 void initialize(Attributor &A) override {

5219 indicatePessimisticFixpoint();

5220

5222

5223 auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());

5224 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);

5225 assert(It != OMPInfoCache.RuntimeFunctionIDMap.end() &&

5226 "Expected a known OpenMP runtime function");

5227

5228 RFKind = It->getSecond();

5229

5230 CallBase &CB = cast(getAssociatedValue());

5231 A.registerSimplificationCallback(

5233 [&](const IRPosition &IRP, const AbstractAttribute *AA,

5234 bool &UsedAssumedInformation) -> std::optional<Value *> {

5235 assert((isValidState() || SimplifiedValue == nullptr) &&

5236 "Unexpected invalid state!");

5237

5238 if (!isAtFixpoint()) {

5239 UsedAssumedInformation = true;

5240 if (AA)

5241 A.recordDependence(*this, *AA, DepClassTy::OPTIONAL);

5242 }

5243 return SimplifiedValue;

5244 });

5245 }

5246

5247 ChangeStatus updateImpl(Attributor &A) override {

5249 switch (RFKind) {

5250 case OMPRTL___kmpc_is_spmd_exec_mode:

5251 Changed |= foldIsSPMDExecMode(A);

5252 break;

5253 case OMPRTL___kmpc_parallel_level:

5254 Changed |= foldParallelLevel(A);

5255 break;

5256 case OMPRTL___kmpc_get_hardware_num_threads_in_block:

5257 Changed = Changed | foldKernelFnAttribute(A, "omp_target_thread_limit");

5258 break;

5259 case OMPRTL___kmpc_get_hardware_num_blocks:

5260 Changed = Changed | foldKernelFnAttribute(A, "omp_target_num_teams");

5261 break;

5262 default:

5264 }

5265

5267 }

5268

5271

5272 if (SimplifiedValue && *SimplifiedValue) {

5275 A.deleteAfterManifest(I);

5276

5278 auto Remark = [&](OptimizationRemark OR) {

5280 return OR << "Replacing OpenMP runtime call "

5282 << ore::NV("FoldedValue", C->getZExtValue()) << ".";

5283 return OR << "Replacing OpenMP runtime call "

5285 };

5286

5288 A.emitRemark(CB, "OMP180", Remark);

5289

5290 LLVM_DEBUG(dbgs() << TAG << "Replacing runtime call: " << I << " with "

5291 << **SimplifiedValue << "\n");

5292

5293 Changed = ChangeStatus::CHANGED;

5294 }

5295

5297 }

5298

5299 ChangeStatus indicatePessimisticFixpoint() override {

5300 SimplifiedValue = nullptr;

5301 return AAFoldRuntimeCall::indicatePessimisticFixpoint();

5302 }

5303

5304private:

5305

5306 ChangeStatus foldIsSPMDExecMode(Attributor &A) {

5307 std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;

5308

5309 unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;

5310 unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;

5311 auto *CallerKernelInfoAA = A.getAAFor(

5313

5314 if (!CallerKernelInfoAA ||

5315 !CallerKernelInfoAA->ReachingKernelEntries.isValidState())

5316 return indicatePessimisticFixpoint();

5317

5318 for (Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {

5320 DepClassTy::REQUIRED);

5321

5322 if (!AA || !AA->isValidState()) {

5323 SimplifiedValue = nullptr;

5324 return indicatePessimisticFixpoint();

5325 }

5326

5327 if (AA->SPMDCompatibilityTracker.isAssumed()) {

5328 if (AA->SPMDCompatibilityTracker.isAtFixpoint())

5329 ++KnownSPMDCount;

5330 else

5331 ++AssumedSPMDCount;

5332 } else {

5333 if (AA->SPMDCompatibilityTracker.isAtFixpoint())

5334 ++KnownNonSPMDCount;

5335 else

5336 ++AssumedNonSPMDCount;

5337 }

5338 }

5339

5340 if ((AssumedSPMDCount + KnownSPMDCount) &&

5341 (AssumedNonSPMDCount + KnownNonSPMDCount))

5342 return indicatePessimisticFixpoint();

5343

5344 auto &Ctx = getAnchorValue().getContext();

5345 if (KnownSPMDCount || AssumedSPMDCount) {

5346 assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&

5347 "Expected only SPMD kernels!");

5348

5349

5350 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), true);

5351 } else if (KnownNonSPMDCount || AssumedNonSPMDCount) {

5352 assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&

5353 "Expected only non-SPMD kernels!");

5354

5355

5356 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), false);

5357 } else {

5358

5359

5360

5361 assert(!SimplifiedValue && "SimplifiedValue should be none");

5362 }

5363

5364 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED

5365 : ChangeStatus::CHANGED;

5366 }

5367

5368

5370 std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;

5371

5372 auto *CallerKernelInfoAA = A.getAAFor(

5374

5375 if (!CallerKernelInfoAA ||

5376 !CallerKernelInfoAA->ParallelLevels.isValidState())

5377 return indicatePessimisticFixpoint();

5378

5379 if (!CallerKernelInfoAA->ReachingKernelEntries.isValidState())

5380 return indicatePessimisticFixpoint();

5381

5382 if (CallerKernelInfoAA->ReachingKernelEntries.empty()) {

5383 assert(!SimplifiedValue &&

5384 "SimplifiedValue should keep none at this point");

5385 return ChangeStatus::UNCHANGED;

5386 }

5387

5388 unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;

5389 unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;

5390 for (Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {

5392 DepClassTy::REQUIRED);

5393 if (!AA || !AA->SPMDCompatibilityTracker.isValidState())

5394 return indicatePessimisticFixpoint();

5395

5396 if (AA->SPMDCompatibilityTracker.isAssumed()) {

5397 if (AA->SPMDCompatibilityTracker.isAtFixpoint())

5398 ++KnownSPMDCount;

5399 else

5400 ++AssumedSPMDCount;

5401 } else {

5402 if (AA->SPMDCompatibilityTracker.isAtFixpoint())

5403 ++KnownNonSPMDCount;

5404 else

5405 ++AssumedNonSPMDCount;

5406 }

5407 }

5408

5409 if ((AssumedSPMDCount + KnownSPMDCount) &&

5410 (AssumedNonSPMDCount + KnownNonSPMDCount))

5411 return indicatePessimisticFixpoint();

5412

5413 auto &Ctx = getAnchorValue().getContext();

5414

5415

5416

5417 if (AssumedSPMDCount || KnownSPMDCount) {

5418 assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&

5419 "Expected only SPMD kernels!");

5420 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), 1);

5421 } else {

5422 assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&

5423 "Expected only non-SPMD kernels!");

5424 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), 0);

5425 }

5426 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED

5427 : ChangeStatus::CHANGED;

5428 }

5429

5430 ChangeStatus foldKernelFnAttribute(Attributor &A, llvm::StringRef Attr) {

5431

5432 int32_t CurrentAttrValue = -1;

5433 std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;

5434

5435 auto *CallerKernelInfoAA = A.getAAFor(

5437

5438 if (!CallerKernelInfoAA ||

5439 !CallerKernelInfoAA->ReachingKernelEntries.isValidState())

5440 return indicatePessimisticFixpoint();

5441

5442

5443 for (Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {

5444 int32_t NextAttrVal = K->getFnAttributeAsParsedInteger(Attr, -1);

5445

5446 if (NextAttrVal == -1 ||

5447 (CurrentAttrValue != -1 && CurrentAttrValue != NextAttrVal))

5448 return indicatePessimisticFixpoint();

5449 CurrentAttrValue = NextAttrVal;

5450 }

5451

5452 if (CurrentAttrValue != -1) {

5453 auto &Ctx = getAnchorValue().getContext();

5454 SimplifiedValue =

5455 ConstantInt::get(Type::getInt32Ty(Ctx), CurrentAttrValue);

5456 }

5457 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED

5458 : ChangeStatus::CHANGED;

5459 }

5460

5461

5462

5463

5464 std::optional<Value *> SimplifiedValue;

5465

5466

5468};

5469

5470}

5471

5472

5473void OpenMPOpt::registerFoldRuntimeCall(RuntimeFunction RF) {

5474 auto &RFI = OMPInfoCache.RFIs[RF];

5475 RFI.foreachUse(SCC, [&](Use &U, Function &F) {

5476 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &RFI);

5477 if (!CI)

5478 return false;

5479 A.getOrCreateAAFor(

5481 DepClassTy::NONE, false,

5482 false);

5483 return false;

5484 });

5485}

5486

5487void OpenMPOpt::registerAAs(bool IsModulePass) {

5488 if (SCC.empty())

5489 return;

5490

5491 if (IsModulePass) {

5492

5493

5494

5495

5497 A.getOrCreateAAFor(

5499 DepClassTy::NONE, false,

5500 false);

5501 return false;

5502 };

5503 OMPInformationCache::RuntimeFunctionInfo &InitRFI =

5504 OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];

5505 InitRFI.foreachUse(SCC, CreateKernelInfoCB);

5506

5507 registerFoldRuntimeCall(OMPRTL___kmpc_is_spmd_exec_mode);

5508 registerFoldRuntimeCall(OMPRTL___kmpc_parallel_level);

5509 registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_threads_in_block);

5510 registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_blocks);

5511 }

5512

5513

5515 for (int Idx = 0; Idx < OMPInfoCache.ICVs.size() - 1; ++Idx) {

5516 auto ICVInfo = OMPInfoCache.ICVs[static_cast<InternalControlVar>(Idx)];

5517

5518 auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter];

5519

5521 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI);

5522 if (!CI)

5523 return false;

5524

5526

5528 A.getOrCreateAAFor(CBPos);

5529 return false;

5530 };

5531

5532 GetterRFI.foreachUse(SCC, CreateAA);

5533 }

5534 }

5535

5536

5537

5539 return;

5540

5541 for (auto *F : SCC) {

5542 if (F->isDeclaration())

5543 continue;

5544

5545

5546

5547

5548 if (F->hasLocalLinkage()) {

5549 if (llvm::all_of(F->uses(), [this](const Use &U) {

5550 const auto *CB = dyn_cast(U.getUser());

5551 return CB && CB->isCallee(&U) &&

5552 A.isRunOn(const_cast<Function *>(CB->getCaller()));

5553 }))

5554 continue;

5555 }

5556 registerAAsForFunction(A, *F);

5557 }

5558}

5559

5560void OpenMPOpt::registerAAsForFunction(Attributor &A, const Function &F) {

5566 if (F.hasFnAttribute(Attribute::Convergent))

5568

5571 bool UsedAssumedInformation = false;

5574 A.getOrCreateAAFor(

5576 continue;

5577 }

5580 A.getOrCreateAAFor(

5582 }

5585 A.getOrCreateAAFor(

5587 continue;

5588 }

5591 continue;

5592 }

5594 if (II->getIntrinsicID() == Intrinsic::assume) {

5595 A.getOrCreateAAFor(

5597 continue;

5598 }

5599 }

5600 }

5601}

5602

5603const char AAICVTracker::ID = 0;

5604const char AAKernelInfo::ID = 0;

5606const char AAHeapToShared::ID = 0;

5607const char AAFoldRuntimeCall::ID = 0;

5608

5609AAICVTracker &AAICVTracker::createForPosition(const IRPosition &IRP,

5610 Attributor &A) {

5611 AAICVTracker *AA = nullptr;

5617 llvm_unreachable("ICVTracker can only be created for function position!");

5619 AA = new (A.Allocator) AAICVTrackerFunctionReturned(IRP, A);

5620 break;

5622 AA = new (A.Allocator) AAICVTrackerCallSiteReturned(IRP, A);

5623 break;

5625 AA = new (A.Allocator) AAICVTrackerCallSite(IRP, A);

5626 break;

5628 AA = new (A.Allocator) AAICVTrackerFunction(IRP, A);

5629 break;

5630 }

5631

5632 return *AA;

5633}

5634

5637 AAExecutionDomainFunction *AA = nullptr;

5647 "AAExecutionDomain can only be created for function position!");

5649 AA = new (A.Allocator) AAExecutionDomainFunction(IRP, A);

5650 break;

5651 }

5652

5653 return *AA;

5654}

5655

5656AAHeapToShared &AAHeapToShared::createForPosition(const IRPosition &IRP,

5658 AAHeapToSharedFunction *AA = nullptr;

5668 "AAHeapToShared can only be created for function position!");

5670 AA = new (A.Allocator) AAHeapToSharedFunction(IRP, A);

5671 break;

5672 }

5673

5674 return *AA;

5675}

5676

5677AAKernelInfo &AAKernelInfo::createForPosition(const IRPosition &IRP,

5678 Attributor &A) {

5679 AAKernelInfo *AA = nullptr;

5687 llvm_unreachable("KernelInfo can only be created for function position!");

5689 AA = new (A.Allocator) AAKernelInfoCallSite(IRP, A);

5690 break;

5692 AA = new (A.Allocator) AAKernelInfoFunction(IRP, A);

5693 break;

5694 }

5695

5696 return *AA;

5697}

5698

5699AAFoldRuntimeCall &AAFoldRuntimeCall::createForPosition(const IRPosition &IRP,

5700 Attributor &A) {

5701 AAFoldRuntimeCall *AA = nullptr;

5710 llvm_unreachable("KernelInfo can only be created for call site position!");

5712 AA = new (A.Allocator) AAFoldRuntimeCallCallSiteReturned(IRP, A);

5713 break;

5714 }

5715

5716 return *AA;

5717}

5718

5724

5728

5730 LLVM_DEBUG(dbgs() << TAG << "Module before OpenMPOpt Module Pass:\n" << M);

5731

5732 auto IsCalled = [&](Function &F) {

5733 if (Kernels.contains(&F))

5734 return true;

5735 return F.use_empty();

5736 };

5737

5738 auto EmitRemark = [&](Function &F) {

5740 ORE.emit([&]() {

5742 return ORA << "Could not internalize function. "

5743 << "Some optimizations may not be possible. [OMP140]";

5744 });

5745 };

5746

5748

5749

5750

5755 if (F.isDeclaration() && !Kernels.contains(&F) && IsCalled(F) &&

5758 InternalizeFns.insert(&F);

5759 } else if (F.hasLocalLinkage() && F.hasFnAttribute(Attribute::Cold)) {

5760 EmitRemark(F);

5761 }

5762 }

5763

5766 }

5767

5768

5772 if (F.isDeclaration() && !InternalizedMap.lookup(&F)) {

5773 SCC.push_back(&F);

5774 Functions.insert(&F);

5775 }

5776

5777 if (SCC.empty())

5779

5781

5784 };

5785

5788

5792 OMPInformationCache InfoCache(M, AG, Allocator, nullptr, PostLink);

5793

5794 unsigned MaxFixpointIterations =

5796

5806 return F.hasFnAttribute("kernel");

5807 };

5808

5810

5811 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);

5812 Changed |= OMPOpt.run(true);

5813

5814

5817 if (F.isDeclaration() && !Kernels.contains(&F) &&

5818 F.hasFnAttribute(Attribute::NoInline))

5819 F.addFnAttr(Attribute::AlwaysInline);

5820

5822 LLVM_DEBUG(dbgs() << TAG << "Module after OpenMPOpt Module Pass:\n" << M);

5823

5826

5828}

5829

5834 if (containsOpenMP(*C.begin()->getFunction().getParent()))

5838

5840

5842 Function *Fn = &N.getFunction();

5843 SCC.push_back(Fn);

5844 }

5845

5846 if (SCC.empty())

5848

5849 Module &M = *C.begin()->getFunction().getParent();

5850

5852 LLVM_DEBUG(dbgs() << TAG << "Module before OpenMPOpt CGSCC Pass:\n" << M);

5853

5856

5858

5861 };

5862

5866

5871 OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator,

5872 &Functions, PostLink);

5873

5874 unsigned MaxFixpointIterations =

5876

5885

5887

5888 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);

5889 bool Changed = OMPOpt.run(false);

5890

5892 LLVM_DEBUG(dbgs() << TAG << "Module after OpenMPOpt CGSCC Pass:\n" << M);

5893

5896

5898}

5899

5903

5906

5908 if (F.hasKernelCallingConv()) {

5909

5910

5911

5913 ++NumOpenMPTargetRegionKernels;

5914 Kernels.insert(&F);

5915 } else

5916 ++NumNonOpenMPTargetRegionKernels;

5917 }

5918

5919 return Kernels;

5920}

5921

5923 Metadata *MD = M.getModuleFlag("openmp");

5924 if (!MD)

5925 return false;

5926

5927 return true;

5928}

5929

5931 Metadata *MD = M.getModuleFlag("openmp-device");

5932 if (!MD)

5933 return false;

5934

5935 return true;

5936}

assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL

Expand Atomic instructions

static cl::opt< unsigned > SetFixpointIterations("attributor-max-iterations", cl::Hidden, cl::desc("Maximal number of fixpoint iterations."), cl::init(32))

static const Function * getParent(const Value *V)

static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")

static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

This file provides interfaces used to manipulate a call graph, regardless if it is a "old style" Call...

This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...

This file contains the declarations for the subclasses of Constant, which represent the different fla...

dxil pretty DXIL Metadata Pretty Printer

This file defines the DenseSet and SmallDenseSet classes.

This file defines an array type that can be indexed using scoped enum values.

static void emitRemark(const Function &F, OptimizationRemarkEmitter &ORE, bool Skip)

Loop::LoopBounds::Direction Direction

Machine Check Debug Module

This file provides utility analysis objects describing memory locations.

uint64_t IntrinsicInst * II

This file defines constans and helpers used when dealing with OpenMP.

This file defines constans that will be used by both host and device compilation.

static constexpr auto TAG

Definition OpenMPOpt.cpp:185

static cl::opt< bool > HideMemoryTransferLatency("openmp-hide-memory-transfer-latency", cl::desc("[WIP] Tries to hide the latency of host to device memory" " transfers"), cl::Hidden, cl::init(false))

static cl::opt< bool > DisableOpenMPOptStateMachineRewrite("openmp-opt-disable-state-machine-rewrite", cl::desc("Disable OpenMP optimizations that replace the state machine."), cl::Hidden, cl::init(false))

static cl::opt< bool > EnableParallelRegionMerging("openmp-opt-enable-merging", cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden, cl::init(false))

static cl::opt< bool > PrintModuleAfterOptimizations("openmp-opt-print-module-after", cl::desc("Print the current module after OpenMP optimizations."), cl::Hidden, cl::init(false))

#define KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MEMBER)

Definition OpenMPOpt.cpp:241

#define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX)

Definition OpenMPOpt.cpp:218

#define KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MEMBER)

Definition OpenMPOpt.cpp:3678

static cl::opt< bool > PrintOpenMPKernels("openmp-print-gpu-kernels", cl::init(false), cl::Hidden)

static cl::opt< bool > DisableOpenMPOptFolding("openmp-opt-disable-folding", cl::desc("Disable OpenMP optimizations involving folding."), cl::Hidden, cl::init(false))

static cl::opt< bool > PrintModuleBeforeOptimizations("openmp-opt-print-module-before", cl::desc("Print the current module before OpenMP optimizations."), cl::Hidden, cl::init(false))

static cl::opt< unsigned > SetFixpointIterations("openmp-opt-max-iterations", cl::Hidden, cl::desc("Maximal number of attributor iterations."), cl::init(256))

static cl::opt< bool > DisableInternalization("openmp-opt-disable-internalization", cl::desc("Disable function internalization."), cl::Hidden, cl::init(false))

static cl::opt< bool > PrintICVValues("openmp-print-icv-values", cl::init(false), cl::Hidden)

static cl::opt< bool > DisableOpenMPOptimizations("openmp-opt-disable", cl::desc("Disable OpenMP specific optimizations."), cl::Hidden, cl::init(false))

static cl::opt< unsigned > SharedMemoryLimit("openmp-opt-shared-limit", cl::Hidden, cl::desc("Maximum amount of shared memory to use."), cl::init(std::numeric_limits< unsigned >::max()))

static cl::opt< bool > EnableVerboseRemarks("openmp-opt-verbose-remarks", cl::desc("Enables more verbose remarks."), cl::Hidden, cl::init(false))

static cl::opt< bool > DisableOpenMPOptDeglobalization("openmp-opt-disable-deglobalization", cl::desc("Disable OpenMP optimizations involving deglobalization."), cl::Hidden, cl::init(false))

static cl::opt< bool > DisableOpenMPOptBarrierElimination("openmp-opt-disable-barrier-elimination", cl::desc("Disable OpenMP optimizations that eliminate barriers."), cl::Hidden, cl::init(false))

#define DEBUG_TYPE

Definition OpenMPOpt.cpp:67

static cl::opt< bool > DeduceICVValues("openmp-deduce-icv-values", cl::init(false), cl::Hidden)

#define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX)

Definition OpenMPOpt.cpp:210

#define KERNEL_ENVIRONMENT_GETTER(MEMBER, RETURNTYPE)

Definition OpenMPOpt.cpp:231

static cl::opt< bool > DisableOpenMPOptSPMDization("openmp-opt-disable-spmdization", cl::desc("Disable OpenMP optimizations involving SPMD-ization."), cl::Hidden, cl::init(false))

static cl::opt< bool > AlwaysInlineDeviceFunctions("openmp-opt-inline-device", cl::desc("Inline all applicable functions on the device."), cl::Hidden, cl::init(false))

FunctionAnalysisManager FAM

This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.

static StringRef getName(Value *V)

std::pair< BasicBlock *, BasicBlock * > Edge

static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)

This file implements a set that has insertion order iteration characteristics.

This file defines the SmallPtrSet class.

This file defines the SmallVector class.

This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...

#define STATISTIC(VARNAME, DESC)

static const int BlockSize

static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, const llvm::StringTable &StandardNames, VectorLibrary VecLib)

Initialize the set of available library functions based on the specified target triple.

static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))

PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)

Get the result of an analysis pass for a given IR unit.

size_t size() const

size - Get the array size.

iterator begin()

Instruction iterator methods.

LLVM_ABI const_iterator getFirstInsertionPt() const

Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...

const Function * getParent() const

Return the enclosing method, or null if none.

reverse_iterator rbegin()

static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)

Creates a new BasicBlock.

LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)

Split the basic block into two basic blocks at the specified instruction.

LLVM_ABI const BasicBlock * getUniqueSuccessor() const

Return the successor of this block if it has a unique successor.

InstListType::reverse_iterator reverse_iterator

const Instruction * getTerminator() const LLVM_READONLY

Returns the terminator instruction if the block is well formed or null if the block is not well forme...

static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)

Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...

void setCallingConv(CallingConv::ID CC)

Function * getCalledFunction() const

Returns the function called, or null if this is an indirect function invocation or the function signa...

bool doesNotAccessMemory(unsigned OpNo) const

LLVM_ABI bool isIndirectCall() const

Return true if the callsite is an indirect call.

bool isCallee(Value::const_user_iterator UI) const

Determine whether the passed iterator points to the callee operand's Use.

Value * getArgOperand(unsigned i) const

void setArgOperand(unsigned i, Value *v)

iterator_range< User::op_iterator > args()

Iteration adapter for range-for loops.

unsigned getArgOperandNo(const Use *U) const

Given a use for a arg operand, get the arg operand number that corresponds to it.

unsigned arg_size() const

AttributeList getAttributes() const

Return the attributes for this call.

void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)

Adds the attribute to the indicated argument.

bool isArgOperand(const Use *U) const

bool hasOperandBundles() const

Return true if this User has any operand bundles.

LLVM_ABI Function * getCaller()

Helper to get the caller (the parent function).

Wrapper to unify "old style" CallGraph and "new style" LazyCallGraph.

void initialize(LazyCallGraph &LCG, LazyCallGraph::SCC &SCC, CGSCCAnalysisManager &AM, CGSCCUpdateResult &UR)

Initializers for usage outside of a CGSCC pass, inside a CGSCC pass in the old and new pass manager (...

static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)

@ ICMP_SLT

signed less than

static LLVM_ABI Constant * getPointerCast(Constant *C, Type *Ty)

Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.

static LLVM_ABI Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)

Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.

This is the shared class of boolean and integer constants.

IntegerType * getIntegerType() const

Variant of the getType() method to always return an IntegerType, which reduces the amount of casting ...

static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)

bool isZero() const

This is just a convenience method to make client code smaller for a common code.

int64_t getSExtValue() const

Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...

This is an important base class in LLVM.

static LLVM_ABI Constant * getNullValue(Type *Ty)

Constructor to create a '0' constant of arbitrary type.

ValueT lookup(const_arg_type_t< KeyT > Val) const

lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...

std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)

LLVM_ABI Instruction * findNearestCommonDominator(Instruction *I1, Instruction *I2) const

Find the nearest instruction I that dominates both I1 and I2, in the sense that a result produced bef...

static ErrorSuccess success()

Create a success value.

AtomicOrdering getOrdering() const

Returns the ordering constraint of this fence instruction.

A proxy from a FunctionAnalysisManager to an SCC.

const BasicBlock & getEntryBlock() const

const BasicBlock & front() const

LLVMContext & getContext() const

getContext - Return a reference to the LLVMContext associated with this function.

Argument * getArg(unsigned i) const

bool hasFnAttribute(Attribute::AttrKind Kind) const

Return true if the function has the attribute.

LLVM_ABI bool isDeclaration() const

Return true if the primary definition of this global value is outside of the current translation unit...

bool hasLocalLinkage() const

Module * getParent()

Get the module that this global value is contained inside of...

@ PrivateLinkage

Like Internal, but omit from symbol table.

@ InternalLinkage

Rename collisions when linking (static functions).

const Constant * getInitializer() const

getInitializer - Return the initializer for this global variable.

LLVM_ABI void setInitializer(Constant *InitVal)

setInitializer - Sets the initializer for this global variable, removing any existing initializer if ...

LLVM_ABI bool isLifetimeStartOrEnd() const LLVM_READONLY

Return true if the instruction is a llvm.lifetime.start or llvm.lifetime.end marker.

LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY

Return true if this instruction may modify memory.

const DebugLoc & getDebugLoc() const

Return the debug location for this node as a DebugLoc.

LLVM_ABI const Module * getModule() const

Return the module owning the function this instruction belongs to or nullptr it the function does not...

LLVM_ABI InstListType::iterator eraseFromParent()

This method unlinks 'this' from the containing basic block and deletes it.

LLVM_ABI const Function * getFunction() const

Return the function this instruction belongs to.

LLVM_ABI bool mayHaveSideEffects() const LLVM_READONLY

Return true if the instruction may have side effects.

LLVM_ABI bool mayReadFromMemory() const LLVM_READONLY

Return true if this instruction may read memory.

void setDebugLoc(DebugLoc Loc)

Set the debug location information for this instruction.

LLVM_ABI void setSuccessor(unsigned Idx, BasicBlock *BB)

Update the specified successor to point at the provided block.

A node in the call graph.

An SCC of the call graph.

A lazily constructed view of the call graph of a module.

LLVM_ABI void eraseFromParent()

This method unlinks 'this' from the containing function and deletes it.

LLVM_ABI StringRef getName() const

Return the name of the corresponding LLVM basic block, or an empty string.

A Module instance is used to store all the information related to an LLVM module.

const Triple & getTargetTriple() const

Get the target triple which is a string describing the target host.

PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM, LazyCallGraph &CG, CGSCCUpdateResult &UR)

Definition OpenMPOpt.cpp:5830

PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)

Definition OpenMPOpt.cpp:5719

static LLVM_ABI PoisonValue * get(Type *T)

Static factory methods - Return an 'poison' object of the specified type.

A set of analyses that are preserved following a run of a transformation pass.

static PreservedAnalyses none()

Convenience factory function for the empty preserved set.

static PreservedAnalyses all()

Construct a special preserved set that preserves all passes.

static ReturnInst * Create(LLVMContext &C, Value *retVal=nullptr, InsertPosition InsertBefore=nullptr)

A vector that has set insertion semantics.

size_type size() const

Determine the number of elements in the SetVector.

size_type count(const_arg_type key) const

Count the number of elements of a given key in the SetVector.

bool insert(const value_type &X)

Insert a new element into the SetVector.

size_type count(ConstPtrType Ptr) const

count - Return 1 if the specified pointer is in the set, 0 otherwise.

std::pair< iterator, bool > insert(PtrType Ptr)

Inserts Ptr if and only if there is no element in the container equal to Ptr.

SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.

reference emplace_back(ArgTypes &&... Args)

void append(ItTy in_start, ItTy in_end)

Add the specified range to the end of the SmallVector.

void push_back(const T &Elt)

This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.

bool starts_with(StringRef Prefix) const

Check if this string starts with the given Prefix.

Triple - Helper class for working with autoconf configuration names.

static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)

LLVM_ABI unsigned getPointerAddressSpace() const

Get the address space of this pointer or pointer vector type.

static LLVM_ABI UndefValue * get(Type *T)

Static factory methods - Return an 'undef' object of the specified type.

A Use represents the edge between a Value definition and its users.

LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)

Replace uses of one Value with another.

Type * getType() const

All values are typed, get the type of this value.

LLVM_ABI void setName(const Twine &Name)

Change the name of the value.

bool hasOneUse() const

Return true if there is exactly one use of this value.

LLVM_ABI void replaceAllUsesWith(Value *V)

Change all uses of this to point to a new Value.

iterator_range< user_iterator > users()

LLVM_ABI const Value * stripPointerCasts() const

Strip off pointer casts, all-zero GEPs and address space casts.

LLVM_ABI StringRef getName() const

Return a constant reference to the value's name.

const ParentTy * getParent() const

self_iterator getIterator()

NodeTy * getNextNode()

Get the next node, or nullptr for the list tail.

#define llvm_unreachable(msg)

Marks that the current location is not supposed to be reachable.

Definition OpenMPOpt.cpp:188

GlobalVariable * getKernelEnvironementGVFromKernelInitCB(CallBase *KernelInitCB)

Definition OpenMPOpt.cpp:260

ConstantStruct * getKernelEnvironementFromKernelInitCB(CallBase *KernelInitCB)

Definition OpenMPOpt.cpp:267

Abstract Attribute helper functions.

LLVM_ABI bool isValidAtPosition(const ValueAndContext &VAC, InformationCache &InfoCache)

Return true if the value of VAC is a valid at the position of VAC, that is a constant,...

LLVM_ABI bool isPotentiallyAffectedByBarrier(Attributor &A, const Instruction &I, const AbstractAttribute &QueryingAA)

Return true if I is potentially affected by a barrier.

LLVM_ABI bool isNoSyncInst(Attributor &A, const Instruction &I, const AbstractAttribute &QueryingAA)

Return true if I is a nosync instruction.

constexpr char Args[]

Key for Kernel::Metadata::mArgs.

E & operator^=(E &LHS, E RHS)

unsigned ID

LLVM IR allows to use arbitrary numbers as calling convention identifiers.

@ C

The default llvm calling convention, compatible with C.

@ BasicBlock

Various leaf nodes.

initializer< Ty > init(const Ty &Val)

PointerTypeMap run(const Module &M)

Compute the PointerTypeMap for the module M.

constexpr uint64_t PointerSize

aarch64 pointer size.

bool isOpenMPDevice(Module &M)

Helper to determine if M is a OpenMP target offloading device module.

Definition OpenMPOpt.cpp:5930

bool containsOpenMP(Module &M)

Helper to determine if M contains OpenMP.

Definition OpenMPOpt.cpp:5922

InternalControlVar

IDs for all Internal Control Variables (ICVs).

RuntimeFunction

IDs for all omp runtime library (RTL) functions.

KernelSet getDeviceKernels(Module &M)

Get OpenMP device kernels in M.

Definition OpenMPOpt.cpp:5904

@ OMP_TGT_EXEC_MODE_GENERIC_SPMD

@ OMP_TGT_EXEC_MODE_GENERIC

SetVector< Kernel > KernelSet

Set of kernels in the module.

Function * Kernel

Summary of a kernel (=entry point for target offloading).

bool isOpenMPKernel(Function &Fn)

Return true iff Fn is an OpenMP GPU kernel; Fn has the "kernel" attribute.

Definition OpenMPOpt.cpp:5900

DiagnosticInfoOptimizationBase::Argument NV

NodeAddr< UseNode * > Use

friend class Instruction

Iterator for Instructions in a `BasicBlock.

LLVM_ABI iterator begin() const

This is an optimization pass for GlobalISel generic memory operations.

auto drop_begin(T &&RangeOrContainer, size_t N=1)

Return a range covering RangeOrContainer with the first N elements excluded.

FunctionAddr VTableAddr Value

bool all_of(R &&range, UnaryPredicate P)

Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.

auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)

Get the size of a range.

bool succ_empty(const Instruction *I)

decltype(auto) dyn_cast(const From &Val)

dyn_cast - Return the argument parameter cast to the specified type.

FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty

bool operator!=(uint64_t V1, const APInt &V2)

constexpr from_range_t from_range

Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)

Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.

InnerAnalysisManagerProxy< FunctionAnalysisManager, Module > FunctionAnalysisManagerModuleProxy

Provide the FunctionAnalysisManager to Module proxy.

bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)

AnalysisManager< LazyCallGraph::SCC, LazyCallGraph & > CGSCCAnalysisManager

The CGSCC analysis manager.

@ ThinLTOPostLink

ThinLTO postlink (backend compile) phase.

@ FullLTOPostLink

Full LTO postlink (backend compile) phase.

@ ThinLTOPreLink

ThinLTO prelink (summary) phase.

auto dyn_cast_or_null(const Y &Val)

LLVM_ABI raw_ostream & dbgs()

dbgs() - This returns a reference to a raw_ostream for debugging messages.

class LLVM_GSL_OWNER SmallVector

Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...

bool isa(const From &Val)

isa - Return true if the parameter to the template is an instance of one of the template type argu...

MutableArrayRef(T &OneElt) -> MutableArrayRef< T >

void cantFail(Error Err, const char *Msg=nullptr)

Report a fatal error if Err is a failure value.

IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >

bool operator&=(SparseBitVector< ElementSize > *LHS, const SparseBitVector< ElementSize > &RHS)

auto count(R &&Range, const E &Element)

Wrapper function around std::count to count the number of times an element Element occurs in the give...

ArrayRef(const T &OneElt) -> ArrayRef< T >

std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)

decltype(auto) cast(const From &Val)

cast - Return the argument parameter cast to the specified type.

LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)

Split the specified block at the specified instruction.

auto predecessors(const MachineBasicBlock *BB)

LLVM_ABI Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)

Attempt to constant fold an insertvalue instruction with the specified operands and indices.

@ OPTIONAL

The target may be valid if the source is not.

AnalysisManager< Function > FunctionAnalysisManager

Convenience typedef for the Function analysis manager.

BumpPtrAllocatorImpl<> BumpPtrAllocator

The standard BumpPtrAllocator which just uses the default template parameters.

LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)

This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....

AnalysisManager< Module > ModuleAnalysisManager

Convenience typedef for the Module analysis manager.

static LLVM_ABI AAExecutionDomain & createForPosition(const IRPosition &IRP, Attributor &A)

Create an abstract attribute view for the position IRP.

Definition OpenMPOpt.cpp:5635

AAExecutionDomain(const IRPosition &IRP, Attributor &A)

static LLVM_ABI const char ID

Unique ID (due to the unique address)

AccessKind

Simple enum to distinguish read/write/read-write accesses.

StateType::base_t MemoryLocationsKind

static LLVM_ABI bool isAlignedBarrier(const CallBase &CB, bool ExecutedAligned)

Helper function to determine if CB is an aligned (GPU) barrier.

Base struct for all "concrete attribute" deductions.

virtual const char * getIdAddr() const =0

This function should return the address of the ID of the AbstractAttribute.

An interface to query the internal state of an abstract attribute.

Wrapper for FunctionAnalysisManager.

Configuration for the Attributor.

std::function< void(Attributor &A, const Function &F)> InitializationCallback

Callback function to be invoked on internal functions marked live.

std::optional< unsigned > MaxFixpointIterations

Maximum number of iterations to run until fixpoint.

bool RewriteSignatures

Flag to determine if we rewrite function signatures.

OptimizationRemarkGetter OREGetter

IPOAmendableCBTy IPOAmendableCB

bool IsModulePass

Is the user of the Attributor a module pass or not.

bool DefaultInitializeLiveInternals

Flag to determine if we want to initialize all default AAs for an internal function marked live.

The fixpoint analysis framework that orchestrates the attribute deduction.

static LLVM_ABI bool isInternalizable(Function &F)

Returns true if the function F can be internalized.

std::function< std::optional< Value * >( const IRPosition &, const AbstractAttribute *, bool &)> SimplifictionCallbackTy

Register CB as a simplification callback.

std::function< std::optional< Constant * >( const GlobalVariable &, const AbstractAttribute *, bool &)> GlobalVariableSimplifictionCallbackTy

Register CB as a simplification callback.

std::function< bool(Attributor &, const AbstractAttribute *)> VirtualUseCallbackTy

static LLVM_ABI bool internalizeFunctions(SmallPtrSetImpl< Function * > &FnSet, DenseMap< Function *, Function * > &FnMap)

Make copies of each function in the set FnSet such that the copied version has internal linkage after...

Simple wrapper for a single bit (boolean) state.

Support structure for SCC passes to communicate updates the call graph back to the CGSCC pass manager...

Helper to describe and deal with positions in the LLVM-IR.

static const IRPosition callsite_returned(const CallBase &CB)

Create a position describing the returned value of CB.

static const IRPosition returned(const Function &F, const CallBaseContext *CBContext=nullptr)

Create a position describing the returned value of F.

static const IRPosition value(const Value &V, const CallBaseContext *CBContext=nullptr)

Create a position describing the value of V.

static const IRPosition inst(const Instruction &I, const CallBaseContext *CBContext=nullptr)

Create a position describing the instruction I.

@ IRP_ARGUMENT

An attribute for a function argument.

@ IRP_RETURNED

An attribute for the function return value.

@ IRP_CALL_SITE

An attribute for a call site (function scope).

@ IRP_CALL_SITE_RETURNED

An attribute for a call site return value.

@ IRP_FUNCTION

An attribute for a function (scope).

@ IRP_FLOAT

A position that is not associated with a spot suitable for attributes.

@ IRP_CALL_SITE_ARGUMENT

An attribute for a call site argument.

@ IRP_INVALID

An invalid position.

static const IRPosition function(const Function &F, const CallBaseContext *CBContext=nullptr)

Create a position describing the function scope of F.

Kind getPositionKind() const

Return the associated position kind.

static const IRPosition callsite_function(const CallBase &CB)

Create a position describing the function scope of CB.

Data structure to hold cached (LLVM-IR) information.

Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...