LLVM: lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp Source File (original) (raw)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
31#include "llvm/IR/IntrinsicsAMDGPU.h"
35
36#define DEBUG_TYPE "amdgpu-atomic-optimizer"
37
38using namespace llvm;
40
41namespace {
42
43struct ReplacementInfo {
46 unsigned ValIdx;
47 bool ValDivergent;
48};
49
50class AMDGPUAtomicOptimizer : public FunctionPass {
51public:
52 static char ID;
54 AMDGPUAtomicOptimizer(ScanOptions ScanImpl)
56
58
59 void getAnalysisUsage(AnalysisUsage &AU) const override {
63 }
64};
65
66class AMDGPUAtomicOptimizerImpl
67 : public InstVisitor {
68private:
75 bool IsPixelShader;
77
79 Value *const Identity) const;
81 Value *const Identity) const;
83
84 std::pair<Value *, Value *>
88
90 bool ValDivergent) const;
91
92public:
93 AMDGPUAtomicOptimizerImpl() = delete;
94
98 : F(F), UA(UA), DL(F.getDataLayout()), DTU(DTU), ST(ST),
100 ScanImpl(ScanImpl) {}
101
102 bool run();
103
106};
107
108}
109
110char AMDGPUAtomicOptimizer::ID = 0;
111
113
114bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) {
115 if (skipFunction(F)) {
116 return false;
117 }
118
120 getAnalysis().getUniformityInfo();
121
123 getAnalysisIfAvailable();
125 DomTreeUpdater::UpdateStrategy::Lazy);
126
127 const TargetPassConfig &TPC = getAnalysis();
130
131 return AMDGPUAtomicOptimizerImpl(F, UA, DTU, ST, ScanImpl).run();
132}
133
137
139 DomTreeUpdater::UpdateStrategy::Lazy);
141
142 bool IsChanged = AMDGPUAtomicOptimizerImpl(F, UA, DTU, ST, ScanImpl).run();
143
144 if (!IsChanged) {
146 }
147
150 return PA;
151}
152
153bool AMDGPUAtomicOptimizerImpl::run() {
154
156 return false;
157
159 if (ToReplace.empty())
160 return false;
161
162 for (auto &[I, Op, ValIdx, ValDivergent] : ToReplace)
163 optimizeAtomic(*I, Op, ValIdx, ValDivergent);
164 ToReplace.clear();
165 return true;
166}
167
169 switch (Ty->getTypeID()) {
172 return true;
174 unsigned Size = Ty->getIntegerBitWidth();
175 return (Size == 32 || Size == 64);
176 }
177 default:
178 return false;
179 }
180}
181
182void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
183
184 switch (I.getPointerAddressSpace()) {
185 default:
186 return;
189 break;
190 }
191
193
194 switch (Op) {
195 default:
196 return;
210 break;
211 }
212
213
215 !(I.getType()->isFloatTy() || I.getType()->isDoubleTy())) {
216 return;
217 }
218
219 const unsigned PtrIdx = 0;
220 const unsigned ValIdx = 1;
221
222
223
225 return;
226 }
227
228 bool ValDivergent = UA.isDivergentUse(I.getOperandUse(ValIdx));
229
230
231
232
233
234 if (ValDivergent) {
235 if (ScanImpl == ScanOptions::DPP && !ST.hasDPP())
236 return;
237
239 return;
240 }
241
242
243
244
245 ToReplace.push_back({&I, Op, ValIdx, ValDivergent});
246}
247
248void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
250
251 switch (I.getIntrinsicID()) {
252 default:
253 return;
254 case Intrinsic::amdgcn_struct_buffer_atomic_add:
255 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
256 case Intrinsic::amdgcn_raw_buffer_atomic_add:
257 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
259 break;
260 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
261 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
262 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
263 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
265 break;
266 case Intrinsic::amdgcn_struct_buffer_atomic_and:
267 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
268 case Intrinsic::amdgcn_raw_buffer_atomic_and:
269 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
271 break;
272 case Intrinsic::amdgcn_struct_buffer_atomic_or:
273 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
274 case Intrinsic::amdgcn_raw_buffer_atomic_or:
275 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
277 break;
278 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
279 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
280 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
281 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
283 break;
284 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
285 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
286 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
287 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
289 break;
290 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
291 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
292 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
293 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
295 break;
296 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
297 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
298 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
299 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
301 break;
302 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
303 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
304 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
305 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
307 break;
308 }
309
310 const unsigned ValIdx = 0;
311
312 const bool ValDivergent = UA.isDivergentUse(I.getOperandUse(ValIdx));
313
314
315
316
317
318 if (ValDivergent) {
319 if (ScanImpl == ScanOptions::DPP && !ST.hasDPP())
320 return;
321
323 return;
324 }
325
326
327
328 for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) {
330 return;
331 }
332
333
334
335
336 ToReplace.push_back({&I, Op, ValIdx, ValDivergent});
337}
338
339
340
344
345 switch (Op) {
346 default:
349 return B.CreateBinOp(Instruction::Add, LHS, RHS);
351 return B.CreateFAdd(LHS, RHS);
353 return B.CreateBinOp(Instruction::Sub, LHS, RHS);
355 return B.CreateFSub(LHS, RHS);
357 return B.CreateBinOp(Instruction::And, LHS, RHS);
359 return B.CreateBinOp(Instruction::Or, LHS, RHS);
361 return B.CreateBinOp(Instruction::Xor, LHS, RHS);
362
365 break;
368 break;
371 break;
374 break;
376 return B.CreateMaxNum(LHS, RHS);
378 return B.CreateMinNum(LHS, RHS);
379 }
382}
383
384
385
389 Value *const Identity) const {
390 Type *AtomicTy = V->getType();
391 Module *M = B.GetInsertBlock()->getModule();
392
393
394 for (unsigned Idx = 0; Idx < 4; Idx++) {
397 B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, AtomicTy,
398 {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx),
399 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
400 }
401
402
404 Value *Permlanex16Call =
405 B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlanex16,
407 B.getInt32(0), B.getFalse(), B.getFalse()});
410 return V;
411 }
412
414
415 Value *Permlane64Call =
416 B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlane64, V);
418 }
419
420
421
423 M, Intrinsic::amdgcn_readlane, AtomicTy);
424 Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
425 Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
427}
428
429
430
433 Value *Identity) const {
434 Type *AtomicTy = V->getType();
435 Module *M = B.GetInsertBlock()->getModule();
437 M, Intrinsic::amdgcn_update_dpp, AtomicTy);
438
439 for (unsigned Idx = 0; Idx < 4; Idx++) {
442 B.CreateCall(UpdateDPP,
443 {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx),
444 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
445 }
447
450 B.CreateCall(UpdateDPP,
451 {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa),
452 B.getInt32(0xf), B.getFalse()}));
455 B.CreateCall(UpdateDPP,
456 {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc),
457 B.getInt32(0xf), B.getFalse()}));
458 } else {
459
460
461
462
463
466 B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlanex16,
468 B.getInt32(-1), B.getFalse(), B.getFalse()});
469
470 Value *UpdateDPPCall = B.CreateCall(
472 B.getInt32(0xa), B.getInt32(0xf), B.getFalse()});
474
476
477 Value *const Lane31 = B.CreateIntrinsic(
478 AtomicTy, Intrinsic::amdgcn_readlane, {V, B.getInt32(31)});
479
480 Value *UpdateDPPCall = B.CreateCall(
482 B.getInt32(0xc), B.getInt32(0xf), B.getFalse()});
483
485 }
486 }
487 return V;
488}
489
490
491
493 Value *Identity) const {
494 Type *AtomicTy = V->getType();
495 Module *M = B.GetInsertBlock()->getModule();
497 M, Intrinsic::amdgcn_update_dpp, AtomicTy);
499
500 V = B.CreateCall(UpdateDPP,
502 B.getInt32(0xf), B.getFalse()});
503 } else {
505 M, Intrinsic::amdgcn_readlane, AtomicTy);
507 M, Intrinsic::amdgcn_writelane, AtomicTy);
508
509
510
512 V = B.CreateCall(UpdateDPP,
514 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});
515
516
517 V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}),
519
521
523 WriteLane,
524 {B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V});
525
526
528 WriteLane,
529 {B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V});
530 }
531 }
532
533 return V;
534}
535
536
537
538
539
540
541std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
543 Instruction &I, BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const {
544 auto *Ty = I.getType();
546 auto *EntryBB = I.getParent();
547 auto NeedResult = .use_empty();
548
549 auto *Ballot =
550 B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());
551
552
553 B.SetInsertPoint(ComputeLoop);
554
555 auto *Accumulator = B.CreatePHI(Ty, 2, "Accumulator");
556 Accumulator->addIncoming(Identity, EntryBB);
557 PHINode *OldValuePhi = nullptr;
558 if (NeedResult) {
559 OldValuePhi = B.CreatePHI(Ty, 2, "OldValuePhi");
561 }
562 auto *ActiveBits = B.CreatePHI(WaveTy, 2, "ActiveBits");
563 ActiveBits->addIncoming(Ballot, EntryBB);
564
565
566 auto *FF1 =
567 B.CreateIntrinsic(Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue()});
568
569 auto *LaneIdxInt = B.CreateTrunc(FF1, B.getInt32Ty());
570
571
572 Value *LaneValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_readlane,
573 {V, LaneIdxInt});
574
575
576
577 Value *OldValue = nullptr;
578 if (NeedResult) {
579 OldValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_writelane,
580 {Accumulator, LaneIdxInt, OldValuePhi});
581 OldValuePhi->addIncoming(OldValue, ComputeLoop);
582 }
583
584
586 Accumulator->addIncoming(NewAccumulator, ComputeLoop);
587
588
589
590 auto *Mask = B.CreateShl(ConstantInt::get(WaveTy, 1), FF1);
591
592 auto *InverseMask = B.CreateXor(Mask, ConstantInt::get(WaveTy, -1));
593 auto *NewActiveBits = B.CreateAnd(ActiveBits, InverseMask);
594 ActiveBits->addIncoming(NewActiveBits, ComputeLoop);
595
596
597 auto *IsEnd = B.CreateICmpEQ(NewActiveBits, ConstantInt::get(WaveTy, 0));
598 B.CreateCondBr(IsEnd, ComputeEnd, ComputeLoop);
599
600 B.SetInsertPoint(ComputeEnd);
601
602 return {OldValue, NewAccumulator};
603}
604
608 const unsigned BitWidth = Ty->getPrimitiveSizeInBits();
609 switch (Op) {
610 default:
626 return ConstantFP::get(C, APFloat::getZero(Ty->getFltSemantics(), true));
628 return ConstantFP::get(C, APFloat::getZero(Ty->getFltSemantics(), false));
631
632
633
634
635 return ConstantFP::get(C, APFloat::getNaN(Ty->getFltSemantics()));
636 }
637}
638
643
644void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
646 unsigned ValIdx,
647 bool ValDivergent) const {
648
650
652 B.setIsFPConstrained(I.getFunction()->hasFnAttribute(Attribute::StrictFP));
653 }
654
655
656
659
660
661
662
663
664
665 if (IsPixelShader) {
666
668
669 Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {});
672
673
675
676 I.moveBefore(NonHelperTerminator->getIterator());
678 }
679
680 Type *const Ty = I.getType();
683 [[maybe_unused]] const unsigned TyBitWidth = DL.getTypeSizeInBits(Ty);
684
685
686
687 Value *V = I.getOperand(ValIdx);
688
689
690
692 CallInst *const Ballot =
693 B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());
694
695
696
697
698
701 Mbcnt =
702 B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {Ballot, B.getInt32(0)});
703 } else {
704 Value *const ExtractLo = B.CreateTrunc(Ballot, Int32Ty);
705 Value *const ExtractHi = B.CreateTrunc(B.CreateLShr(Ballot, 32), Int32Ty);
706 Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo,
707 {ExtractLo, B.getInt32(0)});
708 Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {ExtractHi, Mbcnt});
709 }
710
712 LLVMContext &C = F->getContext();
713
714
715
721 }
723
724 Value *ExclScan = nullptr;
725 Value *NewV = nullptr;
726
727 const bool NeedResult = .use_empty();
728
731
732
733 if (ValDivergent) {
734 if (ScanImpl == ScanOptions::DPP) {
735
736
737 NewV =
738 B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
740
741
742
743 NewV = buildReduction(B, ScanOp, NewV, Identity);
744 } else {
745 NewV = buildScan(B, ScanOp, NewV, Identity);
746 if (NeedResult)
747 ExclScan = buildShiftRight(B, NewV, Identity);
748
749
750
752 NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane,
753 {NewV, LastLaneIdx});
754 }
755
756 NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
757 } else if (ScanImpl == ScanOptions::Iterative) {
758
761 std::tie(ExclScan, NewV) = buildScanIteratively(B, ScanOp, Identity, V, I,
762 ComputeLoop, ComputeEnd);
763 } else {
764 llvm_unreachable("Atomic Optimzer is disabled for None strategy");
765 }
766 } else {
767 switch (Op) {
768 default:
770
773
774
775 Value *const Ctpop = B.CreateIntCast(
776 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
778 break;
779 }
782 Value *const Ctpop = B.CreateIntCast(
783 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Int32Ty, false);
784 Value *const CtpopFP = B.CreateUIToFP(Ctpop, Ty);
785 NewV = B.CreateFMul(V, CtpopFP);
786 break;
787 }
796
797
798 NewV = V;
799 break;
800
802
803
804 Value *const Ctpop = B.CreateIntCast(
805 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
806 NewV = buildMul(B, V, B.CreateAnd(Ctpop, 1));
807 break;
808 }
809 }
810
811
812
813
814 Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getInt32(0));
815
816
817 BasicBlock *const OriginalBB = I.getParent();
818
819
820
821
822
823
824 Instruction *const SingleLaneTerminator =
826
827
828
829
830
831
832
833
834
836 if (ValDivergent && ScanImpl == ScanOptions::Iterative) {
837
838
839
840
842 B.SetInsertPoint(ComputeEnd);
844 B.Insert(Terminator);
845
846
847
848 B.SetInsertPoint(OriginalBB);
849 B.CreateBr(ComputeLoop);
850
851
853 {{DominatorTree::Insert, OriginalBB, ComputeLoop},
854 {DominatorTree::Insert, ComputeLoop, ComputeEnd}});
855
856
857
858 for (auto *Succ : Terminator->successors()) {
859 DomTreeUpdates.push_back({DominatorTree::Insert, ComputeEnd, Succ});
860 DomTreeUpdates.push_back({DominatorTree::Delete, OriginalBB, Succ});
861 }
862
864
865 Predecessor = ComputeEnd;
866 } else {
867 Predecessor = OriginalBB;
868 }
869
870 B.SetInsertPoint(SingleLaneTerminator);
871
872
873
875 B.Insert(NewI);
877
878
879
881
882 if (NeedResult) {
883
884 PHINode *const PHI = B.CreatePHI(Ty, 2);
886 PHI->addIncoming(NewI, SingleLaneTerminator->getParent());
887
888
889
890
892 if (TyBitWidth < 32)
893 ReadlaneVal = B.CreateZExt(PHI, B.getInt32Ty());
894
895 Value *BroadcastI = B.CreateIntrinsic(
896 ReadlaneVal->getType(), Intrinsic::amdgcn_readfirstlane, ReadlaneVal);
897 if (TyBitWidth < 32)
898 BroadcastI = B.CreateTrunc(BroadcastI, Ty);
899
900
901
902
903
904 Value *LaneOffset = nullptr;
905 if (ValDivergent) {
906 if (ScanImpl == ScanOptions::DPP) {
907 LaneOffset =
908 B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);
909 } else if (ScanImpl == ScanOptions::Iterative) {
910 LaneOffset = ExclScan;
911 } else {
912 llvm_unreachable("Atomic Optimzer is disabled for None strategy");
913 }
914 } else {
915 Mbcnt = isAtomicFloatingPointTy ? B.CreateUIToFP(Mbcnt, Ty)
916 : B.CreateIntCast(Mbcnt, Ty, false);
917 switch (Op) {
918 default:
922 LaneOffset = buildMul(B, V, Mbcnt);
923 break;
932 LaneOffset = B.CreateSelect(Cond, Identity, V);
933 break;
935 LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1));
936 break;
939 LaneOffset = B.CreateFMul(V, Mbcnt);
940 break;
941 }
942 }
943 }
945 if (isAtomicFloatingPointTy) {
946
947
948
949
950
951
952
953
954
955
956 Result = B.CreateSelect(Cond, BroadcastI, Result);
957 }
958
959 if (IsPixelShader) {
960
962
963 PHINode *const PHI = B.CreatePHI(Ty, 2);
965 PHI->addIncoming(Result, I.getParent());
966 I.replaceAllUsesWith(PHI);
967 } else {
968
969 I.replaceAllUsesWith(Result);
970 }
971 }
972
973
974 I.eraseFromParent();
975}
976
978 "AMDGPU atomic optimizations", false, false)
983
985 return new AMDGPUAtomicOptimizer(ScanStrategy);
986}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static Constant * getIdentityValueForAtomicOp(Type *const Ty, AtomicRMWInst::BinOp Op)
Definition AMDGPUAtomicOptimizer.cpp:605
static bool isLegalCrossLaneType(Type *Ty)
Definition AMDGPUAtomicOptimizer.cpp:168
static Value * buildMul(IRBuilder<> &B, Value *LHS, Value *RHS)
Definition AMDGPUAtomicOptimizer.cpp:639
static Value * buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *LHS, Value *RHS)
Definition AMDGPUAtomicOptimizer.cpp:341
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool runOnFunction(Function &F, bool PostInlining)
AMD GCN specific subclass of TargetSubtarget.
Machine Check Debug Module
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
const SmallVectorImpl< MachineOperand > & Cond
void visit(MachineFunction &MF, MachineBasicBlock &Start, std::function< void(MachineBasicBlock *)> op)
Target-Independent Code Generator Pass Configuration Options pass.
LLVM IR instance of the generic uniformity analysis.
unsigned getWavefrontSize() const
static APFloat getNaN(const fltSemantics &Sem, bool Negative=false, uint64_t payload=0)
Factory for NaN values.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
an instruction that atomically reads a memory location, combines it with another value,...
static bool isFPOperation(BinOp Op)
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Min
*p = old <signed v ? old : v
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
LLVM Basic Block Representation.
const Function * getParent() const
Return the enclosing method, or null if none.
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
This is the shared class of boolean and integer constants.
bool isOne() const
This is just a convenience method to make client code smaller for a common case.
This is an important base class in LLVM.
A parsed version of the target data layout string in and methods for querying it.
Analysis pass which computes a DominatorTree.
Legacy analysis pass which computes a DominatorTree.
DominatorTree & getDomTree()
FunctionPass class - This class is used to implement most global optimizations.
bool hasDPPWavefrontShifts() const
bool hasPermLane64() const
bool hasDPPBroadcasts() const
bool hasPermLaneX16() const
void applyUpdates(ArrayRef< UpdateT > Updates)
Submit updates to all available trees.
bool isDivergentUse(const UseT &U) const
Whether U is divergent.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Base class for instruction visitors.
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserve()
Mark an analysis as preserved.
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Primary interface to the complete machine description for the target machine.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
Target-Independent Code Generator Pass Configuration Options.
TMC & getTM() const
Get the right type of TargetMachine for this target.
The instances of the Type class are immutable: once they are created, they are never changed.
@ FloatTyID
32-bit floating point type
@ IntegerTyID
Arbitrary bit width integers.
@ DoubleTyID
64-bit floating point type
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Analysis pass which computes UniformityInfo.
Legacy analysis pass which computes a CycleInfo.
void setOperand(unsigned i, Value *Val)
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ C
The default llvm calling convention, compatible with C.
@ BasicBlock
Various leaf nodes.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
FunctionAddr VTableAddr Value
decltype(auto) dyn_cast(const From &Val)
dyn_cast - Return the argument parameter cast to the specified type.
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
FunctionPass * createAMDGPUAtomicOptimizerPass(ScanOptions ScanStrategy)
Definition AMDGPUAtomicOptimizer.cpp:984
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
DWARFExpression::Operation Op
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast - Return the argument parameter cast to the specified type.
char & AMDGPUAtomicOptimizerID
Definition AMDGPUAtomicOptimizer.cpp:112
LLVM_ABI Instruction * SplitBlockAndInsertIfThen(Value *Cond, BasicBlock::iterator SplitBefore, bool Unreachable, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, BasicBlock *ThenBlock=nullptr)
Split the containing block at the specified instruction - everything before SplitBefore stays in the ...
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Definition AMDGPUAtomicOptimizer.cpp:134