LLVM: lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp Source File (original) (raw)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
29#include "llvm/IR/IntrinsicsAMDGPU.h"
38
39#define DEBUG_TYPE "amdgpu-codegenprepare"
40
41using namespace llvm;
43
44namespace {
45
47 "amdgpu-codegenprepare-widen-constant-loads",
48 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
51
53 BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
54 cl::desc("Break large PHI nodes for DAGISel"),
56
58 ForceBreakLargePHIs("amdgpu-codegenprepare-force-break-large-phis",
59 cl::desc("For testing purposes, always break large "
60 "PHIs even if it isn't profitable."),
62
64 "amdgpu-codegenprepare-break-large-phis-threshold",
65 cl::desc("Minimum type size in bits for breaking large PHI nodes"),
67
69 "amdgpu-codegenprepare-mul24",
70 cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
73
74
76 "amdgpu-codegenprepare-expand-div64",
77 cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
80
81
82
84 "amdgpu-codegenprepare-disable-idiv-expansion",
85 cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
88
89
91 "amdgpu-codegenprepare-disable-fdiv-expansion",
92 cl::desc("Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
95
96class AMDGPUCodeGenPrepareImpl
97 : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
98public:
107 const bool HasFP32DenormalFlush;
108 bool FlowChanged = false;
109 mutable Function *SqrtF32 = nullptr;
110 mutable Function *LdexpF32 = nullptr;
112
114
119 DT(DT), UA(UA), DL(F.getDataLayout()),
122
123 Function *getSqrtF32() const {
124 if (SqrtF32)
125 return SqrtF32;
126
129 F.getParent(), Intrinsic::amdgcn_sqrt, {Type::getFloatTy(Ctx)});
130 return SqrtF32;
131 }
132
133 Function *getLdexpF32() const {
134 if (LdexpF32)
135 return LdexpF32;
136
139 F.getParent(), Intrinsic::ldexp,
140 {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)});
141 return LdexpF32;
142 }
143
144 bool canBreakPHINode(const PHINode &I);
145
146
147 bool isLegalFloatingTy(const Type *T) const;
148
149
153 }
154
155 bool canIgnoreDenormalInput(const Value *V, const Instruction *CtxI) const {
156 return HasFP32DenormalFlush ||
158 }
159
160
161
162
163 unsigned numBitsUnsigned(Value *Op) const;
164
165
166
167
168 unsigned numBitsSigned(Value *Op) const;
169
170
171
173
174
175
177
181 unsigned MaxDivBits, bool Signed) const;
182
183
186 bool IsDiv, bool IsSigned) const;
187
189 Value *Num, Value *Den, unsigned NumBits,
190 bool IsDiv, bool IsSigned) const;
191
192
195
199
200
201
202
203
204
205
206
207
208 bool canWidenScalarExtLoad(LoadInst &I) const;
209
212
215
219
223 float ReqdAccuracy) const;
224
228 float ReqdAccuracy) const;
229
230 std::pair<Value *, Value *> getFrexpResults(IRBuilder<> &Builder,
231 Value *Src) const;
232
234 bool IsNegative) const;
239
240 bool tryNarrowMathIfNoOverflow(Instruction *I);
241
242public:
244
245 bool visitInstruction(Instruction &I) { return false; }
249 bool visitPHINode(PHINode &I);
251
255 bool run();
256};
257
258class AMDGPUCodeGenPrepare : public FunctionPass {
259public:
260 static char ID;
262 void getAnalysisUsage(AnalysisUsage &AU) const override {
266
267
268 if (!ExpandDiv64InIR)
270 }
272 StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
273};
274
275}
276
277bool AMDGPUCodeGenPrepareImpl::run() {
278 BreakPhiNodesCache.clear();
279 bool MadeChange = false;
280
281
282
283
284 for (BasicBlock &BB : reverse(F)) {
288 }
289 }
290
291 while (!DeadVals.empty()) {
294 }
295
296 return MadeChange;
297}
298
299bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const {
302}
303
304bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
306 int TySize = DL.getTypeSizeInBits(Ty);
307 Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty);
308
309 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.isUniform(&I);
310}
311
312unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op) const {
314}
315
316unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op) const {
318}
319
323 if (!VT) {
325 return;
326 }
327
328 for (int I = 0, E = VT->getNumElements(); I != E; ++I)
329 Values.push_back(Builder.CreateExtractElement(V, I));
330}
331
335 if (!Ty->isVectorTy()) {
337 return Values[0];
338 }
339
341 for (int I = 0, E = Values.size(); I != E; ++I)
342 NewVal = Builder.CreateInsertElement(NewVal, Values[I], I);
343
344 return NewVal;
345}
346
347bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
348 if (I.getOpcode() != Instruction::Mul)
349 return false;
350
354 return false;
355
356
358 return false;
359
363 Builder.SetCurrentDebugLocation(I.getDebugLoc());
364
365 unsigned LHSBits = 0, RHSBits = 0;
366 bool IsSigned = false;
367
368 if (ST.hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 &&
369 (RHSBits = numBitsUnsigned(RHS)) <= 24) {
370 IsSigned = false;
371
372 } else if (ST.hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 &&
373 (RHSBits = numBitsSigned(RHS)) <= 24) {
374 IsSigned = true;
375
376 } else
377 return false;
378
384
385 IntegerType *I32Ty = Builder.getInt32Ty();
386 IntegerType *IntrinTy = Size > 32 ? Builder.getInt64Ty() : I32Ty;
387 Type *DstTy = LHSVals[0]->getType();
388
389 for (int I = 0, E = LHSVals.size(); I != E; ++I) {
390 Value *LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty)
391 : Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
392 Value *RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty)
393 : Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
395 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
397 Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy)
398 : Builder.CreateZExtOrTrunc(Result, DstTy);
400 }
401
404 I.replaceAllUsesWith(NewVal);
405 DeadVals.push_back(&I);
406
407 return true;
408}
409
410
411
413 Cast = nullptr;
415 return Sel;
416
419 return Sel;
420 }
421
422 return nullptr;
423}
424
425bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const {
426
427
428 int SelOpNo = 0;
429
430 CastInst *CastOp;
431
432
433
436 SelOpNo = 1;
438 }
439
441 return false;
442
446 if (!CBO || !CT || !CF)
447 return false;
448
449 if (CastOp) {
451 return false;
454 }
455
456
457
462 return false;
463
468 return false;
469
471 Builder.SetCurrentDebugLocation(BO.getDebugLoc());
473 Builder.setFastMathFlags(FPOp->getFastMathFlags());
474
476 FoldedT, FoldedF);
479 DeadVals.push_back(&BO);
480 if (CastOp)
481 DeadVals.push_back(CastOp);
482 DeadVals.push_back(Sel);
483 return true;
484}
485
486std::pair<Value *, Value *>
487AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder<> &Builder,
488 Value *Src) const {
489 Type *Ty = Src->getType();
493
494
495
496
497
498 Value *FrexpExp =
502 : Builder.CreateExtractValue(Frexp, {1});
503 return {FrexpMant, FrexpExp};
504}
505
506
507Value *AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder<> &Builder,
509 bool IsNegative) const {
510
511
512 if (IsNegative)
514
515
516
517
518
519
520
521
522
523
524 auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
527 return Builder.CreateCall(getLdexpF32(), {Rcp, ScaleFactor});
528}
529
530
533 FastMathFlags FMF) const {
534
535
536
539 return nullptr;
540
541
542
543 auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, RHS);
544
547
548 auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, LHS);
550
551
552
553 Value *ExpDiff = Builder.CreateSub(FrexpExpLHS, FrexpExpRHS);
554 return Builder.CreateCall(getLdexpF32(), {Mul, ExpDiff});
555}
556
557
558Value *AMDGPUCodeGenPrepareImpl::emitSqrtIEEE2ULP(IRBuilder<> &Builder,
560 FastMathFlags FMF) const {
561 Type *Ty = Src->getType();
564 Value *NeedScale =
565 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
566
568 Value *InputScaleFactor =
570
572
574
575 Value *OutputScaleFactor =
577 return Builder.CreateCall(getLdexpF32(), {Sqrt, OutputScaleFactor});
578}
579
580
582 bool IsNegative) {
583
584
585
586
587
588 Type *Ty = Src->getType();
591 Value *NeedScale =
592 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
593 Constant *One = ConstantFP::get(Ty, 1.0);
594 Constant *InputScale = ConstantFP::get(Ty, 0x1.0p+24);
596 ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12);
597
598 Value *InputScaleFactor = Builder.CreateSelect(NeedScale, InputScale, One);
599
600 Value *ScaledInput = Builder.CreateFMul(Src, InputScaleFactor);
601 Value *Rsq = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, ScaledInput);
602 Value *OutputScaleFactor = Builder.CreateSelect(
603 NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One);
604
605 return Builder.CreateFMul(Rsq, OutputScaleFactor);
606}
607
608bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp,
609 FastMathFlags DivFMF,
610 FastMathFlags SqrtFMF) const {
611
613 return false;
614
615
617}
618
619Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
621 const FastMathFlags SqrtFMF, const Instruction *CtxI) const {
622
624
625
626
627
629 if (!CLHS)
630 return nullptr;
631
633
634 bool IsNegative = false;
635
636
638
639 IRBuilder<>::FastMathFlagGuard Guard(Builder);
641
643 canIgnoreDenormalInput(Den, CtxI)) {
645
647 }
648
650 }
651
652 return nullptr;
653}
654
655
656
657
658
659
660
662AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder<> &Builder, Value *Num,
663 Value *Den, FastMathFlags FMF,
664 const Instruction *CtxI) const {
665
666
667
669
671 bool IsNegative = false;
674 Value *Src = Den;
675
676 if (HasFP32DenormalFlush || FMF.approxFunc()) {
677
678 if (IsNegative)
680
681
682
683
684
685
686
687
688
689
690
691
693 }
694
695
696
697 return emitRcpIEEE1ULP(Builder, Src, IsNegative);
698 }
699 }
700
702
703
704
705
706 if (HasFP32DenormalFlush || FMF.approxFunc()) {
708 return Builder.CreateFMul(Num, Recip);
709 }
710
711 Value *Recip = emitRcpIEEE1ULP(Builder, Den, false);
712 return Builder.CreateFMul(Num, Recip);
713 }
714
715 return nullptr;
716}
717
718
719
720
721
722
723
724
725Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
727
728 if (ReqdAccuracy < 2.5f)
729 return nullptr;
730
731
733
734 bool NumIsOne = false;
736 if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
737 NumIsOne = true;
738 }
739
740
741
742
743
744 if (!HasFP32DenormalFlush && !NumIsOne)
745 return nullptr;
746
747 return Builder.CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {Num, Den});
748}
749
750Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
752 FastMathFlags SqrtFMF, Value *RsqOp, const Instruction *FDivInst,
753 float ReqdDivAccuracy) const {
754 if (RsqOp) {
756 optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst);
757 if (Rsq)
758 return Rsq;
759 }
760
761 Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);
762 if (Rcp)
763 return Rcp;
764
765
766
767
768
769 Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy);
770 if (FDivFast)
771 return FDivFast;
772
773 return emitFrexpDiv(Builder, Num, Den, DivFMF);
774}
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
792 if (DisableFDivExpand)
793 return false;
794
797 return false;
798
799
800
801
804 const float ReqdAccuracy = FPOp->getFPAccuracy();
805
806 FastMathFlags SqrtFMF;
807
810
811 Value *RsqOp = nullptr;
813 if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
814 DenII->hasOneUse()) {
817 if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF))
819 }
820
821
822
823
824
825
826
827
828
829
830 const bool AllowInaccurateRcp = DivFMF.approxFunc();
831 if (!RsqOp && AllowInaccurateRcp)
832 return false;
833
834
835 if (ReqdAccuracy < 1.0f)
836 return false;
837
841
847
848 if (RsqOp)
850
852 for (int I = 0, E = NumVals.size(); I != E; ++I) {
853 Value *NumElt = NumVals[I];
854 Value *DenElt = DenVals[I];
855 Value *RsqDenElt = RsqOp ? RsqDenVals[I] : nullptr;
856
858 visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt,
860 if (!NewElt) {
861
862
863
864
865 NewElt = Builder.CreateFDiv(NumElt, DenElt);
867 NewEltInst->copyMetadata(FDiv);
868 }
869
870 ResultVals[I] = NewElt;
871 }
872
874
875 if (NewVal) {
878 DeadVals.push_back(&FDiv);
879 }
880
881 return true;
882}
883
888
889 Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty);
890 Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty);
891 Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
892 Value *Lo = Builder.CreateTrunc(MUL64, I32Ty);
893 Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
894 Hi = Builder.CreateTrunc(Hi, I32Ty);
896}
897
901
902
903
904
905
906unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
908 unsigned MaxDivBits,
909 bool IsSigned) const {
913 if (IsSigned) {
915
916 unsigned DivBits = SSBits - RHSSignBits + 1;
917 if (DivBits > MaxDivBits)
918 return SSBits;
919
921
922 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
923 DivBits = SSBits - SignBits + 1;
924 return DivBits;
925 }
926
927
928
931 return SSBits;
933 unsigned DivBits = SSBits - RHSSignBits;
934 if (DivBits > MaxDivBits)
935 return SSBits;
936
939 return SSBits;
941
942 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
943 DivBits = SSBits - SignBits;
944 return DivBits;
945}
946
947
948
949Value *AMDGPUCodeGenPrepareImpl::expandDivRem24(IRBuilder<> &Builder,
950 BinaryOperator &I, Value *Num,
951 Value *Den, bool IsDiv,
952 bool IsSigned) const {
953 unsigned DivBits = getDivNumBits(I, Num, Den, 24, IsSigned);
954 if (DivBits > 24)
955 return nullptr;
956 return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned);
957}
958
959Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
961 unsigned DivBits, bool IsDiv, bool IsSigned) const {
965
967 ConstantInt *One = Builder.getInt32(1);
969
970 if (IsSigned) {
971
972 JQ = Builder.CreateXor(Num, Den);
973
974
976
977
978 JQ = Builder.CreateOr(JQ, One);
979 }
980
981
983
984
986
987
990
991
994
998
999
1002
1003
1005
1006
1008 ? Intrinsic::fma
1011 {FQNeg->getType()}, {FQNeg, FB, FA}, FQ);
1012
1013
1016
1017
1019
1020
1022
1023
1025
1026
1028
1029
1031
1032 Value *Res = Div;
1033 if (!IsDiv) {
1034
1036 Res = Builder.CreateSub(Num, Rem);
1037 }
1038
1039 if (DivBits != 0 && DivBits < 32) {
1040
1041 if (IsSigned) {
1042 int InRegBits = 32 - DivBits;
1043
1044 Res = Builder.CreateShl(Res, InRegBits);
1045 Res = Builder.CreateAShr(Res, InRegBits);
1046 } else {
1047 ConstantInt *TruncMask
1048 = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
1049 Res = Builder.CreateAnd(Res, TruncMask);
1050 }
1051 }
1052
1053 return Res;
1054}
1055
1056
1057
1058
1059
1060bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,
1062 Value *Den) const {
1064
1065
1066 if (C->getType()->getScalarSizeInBits() <= 32)
1067 return true;
1068
1069
1070
1071
1072
1073
1075 return true;
1076
1077 return false;
1078 }
1079
1081
1082 if (BinOpDen->getOpcode() == Instruction::Shl &&
1085 return true;
1086 }
1087 }
1088
1089 return false;
1090}
1091
1093
1099 return Builder.CreateAShr(V, Builder.getInt32(31));
1100}
1101
1102Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder,
1103 BinaryOperator &I, Value *X,
1106 assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
1107 Opc == Instruction::SRem || Opc == Instruction::SDiv);
1108
1109 FastMathFlags FMF;
1112
1113 if (divHasSpecialOptimization(I, X, Y))
1114 return nullptr;
1115
1116 bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
1117 bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
1118
1122
1124 if (IsSigned) {
1127 } else {
1130 }
1131 }
1132
1133 if (Value *Res = expandDivRem24(Builder, I, X, Y, IsDiv, IsSigned)) {
1136 }
1137
1139 ConstantInt *One = Builder.getInt32(1);
1140
1141 Value *Sign = nullptr;
1142 if (IsSigned) {
1145
1146 Sign = IsDiv ? Builder.CreateXor(SignX, SignY) : SignX;
1147
1150
1153 }
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1188 Value *RcpY = Builder.CreateIntrinsic(Intrinsic::amdgcn_rcp, F32Ty, {FloatY});
1192
1193
1197
1198
1201
1202
1204 if (IsDiv)
1207
1208
1211 if (IsDiv)
1213 else
1215
1216 if (IsSigned) {
1217 Res = Builder.CreateXor(Res, Sign);
1218 Res = Builder.CreateSub(Res, Sign);
1220 } else {
1222 }
1223 return Res;
1224}
1225
1226Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder,
1227 BinaryOperator &I, Value *Num,
1228 Value *Den) const {
1229 if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
1230 return nullptr;
1231
1233
1234 bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
1235 bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
1236
1237 unsigned NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned);
1238 if (NumDivBits > 32)
1239 return nullptr;
1240
1241 Value *Narrowed = nullptr;
1242 if (NumDivBits <= 24) {
1243 Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits,
1244 IsDiv, IsSigned);
1245 } else if (NumDivBits <= 32) {
1246 Narrowed = expandDivRem32(Builder, I, Num, Den);
1247 }
1248
1249 if (Narrowed) {
1250 return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) :
1252 }
1253
1254 return nullptr;
1255}
1256
1257void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const {
1259
1260 if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
1262 return;
1263 }
1264
1265 if (Opc == Instruction::URem || Opc == Instruction::SRem) {
1267 return;
1268 }
1269
1271}
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285bool AMDGPUCodeGenPrepareImpl::tryNarrowMathIfNoOverflow(Instruction *I) {
1286 unsigned Opc = I->getOpcode();
1287 Type *OldType = I->getType();
1288
1289 if (Opc != Instruction::Add && Opc != Instruction::Mul)
1290 return false;
1291
1293
1294 if (Opc != Instruction::Add && Opc != Instruction::Mul)
1295 llvm_unreachable("Unexpected opcode, only valid for Instruction::Add and "
1296 "Instruction::Mul.");
1297
1299
1300 MaxBitsNeeded = std::max(bit_ceil(MaxBitsNeeded), 8);
1301 Type *NewType = DL.getSmallestLegalIntType(I->getContext(), MaxBitsNeeded);
1302 if (!NewType)
1303 return false;
1305 if (NewBit >= OrigBit)
1306 return false;
1308
1309
1313
1316
1317 int NumOfNonConstOps = 2;
1319
1320 NumOfNonConstOps = 1;
1321 }
1323 NewType, OldType,
1326
1327 NewCost +=
1330 if (NewCost >= OldCost)
1331 return false;
1332
1338
1341 DeadVals.push_back(I);
1342 return true;
1343}
1344
1345bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
1346 if (foldBinOpIntoSelect(I))
1347 return true;
1348
1349 if (UseMul24Intrin && replaceMulWithMul24(I))
1350 return true;
1351 if (tryNarrowMathIfNoOverflow(&I))
1352 return true;
1353
1357 Value *NewDiv = nullptr;
1359
1361
1362 if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
1363 Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
1364 ScalarSize <= 64 &&
1365 !DisableIDivExpand) {
1366 Value *Num = I.getOperand(0);
1367 Value *Den = I.getOperand(1);
1370
1373
1374 for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
1377
1379 if (ScalarSize <= 32) {
1380 NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
1381 if (!NewElt)
1382 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1383 } else {
1384
1385
1386 NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN);
1387 if (!NewElt) {
1388
1389
1390
1391 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1392
1393
1395 Div64ToExpand.push_back(NewEltBO);
1396 }
1397 }
1398
1400 NewEltI->copyIRFlags(&I);
1401
1403 }
1404 } else {
1405 if (ScalarSize <= 32)
1406 NewDiv = expandDivRem32(Builder, I, Num, Den);
1407 else {
1408 NewDiv = shrinkDivRem64(Builder, I, Num, Den);
1409 if (!NewDiv)
1411 }
1412 }
1413
1414 if (NewDiv) {
1415 I.replaceAllUsesWith(NewDiv);
1416 DeadVals.push_back(&I);
1418 }
1419 }
1420
1421 if (ExpandDiv64InIR) {
1422
1423 for (BinaryOperator *Div : Div64ToExpand) {
1424 expandDivRem64(*Div);
1425 FlowChanged = true;
1427 }
1428 }
1429
1431}
1432
1433bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
1435 return false;
1436
1439 canWidenScalarExtLoad(I)) {
1442
1446
1447
1448
1449 if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) {
1450 ConstantInt *Lower =
1452
1453 if (Lower->isNullValue()) {
1454 WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);
1455 } else {
1458
1460 };
1461
1462 WidenLoad->setMetadata(LLVMContext::MD_range,
1464 }
1465 }
1466
1467 int TySize = DL.getTypeSizeInBits(I.getType());
1472 DeadVals.push_back(&I);
1473 return true;
1474 }
1475
1476 return false;
1477}
1478
1479bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
1484 CmpPredicate Pred;
1485
1486
1488 return false;
1489
1491 if (!FPOp)
1492 return false;
1493
1496
1499
1500 Value *Fract = nullptr;
1501 if (Pred == FCmpInst::FCMP_UNO && TrueVal == CmpVal && IIFalse &&
1502 CmpVal == matchFractPat(*IIFalse)) {
1503
1504 Fract = applyFractPat(Builder, CmpVal);
1505 } else if (Pred == FCmpInst::FCMP_ORD && FalseVal == CmpVal && IITrue &&
1506 CmpVal == matchFractPat(*IITrue)) {
1507
1508 Fract = applyFractPat(Builder, CmpVal);
1509 } else
1510 return false;
1511
1513 I.replaceAllUsesWith(Fract);
1514 DeadVals.push_back(&I);
1515 return true;
1516}
1517
1521 return IA && IB && IA->getParent() == IB->getParent();
1522}
1523
1524
1525
1528 if (!FVT)
1529 return false;
1530
1531 const Value *CurVal = V;
1532
1533
1534 BitVector EltsCovered(FVT->getNumElements());
1537
1538
1539
1540
1541 if (!Idx || Idx->getZExtValue() >= FVT->getNumElements())
1542 return false;
1543
1544 const auto *VecSrc = IE->getOperand(0);
1545
1546
1547
1548
1550 return false;
1551
1552 CurVal = VecSrc;
1553 EltsCovered.set(Idx->getZExtValue());
1554
1555
1556 if (EltsCovered.all())
1557 return true;
1558 }
1559
1560
1561
1562
1563
1564
1566 return true;
1567
1568
1569
1570
1571
1576 }
1577
1578 return false;
1579}
1580
1583 const auto [It, Inserted] = SeenPHIs.insert(&I);
1584 if (!Inserted)
1585 return;
1586
1587 for (const Value *Inc : I.incoming_values()) {
1590 }
1591
1592 for (const User *U : I.users()) {
1595 }
1596}
1597
1598bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode &I) {
1599
1600 if (const auto It = BreakPhiNodesCache.find(&I);
1601 It != BreakPhiNodesCache.end())
1602 return It->second;
1603
1604
1605
1606
1607
1608
1609
1610 SmallPtrSet<const PHINode *, 8> WorkList;
1612
1613#ifndef NDEBUG
1614
1615
1616 for (const PHINode *WLP : WorkList) {
1617 assert(BreakPhiNodesCache.count(WLP) == 0);
1618 }
1619#endif
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632 const auto Threshold = (alignTo(WorkList.size() * 2, 3) / 3);
1633 unsigned NumBreakablePHIs = 0;
1634 bool CanBreak = false;
1635 for (const PHINode *Cur : WorkList) {
1636
1637
1638
1639
1640
1641
1643 if (++NumBreakablePHIs >= Threshold) {
1644 CanBreak = true;
1645 break;
1646 }
1647 }
1648 }
1649
1650 for (const PHINode *Cur : WorkList)
1651 BreakPhiNodesCache[Cur] = CanBreak;
1652
1653 return CanBreak;
1654}
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1670public:
1673
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1700 Value *&Res = SlicedVals[{BB, Inc}];
1701 if (Res)
1702 return Res;
1703
1706 B.SetCurrentDebugLocation(IncInst->getDebugLoc());
1707
1710 for (unsigned K = Idx; K < (Idx + NumElts); ++K)
1711 Mask.push_back(K);
1712 Res = B.CreateShuffleVector(Inc, Mask, NewValName);
1713 } else
1714 Res = B.CreateExtractElement(Inc, Idx, NewValName);
1715
1716 return Res;
1717 }
1718
1719private:
1721};
1722
1723bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1735 return false;
1736
1739 DL.getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold)
1740 return false;
1741
1742 if (!ForceBreakLargePHIs && !canBreakPHINode(I))
1743 return false;
1744
1745 std::vector Slices;
1746
1748 {
1749 unsigned Idx = 0;
1750
1751
1752 const unsigned EltSize = DL.getTypeSizeInBits(EltTy);
1754 if (EltSize == 8 || EltSize == 16) {
1755 const unsigned SubVecSize = (32 / EltSize);
1757 for (unsigned End = alignDown(NumElts, SubVecSize); Idx < End;
1758 Idx += SubVecSize)
1759 Slices.emplace_back(SubVecTy, Idx, SubVecSize);
1760 }
1761
1762
1763 for (; Idx < NumElts; ++Idx)
1764 Slices.emplace_back(EltTy, Idx, 1);
1765 }
1766
1767 assert(Slices.size() > 1);
1768
1769
1770
1771
1773 B.SetCurrentDebugLocation(I.getDebugLoc());
1774
1775 unsigned IncNameSuffix = 0;
1776 for (VectorSlice &S : Slices) {
1777
1778
1779 B.SetInsertPoint(I.getParent()->getFirstNonPHIIt());
1780 S.NewPHI = B.CreatePHI(S.Ty, I.getNumIncomingValues());
1781
1782 for (const auto &[Idx, BB] : enumerate(I.blocks())) {
1783 S.NewPHI->addIncoming(S.getSlicedVal(BB, I.getIncomingValue(Idx),
1784 "largephi.extractslice" +
1785 std::to_string(IncNameSuffix++)),
1786 BB);
1787 }
1788 }
1789
1790
1792 unsigned NameSuffix = 0;
1793 for (VectorSlice &S : Slices) {
1794 const auto ValName = "largephi.insertslice" + std::to_string(NameSuffix++);
1795 if (S.NumElts > 1)
1796 Vec = B.CreateInsertVector(FVT, Vec, S.NewPHI, S.Idx, ValName);
1797 else
1798 Vec = B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);
1799 }
1800
1801 I.replaceAllUsesWith(Vec);
1802 DeadVals.push_back(&I);
1803 return true;
1804}
1805
1806
1807
1808
1809
1810
1813
1814
1815
1817 return true;
1818
1819
1820 if (const auto *Arg = dyn_cast(V); Arg && Arg->hasNonNullAttr())
1821 return true;
1822
1823
1825 Load && Load->hasMetadata(LLVMContext::MD_nonnull))
1826 return true;
1827
1828
1829
1831 return false;
1832
1833
1834
1835
1836
1837
1838
1839
1840
1843
1844 assert(SrcPtrKB.getBitWidth() == DL.getPointerSizeInBits(AS));
1845 assert((NullVal == 0 || NullVal == -1) &&
1846 "don't know how to check for this null value!");
1847 return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
1848}
1849
1850bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
1851
1852
1853
1854 if (I.getType()->isVectorTy())
1855 return false;
1856
1857
1858
1859 const unsigned SrcAS = I.getSrcAddressSpace();
1860 const unsigned DstAS = I.getDestAddressSpace();
1861
1862 bool CanLower = false;
1869 if (!CanLower)
1870 return false;
1871
1874 if ((WorkList, [&](const Value *V) {
1876 }))
1877 return false;
1878
1880 auto *Intrin = B.CreateIntrinsic(
1881 I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
1882 I.replaceAllUsesWith(Intrin);
1883 DeadVals.push_back(&I);
1884 return true;
1885}
1886
1887bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
1888 switch (I.getIntrinsicID()) {
1889 case Intrinsic::minnum:
1890 case Intrinsic::minimumnum:
1891 case Intrinsic::minimum:
1892 return visitFMinLike(I);
1893 case Intrinsic::sqrt:
1894 return visitSqrt(I);
1895 default:
1896 return false;
1897 }
1898}
1899
1900
1901
1902
1903
1904
1905
1906
1907Value *AMDGPUCodeGenPrepareImpl::matchFractPat(IntrinsicInst &I) {
1909 return nullptr;
1910
1912
1913
1914
1915 if (IID != Intrinsic::minnum && IID != Intrinsic::minimum &&
1916 IID != Intrinsic::minimumnum)
1917 return nullptr;
1918
1921 return nullptr;
1922
1923 Value *Arg0 = I.getArgOperand(0);
1924 Value *Arg1 = I.getArgOperand(1);
1925
1928 return nullptr;
1929
1931 bool LosesInfo;
1932 One.convert(C->getSemantics(), APFloat::rmNearestTiesToEven, &LosesInfo);
1933
1934
1935 One.next(true);
1936 if (One != *C)
1937 return nullptr;
1938
1939 Value *FloorSrc;
1942 return FloorSrc;
1943 return nullptr;
1944}
1945
1946Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
1947 Value *FractArg) {
1950
1952
1954 for (unsigned I = 0, E = FractVals.size(); I != E; ++I) {
1955 ResultVals[I] =
1956 Builder.CreateIntrinsic(Intrinsic::amdgcn_fract, {Ty}, {FractVals[I]});
1957 }
1958
1960}
1961
1962bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) {
1963 Value *FractArg = matchFractPat(I);
1964 if (!FractArg)
1965 return false;
1966
1967
1968
1969 if (.hasNoNaNs() &&
(FractArg, SimplifyQuery(DL, TLI)))
1970 return false;
1971
1973 FastMathFlags FMF = I.getFastMathFlags();
1976
1977 Value *Fract = applyFractPat(Builder, FractArg);
1979 I.replaceAllUsesWith(Fract);
1980 DeadVals.push_back(&I);
1981 return true;
1982}
1983
1984
1985bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
1988 return false;
1989
1992
1993
1994
1996 return false;
1997
1998 const float ReqdAccuracy = FPOp->getFPAccuracy();
1999
2000
2001 if (ReqdAccuracy < 1.0f)
2002 return false;
2003
2005 bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);
2006
2007
2008
2009 if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)
2010 return false;
2011
2015
2017 for (int I = 0, E = SrcVals.size(); I != E; ++I) {
2018 if (CanTreatAsDAZ)
2019 ResultVals[I] = Builder.CreateCall(getSqrtF32(), SrcVals[I]);
2020 else
2021 ResultVals[I] = emitSqrtIEEE2ULP(Builder, SrcVals[I], SqrtFMF);
2022 }
2023
2027 DeadVals.push_back(&Sqrt);
2028 return true;
2029}
2030
2031bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
2032 if (skipFunction(F))
2033 return false;
2034
2035 auto *TPC = getAnalysisIfAvailable();
2036 if (!TPC)
2037 return false;
2038
2039 const AMDGPUTargetMachine &TM = TPC->getTM();
2040 const TargetLibraryInfo *TLI =
2041 &getAnalysis().getTLI(F);
2042 AssumptionCache *AC =
2043 &getAnalysis().getAssumptionCache(F);
2044 auto *DTWP = getAnalysisIfAvailable();
2045 const DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
2047 getAnalysis().getUniformityInfo();
2048 return AMDGPUCodeGenPrepareImpl(F, TM, TLI, AC, DT, UA).run();
2049}
2050
2058 AMDGPUCodeGenPrepareImpl Impl(F, ATM, TLI, AC, DT, UA);
2059 if (!Impl.run())
2062 if (!Impl.FlowChanged)
2064 return PA;
2065}
2066
2068 "AMDGPU IR optimizations", false, false)
2074
2075char AMDGPUCodeGenPrepare::ID = 0;
2076
2078 return new AMDGPUCodeGenPrepare();
2079}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static Value * insertValues(IRBuilder<> &Builder, Type *Ty, SmallVectorImpl< Value * > &Values)
Definition AMDGPUCodeGenPrepare.cpp:332
static void extractValues(IRBuilder<> &Builder, SmallVectorImpl< Value * > &Values, Value *V)
Definition AMDGPUCodeGenPrepare.cpp:320
static Value * getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS)
Definition AMDGPUCodeGenPrepare.cpp:898
static bool isInterestingPHIIncomingValue(const Value *V)
Definition AMDGPUCodeGenPrepare.cpp:1526
static SelectInst * findSelectThroughCast(Value *V, CastInst *&Cast)
Definition AMDGPUCodeGenPrepare.cpp:412
static std::pair< Value *, Value * > getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS)
Definition AMDGPUCodeGenPrepare.cpp:884
static Value * emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src, bool IsNegative)
Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
Definition AMDGPUCodeGenPrepare.cpp:581
static Value * getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL)
Definition AMDGPUCodeGenPrepare.cpp:1092
static void collectPHINodes(const PHINode &I, SmallPtrSet< const PHINode *, 8 > &SeenPHIs)
Definition AMDGPUCodeGenPrepare.cpp:1581
static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL, const AMDGPUTargetMachine &TM, unsigned AS)
Definition AMDGPUCodeGenPrepare.cpp:1811
static bool areInSameBB(const Value *A, const Value *B)
Definition AMDGPUCodeGenPrepare.cpp:1518
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool runOnFunction(Function &F, bool PostInlining)
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
const SmallVectorImpl< MachineOperand > & Cond
void visit(MachineFunction &MF, MachineBasicBlock &Start, std::function< void(MachineBasicBlock *)> op)
This file implements a set that has insertion order iteration characteristics.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static cl::opt< cl::boolOrDefault > EnableGlobalISelOption("global-isel", cl::Hidden, cl::desc("Enable the \"global\" instruction selector"))
Target-Independent Code Generator Pass Configuration Options pass.
This pass exposes codegen information to IR-level passes.
LLVM IR instance of the generic uniformity analysis.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Type * Ty
Definition AMDGPUCodeGenPrepare.cpp:1674
unsigned Idx
Definition AMDGPUCodeGenPrepare.cpp:1675
VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
Definition AMDGPUCodeGenPrepare.cpp:1671
PHINode * NewPHI
Definition AMDGPUCodeGenPrepare.cpp:1677
Value * getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName)
Slice Inc according to the information contained within this slice.
Definition AMDGPUCodeGenPrepare.cpp:1699
unsigned NumElts
Definition AMDGPUCodeGenPrepare.cpp:1676
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
Definition AMDGPUCodeGenPrepare.cpp:2051
bool hasMadMacF32Insts() const
bool has16BitInsts() const
bool hasFastFMAF32() const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
This class represents a conversion between pointers from one address space to another.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
BinaryOps getOpcode() const
bool all() const
all - Returns true if all bits are set.
Represents analyses that only rely on functions' control flow.
This is the base class for all instructions that perform data casts.
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
TargetTransformInfo getTargetTransformInfo(const Function &F) const override
Get a TargetTransformInfo implementation for the target.
LLVM_ABI bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
This is an important base class in LLVM.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Analysis pass which computes a DominatorTree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Utility class for floating point operations which can have information about relaxed accuracy require...
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
LLVM_ABI float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
void setFast(bool B=true)
bool allowReciprocal() const
void setNoNaNs(bool B=true)
bool allowContract() const
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
FunctionPass class - This class is used to implement most global optimizations.
bool isUniform(ConstValueRefT V) const
Whether V is uniform/non-divergent.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Value * CreateFDiv(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateSIToFP(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Value * CreateFCmpOLT(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
FastMathFlags getFastMathFlags() const
Get the flags to be applied to created floating point ops.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateFNeg(Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Value * CreateFCmpOGE(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateFPToSI(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Base class for instruction visitors.
LLVM_ABI void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
An instruction for reading from memory.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
This class represents the LLVM 'select' instruction.
const Value * getFalseValue() const
const Value * getCondition() const
const Value * getTrueValue() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
LLVM_ABI unsigned getIntegerBitWidth() const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
LLVM_ABI const fltSemantics & getFltSemantics() const
Analysis pass which computes UniformityInfo.
Legacy analysis pass which computes a CycleInfo.
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
CmpClass_match< LHS, RHS, FCmpInst > m_FCmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FSub > m_FSub(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
ap_match< APFloat > m_APFloat(const APFloat *&Res)
Match a ConstantFP or splatted ConstantVector, binding the specified pointer to the contained APFloat...
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_IntrinsicIntrinsic::fabs(m_Value(X))
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
cstfp_pred_ty< is_nonnan > m_NonNaN()
Match a non-NaN FP constant.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
FunctionAddr VTableAddr Value
LLVM_ABI KnownFPClass computeKnownFPClass(const Value *V, const APInt &DemandedElts, FPClassTest InterestedClasses, const SimplifyQuery &SQ, unsigned Depth=0)
Determine which floating-point classes are valid for V, and return them in KnownFPClass bit sets.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
decltype(auto) dyn_cast(const From &Val)
dyn_cast - Return the argument parameter cast to the specified type.
LLVM_ABI bool expandRemainderUpTo64Bits(BinaryOperator *Rem)
Generate code to calculate the remainder of two integers, replacing Rem with the generated code.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
auto dyn_cast_or_null(const Y &Val)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
auto reverse(ContainerTy &&C)
LLVM_ABI bool expandDivisionUpTo64Bits(BinaryOperator *Div)
Generate code to divide two integers, replacing Div with the generated code.
FPClassTest
Floating-point class tests, supported by 'is_fpclass' intrinsic.
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa - Return true if the parameter to the template is an instance of one of the template type argu...
LLVM_ABI Constant * ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS, Constant *RHS, const DataLayout &DL)
Attempt to constant fold a binary operation with the specified operands.
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
FunctionPass * createAMDGPUCodeGenPreparePass()
Definition AMDGPUCodeGenPrepare.cpp:2077
To bit_cast(const From &from) noexcept
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
DWARFExpression::Operation Op
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
decltype(auto) cast(const From &Val)
cast - Return the argument parameter cast to the specified type.
LLVM_ABI bool isKnownNeverNaN(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if the floating-point scalar value is not a NaN or if the floating-point vector value has...
LLVM_ABI unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Get the upper bound on bit size for this Value Op as a signed integer.
LLVM_ABI bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL, bool OrZero=false, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return true if the given value is known to have exactly one bit set when defined.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=MaxLookupSearchDepth)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
LLVM_ABI CGPassBuilderOption getCGPassBuilderOption()
static constexpr DenormalMode getPreserveSign()
bool isNonNegative() const
Returns true if this value is known to be non-negative.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
bool isNegative() const
Returns true if this value is known to be negative.
bool isKnownNeverSubnormal() const
Return true if it's known this can never be a subnormal.