AMDGPUCodeGenPrepare.cpp Source File (original) (raw)

29#include "llvm/IR/IntrinsicsAMDGPU.h"

39#define DEBUG_TYPE "amdgpu-codegenprepare"

41using namespace llvm;

44namespace {

47 "amdgpu-codegenprepare-widen-constant-loads",

48 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),

53 BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",

54 cl::desc("Break large PHI nodes for DAGISel"),

58 ForceBreakLargePHIs("amdgpu-codegenprepare-force-break-large-phis",

59 cl::desc("For testing purposes, always break large "

60 "PHIs even if it isn't profitable."),

64 "amdgpu-codegenprepare-break-large-phis-threshold",

65 cl::desc("Minimum type size in bits for breaking large PHI nodes"),

69 "amdgpu-codegenprepare-mul24",

70 cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),

76 "amdgpu-codegenprepare-expand-div64",

77 cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),

84 "amdgpu-codegenprepare-disable-idiv-expansion",

85 cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),

91 "amdgpu-codegenprepare-disable-fdiv-expansion",

92 cl::desc("Prevent expanding floating point division in AMDGPUCodeGenPrepare"),

96class AMDGPUCodeGenPrepareImpl

97 : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {

98public:

107 const bool HasFP32DenormalFlush;

108 bool FlowChanged = false;

109 mutable Function *SqrtF32 = nullptr;

110 mutable Function *LdexpF32 = nullptr;

112

114

119 DT(DT), UA(UA), DL(F.getDataLayout()),

122

123 Function *getSqrtF32() const {

124 if (SqrtF32)

125 return SqrtF32;

126

129 F.getParent(), Intrinsic::amdgcn_sqrt, {Type::getFloatTy(Ctx)});

130 return SqrtF32;

131 }

132

133 Function *getLdexpF32() const {

134 if (LdexpF32)

135 return LdexpF32;

136

139 F.getParent(), Intrinsic::ldexp,

140 {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)});

141 return LdexpF32;

142 }

143

144 bool canBreakPHINode(const PHINode &I);

145

146

147 bool isLegalFloatingTy(const Type *T) const;

148

149

153 }

154

155 bool canIgnoreDenormalInput(const Value *V, const Instruction *CtxI) const {

156 return HasFP32DenormalFlush ||

158 }

159

160

161

162

163 unsigned numBitsUnsigned(Value *Op) const;

164

165

166

167

168 unsigned numBitsSigned(Value *Op) const;

169

170

171

173

174

175

177

181 unsigned MaxDivBits, bool Signed) const;

182

183

186 bool IsDiv, bool IsSigned) const;

187

189 Value *Num, Value *Den, unsigned NumBits,

190 bool IsDiv, bool IsSigned) const;

191

192

195

199

200

201

202

203

204

205

206

207

208 bool canWidenScalarExtLoad(LoadInst &I) const;

209

212

215

219

223 float ReqdAccuracy) const;

224

228 float ReqdAccuracy) const;

229

230 std::pair<Value *, Value *> getFrexpResults(IRBuilder<> &Builder,

231 Value *Src) const;

232

234 bool IsNegative) const;

239

240 bool tryNarrowMathIfNoOverflow(Instruction *I);

241

242public:

244

245 bool visitInstruction(Instruction &I) { return false; }

249 bool visitPHINode(PHINode &I);

251

255 bool run();

256};

257

258class AMDGPUCodeGenPrepare : public FunctionPass {

259public:

260 static char ID;

262 void getAnalysisUsage(AnalysisUsage &AU) const override {

266

267

268 if (!ExpandDiv64InIR)

270 }

272 StringRef getPassName() const override { return "AMDGPU IR optimizations"; }

273};

274

275}

276

277bool AMDGPUCodeGenPrepareImpl::run() {

278 BreakPhiNodesCache.clear();

279 bool MadeChange = false;

280

281

282

283

284 for (BasicBlock &BB : reverse(F)) {

287 MadeChange |= visit(I);

288 }

289 }

290

291 while (!DeadVals.empty()) {

294 }

295

296 return MadeChange;

297}

298

299bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const {

302}

303

304bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {

305 Type *Ty = I.getType();

306 int TySize = DL.getTypeSizeInBits(Ty);

307 Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty);

308

309 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.isUniform(&I);

310}

311

312unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op) const {

314}

315

316unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op) const {

318}

319

323 if (!VT) {

325 return;

326 }

327

328 for (int I = 0, E = VT->getNumElements(); I != E; ++I)

329 Values.push_back(Builder.CreateExtractElement(V, I));

330}

331

335 if (!Ty->isVectorTy()) {

337 return Values[0];

338 }

339

341 for (int I = 0, E = Values.size(); I != E; ++I)

342 NewVal = Builder.CreateInsertElement(NewVal, Values[I], I);

343

344 return NewVal;

345}

346

347bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {

348 if (I.getOpcode() != Instruction::Mul)

349 return false;

350

351 Type *Ty = I.getType();

354 return false;

355

356

358 return false;

359

363 Builder.SetCurrentDebugLocation(I.getDebugLoc());

364

365 unsigned LHSBits = 0, RHSBits = 0;

366 bool IsSigned = false;

367

368 if (ST.hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 &&

369 (RHSBits = numBitsUnsigned(RHS)) <= 24) {

370 IsSigned = false;

371

372 } else if (ST.hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 &&

373 (RHSBits = numBitsSigned(RHS)) <= 24) {

374 IsSigned = true;

375

376 } else

377 return false;

378

384

385 IntegerType *I32Ty = Builder.getInt32Ty();

386 IntegerType *IntrinTy = Size > 32 ? Builder.getInt64Ty() : I32Ty;

387 Type *DstTy = LHSVals[0]->getType();

388

389 for (int I = 0, E = LHSVals.size(); I != E; ++I) {

390 Value *LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty)

391 : Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);

392 Value *RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty)

393 : Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);

395 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;

397 Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy)

398 : Builder.CreateZExtOrTrunc(Result, DstTy);

400 }

401

404 I.replaceAllUsesWith(NewVal);

405 DeadVals.push_back(&I);

406

407 return true;

408}

409

410

411

413 Cast = nullptr;

415 return Sel;

416

419 return Sel;

420 }

421

422 return nullptr;

423}

424

425bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const {

426

427

428 int SelOpNo = 0;

429

430 CastInst *CastOp;

431

432

433

436 SelOpNo = 1;

438 }

439

441 return false;

442

446 if (!CBO || !CT || !CF)

447 return false;

448

449 if (CastOp) {

451 return false;

454 }

455

456

457

462 return false;

463

468 return false;

469

471 Builder.SetCurrentDebugLocation(BO.getDebugLoc());

473 Builder.setFastMathFlags(FPOp->getFastMathFlags());

474

476 FoldedT, FoldedF);

479 DeadVals.push_back(&BO);

480 if (CastOp)

481 DeadVals.push_back(CastOp);

482 DeadVals.push_back(Sel);

483 return true;

484}

485

486std::pair<Value *, Value *>

487AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder<> &Builder,

488 Value *Src) const {

489 Type *Ty = Src->getType();

493

494

495

496

497

498 Value *FrexpExp =

502 : Builder.CreateExtractValue(Frexp, {1});

503 return {FrexpMant, FrexpExp};

504}

505

506

507Value *AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder<> &Builder,

509 bool IsNegative) const {

510

511

512 if (IsNegative)

514

515

516

517

518

519

520

521

522

523

524 auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);

527 return Builder.CreateCall(getLdexpF32(), {Rcp, ScaleFactor});

528}

529

530

533 FastMathFlags FMF) const {

534

535

536

539 return nullptr;

540

541

542

543 auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, RHS);

544

547

548 auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, LHS);

550

551

552

553 Value *ExpDiff = Builder.CreateSub(FrexpExpLHS, FrexpExpRHS);

554 return Builder.CreateCall(getLdexpF32(), {Mul, ExpDiff});

555}

556

557

558Value *AMDGPUCodeGenPrepareImpl::emitSqrtIEEE2ULP(IRBuilder<> &Builder,

560 FastMathFlags FMF) const {

561 Type *Ty = Src->getType();

564 Value *NeedScale =

565 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));

566

568 Value *InputScaleFactor =

570

572

574

575 Value *OutputScaleFactor =

577 return Builder.CreateCall(getLdexpF32(), {Sqrt, OutputScaleFactor});

578}

579

580

582 bool IsNegative) {

583

584

585

586

587

588 Type *Ty = Src->getType();

591 Value *NeedScale =

592 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));

593 Constant *One = ConstantFP::get(Ty, 1.0);

594 Constant *InputScale = ConstantFP::get(Ty, 0x1.0p+24);

596 ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12);

597

598 Value *InputScaleFactor = Builder.CreateSelect(NeedScale, InputScale, One);

599

600 Value *ScaledInput = Builder.CreateFMul(Src, InputScaleFactor);

601 Value *Rsq = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, ScaledInput);

602 Value *OutputScaleFactor = Builder.CreateSelect(

603 NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One);

604

605 return Builder.CreateFMul(Rsq, OutputScaleFactor);

606}

607

608bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp,

609 FastMathFlags DivFMF,

610 FastMathFlags SqrtFMF) const {

611

613 return false;

614

615

617}

618

619Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(

621 const FastMathFlags SqrtFMF, const Instruction *CtxI) const {

622

624

625

626

627

629 if (!CLHS)

630 return nullptr;

631

633

634 bool IsNegative = false;

635

636

638

639 IRBuilder<>::FastMathFlagGuard Guard(Builder);

641

643 canIgnoreDenormalInput(Den, CtxI)) {

645

647 }

648

650 }

651

652 return nullptr;

653}

654

655

656

657

658

659

660

662AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder<> &Builder, Value *Num,

663 Value *Den, FastMathFlags FMF,

664 const Instruction *CtxI) const {

665

666

667

669

671 bool IsNegative = false;

674 Value *Src = Den;

675

676 if (HasFP32DenormalFlush || FMF.approxFunc()) {

677

678 if (IsNegative)

680

681

682

683

684

685

686

687

688

689

690

691

693 }

694

695

696

697 return emitRcpIEEE1ULP(Builder, Src, IsNegative);

698 }

699 }

700

702

703

704

705

706 if (HasFP32DenormalFlush || FMF.approxFunc()) {

708 return Builder.CreateFMul(Num, Recip);

709 }

710

711 Value *Recip = emitRcpIEEE1ULP(Builder, Den, false);

712 return Builder.CreateFMul(Num, Recip);

713 }

714

715 return nullptr;

716}

717

718

719

720

721

722

723

724

725Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(

727

728 if (ReqdAccuracy < 2.5f)

729 return nullptr;

730

731

733

734 bool NumIsOne = false;

736 if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))

737 NumIsOne = true;

738 }

739

740

741

742

743

744 if (!HasFP32DenormalFlush && !NumIsOne)

745 return nullptr;

746

747 return Builder.CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {Num, Den});

748}

749

750Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(

752 FastMathFlags SqrtFMF, Value *RsqOp, const Instruction *FDivInst,

753 float ReqdDivAccuracy) const {

754 if (RsqOp) {

756 optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst);

757 if (Rsq)

758 return Rsq;

759 }

760

761 Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);

762 if (Rcp)

763 return Rcp;

764

765

766

767

768

769 Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy);

770 if (FDivFast)

771 return FDivFast;

772

773 return emitFrexpDiv(Builder, Num, Den, DivFMF);

774}

775

776

777

778

779

780

781

782

783

784

785

786

787

788

789

790

791bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {

792 if (DisableFDivExpand)

793 return false;

794

797 return false;

798

799

800

801

804 const float ReqdAccuracy = FPOp->getFPAccuracy();

805

806 FastMathFlags SqrtFMF;

807

810

811 Value *RsqOp = nullptr;

813 if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&

814 DenII->hasOneUse()) {

817 if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF))

819 }

820

821

822

823

824

825

826

827

828

829

830 const bool AllowInaccurateRcp = DivFMF.approxFunc();

831 if (!RsqOp && AllowInaccurateRcp)

832 return false;

833

834

835 if (ReqdAccuracy < 1.0f)

836 return false;

837

841

847

848 if (RsqOp)

850

852 for (int I = 0, E = NumVals.size(); I != E; ++I) {

853 Value *NumElt = NumVals[I];

854 Value *DenElt = DenVals[I];

855 Value *RsqDenElt = RsqOp ? RsqDenVals[I] : nullptr;

856

858 visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt,

860 if (!NewElt) {

861

862

863

864

865 NewElt = Builder.CreateFDiv(NumElt, DenElt);

867 NewEltInst->copyMetadata(FDiv);

868 }

869

870 ResultVals[I] = NewElt;

871 }

872

874

875 if (NewVal) {

878 DeadVals.push_back(&FDiv);

879 }

880

881 return true;

882}

883

888

889 Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty);

890 Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty);

891 Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);

892 Value *Lo = Builder.CreateTrunc(MUL64, I32Ty);

893 Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));

894 Hi = Builder.CreateTrunc(Hi, I32Ty);

895 return std::pair(Lo, Hi);

896}

897

901

902

903

904

905

906unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,

908 unsigned MaxDivBits,

909 bool IsSigned) const {

913 if (IsSigned) {

915

916 unsigned DivBits = SSBits - RHSSignBits + 1;

917 if (DivBits > MaxDivBits)

918 return SSBits;

919

921

922 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);

923 DivBits = SSBits - SignBits + 1;

924 return DivBits;

925 }

926

927

928

931 return SSBits;

933 unsigned DivBits = SSBits - RHSSignBits;

934 if (DivBits > MaxDivBits)

935 return SSBits;

936

939 return SSBits;

941

942 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);

943 DivBits = SSBits - SignBits;

944 return DivBits;

945}

946

947

948

949Value *AMDGPUCodeGenPrepareImpl::expandDivRem24(IRBuilder<> &Builder,

950 BinaryOperator &I, Value *Num,

951 Value *Den, bool IsDiv,

952 bool IsSigned) const {

953 unsigned DivBits = getDivNumBits(I, Num, Den, 24, IsSigned);

954 if (DivBits > 24)

955 return nullptr;

956 return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned);

957}

958

959Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(

961 unsigned DivBits, bool IsDiv, bool IsSigned) const {

965

967 ConstantInt *One = Builder.getInt32(1);

969

970 if (IsSigned) {

971

972 JQ = Builder.CreateXor(Num, Den);

973

974

976

977

978 JQ = Builder.CreateOr(JQ, One);

979 }

980

981

983

984

986

987

990

991

994

998

999

1002

1003

1005

1006

1008 ? Intrinsic::fma

1011 {FQNeg->getType()}, {FQNeg, FB, FA}, FQ);

1012

1013

1016

1017

1019

1020

1022

1023

1025

1026

1028

1029

1031

1032 Value *Res = Div;

1033 if (!IsDiv) {

1034

1036 Res = Builder.CreateSub(Num, Rem);

1037 }

1038

1039 if (DivBits != 0 && DivBits < 32) {

1040

1041 if (IsSigned) {

1042 int InRegBits = 32 - DivBits;

1043

1044 Res = Builder.CreateShl(Res, InRegBits);

1045 Res = Builder.CreateAShr(Res, InRegBits);

1046 } else {

1047 ConstantInt *TruncMask

1048 = Builder.getInt32((UINT64_C(1) << DivBits) - 1);

1049 Res = Builder.CreateAnd(Res, TruncMask);

1050 }

1051 }

1052

1053 return Res;

1054}

1055

1056

1057

1058

1059

1060bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,

1062 Value *Den) const {

1064

1065

1066 if (C->getType()->getScalarSizeInBits() <= 32)

1067 return true;

1068

1069

1070

1071

1072

1073

1075 return true;

1076

1077 return false;

1078 }

1079

1081

1082 if (BinOpDen->getOpcode() == Instruction::Shl &&

1085 return true;

1086 }

1087 }

1088

1089 return false;

1090}

1091

1093

1099 return Builder.CreateAShr(V, Builder.getInt32(31));

1100}

1101

1102Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder,

1103 BinaryOperator &I, Value *X,

1106 assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||

1107 Opc == Instruction::SRem || Opc == Instruction::SDiv);

1108

1109 FastMathFlags FMF;

1112

1113 if (divHasSpecialOptimization(I, X, Y))

1114 return nullptr;

1115

1116 bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;

1117 bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;

1118

1119 Type *Ty = X->getType();

1122

1124 if (IsSigned) {

1127 } else {

1130 }

1131 }

1132

1133 if (Value *Res = expandDivRem24(Builder, I, X, Y, IsDiv, IsSigned)) {

1136 }

1137

1139 ConstantInt *One = Builder.getInt32(1);

1140

1141 Value *Sign = nullptr;

1142 if (IsSigned) {

1145

1146 Sign = IsDiv ? Builder.CreateXor(SignX, SignY) : SignX;

1147

1150

1153 }

1154

1155

1156

1157

1158

1159

1160

1161

1162

1163

1164

1165

1166

1167

1168

1169

1170

1171

1172

1173

1174

1175

1176

1177

1178

1179

1180

1181

1182

1183

1184

1185

1186

1188 Value *RcpY = Builder.CreateIntrinsic(Intrinsic::amdgcn_rcp, F32Ty, {FloatY});

1192

1193

1197

1198

1201

1202

1204 if (IsDiv)

1207

1208

1211 if (IsDiv)

1213 else

1215

1216 if (IsSigned) {

1217 Res = Builder.CreateXor(Res, Sign);

1218 Res = Builder.CreateSub(Res, Sign);

1220 } else {

1222 }

1223 return Res;

1224}

1225

1226Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder,

1227 BinaryOperator &I, Value *Num,

1228 Value *Den) const {

1229 if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))

1230 return nullptr;

1231

1233

1234 bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;

1235 bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;

1236

1237 unsigned NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned);

1238 if (NumDivBits > 32)

1239 return nullptr;

1240

1241 Value *Narrowed = nullptr;

1242 if (NumDivBits <= 24) {

1243 Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits,

1244 IsDiv, IsSigned);

1245 } else if (NumDivBits <= 32) {

1246 Narrowed = expandDivRem32(Builder, I, Num, Den);

1247 }

1248

1249 if (Narrowed) {

1250 return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) :

1252 }

1253

1254 return nullptr;

1255}

1256

1257void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const {

1259

1260 if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {

1262 return;

1263 }

1264

1265 if (Opc == Instruction::URem || Opc == Instruction::SRem) {

1267 return;

1268 }

1269

1271}

1272

1273

1274

1275

1276

1277

1278

1279

1280

1281

1282

1283

1284

1285bool AMDGPUCodeGenPrepareImpl::tryNarrowMathIfNoOverflow(Instruction *I) {

1286 unsigned Opc = I->getOpcode();

1287 Type *OldType = I->getType();

1288

1289 if (Opc != Instruction::Add && Opc != Instruction::Mul)

1290 return false;

1291

1293

1294 if (Opc != Instruction::Add && Opc != Instruction::Mul)

1295 llvm_unreachable("Unexpected opcode, only valid for Instruction::Add and "

1296 "Instruction::Mul.");

1297

1299

1300 MaxBitsNeeded = std::max(bit_ceil(MaxBitsNeeded), 8);

1301 Type *NewType = DL.getSmallestLegalIntType(I->getContext(), MaxBitsNeeded);

1302 if (!NewType)

1303 return false;

1305 if (NewBit >= OrigBit)

1306 return false;

1308

1309

1313

1316

1317 int NumOfNonConstOps = 2;

1319

1320 NumOfNonConstOps = 1;

1321 }

1323 NewType, OldType,

1326

1327 NewCost +=

1330 if (NewCost >= OldCost)

1331 return false;

1332

1338

1341 DeadVals.push_back(I);

1342 return true;

1343}

1344

1345bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {

1346 if (foldBinOpIntoSelect(I))

1347 return true;

1348

1349 if (UseMul24Intrin && replaceMulWithMul24(I))

1350 return true;

1351 if (tryNarrowMathIfNoOverflow(&I))

1352 return true;

1353

1356 Type *Ty = I.getType();

1357 Value *NewDiv = nullptr;

1359

1361

1362 if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||

1363 Opc == Instruction::SRem || Opc == Instruction::SDiv) &&

1364 ScalarSize <= 64 &&

1365 !DisableIDivExpand) {

1366 Value *Num = I.getOperand(0);

1367 Value *Den = I.getOperand(1);

1370

1373

1374 for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {

1377

1379 if (ScalarSize <= 32) {

1380 NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);

1381 if (!NewElt)

1382 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);

1383 } else {

1384

1385

1386 NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN);

1387 if (!NewElt) {

1388

1389

1390

1391 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);

1392

1393

1395 Div64ToExpand.push_back(NewEltBO);

1396 }

1397 }

1398

1400 NewEltI->copyIRFlags(&I);

1401

1403 }

1404 } else {

1405 if (ScalarSize <= 32)

1406 NewDiv = expandDivRem32(Builder, I, Num, Den);

1407 else {

1408 NewDiv = shrinkDivRem64(Builder, I, Num, Den);

1409 if (!NewDiv)

1411 }

1412 }

1413

1414 if (NewDiv) {

1415 I.replaceAllUsesWith(NewDiv);

1416 DeadVals.push_back(&I);

1418 }

1419 }

1420

1421 if (ExpandDiv64InIR) {

1422

1423 for (BinaryOperator *Div : Div64ToExpand) {

1424 expandDivRem64(*Div);

1425 FlowChanged = true;

1427 }

1428 }

1429

1431}

1432

1433bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {

1435 return false;

1436

1439 canWidenScalarExtLoad(I)) {

1442

1446

1447

1448

1449 if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) {

1450 ConstantInt *Lower =

1452

1453 if (Lower->isNullValue()) {

1454 WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);

1455 } else {

1458

1460 };

1461

1462 WidenLoad->setMetadata(LLVMContext::MD_range,

1464 }

1465 }

1466

1467 int TySize = DL.getTypeSizeInBits(I.getType());

1472 DeadVals.push_back(&I);

1473 return true;

1474 }

1475

1476 return false;

1477}

1478

1479bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {

1484 CmpPredicate Pred;

1485

1486

1488 return false;

1489

1491 if (!FPOp)

1492 return false;

1493

1496

1499

1500 Value *Fract = nullptr;

1501 if (Pred == FCmpInst::FCMP_UNO && TrueVal == CmpVal && IIFalse &&

1502 CmpVal == matchFractPat(*IIFalse)) {

1503

1504 Fract = applyFractPat(Builder, CmpVal);

1505 } else if (Pred == FCmpInst::FCMP_ORD && FalseVal == CmpVal && IITrue &&

1506 CmpVal == matchFractPat(*IITrue)) {

1507

1508 Fract = applyFractPat(Builder, CmpVal);

1509 } else

1510 return false;

1511

1513 I.replaceAllUsesWith(Fract);

1514 DeadVals.push_back(&I);

1515 return true;

1516}

1517

1521 return IA && IB && IA->getParent() == IB->getParent();

1522}

1523

1524

1525

1528 if (!FVT)

1529 return false;

1530

1531 const Value *CurVal = V;

1532

1533

1534 BitVector EltsCovered(FVT->getNumElements());

1537

1538

1539

1540

1541 if (!Idx || Idx->getZExtValue() >= FVT->getNumElements())

1542 return false;

1543

1544 const auto *VecSrc = IE->getOperand(0);

1545

1546

1547

1548

1550 return false;

1551

1552 CurVal = VecSrc;

1553 EltsCovered.set(Idx->getZExtValue());

1554

1555

1556 if (EltsCovered.all())

1557 return true;

1558 }

1559

1560

1561

1562

1563

1564

1566 return true;

1567

1568

1569

1570

1571

1576 }

1577

1578 return false;

1579}

1580

1583 const auto [It, Inserted] = SeenPHIs.insert(&I);

1584 if (!Inserted)

1585 return;

1586

1587 for (const Value *Inc : I.incoming_values()) {

1590 }

1591

1592 for (const User *U : I.users()) {

1595 }

1596}

1597

1598bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode &I) {

1599

1600 if (const auto It = BreakPhiNodesCache.find(&I);

1601 It != BreakPhiNodesCache.end())

1602 return It->second;

1603

1604

1605

1606

1607

1608

1609

1610 SmallPtrSet<const PHINode *, 8> WorkList;

1612

1613#ifndef NDEBUG

1614

1615

1616 for (const PHINode *WLP : WorkList) {

1617 assert(BreakPhiNodesCache.count(WLP) == 0);

1618 }

1619#endif

1620

1621

1622

1623

1624

1625

1626

1627

1628

1629

1630

1631

1632 const auto Threshold = (alignTo(WorkList.size() * 2, 3) / 3);

1633 unsigned NumBreakablePHIs = 0;

1634 bool CanBreak = false;

1635 for (const PHINode *Cur : WorkList) {

1636

1637

1638

1639

1640

1641

1643 if (++NumBreakablePHIs >= Threshold) {

1644 CanBreak = true;

1645 break;

1646 }

1647 }

1648 }

1649

1650 for (const PHINode *Cur : WorkList)

1651 BreakPhiNodesCache[Cur] = CanBreak;

1652

1653 return CanBreak;

1654}

1655

1656

1657

1658

1659

1660

1661

1662

1663

1664

1665

1666

1667

1668

1670public:

1673

1678

1679

1680

1681

1682

1683

1684

1685

1686

1687

1688

1689

1690

1691

1692

1693

1694

1695

1696

1697

1698

1700 Value *&Res = SlicedVals[{BB, Inc}];

1701 if (Res)

1702 return Res;

1703

1706 B.SetCurrentDebugLocation(IncInst->getDebugLoc());

1707

1710 for (unsigned K = Idx; K < (Idx + NumElts); ++K)

1711 Mask.push_back(K);

1712 Res = B.CreateShuffleVector(Inc, Mask, NewValName);

1713 } else

1714 Res = B.CreateExtractElement(Inc, Idx, NewValName);

1715

1716 return Res;

1717 }

1718

1719private:

1721};

1722

1723bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {

1724

1725

1726

1727

1728

1729

1730

1731

1732

1733

1735 return false;

1736

1739 DL.getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold)

1740 return false;

1741

1742 if (!ForceBreakLargePHIs && !canBreakPHINode(I))

1743 return false;

1744

1745 std::vector Slices;

1746

1748 {

1749 unsigned Idx = 0;

1750

1751

1752 const unsigned EltSize = DL.getTypeSizeInBits(EltTy);

1754 if (EltSize == 8 || EltSize == 16) {

1755 const unsigned SubVecSize = (32 / EltSize);

1757 for (unsigned End = alignDown(NumElts, SubVecSize); Idx < End;

1758 Idx += SubVecSize)

1759 Slices.emplace_back(SubVecTy, Idx, SubVecSize);

1760 }

1761

1762

1763 for (; Idx < NumElts; ++Idx)

1764 Slices.emplace_back(EltTy, Idx, 1);

1765 }

1766

1767 assert(Slices.size() > 1);

1768

1769

1770

1771

1773 B.SetCurrentDebugLocation(I.getDebugLoc());

1774

1775 unsigned IncNameSuffix = 0;

1776 for (VectorSlice &S : Slices) {

1777

1778

1779 B.SetInsertPoint(I.getParent()->getFirstNonPHIIt());

1780 S.NewPHI = B.CreatePHI(S.Ty, I.getNumIncomingValues());

1781

1782 for (const auto &[Idx, BB] : enumerate(I.blocks())) {

1783 S.NewPHI->addIncoming(S.getSlicedVal(BB, I.getIncomingValue(Idx),

1784 "largephi.extractslice" +

1785 std::to_string(IncNameSuffix++)),

1786 BB);

1787 }

1788 }

1789

1790

1792 unsigned NameSuffix = 0;

1793 for (VectorSlice &S : Slices) {

1794 const auto ValName = "largephi.insertslice" + std::to_string(NameSuffix++);

1795 if (S.NumElts > 1)

1796 Vec = B.CreateInsertVector(FVT, Vec, S.NewPHI, S.Idx, ValName);

1797 else

1798 Vec = B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);

1799 }

1800

1801 I.replaceAllUsesWith(Vec);

1802 DeadVals.push_back(&I);

1803 return true;

1804}

1805

1806

1807

1808

1809

1810

1813

1814

1815

1817 return true;

1818

1819

1820 if (const auto *Arg = dyn_cast(V); Arg && Arg->hasNonNullAttr())

1821 return true;

1822

1823

1825 Load && Load->hasMetadata(LLVMContext::MD_nonnull))

1826 return true;

1827

1828

1829

1831 return false;

1832

1833

1834

1835

1836

1837

1838

1839

1840

1843

1844 assert(SrcPtrKB.getBitWidth() == DL.getPointerSizeInBits(AS));

1845 assert((NullVal == 0 || NullVal == -1) &&

1846 "don't know how to check for this null value!");

1847 return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();

1848}

1849

1850bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {

1851

1852

1853

1854 if (I.getType()->isVectorTy())

1855 return false;

1856

1857

1858

1859 const unsigned SrcAS = I.getSrcAddressSpace();

1860 const unsigned DstAS = I.getDestAddressSpace();

1861

1862 bool CanLower = false;

1869 if (!CanLower)

1870 return false;

1871

1874 if ( $all\_of$ (WorkList, [&](const Value *V) {

1876 }))

1877 return false;

1878

1880 auto *Intrin = B.CreateIntrinsic(

1881 I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});

1882 I.replaceAllUsesWith(Intrin);

1883 DeadVals.push_back(&I);

1884 return true;

1885}

1886

1887bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {

1888 switch (I.getIntrinsicID()) {

1889 case Intrinsic::minnum:

1890 case Intrinsic::minimumnum:

1891 case Intrinsic::minimum:

1892 return visitFMinLike(I);

1893 case Intrinsic::sqrt:

1894 return visitSqrt(I);

1895 default:

1896 return false;

1897 }

1898}

1899

1900

1901

1902

1903

1904

1905

1906

1907Value *AMDGPUCodeGenPrepareImpl::matchFractPat(IntrinsicInst &I) {

1909 return nullptr;

1910

1912

1913

1914

1915 if (IID != Intrinsic::minnum && IID != Intrinsic::minimum &&

1916 IID != Intrinsic::minimumnum)

1917 return nullptr;

1918

1919 Type *Ty = I.getType();

1921 return nullptr;

1922

1923 Value *Arg0 = I.getArgOperand(0);

1924 Value *Arg1 = I.getArgOperand(1);

1925

1928 return nullptr;

1929

1931 bool LosesInfo;

1932 One.convert(C->getSemantics(), APFloat::rmNearestTiesToEven, &LosesInfo);

1933

1934

1935 One.next(true);

1936 if (One != *C)

1937 return nullptr;

1938

1939 Value *FloorSrc;

1942 return FloorSrc;

1943 return nullptr;

1944}

1945

1946Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,

1947 Value *FractArg) {

1950

1952

1954 for (unsigned I = 0, E = FractVals.size(); I != E; ++I) {

1955 ResultVals[I] =

1956 Builder.CreateIntrinsic(Intrinsic::amdgcn_fract, {Ty}, {FractVals[I]});

1957 }

1958

1960}

1961

1962bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) {

1963 Value *FractArg = matchFractPat(I);

1964 if (!FractArg)

1965 return false;

1966

1967

1968

1969 if (.hasNoNaNs() && isKnownNeverNaN (FractArg, SimplifyQuery(DL, TLI)))

1970 return false;

1971

1973 FastMathFlags FMF = I.getFastMathFlags();

1976

1977 Value *Fract = applyFractPat(Builder, FractArg);

1979 I.replaceAllUsesWith(Fract);

1980 DeadVals.push_back(&I);

1981 return true;

1982}

1983

1984

1985bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {

1988 return false;

1989

1992

1993

1994

1996 return false;

1997

1998 const float ReqdAccuracy = FPOp->getFPAccuracy();

1999

2000

2001 if (ReqdAccuracy < 1.0f)

2002 return false;

2003

2005 bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);

2006

2007

2008

2009 if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)

2010 return false;

2011

2015

2017 for (int I = 0, E = SrcVals.size(); I != E; ++I) {

2018 if (CanTreatAsDAZ)

2019 ResultVals[I] = Builder.CreateCall(getSqrtF32(), SrcVals[I]);

2020 else

2021 ResultVals[I] = emitSqrtIEEE2ULP(Builder, SrcVals[I], SqrtFMF);

2022 }

2023

2027 DeadVals.push_back(&Sqrt);

2028 return true;

2029}

2030

2031bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {

2032 if (skipFunction(F))

2033 return false;

2034

2035 auto *TPC = getAnalysisIfAvailable();

2036 if (!TPC)

2037 return false;

2038

2039 const AMDGPUTargetMachine &TM = TPC->getTM();

2040 const TargetLibraryInfo *TLI =

2041 &getAnalysis().getTLI(F);

2042 AssumptionCache *AC =

2043 &getAnalysis().getAssumptionCache(F);

2044 auto *DTWP = getAnalysisIfAvailable();

2045 const DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;

2047 getAnalysis().getUniformityInfo();

2048 return AMDGPUCodeGenPrepareImpl(F, TM, TLI, AC, DT, UA).run();

2049}

2050

2058 AMDGPUCodeGenPrepareImpl Impl(F, ATM, TLI, AC, DT, UA);

2059 if (!Impl.run())

2062 if (!Impl.FlowChanged)

2064 return PA;

2065}

2066

2068 "AMDGPU IR optimizations", false, false)

2074

2075char AMDGPUCodeGenPrepare::ID = 0;

2076

2078 return new AMDGPUCodeGenPrepare();

2079}

assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

static Value * insertValues(IRBuilder<> &Builder, Type *Ty, SmallVectorImpl< Value * > &Values)

Definition AMDGPUCodeGenPrepare.cpp:332

static void extractValues(IRBuilder<> &Builder, SmallVectorImpl< Value * > &Values, Value *V)

Definition AMDGPUCodeGenPrepare.cpp:320

static Value * getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS)

Definition AMDGPUCodeGenPrepare.cpp:898

static bool isInterestingPHIIncomingValue(const Value *V)

Definition AMDGPUCodeGenPrepare.cpp:1526

static SelectInst * findSelectThroughCast(Value *V, CastInst *&Cast)

Definition AMDGPUCodeGenPrepare.cpp:412

static std::pair< Value *, Value * > getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS)

Definition AMDGPUCodeGenPrepare.cpp:884

static Value * emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src, bool IsNegative)

Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.

Definition AMDGPUCodeGenPrepare.cpp:581

static Value * getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL)

Definition AMDGPUCodeGenPrepare.cpp:1092

static void collectPHINodes(const PHINode &I, SmallPtrSet< const PHINode *, 8 > &SeenPHIs)

Definition AMDGPUCodeGenPrepare.cpp:1581

static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL, const AMDGPUTargetMachine &TM, unsigned AS)

Definition AMDGPUCodeGenPrepare.cpp:1811

static bool areInSameBB(const Value *A, const Value *B)

Definition AMDGPUCodeGenPrepare.cpp:1518

static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))

The AMDGPU TargetMachine interface definition for hw codegen targets.

MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL

static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")

static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

static bool runOnFunction(Function &F, bool PostInlining)

ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))

FunctionAnalysisManager FAM

#define INITIALIZE_PASS_DEPENDENCY(depName)

#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)

#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)

const SmallVectorImpl< MachineOperand > & Cond

void visit(MachineFunction &MF, MachineBasicBlock &Start, std::function< void(MachineBasicBlock *)> op)

This file implements a set that has insertion order iteration characteristics.

static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")

static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")

static cl::opt< cl::boolOrDefault > EnableGlobalISelOption("global-isel", cl::Hidden, cl::desc("Enable the \"global\" instruction selector"))

Target-Independent Code Generator Pass Configuration Options pass.

This pass exposes codegen information to IR-level passes.

LLVM IR instance of the generic uniformity analysis.

static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)

Returns the opcode of Values or ~0 if they do not all agree.

Type * Ty

Definition AMDGPUCodeGenPrepare.cpp:1674

unsigned Idx

Definition AMDGPUCodeGenPrepare.cpp:1675

VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)

Definition AMDGPUCodeGenPrepare.cpp:1671

PHINode * NewPHI

Definition AMDGPUCodeGenPrepare.cpp:1677

Value * getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName)

Slice Inc according to the information contained within this slice.

Definition AMDGPUCodeGenPrepare.cpp:1699

unsigned NumElts

Definition AMDGPUCodeGenPrepare.cpp:1676

PreservedAnalyses run(Function &, FunctionAnalysisManager &)

Definition AMDGPUCodeGenPrepare.cpp:2051

bool hasMadMacF32Insts() const

bool has16BitInsts() const

bool hasFastFMAF32() const

static int64_t getNullPointerValue(unsigned AddrSpace)

Get the integer value of a null pointer in the given address space.

static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)

Returns the smallest (by magnitude) normalized finite number in the given semantics.

This class represents a conversion between pointers from one address space to another.

Represent the analysis usage information of a pass.

AnalysisUsage & addRequired()

void setPreservesAll()

Set by analyses that do not transform their input at all.

A function analysis which provides an AssumptionCache.

An immutable pass that tracks lazily created AssumptionCache objects.

A cache of @llvm.assume calls within a function.

LLVM Basic Block Representation.

const Instruction * getTerminator() const LLVM_READONLY

Returns the terminator instruction if the block is well formed or null if the block is not well forme...

BinaryOps getOpcode() const

bool all() const

all - Returns true if all bits are set.

Represents analyses that only rely on functions' control flow.

This is the base class for all instructions that perform data casts.

Instruction::CastOps getOpcode() const

Return the opcode of this CastInst.

TargetTransformInfo getTargetTransformInfo(const Function &F) const override

Get a TargetTransformInfo implementation for the target.

LLVM_ABI bool isExactlyValue(const APFloat &V) const

We don't rely on operator== working on double values, as it returns true for things that are clearly ...

This is an important base class in LLVM.

static LLVM_ABI Constant * getAllOnesValue(Type *Ty)

static LLVM_ABI Constant * getNullValue(Type *Ty)

Constructor to create a '0' constant of arbitrary type.

A parsed version of the target data layout string in and methods for querying it.

Analysis pass which computes a DominatorTree.

Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.

Utility class for floating point operations which can have information about relaxed accuracy require...

FastMathFlags getFastMathFlags() const

Convenience function for getting all the fast-math flags.

LLVM_ABI float getFPAccuracy() const

Get the maximum error permitted by this operation in ULPs.

Convenience struct for specifying and reasoning about fast-math flags.

void setFast(bool B=true)

bool allowReciprocal() const

void setNoNaNs(bool B=true)

bool allowContract() const

unsigned getNumElements() const

static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)

FunctionPass class - This class is used to implement most global optimizations.

bool isUniform(ConstValueRefT V) const

Whether V is uniform/non-divergent.

Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")

Value * CreateFDiv(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)

Value * CreateSIToFP(Value *V, Type *DestTy, const Twine &Name="")

Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")

IntegerType * getIntNTy(unsigned N)

Fetch the type representing an N-bit integer.

Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")

Create a ZExt or Trunc from the integer value V to DestTy.

Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")

LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)

Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")

Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")

IntegerType * getInt32Ty()

Fetch the type representing a 32-bit integer.

Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)

void setFastMathFlags(FastMathFlags NewFMF)

Set the fast-math flags to be used with generated fp-math operators.

Value * CreateFCmpOLT(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)

void SetCurrentDebugLocation(DebugLoc L)

Set location information used by debugging information.

Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)

LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")

Create a call to intrinsic ID with Args, mangled using Types.

ConstantInt * getInt32(uint32_t C)

Get a constant 32-bit value.

Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)

Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")

LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")

Create a call to intrinsic ID with 1 operand which is mangled on its type.

LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)

Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...

Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)

FastMathFlags getFastMathFlags() const

Get the flags to be applied to created floating point ops.

Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)

Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")

Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)

Type * getFloatTy()

Fetch the type representing a 32-bit floating point value.

CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)

Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)

Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)

Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")

Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)

Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")

Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)

Value * CreateFNeg(Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)

Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)

Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")

Create a SExt or Trunc from the integer value V to DestTy.

Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)

Value * CreateFCmpOGE(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)

Value * CreateFPToSI(Value *V, Type *DestTy, const Twine &Name="")

This provides a uniform API for creating instructions and inserting them into a basic block: either a...

Base class for instruction visitors.

LLVM_ABI void copyFastMathFlags(FastMathFlags FMF)

Convenience function for transferring all fast-math flag values to this instruction,...

const DebugLoc & getDebugLoc() const

Return the debug location for this node as a DebugLoc.

A wrapper class for inspecting calls to intrinsic functions.

This is an important class for using LLVM in a threaded context.

An instruction for reading from memory.

static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)

static LLVM_ABI PoisonValue * get(Type *T)

Static factory methods - Return an 'poison' object of the specified type.

A set of analyses that are preserved following a run of a transformation pass.

static PreservedAnalyses none()

Convenience factory function for the empty preserved set.

static PreservedAnalyses all()

Construct a special preserved set that preserves all passes.

PreservedAnalyses & preserveSet()

Mark an analysis set as preserved.

This class represents the LLVM 'select' instruction.

const Value * getFalseValue() const

const Value * getCondition() const

const Value * getTrueValue() const

std::pair< iterator, bool > insert(PtrType Ptr)

Inserts Ptr if and only if there is no element in the container equal to Ptr.

SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.

This class consists of common code factored out of the SmallVector class to reduce code duplication b...

void push_back(const T &Elt)

This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.

StringRef - Represent a constant reference to a string, i.e.

Analysis pass providing the TargetLibraryInfo.

Provides information about what library functions are available for the current target.

const STC & getSubtarget(const Function &F) const

This method returns a pointer to the specified type of TargetSubtargetInfo.

static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)

Calculates a CastContextHint from I.

LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const

@ TCK_RecipThroughput

Reciprocal throughput.

LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const

This is an approximation of reciprocal throughput of a math/logic op.

The instances of the Type class are immutable: once they are created, they are never changed.

static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)

LLVM_ABI unsigned getIntegerBitWidth() const

static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)

bool isFloatTy() const

Return true if this is 'float', a 32-bit IEEE fp type.

Type * getScalarType() const

If this is a vector type, return the element type, otherwise return 'this'.

LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const

Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...

bool isHalfTy() const

Return true if this is 'half', a 16-bit IEEE fp type.

LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY

If this is a vector type, return the getPrimitiveSizeInBits value for the element type.

bool isDoubleTy() const

Return true if this is 'double', a 64-bit IEEE fp type.

LLVM_ABI const fltSemantics & getFltSemantics() const

Analysis pass which computes UniformityInfo.

Legacy analysis pass which computes a CycleInfo.

Value * getOperand(unsigned i) const

LLVM Value Representation.

Type * getType() const

All values are typed, get the type of this value.

bool hasOneUse() const

Return true if there is exactly one use of this value.

LLVM_ABI void replaceAllUsesWith(Value *V)

Change all uses of this to point to a new Value.

LLVM_ABI void takeName(Value *V)

Transfer the name from V to this value.

Type * getElementType() const

const ParentTy * getParent() const

self_iterator getIterator()

#define llvm_unreachable(msg)

Marks that the current location is not supposed to be reachable.

@ CONSTANT_ADDRESS_32BIT

Address space for 32-bit constant memory.

@ LOCAL_ADDRESS

Address space for local memory.

@ CONSTANT_ADDRESS

Address space for constant memory (VTX2).

@ FLAT_ADDRESS

Address space for flat memory.

@ PRIVATE_ADDRESS

Address space for private memory.

constexpr char Align[]

Key for Kernel::Arg::Metadata::mAlign.

unsigned ID

LLVM IR allows to use arbitrary numbers as calling convention identifiers.

@ C

The default llvm calling convention, compatible with C.

@ FMAD

FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.

LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})

Look up the Function declaration of the intrinsic id in the Module M.

CmpClass_match< LHS, RHS, FCmpInst > m_FCmp(CmpPredicate &Pred, const LHS &L, const RHS &R)

BinaryOp_match< LHS, RHS, Instruction::FSub > m_FSub(const LHS &L, const RHS &R)

bool match(Val *V, const Pattern &P)

ap_match< APFloat > m_APFloat(const APFloat *&Res)

Match a ConstantFP or splatted ConstantVector, binding the specified pointer to the contained APFloat...

IntrinsicID_match m_Intrinsic()

Match intrinsic calls like this: m_IntrinsicIntrinsic::fabs(m_Value(X))

deferredval_ty< Value > m_Deferred(Value *const &V)

Like m_Specific(), but works if the specific value to match is determined as part of the same match()...

cstfp_pred_ty< is_nonnan > m_NonNaN()

Match a non-NaN FP constant.

class_match< Value > m_Value()

Match an arbitrary value and ignore it.

initializer< Ty > init(const Ty &Val)

std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)

Extract a Value from Metadata.

This is an optimization pass for GlobalISel generic memory operations.

GenericUniformityInfo< SSAContext > UniformityInfo

FunctionAddr VTableAddr Value

LLVM_ABI KnownFPClass computeKnownFPClass(const Value *V, const APInt &DemandedElts, FPClassTest InterestedClasses, const SimplifyQuery &SQ, unsigned Depth=0)

Determine which floating-point classes are valid for V, and return them in KnownFPClass bit sets.

bool all_of(R &&range, UnaryPredicate P)

Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.

LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())

If the specified value is a trivially dead instruction, delete it.

auto enumerate(FirstRange &&First, RestRanges &&...Rest)

Given two or more input ranges, returns a new range whose values are tuples (A, B,...

decltype(auto) dyn_cast(const From &Val)

dyn_cast - Return the argument parameter cast to the specified type.

LLVM_ABI bool expandRemainderUpTo64Bits(BinaryOperator *Rem)

Generate code to calculate the remainder of two integers, replacing Rem with the generated code.

iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)

Make a range that does early increment to allow mutation of the underlying range without disrupting i...

constexpr T alignDown(U Value, V Align, W Skew=0)

Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.

T bit_ceil(T Value)

Returns the smallest integral power of two no smaller than Value if Value is nonzero.

auto dyn_cast_or_null(const Y &Val)

bool any_of(R &&range, UnaryPredicate P)

Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.

LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)

Return true if the result produced by the instruction is not used, and the instruction will return.

auto reverse(ContainerTy &&C)

LLVM_ABI bool expandDivisionUpTo64Bits(BinaryOperator *Div)

Generate code to divide two integers, replacing Div with the generated code.

FPClassTest

Floating-point class tests, supported by 'is_fpclass' intrinsic.

LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)

Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...

LLVM_ABI Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)

Attempt to constant fold a cast with the specified operand.

class LLVM_GSL_OWNER SmallVector

Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...

bool isa(const From &Val)

isa - Return true if the parameter to the template is an instance of one of the template type argu...

LLVM_ABI Constant * ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS, Constant *RHS, const DataLayout &DL)

Attempt to constant fold a binary operation with the specified operands.

IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >

FunctionPass * createAMDGPUCodeGenPreparePass()

Definition AMDGPUCodeGenPrepare.cpp:2077

To bit_cast(const From &from) noexcept

uint64_t alignTo(uint64_t Size, Align A)

Returns a multiple of A needed to store Size bytes.

DWARFExpression::Operation Op

LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)

Return the number of times the sign bit of the register is replicated into the other bits.

decltype(auto) cast(const From &Val)

cast - Return the argument parameter cast to the specified type.

LLVM_ABI bool isKnownNeverNaN(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)

Return true if the floating-point scalar value is not a NaN or if the floating-point vector value has...

LLVM_ABI unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)

Get the upper bound on bit size for this Value Op as a signed integer.

LLVM_ABI bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL, bool OrZero=false, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)

Return true if the given value is known to have exactly one bit set when defined.

AnalysisManager< Function > FunctionAnalysisManager

Convenience typedef for the Function analysis manager.

LLVM_ABI void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=MaxLookupSearchDepth)

This method is similar to getUnderlyingObject except that it can look through phi and select instruct...

LLVM_ABI CGPassBuilderOption getCGPassBuilderOption()

static constexpr DenormalMode getPreserveSign()

bool isNonNegative() const

Returns true if this value is known to be non-negative.

unsigned countMinLeadingZeros() const

Returns the minimum number of leading zero bits.

bool isNegative() const

Returns true if this value is known to be negative.

bool isKnownNeverSubnormal() const

Return true if it's known this can never be a subnormal.