LLVM: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp Source File (original) (raw)

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

27#include "llvm/IR/IntrinsicsAMDGPU.h"

30#include

31

32using namespace llvm;

33

34#define DEBUG_TYPE "AMDGPUtti"

35

37 "amdgpu-unroll-threshold-private",

38 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),

40

42 "amdgpu-unroll-threshold-local",

43 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),

45

47 "amdgpu-unroll-threshold-if",

48 cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),

50

52 "amdgpu-unroll-runtime-local",

53 cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),

55

57 "amdgpu-unroll-max-block-to-analyze",

58 cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),

60

63 cl::desc("Cost of alloca argument"));

64

65

66

67

71 cl::desc("Maximum alloca size to use for inline cost"));

72

73

76 cl::desc("Maximum number of BBs allowed in a function after inlining"

77 " (compile time constraint)"));

78

79

81 "amdgpu-memcpy-loop-unroll",

82 cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory "

83 "operations when lowering memcpy as a loop"),

85

87 unsigned Depth = 0) {

89 if (I)

90 return false;

91

92 for (const Value *V : I->operand_values()) {

93 if (!L->contains(I))

94 continue;

97 return SubLoop->contains(PHI); }))

98 return true;

100 return true;

101 }

102 return false;

103}

104

107 TargetTriple(TM->getTargetTriple()),

109 TLI(ST->getTargetLowering()) {}

110

114 const Function &F = *L->getHeader()->getParent();

116 F.getFnAttributeAsParsedInteger("amdgpu-unroll-threshold", 300);

117 UP.MaxCount = std::numeric_limits::max();

119

120

121

123

124

126

127

128

129

130 const unsigned MaxAlloca = (256 - 16) * 4;

133

134

135

136 if (MDNode *LoopUnrollThreshold =

138 if (LoopUnrollThreshold->getNumOperands() == 2) {

140 LoopUnrollThreshold->getOperand(1));

141 if (MetaThresholdValue) {

142

143

144

147 ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);

148 ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);

149 }

150 }

151 }

152

153 unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);

154 for (const BasicBlock *BB : L->getBlocks()) {

156 unsigned LocalGEPsSeen = 0;

157

158 if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {

159 return SubLoop->contains(BB); }))

160 continue;

161

163

164

165

166

167

169 if (UP.Threshold < MaxBoost && Br->isConditional()) {

170 BasicBlock *Succ0 = Br->getSuccessor(0);

171 BasicBlock *Succ1 = Br->getSuccessor(1);

172 if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||

173 (L->contains(Succ1) && L->isLoopExiting(Succ1)))

174 continue;

178 << " for loop:\n"

179 << *L << " due to " << *Br << '\n');

181 return;

182 }

183 }

184 continue;

185 }

186

188 if (GEP)

189 continue;

190

191 unsigned AS = GEP->getAddressSpace();

192 unsigned Threshold = 0;

194 Threshold = ThresholdPrivate;

196 Threshold = ThresholdLocal;

197 else

198 continue;

199

201 continue;

202

204 const Value *Ptr = GEP->getPointerOperand();

208 continue;

210 unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;

211 if (AllocaSize > MaxAlloca)

212 continue;

215 LocalGEPsSeen++;

216

217

218

219

220 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2)

221 continue;

222

225 continue;

226

227 LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"

228 << *L << " due to LDS use.\n");

230 }

231

232

233 bool HasLoopDef = false;

234 for (const Value *Op : GEP->operands()) {

236 if (!Inst || L->isLoopInvariant(Op))

237 continue;

238

239 if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {

240 return SubLoop->contains(Inst); }))

241 continue;

242 HasLoopDef = true;

243 break;

244 }

245 if (!HasLoopDef)

246 continue;

247

248

249

250

251

252

253

254

255

256

257

258

259

260

262 LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold

263 << " for loop:\n"

264 << *L << " due to " << *GEP << '\n');

266 return;

267 }

268

269

270

273 }

274}

275

280

284

285const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {

286

287 AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,

288 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,

289 AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,

290 AMDGPU::FeatureUnalignedAccessMode,

291

292 AMDGPU::FeatureAutoWaitcntBeforeBarrier,

293

294

295 AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,

296 AMDGPU::FeatureTrapHandler,

297

298

299

300 AMDGPU::FeatureSRAMECC,

301

302

303 AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};

304

308 TLI(ST->getTargetLowering()), CommonTTI(TM, F),

309 IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {

312 HasFP64FP16Denormals =

314}

315

317 return F || !ST->isSingleLaneExecution(*F);

318}

319

321

322

323

324

325

326

327

328 return 4;

329}

330

333 switch (K) {

340 }

342}

343

347

349 if (Opcode == Instruction::Load || Opcode == Instruction::Store)

350 return 32 * 4 / ElemWidth;

351

352

353 return (ElemWidth == 8 && ST->has16BitInsts()) ? 4

354 : (ElemWidth == 16 && ST->has16BitInsts()) ? 2

355 : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2

356 : 1;

357}

358

360 unsigned ChainSizeInBytes,

362 unsigned VecRegBitWidth = VF * LoadSize;

364

365 return 128 / LoadSize;

366

367 return VF;

368}

369

371 unsigned ChainSizeInBytes,

373 unsigned VecRegBitWidth = VF * StoreSize;

374 if (VecRegBitWidth > 128)

375 return 128 / StoreSize;

376

377 return VF;

378}

379

387 return 512;

388 }

389

391 return 8 * ST->getMaxPrivateElementSize();

392

393

394 return 128;

395}

396

399 unsigned AddrSpace) const {

400

401

402

404 return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled()) &&

405 ChainSizeInBytes <= ST->getMaxPrivateElementSize();

406 }

407 return true;

408}

409

412 unsigned AddrSpace) const {

414}

415

418 unsigned AddrSpace) const {

420}

421

425

428 unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,

429 std::optional<uint32_t> AtomicElementSize) const {

430

431 if (AtomicElementSize)

433

434

435

436

437

438

439

440

441

442

443

444

445 unsigned I32EltsInVector = 4;

449

451}

452

455 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,

457 std::optional<uint32_t> AtomicCpySize) const {

458

459 if (AtomicCpySize)

461 OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,

462 DestAlign, AtomicCpySize);

463

465 while (RemainingBytes >= 16) {

467 RemainingBytes -= 16;

468 }

469

471 while (RemainingBytes >= 8) {

473 RemainingBytes -= 8;

474 }

475

477 while (RemainingBytes >= 4) {

479 RemainingBytes -= 4;

480 }

481

483 while (RemainingBytes >= 2) {

485 RemainingBytes -= 2;

486 }

487

489 while (RemainingBytes) {

491 --RemainingBytes;

492 }

493}

494

496

497

499 return 1;

500

501 return 8;

502}

503

507 case Intrinsic::amdgcn_ds_ordered_add:

508 case Intrinsic::amdgcn_ds_ordered_swap: {

511 if (!Ordering || !Volatile)

512 return false;

513

514 unsigned OrderingVal = Ordering->getZExtValue();

516 return false;

517

519 Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);

520 Info.ReadMem = true;

521 Info.WriteMem = true;

522 Info.IsVolatile = !Volatile->isZero();

523 return true;

524 }

525 default:

526 return false;

527 }

528}

529

534

535

536 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);

537 int ISD = TLI->InstructionOpcodeToISD(Opcode);

538

539

540

541 unsigned NElts = LT.second.isVector() ?

542 LT.second.getVectorNumElements() : 1;

543

545

546 switch (ISD) {

550 if (SLT == MVT::i64)

551 return get64BitInstrCost(CostKind) * LT.first * NElts;

552

553 if (ST->has16BitInsts() && SLT == MVT::i16)

554 NElts = (NElts + 1) / 2;

555

556

557 return getFullRateInstrCost() * LT.first * NElts;

563 if (SLT == MVT::i64) {

564

565 return 2 * getFullRateInstrCost() * LT.first * NElts;

566 }

567

568 if (ST->has16BitInsts() && SLT == MVT::i16)

569 NElts = (NElts + 1) / 2;

570

571 return LT.first * NElts * getFullRateInstrCost();

573 const int QuarterRateCost = getQuarterRateInstrCost(CostKind);

574 if (SLT == MVT::i64) {

575 const int FullRateCost = getFullRateInstrCost();

576 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;

577 }

578

579 if (ST->has16BitInsts() && SLT == MVT::i16)

580 NElts = (NElts + 1) / 2;

581

582

583 return QuarterRateCost * NElts * LT.first;

584 }

586

587

588

591 const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());

593 if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)

595 if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)

597

598

603 }

604 }

605 [[fallthrough]];

608 if (ST->hasPackedFP32Ops() && SLT == MVT::f32)

609 NElts = (NElts + 1) / 2;

610 if (ST->hasBF16PackedInsts() && SLT == MVT::bf16)

611 NElts = (NElts + 1) / 2;

612 if (SLT == MVT::f64)

613 return LT.first * NElts * get64BitInstrCost(CostKind);

614

615 if (ST->has16BitInsts() && SLT == MVT::f16)

616 NElts = (NElts + 1) / 2;

617

618 if (SLT == MVT::f32 || SLT == MVT::f16 || SLT == MVT::bf16)

619 return LT.first * NElts * getFullRateInstrCost();

620 break;

623

624

625 if (SLT == MVT::f64) {

627 getQuarterRateInstrCost(CostKind) +

628 3 * getHalfRateInstrCost(CostKind);

629

630 if (!ST->hasUsableDivScaleConditionOutput())

631 Cost += 3 * getFullRateInstrCost();

632

633 return LT.first * Cost * NElts;

634 }

635

637

638 if ((SLT == MVT::f32 && !HasFP32Denormals) ||

639 (SLT == MVT::f16 && ST->has16BitInsts())) {

640 return LT.first * getQuarterRateInstrCost(CostKind) * NElts;

641 }

642 }

643

644 if (SLT == MVT::f16 && ST->has16BitInsts()) {

645

646

647

648

649

651 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);

652 return LT.first * Cost * NElts;

653 }

654

655 if (SLT == MVT::f32 && (CxtI && CxtI->hasApproxFunc())) {

656

657

658

659 int Cost = getQuarterRateInstrCost(CostKind) + getFullRateInstrCost();

660 return LT.first * Cost * NElts;

661 }

662

663 if (SLT == MVT::f32 || SLT == MVT::f16) {

664

665 int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +

666 1 * getQuarterRateInstrCost(CostKind);

667

668 if (!HasFP32Denormals) {

669

670 Cost += 2 * getFullRateInstrCost();

671 }

672

673 return LT.first * NElts * Cost;

674 }

675 break;

676 case ISD::FNEG:

677

678

679 return TLI->isFNegFree(SLT) ? 0 : NElts;

680 default:

681 break;

682 }

683

685 Args, CxtI);

686}

687

688

689

691 switch (ID) {

692 case Intrinsic::fma:

693 case Intrinsic::fmuladd:

694 case Intrinsic::copysign:

695 case Intrinsic::minimumnum:

696 case Intrinsic::maximumnum:

697 case Intrinsic::canonicalize:

698

699 case Intrinsic::round:

700 case Intrinsic::uadd_sat:

701 case Intrinsic::usub_sat:

702 case Intrinsic::sadd_sat:

703 case Intrinsic::ssub_sat:

704 case Intrinsic::abs:

705 return true;

706 default:

707 return false;

708 }

709}

710

714 switch (ICA.getID()) {

715 case Intrinsic::fabs:

716

717 return 0;

718 case Intrinsic::amdgcn_workitem_id_x:

719 case Intrinsic::amdgcn_workitem_id_y:

720 case Intrinsic::amdgcn_workitem_id_z:

721

722

723 return 0;

724 case Intrinsic::amdgcn_workgroup_id_x:

725 case Intrinsic::amdgcn_workgroup_id_y:

726 case Intrinsic::amdgcn_workgroup_id_z:

727 case Intrinsic::amdgcn_lds_kernel_id:

728 case Intrinsic::amdgcn_dispatch_ptr:

729 case Intrinsic::amdgcn_dispatch_id:

730 case Intrinsic::amdgcn_implicitarg_ptr:

731 case Intrinsic::amdgcn_queue_ptr:

732

733 return 0;

734 default:

735 break;

736 }

737

740

742

743

744 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);

745

746 unsigned NElts = LT.second.isVector() ?

747 LT.second.getVectorNumElements() : 1;

748

750

751 if ((ST->hasVOP3PInsts() &&

752 (SLT == MVT::f16 || SLT == MVT::i16 ||

753 (SLT == MVT::bf16 && ST->hasBF16PackedInsts()))) ||

754 (ST->hasPackedFP32Ops() && SLT == MVT::f32))

755 NElts = (NElts + 1) / 2;

756

757

758 unsigned InstRate = getQuarterRateInstrCost(CostKind);

759

760 switch (ICA.getID()) {

761 case Intrinsic::fma:

762 case Intrinsic::fmuladd:

763 if (SLT == MVT::f64) {

764 InstRate = get64BitInstrCost(CostKind);

765 break;

766 }

767

768 if ((SLT == MVT::f32 && ST->hasFastFMAF32()) || SLT == MVT::f16)

769 InstRate = getFullRateInstrCost();

770 else {

771 InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)

772 : getQuarterRateInstrCost(CostKind);

773 }

774 break;

775 case Intrinsic::copysign:

776 return NElts * getFullRateInstrCost();

777 case Intrinsic::minimumnum:

778 case Intrinsic::maximumnum: {

779

780

783

784

787 }

788

789 unsigned BaseRate =

790 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();

791 InstRate = BaseRate * NumOps;

792 break;

793 }

794 case Intrinsic::canonicalize: {

795 InstRate =

796 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();

797 break;

798 }

799 case Intrinsic::uadd_sat:

800 case Intrinsic::usub_sat:

801 case Intrinsic::sadd_sat:

802 case Intrinsic::ssub_sat: {

803 if (SLT == MVT::i16 || SLT == MVT::i32)

804 InstRate = getFullRateInstrCost();

805

806 static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};

807 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))

808 NElts = 1;

809 break;

810 }

811 case Intrinsic::abs:

812

813 if (SLT == MVT::i16 || SLT == MVT::i32)

814 InstRate = 2 * getFullRateInstrCost();

815 break;

816 default:

817 break;

818 }

819

820 return LT.first * NElts * InstRate;

821}

822

826 assert((I == nullptr || I->getOpcode() == Opcode) &&

827 "Opcode should reflect passed instruction.");

828 const bool SCost =

830 const int CBrCost = SCost ? 5 : 7;

831 switch (Opcode) {

832 case Instruction::Br: {

833

835 if (BI && BI->isUnconditional())

836 return SCost ? 1 : 4;

837

838

839 return CBrCost;

840 }

841 case Instruction::Switch: {

843

844

845 return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);

846 }

847 case Instruction::Ret:

848 return SCost ? 1 : 10;

849 }

851}

852

855 std::optional FMF,

859

860 EVT OrigTy = TLI->getValueType(DL, Ty);

861

862

863

866

867 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);

868 return LT.first * getFullRateInstrCost();

869}

870

875 EVT OrigTy = TLI->getValueType(DL, Ty);

876

877

878

881

882 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);

883 return LT.first * getHalfRateInstrCost(CostKind);

884}

885

888 unsigned Index, const Value *Op0,

889 const Value *Op1) const {

890 switch (Opcode) {

891 case Instruction::ExtractElement:

892 case Instruction::InsertElement: {

893 unsigned EltSize

895 if (EltSize < 32) {

896 if (EltSize == 16 && Index == 0 && ST->has16BitInsts())

897 return 0;

899 Op1);

900 }

901

902

903

904

905

906

907 return Index == ~0u ? 2 : 0;

908 }

909 default:

911 }

912}

913

914

915

916

917

920

921 if (Indices.size() > 1)

922 return true;

923

927 TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);

928

929 const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];

930

931 int OutputIdx = 0;

932 for (auto &TC : TargetConstraints) {

934 continue;

935

936

937 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)

938 continue;

939

940 TLI->ComputeConstraintToUse(TC, SDValue());

941

943 TRI, TC.ConstraintCode, TC.ConstraintVT).second;

944

945

946

947 if (!RC || TRI->isSGPRClass(RC))

948 return true;

949 }

950

951 return false;

952}

953

960

961

963 if (VT == MVT::i1)

964 return true;

965

966

968 return false;

969

970

971

973}

974

975

976

980

981

982

983

984

985

986

990

991

992

993

994

996 return true;

997

1000 switch (IID) {

1001 case Intrinsic::read_register:

1003 case Intrinsic::amdgcn_addrspacecast_nonnull: {

1004 unsigned SrcAS =

1005 Intrinsic->getOperand(0)->getType()->getPointerAddressSpace();

1006 unsigned DstAS = Intrinsic->getType()->getPointerAddressSpace();

1009 ST->hasGloballyAddressableScratch();

1010 }

1011 case Intrinsic::amdgcn_workitem_id_y:

1012 case Intrinsic::amdgcn_workitem_id_z: {

1014 bool HasUniformYZ =

1015 ST->hasWavefrontsEvenlySplittingXDim(*F, true);

1016 std::optional ThisDimSize = ST->getReqdWorkGroupSize(

1017 *F, IID == Intrinsic::amdgcn_workitem_id_y ? 1 : 2);

1018 return !HasUniformYZ && (!ThisDimSize || *ThisDimSize != 1);

1019 }

1020 default:

1022 }

1023 }

1024

1025

1027 if (CI->isInlineAsm())

1029 return true;

1030 }

1031

1032

1034 return true;

1035

1036

1037

1038

1042 ST->hasGloballyAddressableScratch();

1043 }

1044

1045 return false;

1046}

1047

1051

1053 if (CI->isInlineAsm())

1055 return false;

1056 }

1057

1058

1059

1060

1061

1062

1063

1064

1065

1066

1067

1068

1069

1070

1071 bool XDimDoesntResetWithinWaves = false;

1073 const Function *F = I->getFunction();

1074 XDimDoesntResetWithinWaves = ST->hasWavefrontsEvenlySplittingXDim(*F);

1075 }

1082 return C >= ST->getWavefrontSizeLog2() && XDimDoesntResetWithinWaves;

1083 }

1084

1089 ST->getWavefrontSizeLog2() &&

1090 XDimDoesntResetWithinWaves;

1091 }

1092

1094 if (!ExtValue)

1095 return false;

1096

1098 if (!CI)

1099 return false;

1100

1102 switch (Intrinsic->getIntrinsicID()) {

1103 default:

1104 return false;

1105 case Intrinsic::amdgcn_if:

1106 case Intrinsic::amdgcn_else: {

1108 return Indices.size() == 1 && Indices[0] == 1;

1109 }

1110 }

1111 }

1112

1113

1114

1115

1118

1119 return false;

1120}

1121

1124 switch (IID) {

1125 case Intrinsic::amdgcn_is_shared:

1126 case Intrinsic::amdgcn_is_private:

1127 case Intrinsic::amdgcn_flat_atomic_fmax_num:

1128 case Intrinsic::amdgcn_flat_atomic_fmin_num:

1129 case Intrinsic::amdgcn_load_to_lds:

1130 case Intrinsic::amdgcn_make_buffer_rsrc:

1132 return true;

1133 default:

1134 return false;

1135 }

1136}

1137

1140 Value *NewV) const {

1141 auto IntrID = II->getIntrinsicID();

1142 switch (IntrID) {

1143 case Intrinsic::amdgcn_is_shared:

1144 case Intrinsic::amdgcn_is_private: {

1145 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?

1149 ConstantInt *NewVal = (TrueAS == NewAS) ?

1151 return NewVal;

1152 }

1153 case Intrinsic::ptrmask: {

1156 Value *MaskOp = II->getArgOperand(1);

1158

1159 bool DoTruncate = false;

1160

1162 static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());

1164

1165

1166

1167 if (DL.getPointerSizeInBits(OldAS) != 64 ||

1168 DL.getPointerSizeInBits(NewAS) != 32)

1169 return nullptr;

1170

1171

1174 return nullptr;

1175

1176 DoTruncate = true;

1177 }

1178

1180 if (DoTruncate) {

1181 MaskTy = B.getInt32Ty();

1182 MaskOp = B.CreateTrunc(MaskOp, MaskTy);

1183 }

1184

1185 return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},

1186 {NewV, MaskOp});

1187 }

1188 case Intrinsic::amdgcn_flat_atomic_fmax_num:

1189 case Intrinsic::amdgcn_flat_atomic_fmin_num: {

1190 Type *DestTy = II->getType();

1194 return nullptr;

1195 Module *M = II->getModule();

1197 M, II->getIntrinsicID(), {DestTy, SrcTy, DestTy});

1198 II->setArgOperand(0, NewV);

1199 II->setCalledFunction(NewDecl);

1200 return II;

1201 }

1202 case Intrinsic::amdgcn_load_to_lds: {

1204 Module *M = II->getModule();

1207 II->setArgOperand(0, NewV);

1208 II->setCalledFunction(NewDecl);

1209 return II;

1210 }

1211 case Intrinsic::amdgcn_make_buffer_rsrc: {

1213 Type *DstTy = II->getType();

1214 Module *M = II->getModule();

1216 M, II->getIntrinsicID(), {DstTy, SrcTy});

1217 II->setArgOperand(0, NewV);

1218 II->setCalledFunction(NewDecl);

1219 return II;

1220 }

1221 default:

1222 return nullptr;

1223 }

1224}

1225

1235 SubTp);

1236

1238

1239 unsigned ScalarSize = DL.getTypeSizeInBits(SrcTy->getElementType());

1241 (ScalarSize == 16 || ScalarSize == 8)) {

1242

1243

1244

1245

1246

1247

1248

1249

1250

1251

1252

1254 unsigned NumSrcElts = SrcVecTy->getNumElements();

1255 if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumSrcElts == 2 &&

1258 return 0;

1259 }

1260

1261 unsigned EltsPerReg = 32 / ScalarSize;

1262 switch (Kind) {

1264

1265 return 1;

1267

1269 return divideCeil(DstVecTy->getNumElements(), EltsPerReg);

1272 if (Index % EltsPerReg == 0)

1273 return 0;

1275 return divideCeil(DstVecTy->getNumElements(), EltsPerReg);

1279 if (!DstVecTy)

1281 unsigned NumDstElts = DstVecTy->getNumElements();

1283 unsigned EndIndex = Index + NumInsertElts;

1284 unsigned BeginSubIdx = Index % EltsPerReg;

1285 unsigned EndSubIdx = EndIndex % EltsPerReg;

1286 unsigned Cost = 0;

1287

1288 if (BeginSubIdx != 0) {

1289

1290

1291 Cost = divideCeil(EndIndex, EltsPerReg) - (Index / EltsPerReg);

1292 }

1293

1294

1295

1296 if (EndIndex < NumDstElts && BeginSubIdx < EndSubIdx)

1298

1299 return Cost;

1300 }

1303 if (!DstVecTy)

1305 unsigned NumElts = DstVecTy->getNumElements();

1307

1308

1309 unsigned EltsFromLHS = NumElts - Index;

1310 bool LHSIsAligned = (Index % EltsPerReg) == 0;

1311 bool RHSIsAligned = (EltsFromLHS % EltsPerReg) == 0;

1312 if (LHSIsAligned && RHSIsAligned)

1313 return 0;

1314 if (LHSIsAligned && !RHSIsAligned)

1315 return divideCeil(NumElts, EltsPerReg) - (EltsFromLHS / EltsPerReg);

1316 if (!LHSIsAligned && RHSIsAligned)

1317 return divideCeil(EltsFromLHS, EltsPerReg);

1318 return divideCeil(NumElts, EltsPerReg);

1319 }

1320 default:

1321 break;

1322 }

1323

1324 if (!Mask.empty()) {

1326

1327

1328

1329

1330

1331

1332

1333 unsigned Cost = 0;

1334 for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx += EltsPerReg) {

1337 for (unsigned I = 0; I < EltsPerReg && DstIdx + I < Mask.size(); ++I) {

1338 int SrcIdx = Mask[DstIdx + I];

1339 if (SrcIdx == -1)

1340 continue;

1341 int Reg;

1342 if (SrcIdx < (int)NumSrcElts) {

1343 Reg = SrcIdx / EltsPerReg;

1344 if (SrcIdx % EltsPerReg != I)

1346 } else {

1347 Reg = NumSrcElts + (SrcIdx - NumSrcElts) / EltsPerReg;

1348 if ((SrcIdx - NumSrcElts) % EltsPerReg != I)

1350 }

1353 }

1354 if (Regs.size() >= 2)

1358 }

1359 return Cost;

1360 }

1361 }

1362

1364 SubTp);

1365}

1366

1367

1368

1369

1373

1374 for (auto &Op : I->operands()) {

1375

1376 if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); }))

1377 continue;

1378

1380 Ops.push_back(&Op);

1381 }

1382

1383 return Ops.empty();

1384}

1385

1387 const Function *Callee) const {

1388 const TargetMachine &TM = getTLI()->getTargetMachine();

1393

1394 const FeatureBitset &CallerBits = CallerST->getFeatureBits();

1395 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();

1396

1397 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;

1398 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;

1399 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)

1400 return false;

1401

1402

1403

1407 return false;

1408

1409 if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||

1410 Callee->hasFnAttribute(Attribute::InlineHint))

1411 return true;

1412

1413

1415

1416 if (Callee->size() == 1)

1417 return true;

1418 size_t BBSize = Caller->size() + Callee->size() - 1;

1420 }

1421

1422 return true;

1423}

1424

1428 const int NrOfSGPRUntilSpill = 26;

1429 const int NrOfVGPRUntilSpill = 32;

1430

1432

1433 unsigned adjustThreshold = 0;

1434 int SGPRsInUse = 0;

1435 int VGPRsInUse = 0;

1436 for (const Use &A : CB->args()) {

1439 for (auto ArgVT : ValueVTs) {

1443 SGPRsInUse += CCRegNum;

1444 else

1445 VGPRsInUse += CCRegNum;

1446 }

1447 }

1448

1449

1450

1451

1452

1453

1455 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(

1458 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(

1461

1462

1463

1464 adjustThreshold += std::max(0, SGPRsInUse - NrOfSGPRUntilSpill) *

1466 adjustThreshold += std::max(0, VGPRsInUse - NrOfVGPRUntilSpill) *

1468 return adjustThreshold;

1469}

1470

1473

1474

1475

1476

1477 unsigned AllocaSize = 0;

1479 for (Value *PtrArg : CB->args()) {

1481 if (!Ty)

1482 continue;

1483

1484 unsigned AddrSpace = Ty->getAddressSpace();

1487 continue;

1488

1491 continue;

1492

1494 }

1495 return AllocaSize;

1496}

1497

1502

1505

1506

1507

1509 if (AllocaSize > 0)

1511 return Threshold;

1512}

1513

1516

1517

1518

1521 return 0;

1522

1523

1524

1525

1526

1527

1528

1529

1530

1531

1532

1533

1534

1535

1536

1537 static_assert(InlinerVectorBonusPercent == 0, "vector bonus assumed to be 0");

1539

1541 return BB.getTerminator()->getNumSuccessors() > 1;

1542 });

1543 if (SingleBB) {

1544 Threshold += Threshold / 2;

1545 }

1546

1548

1549

1550 unsigned AllocaThresholdBonus = (Threshold * ArgAllocaSize) / AllocaSize;

1551

1552 return AllocaThresholdBonus;

1553}

1554

1558 CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);

1559}

1560

1563 CommonTTI.getPeelingPreferences(L, SE, PP);

1564}

1565

1568 ? getFullRateInstrCost()

1569 : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)

1570 : getQuarterRateInstrCost(CostKind);

1571}

1572

1573std::pair<InstructionCost, MVT>

1574GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {

1576 auto Size = DL.getTypeSizeInBits(Ty);

1577

1578

1579

1580 if (Size <= 256)

1581 return Cost;

1582

1583 Cost.first += (Size + 255) / 256;

1584 return Cost;

1585}

1586

1588 return ST->hasPrefetch() ? 128 : 0;

1589}

1590

1594

1597 SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {

1599 LB.push_back({"amdgpu-max-num-workgroups[0]", MaxNumWorkgroups[0]});

1600 LB.push_back({"amdgpu-max-num-workgroups[1]", MaxNumWorkgroups[1]});

1601 LB.push_back({"amdgpu-max-num-workgroups[2]", MaxNumWorkgroups[2]});

1602 std::pair<unsigned, unsigned> FlatWorkGroupSize =

1603 ST->getFlatWorkGroupSizes(F);

1604 LB.push_back({"amdgpu-flat-work-group-size[0]", FlatWorkGroupSize.first});

1605 LB.push_back({"amdgpu-flat-work-group-size[1]", FlatWorkGroupSize.second});

1606 std::pair<unsigned, unsigned> WavesPerEU = ST->getWavesPerEU(F);

1607 LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});

1608 LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});

1609}

1610

1613 if (!ST->hasIEEEMode())

1615

1616 const Function *F = I.getFunction();

1617 if (F)

1619

1620 Attribute IEEEAttr = F->getFnAttribute("amdgpu-ieee");

1623

1626}

1627

1629 Align Alignment,

1635 if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&

1636 VecTy->getElementType()->isIntegerTy(8)) {

1637 return divideCeil(DL.getTypeSizeInBits(VecTy) - 1,

1639 }

1640 }

1642 OpInfo, I);

1643}

1644

1647 if (VecTy->getElementType()->isIntegerTy(8)) {

1650 }

1651 }

1653}

assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

Provides AMDGPU specific target descriptions.

The AMDGPU TargetMachine interface definition for hw codegen targets.

static cl::opt< unsigned > UnrollThresholdIf("amdgpu-unroll-threshold-if", cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), cl::init(200), cl::Hidden)

static cl::opt< unsigned > ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000), cl::desc("Cost of alloca argument"))

static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth=0)

Definition AMDGPUTargetTransformInfo.cpp:86

static cl::opt< bool > UnrollRuntimeLocal("amdgpu-unroll-runtime-local", cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"), cl::init(true), cl::Hidden)

static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB, const SITargetLowering *TLI, const GCNTTIImpl *TTIImpl)

Definition AMDGPUTargetTransformInfo.cpp:1425

static cl::opt< unsigned > ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), cl::desc("Maximum alloca size to use for inline cost"))

static cl::opt< size_t > InlineMaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), cl::desc("Maximum number of BBs allowed in a function after inlining" " (compile time constraint)"))

static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID)

Definition AMDGPUTargetTransformInfo.cpp:690

static cl::opt< unsigned > UnrollMaxBlockToAnalyze("amdgpu-unroll-max-block-to-analyze", cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"), cl::init(32), cl::Hidden)

static unsigned getCallArgsTotalAllocaSize(const CallBase *CB, const DataLayout &DL)

Definition AMDGPUTargetTransformInfo.cpp:1471

static cl::opt< unsigned > UnrollThresholdPrivate("amdgpu-unroll-threshold-private", cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), cl::init(2700), cl::Hidden)

static cl::opt< unsigned > MemcpyLoopUnroll("amdgpu-memcpy-loop-unroll", cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory " "operations when lowering memcpy as a loop"), cl::init(16), cl::Hidden)

static cl::opt< unsigned > UnrollThresholdLocal("amdgpu-unroll-threshold-local", cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), cl::init(1000), cl::Hidden)

This file a TargetTransformInfoImplBase conforming object specific to the AMDGPU target machine.

MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL

static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")

static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))

const size_t AbstractManglingParser< Derived, Alloc >::NumOps

const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]

Register const TargetRegisterInfo * TRI

uint64_t IntrinsicInst * II

const SmallVectorImpl< MachineOperand > & Cond

static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))

static unsigned getNumElements(Type *Ty)

uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override

Definition AMDGPUTargetTransformInfo.cpp:281

AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)

Definition AMDGPUTargetTransformInfo.cpp:105

void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override

Definition AMDGPUTargetTransformInfo.cpp:276

void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override

Definition AMDGPUTargetTransformInfo.cpp:111

bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override

Returns true if a cast between SrcAS and DestAS is a noop.

an instruction to allocate memory on the stack

LLVM_ABI bool isStaticAlloca() const

Return true if this alloca is in the entry block of the function and is a constant size.

Type * getAllocatedType() const

Return the type that is being allocated by the instruction.

This class represents an incoming formal argument to a Function.

ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...

size_t size() const

size - Get the array size.

bool empty() const

empty - Check if the array is empty.

Functions, function parameters, and return types can have attributes to indicate how they should be t...

LLVM_ABI bool getValueAsBool() const

Return the attribute's value as a boolean.

bool isValid() const

Return true if the attribute is any kind of attribute.

LLVM Basic Block Representation.

InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override

InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override

InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override

InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override

unsigned getNumberOfParts(Type *Tp) const override

TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const

InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override

InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override

void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override

std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const

InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override

InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override

Conditional or Unconditional Branch instruction.

Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...

bool isInlineAsm() const

Check if this call is an inline asm statement.

Function * getCalledFunction() const

Returns the function called, or null if this is an indirect function invocation or the function signa...

CallingConv::ID getCallingConv() const

Value * getArgOperand(unsigned i) const

iterator_range< User::op_iterator > args()

Iteration adapter for range-for loops.

unsigned getArgOperandNo(const Use *U) const

Given a use for a arg operand, get the arg operand number that corresponds to it.

This class represents a function call, abstracting a target machine's calling convention.

This is the shared class of boolean and integer constants.

static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)

static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)

int64_t getSExtValue() const

Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...

A parsed version of the target data layout string in and methods for querying it.

constexpr bool isScalar() const

Exactly one element.

Convenience struct for specifying and reasoning about fast-math flags.

Container class for subtarget features.

static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)

bool hasFullRate64Ops() const

GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)

Definition AMDGPUTargetTransformInfo.cpp:305

unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override

Definition AMDGPUTargetTransformInfo.cpp:380

InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override

Definition AMDGPUTargetTransformInfo.cpp:1226

InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override

Account for loads of i8 vector types to have reduced cost.

Definition AMDGPUTargetTransformInfo.cpp:1628

InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override

Definition AMDGPUTargetTransformInfo.cpp:530

void collectKernelLaunchBounds(const Function &F, SmallVectorImpl< std::pair< StringRef, int64_t > > &LB) const override

Definition AMDGPUTargetTransformInfo.cpp:1595

bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override

Definition AMDGPUTargetTransformInfo.cpp:416

bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const

Analyze if the results of inline asm are divergent.

Definition AMDGPUTargetTransformInfo.cpp:918

bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const

Definition AMDGPUTargetTransformInfo.cpp:954

unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override

Definition AMDGPUTargetTransformInfo.cpp:348

unsigned getNumberOfRegisters(unsigned RCID) const override

Definition AMDGPUTargetTransformInfo.cpp:320

bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override

Definition AMDGPUTargetTransformInfo.cpp:410

unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override

Definition AMDGPUTargetTransformInfo.cpp:370

bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const

Definition AMDGPUTargetTransformInfo.cpp:397

bool shouldPrefetchAddressSpace(unsigned AS) const override

Definition AMDGPUTargetTransformInfo.cpp:1591

bool hasBranchDivergence(const Function *F=nullptr) const override

Definition AMDGPUTargetTransformInfo.cpp:316

Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const override

Definition AMDGPUTargetTransformInfo.cpp:1138

unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const override

Definition AMDGPUTargetTransformInfo.cpp:1514

unsigned getMaxInterleaveFactor(ElementCount VF) const override

Definition AMDGPUTargetTransformInfo.cpp:495

void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const override

Definition AMDGPUTargetTransformInfo.cpp:453

InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override

Definition AMDGPUTargetTransformInfo.cpp:854

InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override

Get intrinsic cost based on arguments.

Definition AMDGPUTargetTransformInfo.cpp:712

unsigned getInliningThresholdMultiplier() const override

unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override

Definition AMDGPUTargetTransformInfo.cpp:359

unsigned getPrefetchDistance() const override

How much before a load we should place the prefetch instruction.

Definition AMDGPUTargetTransformInfo.cpp:1587

InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override

Definition AMDGPUTargetTransformInfo.cpp:823

KnownIEEEMode fpenvIEEEMode(const Instruction &I) const

Return KnownIEEEMode::On if we know if the use context can assume "amdgpu-ieee"="true" and KnownIEEEM...

Definition AMDGPUTargetTransformInfo.cpp:1612

unsigned adjustInliningThreshold(const CallBase *CB) const override

Definition AMDGPUTargetTransformInfo.cpp:1503

bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override

Whether it is profitable to sink the operands of an Instruction I to the basic block of I.

Definition AMDGPUTargetTransformInfo.cpp:1370

bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override

Definition AMDGPUTargetTransformInfo.cpp:504

bool isAlwaysUniform(const Value *V) const override

Definition AMDGPUTargetTransformInfo.cpp:1048

bool areInlineCompatible(const Function *Caller, const Function *Callee) const override

Definition AMDGPUTargetTransformInfo.cpp:1386

InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override

Try to calculate op costs for min/max reduction operations.

Definition AMDGPUTargetTransformInfo.cpp:872

bool isSourceOfDivergence(const Value *V) const override

Definition AMDGPUTargetTransformInfo.cpp:977

int getInliningLastCallToStaticBonus() const override

Definition AMDGPUTargetTransformInfo.cpp:1498

InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override

Definition AMDGPUTargetTransformInfo.cpp:886

bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const override

Definition AMDGPUTargetTransformInfo.cpp:1122

unsigned getNumberOfParts(Type *Tp) const override

When counting parts on AMD GPUs, account for i8s being grouped together under a single i32 value.

Definition AMDGPUTargetTransformInfo.cpp:1645

void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override

Definition AMDGPUTargetTransformInfo.cpp:1561

unsigned getMinVectorRegisterBitWidth() const override

Definition AMDGPUTargetTransformInfo.cpp:344

TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const override

Definition AMDGPUTargetTransformInfo.cpp:332

void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override

Definition AMDGPUTargetTransformInfo.cpp:1555

Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicElementSize) const override

Definition AMDGPUTargetTransformInfo.cpp:426

uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override

Definition AMDGPUTargetTransformInfo.cpp:422

an instruction for type-safe pointer arithmetic to access elements of arrays and structs

This provides a uniform API for creating instructions and inserting them into a basic block: either a...

static InstructionCost getInvalid(CostType Val=0)

CostType getValue() const

This function is intended to be used as sparingly as possible, since the class provides the full rang...

LLVM_ABI bool hasApproxFunc() const LLVM_READONLY

Determine whether the approximate-math-functions flag is set.

LLVM_ABI bool hasAllowContract() const LLVM_READONLY

Determine whether the allow-contract flag is set.

LLVM_ABI const DataLayout & getDataLayout() const

Get the data layout of the module this instruction belongs to.

Type * getReturnType() const

const IntrinsicInst * getInst() const

Intrinsic::ID getID() const

A wrapper class for inspecting calls to intrinsic functions.

Intrinsic::ID getIntrinsicID() const

Return the intrinsic ID of this intrinsic.

This is an important class for using LLVM in a threaded context.

An instruction for reading from memory.

Represents a single loop in the control flow graph.

static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)

Return the value type corresponding to the specified type.

A Module instance is used to store all the information related to an LLVM module.

unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override

Certain targets require unusual breakdowns of certain types.

The main scalar evolution driver.

std::pair< iterator, bool > insert(PtrType Ptr)

Inserts Ptr if and only if there is no element in the container equal to Ptr.

SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.

This class consists of common code factored out of the SmallVector class to reduce code duplication b...

void push_back(const T &Elt)

This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.

StringRef - Represent a constant reference to a string, i.e.

std::vector< AsmOperandInfo > AsmOperandInfoVector

Primary interface to the complete machine description for the target machine.

virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const

Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...

virtual int getInliningLastCallToStaticBonus() const

virtual const DataLayout & getDataLayout() const

virtual void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const

TargetCostKind

The kind of cost model.

@ TCK_CodeSize

Instruction code size.

@ TCK_SizeAndLatency

The weighted sum of size and latency.

static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)

A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...

@ TCC_Free

Expected to fold away in lowering.

ShuffleKind

The various kinds of shuffle patterns for vector queries.

@ SK_InsertSubvector

InsertSubvector. Index indicates start offset.

@ SK_PermuteSingleSrc

Shuffle elements of single source vector with any shuffle mask.

@ SK_Splice

Concatenates elements from the first input vector with elements of the second input vector.

@ SK_Broadcast

Broadcast element 0 to all other elements.

@ SK_Reverse

Reverse the order of the vector.

@ SK_ExtractSubvector

ExtractSubvector Index indicates start offset.

static constexpr TypeSize getFixed(ScalarTy ExactSize)

static constexpr TypeSize getScalable(ScalarTy MinimumSize)

The instances of the Type class are immutable: once they are created, they are never changed.

static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)

static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)

LLVM_ABI unsigned getPointerAddressSpace() const

Get the address space of this pointer or pointer vector type.

static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)

static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)

bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const

Return true if it makes sense to take the size of this type.

LLVMContext & getContext() const

Return the LLVMContext in which this type was uniqued.

LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY

If this is a vector type, return the getPrimitiveSizeInBits value for the element type.

static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)

A Use represents the edge between a Value definition and its users.

Value * getOperand(unsigned i) const

LLVM Value Representation.

Type * getType() const

All values are typed, get the type of this value.

user_iterator user_begin()

bool hasOneUse() const

Return true if there is exactly one use of this value.

LLVM_ABI LLVMContext & getContext() const

All values hold a context through their type.

Base class of all SIMD vector types.

constexpr ScalarTy getFixedValue() const

#define llvm_unreachable(msg)

Marks that the current location is not supposed to be reachable.

@ CONSTANT_ADDRESS_32BIT

Address space for 32-bit constant memory.

@ BUFFER_STRIDED_POINTER

Address space for 192-bit fat buffer pointers with an additional index.

@ REGION_ADDRESS

Address space for region memory. (GDS)

@ LOCAL_ADDRESS

Address space for local memory.

@ CONSTANT_ADDRESS

Address space for constant memory (VTX2).

@ FLAT_ADDRESS

Address space for flat memory.

@ GLOBAL_ADDRESS

Address space for global memory (RAT0, VTX0).

@ BUFFER_FAT_POINTER

Address space for 160-bit buffer fat pointers.

@ PRIVATE_ADDRESS

Address space for private memory.

@ BUFFER_RESOURCE

Address space for 128-bit buffer resources.

LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)

bool isFlatGlobalAddrSpace(unsigned AS)

bool isArgPassedInSGPR(const Argument *A)

bool isIntrinsicAlwaysUniform(unsigned IntrID)

bool isIntrinsicSourceOfDivergence(unsigned IntrID)

bool isExtendedGlobalAddrSpace(unsigned AS)

unsigned ID

LLVM IR allows to use arbitrary numbers as calling convention identifiers.

@ C

The default llvm calling convention, compatible with C.

ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...

@ ADD

Simple integer binary arithmetic operators.

@ FADD

Simple binary floating point operators.

@ SHL

Shift and rotation operations.

@ AND

Bitwise operators - logical and, logical or, logical xor.

LLVM_ABI int getInstrCost()

This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.

LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})

Look up the Function declaration of the intrinsic id in the Module M.

BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)

BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)

Matches an And with LHS and RHS in either order.

bool match(Val *V, const Pattern &P)

class_match< ConstantInt > m_ConstantInt()

Match an arbitrary ConstantInt and ignore it.

IntrinsicID_match m_Intrinsic()

Match intrinsic calls like this: m_IntrinsicIntrinsic::fabs(m_Value(X))

specific_fpval m_FPOne()

Match a float 1.0 or vector with all elements equal to 1.0.

class_match< Value > m_Value()

Match an arbitrary value and ignore it.

BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)

FNeg_match< OpTy > m_FNeg(const OpTy &X)

Match 'fneg X' as 'fsub -0.0, X'.

m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)

initializer< Ty > init(const Ty &Val)

std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract_or_null(Y &&MD)

Extract a Value from Metadata, allowing null.

This is an optimization pass for GlobalISel generic memory operations.

void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())

ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...

decltype(auto) dyn_cast(const From &Val)

dyn_cast - Return the argument parameter cast to the specified type.

LLVM_ABI MDNode * findOptionMDForLoop(const Loop *TheLoop, StringRef Name)

Find string metadata for a loop.

auto dyn_cast_or_null(const Y &Val)

bool any_of(R &&range, UnaryPredicate P)

Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.

LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)

Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...

LLVM_ABI raw_ostream & dbgs()

dbgs() - This returns a reference to a raw_ostream for debugging messages.

bool none_of(R &&Range, UnaryPredicate P)

Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.

bool isa(const From &Val)

isa - Return true if the parameter to the template is an instance of one of the template type argu...

AtomicOrdering

Atomic ordering for LLVM's memory model.

constexpr T divideCeil(U Numerator, V Denominator)

Returns the integer ceil(Numerator / Denominator).

DWARFExpression::Operation Op

decltype(auto) cast(const From &Val)

cast - Return the argument parameter cast to the specified type.

bool is_contained(R &&Range, const E &Element)

Returns true if Element is found in Range.

LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)

This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....

This struct is a compact representation of a valid (non-zero power of two) alignment.

static constexpr DenormalMode getPreserveSign()

uint64_t getScalarSizeInBits() const

unsigned countMinLeadingOnes() const

Returns the minimum number of leading one bits.

Information about a load/store intrinsic defined by the target.

bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const

Parameters that control the generic loop unrolling transformation.

unsigned Threshold

The cost threshold for the unrolled loop.

bool UnrollVectorizedLoop

Don't disable runtime unroll for the loops which were vectorized.

unsigned MaxIterationsCountToAnalyze

Don't allow loop unrolling to simulate more than this number of iterations when checking full unroll ...

unsigned PartialThreshold

The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...

bool Runtime

Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...

bool Partial

Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...