AMDGPUAtomicOptimizer.cpp Source File (original) (raw)

31#include "llvm/IR/IntrinsicsAMDGPU.h"

36#define DEBUG_TYPE "amdgpu-atomic-optimizer"

38using namespace llvm;

41namespace {

43struct ReplacementInfo {

46 unsigned ValIdx;

47 bool ValDivergent;

48};

50class AMDGPUAtomicOptimizer : public FunctionPass {

51public:

52 static char ID;

54 AMDGPUAtomicOptimizer(ScanOptions ScanImpl)

59 void getAnalysisUsage(AnalysisUsage &AU) const override {

63 }

64};

66class AMDGPUAtomicOptimizerImpl

67 : public InstVisitor {

68private:

75 bool IsPixelShader;

79 Value *const Identity) const;

81 Value *const Identity) const;

84 std::pair<Value *, Value *>

90 bool ValDivergent) const;

92public:

93 AMDGPUAtomicOptimizerImpl() = delete;

98 : F(F), UA(UA), DL(F.getDataLayout()), DTU(DTU), ST(ST),

100 ScanImpl(ScanImpl) {}

101

102 bool run();

103

106};

107

108}

109

110char AMDGPUAtomicOptimizer::ID = 0;

111

113

114bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) {

115 if (skipFunction(F)) {

116 return false;

117 }

118

120 getAnalysis().getUniformityInfo();

121

123 getAnalysisIfAvailable();

125 DomTreeUpdater::UpdateStrategy::Lazy);

126

127 const TargetPassConfig &TPC = getAnalysis();

130

131 return AMDGPUAtomicOptimizerImpl(F, UA, DTU, ST, ScanImpl).run();

132}

133

137

139 DomTreeUpdater::UpdateStrategy::Lazy);

141

142 bool IsChanged = AMDGPUAtomicOptimizerImpl(F, UA, DTU, ST, ScanImpl).run();

143

144 if (!IsChanged) {

146 }

147

150 return PA;

151}

152

153bool AMDGPUAtomicOptimizerImpl::run() {

154

156 return false;

157

159 if (ToReplace.empty())

160 return false;

161

162 for (auto &[I, Op, ValIdx, ValDivergent] : ToReplace)

163 optimizeAtomic(*I, Op, ValIdx, ValDivergent);

164 ToReplace.clear();

165 return true;

166}

167

169 switch (Ty->getTypeID()) {

172 return true;

174 unsigned Size = Ty->getIntegerBitWidth();

175 return (Size == 32 || Size == 64);

176 }

177 default:

178 return false;

179 }

180}

181

182void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {

183

184 switch (I.getPointerAddressSpace()) {

185 default:

186 return;

189 break;

190 }

191

193

194 switch (Op) {

195 default:

196 return;

210 break;

211 }

212

213

215 !(I.getType()->isFloatTy() || I.getType()->isDoubleTy())) {

216 return;

217 }

218

219 const unsigned PtrIdx = 0;

220 const unsigned ValIdx = 1;

221

222

223

225 return;

226 }

227

228 bool ValDivergent = UA.isDivergentUse(I.getOperandUse(ValIdx));

229

230

231

232

233

234 if (ValDivergent) {

235 if (ScanImpl == ScanOptions::DPP && !ST.hasDPP())

236 return;

237

239 return;

240 }

241

242

243

244

245 ToReplace.push_back({&I, Op, ValIdx, ValDivergent});

246}

247

248void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {

250

251 switch (I.getIntrinsicID()) {

252 default:

253 return;

254 case Intrinsic::amdgcn_struct_buffer_atomic_add:

255 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:

256 case Intrinsic::amdgcn_raw_buffer_atomic_add:

257 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:

259 break;

260 case Intrinsic::amdgcn_struct_buffer_atomic_sub:

261 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:

262 case Intrinsic::amdgcn_raw_buffer_atomic_sub:

263 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:

265 break;

266 case Intrinsic::amdgcn_struct_buffer_atomic_and:

267 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:

268 case Intrinsic::amdgcn_raw_buffer_atomic_and:

269 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:

271 break;

272 case Intrinsic::amdgcn_struct_buffer_atomic_or:

273 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:

274 case Intrinsic::amdgcn_raw_buffer_atomic_or:

275 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:

277 break;

278 case Intrinsic::amdgcn_struct_buffer_atomic_xor:

279 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:

280 case Intrinsic::amdgcn_raw_buffer_atomic_xor:

281 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:

283 break;

284 case Intrinsic::amdgcn_struct_buffer_atomic_smin:

285 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:

286 case Intrinsic::amdgcn_raw_buffer_atomic_smin:

287 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:

289 break;

290 case Intrinsic::amdgcn_struct_buffer_atomic_umin:

291 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:

292 case Intrinsic::amdgcn_raw_buffer_atomic_umin:

293 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:

295 break;

296 case Intrinsic::amdgcn_struct_buffer_atomic_smax:

297 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:

298 case Intrinsic::amdgcn_raw_buffer_atomic_smax:

299 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:

301 break;

302 case Intrinsic::amdgcn_struct_buffer_atomic_umax:

303 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:

304 case Intrinsic::amdgcn_raw_buffer_atomic_umax:

305 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:

307 break;

308 }

309

310 const unsigned ValIdx = 0;

311

312 const bool ValDivergent = UA.isDivergentUse(I.getOperandUse(ValIdx));

313

314

315

316

317

318 if (ValDivergent) {

319 if (ScanImpl == ScanOptions::DPP && !ST.hasDPP())

320 return;

321

323 return;

324 }

325

326

327

328 for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) {

330 return;

331 }

332

333

334

335

336 ToReplace.push_back({&I, Op, ValIdx, ValDivergent});

337}

338

339

340

344

345 switch (Op) {

346 default:

349 return B.CreateBinOp(Instruction::Add, LHS, RHS);

351 return B.CreateFAdd(LHS, RHS);

353 return B.CreateBinOp(Instruction::Sub, LHS, RHS);

355 return B.CreateFSub(LHS, RHS);

357 return B.CreateBinOp(Instruction::And, LHS, RHS);

359 return B.CreateBinOp(Instruction::Or, LHS, RHS);

361 return B.CreateBinOp(Instruction::Xor, LHS, RHS);

362

365 break;

368 break;

371 break;

374 break;

376 return B.CreateMaxNum(LHS, RHS);

378 return B.CreateMinNum(LHS, RHS);

379 }

382}

383

384

385

389 Value *const Identity) const {

390 Type *AtomicTy = V->getType();

391 Module *M = B.GetInsertBlock()->getModule();

392

393

394 for (unsigned Idx = 0; Idx < 4; Idx++) {

396 B, Op, V,

397 B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, AtomicTy,

398 {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx),

399 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));

400 }

401

402

404 Value *Permlanex16Call =

405 B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlanex16,

407 B.getInt32(0), B.getFalse(), B.getFalse()});

410 return V;

411 }

412

414

415 Value *Permlane64Call =

416 B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlane64, V);

418 }

419

420

421

423 M, Intrinsic::amdgcn_readlane, AtomicTy);

424 Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});

425 Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});

427}

428

429

430

433 Value *Identity) const {

434 Type *AtomicTy = V->getType();

435 Module *M = B.GetInsertBlock()->getModule();

437 M, Intrinsic::amdgcn_update_dpp, AtomicTy);

438

439 for (unsigned Idx = 0; Idx < 4; Idx++) {

441 B, Op, V,

442 B.CreateCall(UpdateDPP,

443 {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx),

444 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));

445 }

447

449 B, Op, V,

450 B.CreateCall(UpdateDPP,

451 {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa),

452 B.getInt32(0xf), B.getFalse()}));

454 B, Op, V,

455 B.CreateCall(UpdateDPP,

456 {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc),

457 B.getInt32(0xf), B.getFalse()}));

458 } else {

459

460

461

462

463

466 B.CreateIntrinsic(AtomicTy, Intrinsic::amdgcn_permlanex16,

468 B.getInt32(-1), B.getFalse(), B.getFalse()});

469

470 Value *UpdateDPPCall = B.CreateCall(

472 B.getInt32(0xa), B.getInt32(0xf), B.getFalse()});

474

476

477 Value *const Lane31 = B.CreateIntrinsic(

478 AtomicTy, Intrinsic::amdgcn_readlane, {V, B.getInt32(31)});

479

480 Value *UpdateDPPCall = B.CreateCall(

482 B.getInt32(0xc), B.getInt32(0xf), B.getFalse()});

483

485 }

486 }

487 return V;

488}

489

490

491

493 Value *Identity) const {

494 Type *AtomicTy = V->getType();

495 Module *M = B.GetInsertBlock()->getModule();

497 M, Intrinsic::amdgcn_update_dpp, AtomicTy);

499

500 V = B.CreateCall(UpdateDPP,

502 B.getInt32(0xf), B.getFalse()});

503 } else {

505 M, Intrinsic::amdgcn_readlane, AtomicTy);

507 M, Intrinsic::amdgcn_writelane, AtomicTy);

508

509

510

512 V = B.CreateCall(UpdateDPP,

514 B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});

515

516

517 V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}),

518 B.getInt32(16), V});

519

521

522 V = B.CreateCall(

523 WriteLane,

524 {B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V});

525

526

527 V = B.CreateCall(

528 WriteLane,

529 {B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V});

530 }

531 }

532

533 return V;

534}

535

536

537

538

539

540

541std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(

543 Instruction &I, BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const {

544 auto *Ty = I.getType();

546 auto *EntryBB = I.getParent();

547 auto NeedResult = .use_empty();

548

549 auto *Ballot =

550 B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());

551

552

553 B.SetInsertPoint(ComputeLoop);

554

555 auto *Accumulator = B.CreatePHI(Ty, 2, "Accumulator");

556 Accumulator->addIncoming(Identity, EntryBB);

557 PHINode *OldValuePhi = nullptr;

558 if (NeedResult) {

559 OldValuePhi = B.CreatePHI(Ty, 2, "OldValuePhi");

561 }

562 auto *ActiveBits = B.CreatePHI(WaveTy, 2, "ActiveBits");

563 ActiveBits->addIncoming(Ballot, EntryBB);

564

565

566 auto *FF1 =

567 B.CreateIntrinsic(Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue()});

568

569 auto *LaneIdxInt = B.CreateTrunc(FF1, B.getInt32Ty());

570

571

572 Value *LaneValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_readlane,

573 {V, LaneIdxInt});

574

575

576

577 Value *OldValue = nullptr;

578 if (NeedResult) {

579 OldValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_writelane,

580 {Accumulator, LaneIdxInt, OldValuePhi});

581 OldValuePhi->addIncoming(OldValue, ComputeLoop);

582 }

583

584

586 Accumulator->addIncoming(NewAccumulator, ComputeLoop);

587

588

589

590 auto *Mask = B.CreateShl(ConstantInt::get(WaveTy, 1), FF1);

591

592 auto *InverseMask = B.CreateXor(Mask, ConstantInt::get(WaveTy, -1));

593 auto *NewActiveBits = B.CreateAnd(ActiveBits, InverseMask);

594 ActiveBits->addIncoming(NewActiveBits, ComputeLoop);

595

596

597 auto *IsEnd = B.CreateICmpEQ(NewActiveBits, ConstantInt::get(WaveTy, 0));

598 B.CreateCondBr(IsEnd, ComputeEnd, ComputeLoop);

599

600 B.SetInsertPoint(ComputeEnd);

601

602 return {OldValue, NewAccumulator};

603}

604

608 const unsigned BitWidth = Ty->getPrimitiveSizeInBits();

609 switch (Op) {

610 default:

626 return ConstantFP::get(C, APFloat::getZero(Ty->getFltSemantics(), true));

628 return ConstantFP::get(C, APFloat::getZero(Ty->getFltSemantics(), false));

631

632

633

634

635 return ConstantFP::get(C, APFloat::getNaN(Ty->getFltSemantics()));

636 }

637}

638

643

644void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,

646 unsigned ValIdx,

647 bool ValDivergent) const {

648

650

652 B.setIsFPConstrained(I.getFunction()->hasFnAttribute(Attribute::StrictFP));

653 }

654

655

656

659

660

661

662

663

664

665 if (IsPixelShader) {

666

668

669 Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {});

672

673

675

676 I.moveBefore(NonHelperTerminator->getIterator());

677 B.SetInsertPoint(&I);

678 }

679

680 Type *const Ty = I.getType();

683 [[maybe_unused]] const unsigned TyBitWidth = DL.getTypeSizeInBits(Ty);

684

685

686

687 Value *V = I.getOperand(ValIdx);

688

689

690

692 CallInst *const Ballot =

693 B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());

694

695

696

697

698

701 Mbcnt =

702 B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {Ballot, B.getInt32(0)});

703 } else {

704 Value *const ExtractLo = B.CreateTrunc(Ballot, Int32Ty);

705 Value *const ExtractHi = B.CreateTrunc(B.CreateLShr(Ballot, 32), Int32Ty);

706 Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo,

707 {ExtractLo, B.getInt32(0)});

708 Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {ExtractHi, Mbcnt});

709 }

710

712 LLVMContext &C = F->getContext();

713

714

715

721 }

723

724 Value *ExclScan = nullptr;

725 Value *NewV = nullptr;

726

727 const bool NeedResult = .use_empty();

728

731

732

733 if (ValDivergent) {

734 if (ScanImpl == ScanOptions::DPP) {

735

736

737 NewV =

738 B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});

740

741

742

743 NewV = buildReduction(B, ScanOp, NewV, Identity);

744 } else {

745 NewV = buildScan(B, ScanOp, NewV, Identity);

746 if (NeedResult)

747 ExclScan = buildShiftRight(B, NewV, Identity);

748

749

750

752 NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane,

753 {NewV, LastLaneIdx});

754 }

755

756 NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);

757 } else if (ScanImpl == ScanOptions::Iterative) {

758

761 std::tie(ExclScan, NewV) = buildScanIteratively(B, ScanOp, Identity, V, I,

762 ComputeLoop, ComputeEnd);

763 } else {

764 llvm_unreachable("Atomic Optimzer is disabled for None strategy");

765 }

766 } else {

767 switch (Op) {

768 default:

770

773

774

775 Value *const Ctpop = B.CreateIntCast(

776 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);

778 break;

779 }

782 Value *const Ctpop = B.CreateIntCast(

783 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Int32Ty, false);

784 Value *const CtpopFP = B.CreateUIToFP(Ctpop, Ty);

785 NewV = B.CreateFMul(V, CtpopFP);

786 break;

787 }

796

797

798 NewV = V;

799 break;

800

802

803

804 Value *const Ctpop = B.CreateIntCast(

805 B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);

806 NewV = buildMul(B, V, B.CreateAnd(Ctpop, 1));

807 break;

808 }

809 }

810

811

812

813

814 Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getInt32(0));

815

816

817 BasicBlock *const OriginalBB = I.getParent();

818

819

820

821

822

823

824 Instruction *const SingleLaneTerminator =

826

827

828

829

830

831

832

833

834

836 if (ValDivergent && ScanImpl == ScanOptions::Iterative) {

837

838

839

840

842 B.SetInsertPoint(ComputeEnd);

844 B.Insert(Terminator);

845

846

847

848 B.SetInsertPoint(OriginalBB);

849 B.CreateBr(ComputeLoop);

850

851

853 {{DominatorTree::Insert, OriginalBB, ComputeLoop},

854 {DominatorTree::Insert, ComputeLoop, ComputeEnd}});

855

856

857

858 for (auto *Succ : Terminator->successors()) {

859 DomTreeUpdates.push_back({DominatorTree::Insert, ComputeEnd, Succ});

860 DomTreeUpdates.push_back({DominatorTree::Delete, OriginalBB, Succ});

861 }

862

864

865 Predecessor = ComputeEnd;

866 } else {

867 Predecessor = OriginalBB;

868 }

869

870 B.SetInsertPoint(SingleLaneTerminator);

871

872

873

875 B.Insert(NewI);

877

878

879

880 B.SetInsertPoint(&I);

881

882 if (NeedResult) {

883

884 PHINode *const PHI = B.CreatePHI(Ty, 2);

886 PHI->addIncoming(NewI, SingleLaneTerminator->getParent());

887

888

889

890

892 if (TyBitWidth < 32)

893 ReadlaneVal = B.CreateZExt(PHI, B.getInt32Ty());

894

895 Value *BroadcastI = B.CreateIntrinsic(

896 ReadlaneVal->getType(), Intrinsic::amdgcn_readfirstlane, ReadlaneVal);

897 if (TyBitWidth < 32)

898 BroadcastI = B.CreateTrunc(BroadcastI, Ty);

899

900

901

902

903

904 Value *LaneOffset = nullptr;

905 if (ValDivergent) {

906 if (ScanImpl == ScanOptions::DPP) {

907 LaneOffset =

908 B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);

909 } else if (ScanImpl == ScanOptions::Iterative) {

910 LaneOffset = ExclScan;

911 } else {

912 llvm_unreachable("Atomic Optimzer is disabled for None strategy");

913 }

914 } else {

915 Mbcnt = isAtomicFloatingPointTy ? B.CreateUIToFP(Mbcnt, Ty)

916 : B.CreateIntCast(Mbcnt, Ty, false);

917 switch (Op) {

918 default:

922 LaneOffset = buildMul(B, V, Mbcnt);

923 break;

932 LaneOffset = B.CreateSelect(Cond, Identity, V);

933 break;

935 LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1));

936 break;

939 LaneOffset = B.CreateFMul(V, Mbcnt);

940 break;

941 }

942 }

943 }

945 if (isAtomicFloatingPointTy) {

946

947

948

949

950

951

952

953

954

955

956 Result = B.CreateSelect(Cond, BroadcastI, Result);

957 }

958

959 if (IsPixelShader) {

960

962

963 PHINode *const PHI = B.CreatePHI(Ty, 2);

965 PHI->addIncoming(Result, I.getParent());

966 I.replaceAllUsesWith(PHI);

967 } else {

968

969 I.replaceAllUsesWith(Result);

970 }

971 }

972

973

974 I.eraseFromParent();

975}

976

978 "AMDGPU atomic optimizations", false, false)

983

985 return new AMDGPUAtomicOptimizer(ScanStrategy);

986}

assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

static Constant * getIdentityValueForAtomicOp(Type *const Ty, AtomicRMWInst::BinOp Op)

Definition AMDGPUAtomicOptimizer.cpp:605

static bool isLegalCrossLaneType(Type *Ty)

Definition AMDGPUAtomicOptimizer.cpp:168

static Value * buildMul(IRBuilder<> &B, Value *LHS, Value *RHS)

Definition AMDGPUAtomicOptimizer.cpp:639

static Value * buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *LHS, Value *RHS)

Definition AMDGPUAtomicOptimizer.cpp:341

MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL

static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

static bool runOnFunction(Function &F, bool PostInlining)

AMD GCN specific subclass of TargetSubtarget.

Machine Check Debug Module

#define INITIALIZE_PASS_DEPENDENCY(depName)

#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)

#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)

const SmallVectorImpl< MachineOperand > & Cond

void visit(MachineFunction &MF, MachineBasicBlock &Start, std::function< void(MachineBasicBlock *)> op)

Target-Independent Code Generator Pass Configuration Options pass.

LLVM IR instance of the generic uniformity analysis.

unsigned getWavefrontSize() const

static APFloat getNaN(const fltSemantics &Sem, bool Negative=false, uint64_t payload=0)

Factory for NaN values.

static APFloat getZero(const fltSemantics &Sem, bool Negative=false)

Factory for Positive and Negative Zero.

static APInt getMaxValue(unsigned numBits)

Gets maximum unsigned value of APInt for specific bit width.

static APInt getSignedMaxValue(unsigned numBits)

Gets maximum signed value of APInt for a specific bit width.

static APInt getMinValue(unsigned numBits)

Gets minimum unsigned value of APInt for a specific bit width.

static APInt getSignedMinValue(unsigned numBits)

Gets minimum signed value of APInt for a specific bit width.

PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)

Get the result of an analysis pass for a given IR unit.

Represent the analysis usage information of a pass.

AnalysisUsage & addRequired()

AnalysisUsage & addPreserved()

Add the specified Pass class to the set of analyses preserved by this pass.

an instruction that atomically reads a memory location, combines it with another value,...

static bool isFPOperation(BinOp Op)

BinOp

This enumeration lists the possible modifications atomicrmw can make.

@ Min

*p = old <signed v ? old : v

@ Max

*p = old >signed v ? old : v

@ UMin

*p = old <unsigned v ? old : v

@ FMin

*p = minnum(old, v) minnum matches the behavior of llvm.minnum.

@ UMax

*p = old >unsigned v ? old : v

@ FMax

*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.

LLVM Basic Block Representation.

const Function * getParent() const

Return the enclosing method, or null if none.

LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const

Returns an iterator to the first instruction in this block that is not a PHINode instruction.

static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)

Creates a new BasicBlock.

const Instruction * getTerminator() const LLVM_READONLY

Returns the terminator instruction if the block is well formed or null if the block is not well forme...

Predicate

This enumeration lists the possible predicates for CmpInst subclasses.

@ ICMP_SLT

signed less than

@ ICMP_UGT

unsigned greater than

@ ICMP_SGT

signed greater than

@ ICMP_ULT

unsigned less than

This is the shared class of boolean and integer constants.

bool isOne() const

This is just a convenience method to make client code smaller for a common case.

This is an important base class in LLVM.

A parsed version of the target data layout string in and methods for querying it.

Analysis pass which computes a DominatorTree.

Legacy analysis pass which computes a DominatorTree.

DominatorTree & getDomTree()

FunctionPass class - This class is used to implement most global optimizations.

bool hasDPPWavefrontShifts() const

bool hasPermLane64() const

bool hasDPPBroadcasts() const

bool hasPermLaneX16() const

void applyUpdates(ArrayRef< UpdateT > Updates)

Submit updates to all available trees.

bool isDivergentUse(const UseT &U) const

Whether U is divergent.

This provides a uniform API for creating instructions and inserting them into a basic block: either a...

Base class for instruction visitors.

A wrapper class for inspecting calls to intrinsic functions.

This is an important class for using LLVM in a threaded context.

void addIncoming(Value *V, BasicBlock *BB)

Add an incoming value to the end of the PHI list.

static LLVM_ABI PoisonValue * get(Type *T)

Static factory methods - Return an 'poison' object of the specified type.

A set of analyses that are preserved following a run of a transformation pass.

static PreservedAnalyses all()

Construct a special preserved set that preserves all passes.

PreservedAnalyses & preserve()

Mark an analysis as preserved.

This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.

Primary interface to the complete machine description for the target machine.

const STC & getSubtarget(const Function &F) const

This method returns a pointer to the specified type of TargetSubtargetInfo.

Target-Independent Code Generator Pass Configuration Options.

TMC & getTM() const

Get the right type of TargetMachine for this target.

The instances of the Type class are immutable: once they are created, they are never changed.

@ FloatTyID

32-bit floating point type

@ IntegerTyID

Arbitrary bit width integers.

@ DoubleTyID

64-bit floating point type

bool isFloatingPointTy() const

Return true if this is one of the floating-point types.

Analysis pass which computes UniformityInfo.

Legacy analysis pass which computes a CycleInfo.

void setOperand(unsigned i, Value *Val)

LLVM Value Representation.

Type * getType() const

All values are typed, get the type of this value.

const ParentTy * getParent() const

self_iterator getIterator()

#define llvm_unreachable(msg)

Marks that the current location is not supposed to be reachable.

@ LOCAL_ADDRESS

Address space for local memory.

@ GLOBAL_ADDRESS

Address space for global memory (RAT0, VTX0).

constexpr std::underlying_type_t< E > Mask()

Get a bitmask with 1s in all places up to the high-order bit of E's largest value.

unsigned ID

LLVM IR allows to use arbitrary numbers as calling convention identifiers.

@ AMDGPU_PS

Used for Mesa/AMDPAL pixel shaders.

@ C

The default llvm calling convention, compatible with C.

@ BasicBlock

Various leaf nodes.

LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})

Look up the Function declaration of the intrinsic id in the Module M.

friend class Instruction

Iterator for Instructions in a `BasicBlock.

This is an optimization pass for GlobalISel generic memory operations.

GenericUniformityInfo< SSAContext > UniformityInfo

FunctionAddr VTableAddr Value

decltype(auto) dyn_cast(const From &Val)

dyn_cast - Return the argument parameter cast to the specified type.

FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty

FunctionPass * createAMDGPUAtomicOptimizerPass(ScanOptions ScanStrategy)

Definition AMDGPUAtomicOptimizer.cpp:984

class LLVM_GSL_OWNER SmallVector

Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...

IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >

DWARFExpression::Operation Op

constexpr unsigned BitWidth

decltype(auto) cast(const From &Val)

cast - Return the argument parameter cast to the specified type.

char & AMDGPUAtomicOptimizerID

Definition AMDGPUAtomicOptimizer.cpp:112

LLVM_ABI Instruction * SplitBlockAndInsertIfThen(Value *Cond, BasicBlock::iterator SplitBefore, bool Unreachable, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, BasicBlock *ThenBlock=nullptr)

Split the containing block at the specified instruction - everything before SplitBefore stays in the ...

AnalysisManager< Function > FunctionAnalysisManager

Convenience typedef for the Function analysis manager.

PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)

Definition AMDGPUAtomicOptimizer.cpp:134