LLVM: lib/Target/AMDGPU/SIInsertWaitcnts.cpp Source File (original) (raw)

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

43

44using namespace llvm;

45

46#define DEBUG_TYPE "si-insert-waitcnts"

47

49 "Force emit s_waitcnt expcnt(0) instrs");

51 "Force emit s_waitcnt lgkmcnt(0) instrs");

53 "Force emit s_waitcnt vmcnt(0) instrs");

54

57 cl::desc("Force all waitcnt instrs to be emitted as "

58 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),

60

62 "amdgpu-waitcnt-load-forcezero",

63 cl::desc("Force all waitcnt load counters to wait until 0"),

65

66namespace {

67

68

69

70

71enum InstCounterType {

72 LOAD_CNT = 0,

73 DS_CNT,

74 EXP_CNT,

75 STORE_CNT,

76 NUM_NORMAL_INST_CNTS,

77 SAMPLE_CNT = NUM_NORMAL_INST_CNTS,

78 BVH_CNT,

79 KM_CNT,

80 X_CNT,

81 NUM_EXTENDED_INST_CNTS,

82 NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS

83};

84}

85

86namespace llvm {

90}

91

92namespace {

93

94

95

96auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {

97 return enum_seq(LOAD_CNT, MaxCounter);

98}

99

100using RegInterval = std::pair<int, int>;

101

102struct HardwareLimits {

103 unsigned LoadcntMax;

104 unsigned ExpcntMax;

105 unsigned DscntMax;

106 unsigned StorecntMax;

107 unsigned SamplecntMax;

108 unsigned BvhcntMax;

109 unsigned KmcntMax;

110 unsigned XcntMax;

111};

112

113#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \

114 DECL(VMEM_ACCESS) \

115 DECL(VMEM_READ_ACCESS) \

116 DECL(VMEM_SAMPLER_READ_ACCESS) \

117 DECL(VMEM_BVH_READ_ACCESS) \

118 DECL(VMEM_WRITE_ACCESS) \

119 DECL(SCRATCH_WRITE_ACCESS) \

120 DECL(VMEM_GROUP) \

121 DECL(LDS_ACCESS) \

122 DECL(GDS_ACCESS) \

123 DECL(SQ_MESSAGE) \

124 DECL(SCC_WRITE) \

125 DECL(SMEM_ACCESS) \

126 DECL(SMEM_GROUP) \

127 DECL(EXP_GPR_LOCK) \

128 DECL(GDS_GPR_LOCK) \

129 DECL(EXP_POS_ACCESS) \

130 DECL(EXP_PARAM_ACCESS) \

131 DECL(VMW_GPR_LOCK) \

132 DECL(EXP_LDS_ACCESS)

133

134

135#define AMDGPU_EVENT_ENUM(Name) Name,

136enum WaitEventType {

138 NUM_WAIT_EVENTS

139};

140#undef AMDGPU_EVENT_ENUM

141

142#define AMDGPU_EVENT_NAME(Name) #Name,

143static constexpr StringLiteral WaitEventTypeName[] = {

145};

146#undef AMDGPU_EVENT_NAME

147

148

149

150

151

152

153

154

155

156enum RegisterMapping {

157 SQ_MAX_PGM_VGPRS = 2048,

158 AGPR_OFFSET = 512,

159 SQ_MAX_PGM_SGPRS = 128,

160

161

162

163

164

165 FIRST_LDS_VGPR = SQ_MAX_PGM_VGPRS,

166 NUM_LDS_VGPRS = 9,

167 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_LDS_VGPRS,

168 NUM_ALL_ALLOCATABLE = NUM_ALL_VGPRS + SQ_MAX_PGM_SGPRS,

169

170 SCC = NUM_ALL_ALLOCATABLE

171};

172

173

174

175

176

177

178enum VmemType {

179

180 VMEM_NOSAMPLER,

181

182 VMEM_SAMPLER,

183

184 VMEM_BVH,

185 NUM_VMEM_TYPES

186};

187

188

189

190

191static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {

192 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,

193 AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,

194 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT};

195

196static bool updateVMCntOnly(const MachineInstr &Inst) {

199}

200

201#ifndef NDEBUG

202static bool isNormalMode(InstCounterType MaxCounter) {

203 return MaxCounter == NUM_NORMAL_INST_CNTS;

204}

205#endif

206

207VmemType getVmemType(const MachineInstr &Inst) {

208 assert(updateVMCntOnly(Inst));

210 return VMEM_NOSAMPLER;

214

215 if (BaseInfo->BVH)

216 return VMEM_BVH;

217

218

219

220

222 return VMEM_SAMPLER;

223

224 return VMEM_NOSAMPLER;

225}

226

228 switch (T) {

229 case LOAD_CNT:

230 return Wait.LoadCnt;

232 return Wait.ExpCnt;

233 case DS_CNT:

234 return Wait.DsCnt;

235 case STORE_CNT:

236 return Wait.StoreCnt;

237 case SAMPLE_CNT:

238 return Wait.SampleCnt;

239 case BVH_CNT:

240 return Wait.BvhCnt;

241 case KM_CNT:

242 return Wait.KmCnt;

243 case X_CNT:

244 return Wait.XCnt;

245 default:

247 }

248}

249

251 unsigned &WC = getCounterRef(Wait, T);

252 WC = std::min(WC, Count);

253}

254

256 getCounterRef(Wait, T) = ~0u;

257}

258

260 return getCounterRef(Wait, T);

261}

262

263

264InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {

265 for (auto T : inst_counter_types()) {

266 if (masks[T] & (1 << E))

267 return T;

268 }

270}

271

272class WaitcntBrackets;

273

274

275

276

277

278

279

280class WaitcntGenerator {

281protected:

282 const GCNSubtarget *ST = nullptr;

283 const SIInstrInfo *TII = nullptr;

284 AMDGPU::IsaVersion IV;

285 InstCounterType MaxCounter;

286 bool OptNone;

287

288public:

289 WaitcntGenerator() = default;

290 WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter)

291 : ST(&MF.getSubtarget()), TII(ST->getInstrInfo()),

295

296

297

298 bool isOptNone() const { return OptNone; }

299

300

301

302

303

304

305

306

307

308

309

310

311 virtual bool

312 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,

313 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,

315

316

317 bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;

318

319

320

321 virtual bool createNewWaitcnt(MachineBasicBlock &Block,

323 AMDGPU::Waitcnt Wait) = 0;

324

325

326

327 virtual const unsigned *getWaitEventMask() const = 0;

328

329

330

331 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;

332

333 virtual ~WaitcntGenerator() = default;

334

335

336 static constexpr unsigned

337 eventMask(std::initializer_list Events) {

338 unsigned Mask = 0;

339 for (auto &E : Events)

341

343 }

344};

345

346class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {

347public:

348 using WaitcntGenerator::WaitcntGenerator;

349

350 bool

351 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,

352 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,

354

355 bool createNewWaitcnt(MachineBasicBlock &Block,

357 AMDGPU::Waitcnt Wait) override;

358

359 const unsigned *getWaitEventMask() const override {

361

362 static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {

363 eventMask({VMEM_ACCESS, VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS,

364 VMEM_BVH_READ_ACCESS}),

365 eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),

366 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,

367 EXP_POS_ACCESS, EXP_LDS_ACCESS}),

368 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),

369 0,

370 0,

371 0,

372 0};

373

374 return WaitEventMaskForInstPreGFX12;

375 }

376

377 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;

378};

379

380class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {

381public:

382 using WaitcntGenerator::WaitcntGenerator;

383

384 bool

385 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,

386 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,

388

389 bool createNewWaitcnt(MachineBasicBlock &Block,

391 AMDGPU::Waitcnt Wait) override;

392

393 const unsigned *getWaitEventMask() const override {

395

396 static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {

397 eventMask({VMEM_ACCESS, VMEM_READ_ACCESS}),

398 eventMask({LDS_ACCESS, GDS_ACCESS}),

399 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,

400 EXP_POS_ACCESS, EXP_LDS_ACCESS}),

401 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),

402 eventMask({VMEM_SAMPLER_READ_ACCESS}),

403 eventMask({VMEM_BVH_READ_ACCESS}),

404 eventMask({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),

405 eventMask({VMEM_GROUP, SMEM_GROUP})};

406

407 return WaitEventMaskForInstGFX12Plus;

408 }

409

410 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;

411};

412

413class SIInsertWaitcnts {

414public:

415 const GCNSubtarget *ST;

416 const SIInstrInfo *TII = nullptr;

417 const SIRegisterInfo *TRI = nullptr;

418 const MachineRegisterInfo *MRI = nullptr;

419 InstCounterType SmemAccessCounter;

420 InstCounterType MaxCounter;

421 const unsigned *WaitEventMaskForInst;

422

423private:

424 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;

425 DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;

426 MachineLoopInfo *MLI;

427 MachinePostDominatorTree *PDT;

429

430 struct BlockInfo {

431 std::unique_ptr Incoming;

432 bool Dirty = true;

433 };

434

435 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;

436

437 bool ForceEmitWaitcnt[NUM_INST_CNTS];

438

439

440

441

442 WaitcntGeneratorPreGFX12 WCGPreGFX12;

443 WaitcntGeneratorGFX12Plus WCGGFX12Plus;

444

445 WaitcntGenerator *WCG = nullptr;

446

447

448

449 DenseSet<MachineInstr *> ReleaseVGPRInsts;

450

451 HardwareLimits Limits;

452

453public:

454 SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,

456 : MLI(MLI), PDT(PDT), AA(AA) {

457 (void)ForceExpCounter;

458 (void)ForceLgkmCounter;

459 (void)ForceVMCounter;

460 }

461

462 unsigned getWaitCountMax(InstCounterType T) const {

463 switch (T) {

464 case LOAD_CNT:

465 return Limits.LoadcntMax;

466 case DS_CNT:

467 return Limits.DscntMax;

469 return Limits.ExpcntMax;

470 case STORE_CNT:

471 return Limits.StorecntMax;

472 case SAMPLE_CNT:

473 return Limits.SamplecntMax;

474 case BVH_CNT:

475 return Limits.BvhcntMax;

476 case KM_CNT:

477 return Limits.KmcntMax;

478 case X_CNT:

479 return Limits.XcntMax;

480 default:

481 break;

482 }

483 return 0;

484 }

485

486 bool shouldFlushVmCnt(MachineLoop *ML, const WaitcntBrackets &Brackets);

487 bool isPreheaderToFlush(MachineBasicBlock &MBB,

488 const WaitcntBrackets &ScoreBrackets);

489 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;

490 bool run(MachineFunction &MF);

491

492 void setForceEmitWaitcnt() {

493

494

495#ifndef NDEBUG

498 ForceEmitWaitcnt[EXP_CNT] = true;

499 } else {

500 ForceEmitWaitcnt[EXP_CNT] = false;

501 }

502

505 ForceEmitWaitcnt[DS_CNT] = true;

506 ForceEmitWaitcnt[KM_CNT] = true;

507 } else {

508 ForceEmitWaitcnt[DS_CNT] = false;

509 ForceEmitWaitcnt[KM_CNT] = false;

510 }

511

514 ForceEmitWaitcnt[LOAD_CNT] = true;

515 ForceEmitWaitcnt[SAMPLE_CNT] = true;

516 ForceEmitWaitcnt[BVH_CNT] = true;

517 } else {

518 ForceEmitWaitcnt[LOAD_CNT] = false;

519 ForceEmitWaitcnt[SAMPLE_CNT] = false;

520 ForceEmitWaitcnt[BVH_CNT] = false;

521 }

522#endif

523 }

524

525

526

527 WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {

529

530 case AMDGPU::GLOBAL_INV:

531 return VMEM_READ_ACCESS;

532 case AMDGPU::GLOBAL_WB:

533 case AMDGPU::GLOBAL_WBINV:

534 return VMEM_WRITE_ACCESS;

535 default:

536 break;

537 }

538

539

540 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {

541 VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};

542

544

545

547 return VMEM_ACCESS;

550 if (TII->mayAccessScratch(Inst))

551 return SCRATCH_WRITE_ACCESS;

552 return VMEM_WRITE_ACCESS;

553 }

555 return VMEM_READ_ACCESS;

556 return VmemReadMapping[getVmemType(Inst)];

557 }

558

559 bool isVmemAccess(const MachineInstr &MI) const;

560 bool generateWaitcntInstBefore(MachineInstr &MI,

561 WaitcntBrackets &ScoreBrackets,

562 MachineInstr *OldWaitcntInstr,

563 bool FlushVmCnt);

564 bool generateWaitcnt(AMDGPU::Waitcnt Wait,

566 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,

567 MachineInstr *OldWaitcntInstr);

568 void updateEventWaitcntAfter(MachineInstr &Inst,

569 WaitcntBrackets *ScoreBrackets);

571 MachineBasicBlock *Block) const;

572 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &Block,

573 WaitcntBrackets &ScoreBrackets);

574 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,

575 WaitcntBrackets &ScoreBrackets);

576};

577

578

579

580

581

582

583

584

585

586class WaitcntBrackets {

587public:

588 WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {}

589

590 bool isSmemCounter(InstCounterType T) const {

591 return T == Context->SmemAccessCounter || T == X_CNT;

592 }

593

594 unsigned getSgprScoresIdx(InstCounterType T) const {

595 assert(isSmemCounter(T) && "Invalid SMEM counter");

596 return T == X_CNT ? 1 : 0;

597 }

598

599 unsigned getScoreLB(InstCounterType T) const {

600 assert(T < NUM_INST_CNTS);

601 return ScoreLBs[T];

602 }

603

604 unsigned getScoreUB(InstCounterType T) const {

605 assert(T < NUM_INST_CNTS);

606 return ScoreUBs[T];

607 }

608

609 unsigned getScoreRange(InstCounterType T) const {

610 return getScoreUB(T) - getScoreLB(T);

611 }

612

613 unsigned getRegScore(int GprNo, InstCounterType T) const {

614 if (GprNo < NUM_ALL_VGPRS)

615 return VgprScores[T][GprNo];

616

617 if (GprNo < NUM_ALL_ALLOCATABLE)

618 return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];

619

620 assert(GprNo == SCC);

621 return SCCScore;

622 }

623

624 bool merge(const WaitcntBrackets &Other);

625

626 RegInterval getRegInterval(const MachineInstr *MI,

627 const MachineOperand &Op) const;

628

629 bool counterOutOfOrder(InstCounterType T) const;

630 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait);

631 void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;

632 bool hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait);

633 bool canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait);

634 void simplifyXcnt(AMDGPU::Waitcnt &CheckWait, AMDGPU::Waitcnt &UpdateWait);

635

636 void determineWait(InstCounterType T, RegInterval Interval,

637 AMDGPU::Waitcnt &Wait) const;

638 void determineWait(InstCounterType T, int RegNo,

639 AMDGPU::Waitcnt &Wait) const {

640 determineWait(T, {RegNo, RegNo + 1}, Wait);

641 }

642 void tryClearSCCWriteEvent(MachineInstr *Inst);

643

644 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);

645 void applyWaitcnt(InstCounterType T, unsigned Count);

646 void updateByEvent(WaitEventType E, MachineInstr &MI);

647

648 unsigned hasPendingEvent() const { return PendingEvents; }

649 unsigned hasPendingEvent(WaitEventType E) const {

650 return PendingEvents & (1 << E);

651 }

652 unsigned hasPendingEvent(InstCounterType T) const {

653 unsigned HasPending = PendingEvents & Context->WaitEventMaskForInst[T];

654 assert((HasPending != 0) == (getScoreRange(T) != 0));

655 return HasPending;

656 }

657

658 bool hasMixedPendingEvents(InstCounterType T) const {

659 unsigned Events = hasPendingEvent(T);

660

661 return Events & (Events - 1);

662 }

663

664 bool hasPendingFlat() const {

665 return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&

666 LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||

667 (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&

668 LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));

669 }

670

671 void setPendingFlat() {

672 LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];

673 LastFlat[DS_CNT] = ScoreUBs[DS_CNT];

674 }

675

676 bool hasPendingGDS() const {

677 return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];

678 }

679

680 unsigned getPendingGDSWait() const {

681 return std::min(getScoreUB(DS_CNT) - LastGDS,

682 Context->getWaitCountMax(DS_CNT) - 1);

683 }

684

685 void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }

686

687

688

689 bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const {

690 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {

691 assert(RegNo < NUM_ALL_VGPRS);

692 if (VgprVmemTypes[RegNo] & ~(1 << V))

693 return true;

694 }

695 return false;

696 }

697

698 void clearVgprVmemTypes(RegInterval Interval) {

699 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {

700 assert(RegNo < NUM_ALL_VGPRS);

701 VgprVmemTypes[RegNo] = 0;

702 }

703 }

704

705 void setStateOnFunctionEntryOrReturn() {

706 setScoreUB(STORE_CNT,

707 getScoreUB(STORE_CNT) + Context->getWaitCountMax(STORE_CNT));

708 PendingEvents |= Context->WaitEventMaskForInst[STORE_CNT];

709 }

710

711 ArrayRef<const MachineInstr *> getLDSDMAStores() const {

712 return LDSDMAStores;

713 }

714

715 bool hasPointSampleAccel(const MachineInstr &MI) const;

716 bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,

718

719 void print(raw_ostream &) const;

721

722private:

723 struct MergeInfo {

724 unsigned OldLB;

725 unsigned OtherLB;

726 unsigned MyShift;

727 unsigned OtherShift;

728 };

729 static bool mergeScore(const MergeInfo &M, unsigned &Score,

730 unsigned OtherScore);

731

732 void setScoreLB(InstCounterType T, unsigned Val) {

733 assert(T < NUM_INST_CNTS);

734 ScoreLBs[T] = Val;

735 }

736

737 void setScoreUB(InstCounterType T, unsigned Val) {

738 assert(T < NUM_INST_CNTS);

739 ScoreUBs[T] = Val;

740

741 if (T != EXP_CNT)

742 return;

743

744 if (getScoreRange(EXP_CNT) > Context->getWaitCountMax(EXP_CNT))

745 ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - Context->getWaitCountMax(EXP_CNT);

746 }

747

748 void setRegScore(int GprNo, InstCounterType T, unsigned Val) {

749 setScoreByInterval({GprNo, GprNo + 1}, T, Val);

750 }

751

752 void setScoreByInterval(RegInterval Interval, InstCounterType CntTy,

753 unsigned Score);

754

755 void setScoreByOperand(const MachineInstr *MI, const MachineOperand &Op,

756 InstCounterType CntTy, unsigned Val);

757

758 const SIInsertWaitcnts *Context;

759

760 unsigned ScoreLBs[NUM_INST_CNTS] = {0};

761 unsigned ScoreUBs[NUM_INST_CNTS] = {0};

762 unsigned PendingEvents = 0;

763

764 unsigned LastFlat[NUM_INST_CNTS] = {0};

765

766 unsigned LastGDS = 0;

767

768

769 int VgprUB = -1;

770 int SgprUB = -1;

771 unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};

772

773

774

775

776 unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}};

777

778 unsigned SCCScore = 0;

779

780 const MachineInstr *PendingSCCWrite = nullptr;

781

782

783 unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};

784

785

786 SmallVector<const MachineInstr *, NUM_LDS_VGPRS - 1> LDSDMAStores;

787};

788

790public:

791 static char ID;

792 SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}

793

794 bool runOnMachineFunction(MachineFunction &MF) override;

795

796 StringRef getPassName() const override {

797 return "SI insert wait instructions";

798 }

799

800 void getAnalysisUsage(AnalysisUsage &AU) const override {

802 AU.addRequired();

803 AU.addRequired();

807 }

808};

809

810}

811

812RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,

814 if (Op.getReg() == AMDGPU::SCC)

815 return {SCC, SCC + 1};

816

817 const SIRegisterInfo *TRI = Context->TRI;

818 const MachineRegisterInfo *MRI = Context->MRI;

819

820 if (TRI->isInAllocatableClass(Op.getReg()))

821 return {-1, -1};

822

823

824

825 assert(Op.getSubReg() || Op.isUndef());

826

828

830 unsigned RegIdx = TRI->getHWRegIndex(MCReg);

831

832 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());

833 unsigned Size = TRI->getRegSizeInBits(*RC);

834

835

836 if (TRI->isVectorRegister(*MRI, Op.getReg())) {

840 if (TRI->isAGPR(*MRI, Op.getReg()))

841 Result.first += AGPR_OFFSET;

845

846 if (Size == 16 && Context->ST->hasD16Writes32BitVgpr()) {

847

848

851 else

853 }

854 } else if (TRI->isSGPRReg(*MRI, Op.getReg()) && RegIdx < SQ_MAX_PGM_SGPRS) {

855

856

857 Result.first = RegIdx + NUM_ALL_VGPRS;

859 } else {

860 return {-1, -1};

861 }

862

864}

865

866void WaitcntBrackets::setScoreByInterval(RegInterval Interval,

867 InstCounterType CntTy,

868 unsigned Score) {

869 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {

870 if (RegNo < NUM_ALL_VGPRS) {

871 VgprUB = std::max(VgprUB, RegNo);

872 VgprScores[CntTy][RegNo] = Score;

873 } else if (RegNo < NUM_ALL_ALLOCATABLE) {

874 SgprUB = std::max(SgprUB, RegNo - NUM_ALL_VGPRS);

875 SgprScores[getSgprScoresIdx(CntTy)][RegNo - NUM_ALL_VGPRS] = Score;

876 } else {

877 assert(RegNo == SCC);

878 SCCScore = Score;

879 }

880 }

881}

882

883void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI,

884 const MachineOperand &Op,

885 InstCounterType CntTy, unsigned Score) {

886 RegInterval Interval = getRegInterval(MI, Op);

887 setScoreByInterval(Interval, CntTy, Score);

888}

889

890

891

892

893

894

895bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {

897 return false;

898

900 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =

903}

904

905

906

907

908

909

910bool WaitcntBrackets::hasPointSamplePendingVmemTypes(

911 const MachineInstr &MI, RegInterval Interval) const {

912 if (!hasPointSampleAccel(MI))

913 return false;

914

915 return hasOtherPendingVmemTypes(Interval, VMEM_NOSAMPLER);

916}

917

918void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {

919 InstCounterType T = eventCounter(Context->WaitEventMaskForInst, E);

921

922 unsigned UB = getScoreUB(T);

923 unsigned CurrScore = UB + 1;

924 if (CurrScore == 0)

926

927

928

929 PendingEvents |= 1 << E;

930 setScoreUB(T, CurrScore);

931

932 const SIRegisterInfo *TRI = Context->TRI;

933 const MachineRegisterInfo *MRI = Context->MRI;

934 const SIInstrInfo *TII = Context->TII;

935

936 if (T == EXP_CNT) {

937

938

940

941

942 if (const auto *AddrOp = TII->getNamedOperand(Inst, AMDGPU::OpName::addr))

943 setScoreByOperand(&Inst, *AddrOp, EXP_CNT, CurrScore);

944

946 if (const auto *Data0 =

947 TII->getNamedOperand(Inst, AMDGPU::OpName::data0))

948 setScoreByOperand(&Inst, *Data0, EXP_CNT, CurrScore);

949 if (const auto *Data1 =

950 TII->getNamedOperand(Inst, AMDGPU::OpName::data1))

951 setScoreByOperand(&Inst, *Data1, EXP_CNT, CurrScore);

953 Inst.getOpcode() != AMDGPU::DS_APPEND &&

954 Inst.getOpcode() != AMDGPU::DS_CONSUME &&

955 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {

956 for (const MachineOperand &Op : Inst.all_uses()) {

957 if (TRI->isVectorRegister(*MRI, Op.getReg()))

958 setScoreByOperand(&Inst, Op, EXP_CNT, CurrScore);

959 }

960 }

961 } else if (TII->isFLAT(Inst)) {

963 setScoreByOperand(&Inst,

964 *TII->getNamedOperand(Inst, AMDGPU::OpName::data),

965 EXP_CNT, CurrScore);

967 setScoreByOperand(&Inst,

968 *TII->getNamedOperand(Inst, AMDGPU::OpName::data),

969 EXP_CNT, CurrScore);

970 }

971 } else if (TII->isMIMG(Inst)) {

973 setScoreByOperand(&Inst, Inst.getOperand(0), EXP_CNT, CurrScore);

975 setScoreByOperand(&Inst,

976 *TII->getNamedOperand(Inst, AMDGPU::OpName::data),

977 EXP_CNT, CurrScore);

978 }

979 } else if (TII->isMTBUF(Inst)) {

981 setScoreByOperand(&Inst, Inst.getOperand(0), EXP_CNT, CurrScore);

982 } else if (TII->isMUBUF(Inst)) {

984 setScoreByOperand(&Inst, Inst.getOperand(0), EXP_CNT, CurrScore);

986 setScoreByOperand(&Inst,

987 *TII->getNamedOperand(Inst, AMDGPU::OpName::data),

988 EXP_CNT, CurrScore);

989 }

990 } else if (TII->isLDSDIR(Inst)) {

991

992 setScoreByOperand(&Inst,

993 *TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),

994 EXP_CNT, CurrScore);

995 } else {

996 if (TII->isEXP(Inst)) {

997

998

999

1000

1001 for (MachineOperand &DefMO : Inst.all_defs()) {

1002 if (TRI->isVGPR(*MRI, DefMO.getReg())) {

1003 setScoreByOperand(&Inst, DefMO, EXP_CNT, CurrScore);

1004 }

1005 }

1006 }

1007 for (const MachineOperand &Op : Inst.all_uses()) {

1008 if (TRI->isVectorRegister(*MRI, Op.getReg()))

1009 setScoreByOperand(&Inst, Op, EXP_CNT, CurrScore);

1010 }

1011 }

1012 } else if (T == X_CNT) {

1013 WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;

1014 if (PendingEvents & (1 << OtherEvent)) {

1015

1016

1017

1018

1019 setScoreLB(T, getScoreUB(T) - 1);

1020 PendingEvents &= ~(1 << OtherEvent);

1021 }

1022 for (const MachineOperand &Op : Inst.all_uses())

1023 setScoreByOperand(&Inst, Op, T, CurrScore);

1024 } else {

1025

1026

1027

1028

1029

1030

1031

1032

1033

1034 for (const MachineOperand &Op : Inst.defs()) {

1035 RegInterval Interval = getRegInterval(&Inst, Op);

1036 if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {

1037 if (Interval.first >= NUM_ALL_VGPRS)

1038 continue;

1039 if (updateVMCntOnly(Inst)) {

1040

1041

1042

1044 VmemType V = getVmemType(Inst);

1045 unsigned char TypesMask = 1 << V;

1046

1047

1048 if (hasPointSampleAccel(Inst))

1049 TypesMask |= 1 << VMEM_NOSAMPLER;

1050 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)

1051 VgprVmemTypes[RegNo] |= TypesMask;

1052 }

1053 }

1054 setScoreByInterval(Interval, T, CurrScore);

1055 }

1057 (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {

1058

1059

1060 unsigned Slot = 0;

1061 for (const auto *MemOp : Inst.memoperands()) {

1062 if (!MemOp->isStore() ||

1064 continue;

1065

1066

1067 auto AAI = MemOp->getAAInfo();

1068

1069

1070

1071

1072

1073

1074

1075 if (!AAI || !AAI.Scope)

1076 break;

1077 for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {

1078 for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {

1079 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {

1081 break;

1082 }

1083 }

1084 }

1085 if (Slot)

1086 break;

1087

1088

1089

1090 LDSDMAStores.push_back(&Inst);

1091 Slot = LDSDMAStores.size();

1092 break;

1093 }

1094 if (Slot < NUM_LDS_VGPRS)

1095 setRegScore(FIRST_LDS_VGPR + Slot, T, CurrScore);

1096 if (Slot)

1097 setRegScore(FIRST_LDS_VGPR, T, CurrScore);

1098 }

1099

1101 setRegScore(SCC, T, CurrScore);

1102 PendingSCCWrite = &Inst;

1103 }

1104 }

1105}

1106

1107void WaitcntBrackets::print(raw_ostream &OS) const {

1108 const GCNSubtarget *ST = Context->ST;

1109

1110 OS << '\n';

1111 for (auto T : inst_counter_types(Context->MaxCounter)) {

1112 unsigned SR = getScoreRange(T);

1113

1114 switch (T) {

1115 case LOAD_CNT:

1116 OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("

1117 << SR << "): ";

1118 break;

1119 case DS_CNT:

1120 OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("

1121 << SR << "): ";

1122 break;

1124 OS << " EXP_CNT(" << SR << "): ";

1125 break;

1126 case STORE_CNT:

1127 OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("

1128 << SR << "): ";

1129 break;

1130 case SAMPLE_CNT:

1131 OS << " SAMPLE_CNT(" << SR << "): ";

1132 break;

1133 case BVH_CNT:

1134 OS << " BVH_CNT(" << SR << "): ";

1135 break;

1136 case KM_CNT:

1137 OS << " KM_CNT(" << SR << "): ";

1138 break;

1139 case X_CNT:

1140 OS << " X_CNT(" << SR << "): ";

1141 break;

1142 default:

1143 OS << " UNKNOWN(" << SR << "): ";

1144 break;

1145 }

1146

1147 if (SR != 0) {

1148

1149 unsigned LB = getScoreLB(T);

1150

1151 for (int J = 0; J <= VgprUB; J++) {

1152 unsigned RegScore = getRegScore(J, T);

1153 if (RegScore <= LB)

1154 continue;

1155 unsigned RelScore = RegScore - LB - 1;

1156 if (J < FIRST_LDS_VGPR) {

1157 OS << RelScore << ":v" << J << " ";

1158 } else {

1159 OS << RelScore << ":ds ";

1160 }

1161 }

1162

1163 if (isSmemCounter(T)) {

1164 for (int J = 0; J <= SgprUB; J++) {

1165 unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, T);

1166 if (RegScore <= LB)

1167 continue;

1168 unsigned RelScore = RegScore - LB - 1;

1169 OS << RelScore << ":s" << J << " ";

1170 }

1171 }

1172 if (T == KM_CNT && SCCScore > 0)

1173 OS << SCCScore << ":scc ";

1174 }

1175 OS << '\n';

1176 }

1177

1178 OS << "Pending Events: ";

1179 if (hasPendingEvent()) {

1180 ListSeparator LS;

1181 for (unsigned I = 0; I != NUM_WAIT_EVENTS; ++I) {

1182 if (hasPendingEvent((WaitEventType)I)) {

1183 OS << LS << WaitEventTypeName[I];

1184 }

1185 }

1186 } else {

1187 OS << "none";

1188 }

1189 OS << '\n';

1190

1191 OS << '\n';

1192}

1193

1194

1195

1196void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) {

1197 simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);

1198 simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);

1199 simplifyWaitcnt(DS_CNT, Wait.DsCnt);

1200 simplifyWaitcnt(STORE_CNT, Wait.StoreCnt);

1201 simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);

1202 simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);

1203 simplifyWaitcnt(KM_CNT, Wait.KmCnt);

1205}

1206

1207void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,

1208 unsigned &Count) const {

1209

1210

1211

1212 if (Count >= getScoreRange(T))

1214}

1215

1216void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,

1217 AMDGPU::Waitcnt &Wait) const {

1218 const unsigned LB = getScoreLB(T);

1219 const unsigned UB = getScoreUB(T);

1220 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {

1221 unsigned ScoreToWait = getRegScore(RegNo, T);

1222

1223

1224

1225 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {

1226 if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&

1227 Context->ST->hasFlatLgkmVMemCountInOrder()) {

1228

1229

1230

1231 addWait(Wait, T, 0);

1232 } else if (counterOutOfOrder(T)) {

1233

1234

1235

1236 addWait(Wait, T, 0);

1237 } else {

1238

1239

1240 unsigned NeededWait =

1241 std::min(UB - ScoreToWait, Context->getWaitCountMax(T) - 1);

1242 addWait(Wait, T, NeededWait);

1243 }

1244 }

1245 }

1246}

1247

1248void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {

1249

1250

1251 if (PendingSCCWrite &&

1252 PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&

1254 unsigned SCC_WRITE_PendingEvent = 1 << SCC_WRITE;

1255

1256 if ((PendingEvents & Context->WaitEventMaskForInst[KM_CNT]) ==

1257 SCC_WRITE_PendingEvent) {

1258 setScoreLB(KM_CNT, getScoreUB(KM_CNT));

1259 }

1260

1261 PendingEvents &= ~SCC_WRITE_PendingEvent;

1262 PendingSCCWrite = nullptr;

1263 }

1264}

1265

1266void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {

1267 applyWaitcnt(LOAD_CNT, Wait.LoadCnt);

1268 applyWaitcnt(EXP_CNT, Wait.ExpCnt);

1269 applyWaitcnt(DS_CNT, Wait.DsCnt);

1270 applyWaitcnt(STORE_CNT, Wait.StoreCnt);

1271 applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);

1272 applyWaitcnt(BVH_CNT, Wait.BvhCnt);

1273 applyWaitcnt(KM_CNT, Wait.KmCnt);

1274 applyWaitcnt(X_CNT, Wait.XCnt);

1275}

1276

1277void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {

1278 const unsigned UB = getScoreUB(T);

1279 if (Count >= UB)

1280 return;

1281 if (Count != 0) {

1282 if (counterOutOfOrder(T))

1283 return;

1284 setScoreLB(T, std::max(getScoreLB(T), UB - Count));

1285 } else {

1286 setScoreLB(T, UB);

1287 PendingEvents &= ~Context->WaitEventMaskForInst[T];

1288 }

1289}

1290

1291bool WaitcntBrackets::hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait) {

1292

1293

1294

1295 return Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP);

1296}

1297

1298bool WaitcntBrackets::canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait) {

1299

1300

1301

1302 return Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&

1303 !hasPendingEvent(STORE_CNT);

1304}

1305

1306void WaitcntBrackets::simplifyXcnt(AMDGPU::Waitcnt &CheckWait,

1307 AMDGPU::Waitcnt &UpdateWait) {

1308

1309

1310

1311

1312

1313 if (hasRedundantXCntWithKmCnt(CheckWait)) {

1314 if (!hasMixedPendingEvents(X_CNT)) {

1315 applyWaitcnt(X_CNT, 0);

1316 } else {

1317 PendingEvents &= ~(1 << SMEM_GROUP);

1318 }

1319 } else if (canOptimizeXCntWithLoadCnt(CheckWait)) {

1320 if (!hasMixedPendingEvents(X_CNT)) {

1321 applyWaitcnt(X_CNT, std::min(CheckWait.XCnt, CheckWait.LoadCnt));

1322 } else if (CheckWait.LoadCnt == 0) {

1323 PendingEvents &= ~(1 << VMEM_GROUP);

1324 }

1325 }

1326 simplifyWaitcnt(X_CNT, UpdateWait.XCnt);

1327}

1328

1329

1330

1331bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {

1332

1333 if ((T == Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||

1334 (T == X_CNT && hasPendingEvent(SMEM_GROUP)))

1335 return true;

1336 return hasMixedPendingEvents(T);

1337}

1338

1340 false, false)

1345

1346char SIInsertWaitcntsLegacy::ID = 0;

1347

1349

1351 return new SIInsertWaitcntsLegacy();

1352}

1353

1355 unsigned NewEnc) {

1356 int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);

1358

1360

1361 if (NewEnc == MO.getImm())

1362 return false;

1363

1365 return true;

1366}

1367

1368

1369

1371 switch (Opcode) {

1372 case AMDGPU::S_WAIT_LOADCNT:

1373 return LOAD_CNT;

1374 case AMDGPU::S_WAIT_EXPCNT:

1375 return EXP_CNT;

1376 case AMDGPU::S_WAIT_STORECNT:

1377 return STORE_CNT;

1378 case AMDGPU::S_WAIT_SAMPLECNT:

1379 return SAMPLE_CNT;

1380 case AMDGPU::S_WAIT_BVHCNT:

1381 return BVH_CNT;

1382 case AMDGPU::S_WAIT_DSCNT:

1383 return DS_CNT;

1384 case AMDGPU::S_WAIT_KMCNT:

1385 return KM_CNT;

1386 case AMDGPU::S_WAIT_XCNT:

1387 return X_CNT;

1388 default:

1389 return {};

1390 }

1391}

1392

1393bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {

1395 if (Opcode == Waitcnt->getOpcode())

1396 return false;

1397

1399 return true;

1400}

1401

1402

1403

1404

1405

1406

1407bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(

1408 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,

1411 assert(isNormalMode(MaxCounter));

1412

1414 MachineInstr *WaitcntInstr = nullptr;

1415 MachineInstr *WaitcntVsCntInstr = nullptr;

1416

1418 dbgs() << "PreGFX12::applyPreexistingWaitcnt at: ";

1420 dbgs() << "end of block\n";

1421 else

1422 dbgs() << *It;

1423 });

1424

1425 for (auto &II :

1428 if (II.isMetaInstruction()) {

1430 continue;

1431 }

1432

1434 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;

1435

1436

1437

1438 if (Opcode == AMDGPU::S_WAITCNT) {

1439 unsigned IEnc = II.getOperand(0).getImm();

1441 if (TrySimplify)

1442 ScoreBrackets.simplifyWaitcnt(OldWait);

1443 Wait = Wait.combined(OldWait);

1444

1445

1446 if (WaitcntInstr || (Wait.hasWaitExceptStoreCnt() && TrySimplify)) {

1447 II.eraseFromParent();

1449 } else

1450 WaitcntInstr = &II;

1451 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {

1452 assert(ST->hasVMemToLDSLoad());

1453 LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II

1454 << "Before: " << Wait.LoadCnt << '\n';);

1455 ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, Wait);

1457

1458

1459

1460

1461

1462

1463

1464 II.eraseFromParent();

1465 } else {

1466 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);

1467 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);

1468

1469 unsigned OldVSCnt =

1470 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();

1471 if (TrySimplify)

1472 ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);

1473 Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);

1474

1475 if (WaitcntVsCntInstr || (Wait.hasWaitStoreCnt() && TrySimplify)) {

1476 II.eraseFromParent();

1478 } else

1479 WaitcntVsCntInstr = &II;

1480 }

1481 }

1482

1483 if (WaitcntInstr) {

1486 Modified |= promoteSoftWaitCnt(WaitcntInstr);

1487

1488 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);

1489 ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt);

1490 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);

1491 Wait.LoadCnt = ~0u;

1492 Wait.ExpCnt = ~0u;

1493 Wait.DsCnt = ~0u;

1494

1497 << "applied pre-existing waitcnt\n"

1498 << "New Instr at block end: " << *WaitcntInstr << '\n'

1499 : dbgs() << "applied pre-existing waitcnt\n"

1500 << "Old Instr: " << *It

1501 << "New Instr: " << *WaitcntInstr << '\n');

1502 }

1503

1504 if (WaitcntVsCntInstr) {

1506 AMDGPU::OpName::simm16, Wait.StoreCnt);

1507 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);

1508

1509 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);

1510 Wait.StoreCnt = ~0u;

1511

1513 ? dbgs() << "applied pre-existing waitcnt\n"

1514 << "New Instr at block end: " << *WaitcntVsCntInstr

1515 << '\n'

1516 : dbgs() << "applied pre-existing waitcnt\n"

1517 << "Old Instr: " << *It

1518 << "New Instr: " << *WaitcntVsCntInstr << '\n');

1519 }

1520

1522}

1523

1524

1525

1526bool WaitcntGeneratorPreGFX12::createNewWaitcnt(

1528 AMDGPU::Waitcnt Wait) {

1530 assert(isNormalMode(MaxCounter));

1531

1534

1535

1536

1537 if (Wait.hasWaitExceptStoreCnt()) {

1539 [[maybe_unused]] auto SWaitInst =

1542

1544 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;

1545 dbgs() << "New Instr: " << *SWaitInst << '\n');

1546 }

1547

1548 if (Wait.hasWaitStoreCnt()) {

1550

1551 [[maybe_unused]] auto SWaitInst =

1556

1558 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;

1559 dbgs() << "New Instr: " << *SWaitInst << '\n');

1560 }

1561

1563}

1564

1565AMDGPU::Waitcnt

1566WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {

1567 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u);

1568}

1569

1570AMDGPU::Waitcnt

1571WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {

1572 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,

1573 ~0u );

1574}

1575

1576

1577

1578

1579

1580bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(

1581 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,

1584 assert(!isNormalMode(MaxCounter));

1585

1587 MachineInstr *CombinedLoadDsCntInstr = nullptr;

1588 MachineInstr *CombinedStoreDsCntInstr = nullptr;

1589 MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};

1590

1592 dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: ";

1594 dbgs() << "end of block\n";

1595 else

1596 dbgs() << *It;

1597 });

1598

1599 for (auto &II :

1602 if (II.isMetaInstruction()) {

1604 continue;

1605 }

1606

1607 MachineInstr **UpdatableInstr;

1608

1609

1610

1611

1613 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;

1614

1615

1616

1617 if (Opcode == AMDGPU::S_WAITCNT)

1618 continue;

1619

1620 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {

1621 unsigned OldEnc =

1622 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();

1624 if (TrySimplify)

1625 ScoreBrackets.simplifyWaitcnt(OldWait);

1626 Wait = Wait.combined(OldWait);

1627 UpdatableInstr = &CombinedLoadDsCntInstr;

1628 } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {

1629 unsigned OldEnc =

1630 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();

1632 if (TrySimplify)

1633 ScoreBrackets.simplifyWaitcnt(OldWait);

1634 Wait = Wait.combined(OldWait);

1635 UpdatableInstr = &CombinedStoreDsCntInstr;

1636 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {

1637

1638

1639 II.eraseFromParent();

1640 continue;

1641 } else {

1643 assert(CT.has_value());

1644 unsigned OldCnt =

1645 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();

1646 if (TrySimplify)

1647 ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);

1648 addWait(Wait, CT.value(), OldCnt);

1649 UpdatableInstr = &WaitInstrs[CT.value()];

1650 }

1651

1652

1653 if (!*UpdatableInstr) {

1654 *UpdatableInstr = &II;

1655 } else {

1656 II.eraseFromParent();

1658 }

1659 }

1660

1661

1662 AMDGPU::Waitcnt PreCombine = Wait;

1663 if (CombinedLoadDsCntInstr) {

1664

1665

1666

1667

1668

1669

1670

1671 if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) {

1674 AMDGPU::OpName::simm16, NewEnc);

1675 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);

1676 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);

1677 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);

1678 Wait.LoadCnt = ~0u;

1679 Wait.DsCnt = ~0u;

1680

1682 ? dbgs() << "applied pre-existing waitcnt\n"

1683 << "New Instr at block end: "

1684 << *CombinedLoadDsCntInstr << '\n'

1685 : dbgs() << "applied pre-existing waitcnt\n"

1686 << "Old Instr: " << *It << "New Instr: "

1687 << *CombinedLoadDsCntInstr << '\n');

1688 } else {

1691 }

1692 }

1693

1694 if (CombinedStoreDsCntInstr) {

1695

1696 if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) {

1699 AMDGPU::OpName::simm16, NewEnc);

1700 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);

1701 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);

1702 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);

1703 Wait.StoreCnt = ~0u;

1704 Wait.DsCnt = ~0u;

1705

1707 ? dbgs() << "applied pre-existing waitcnt\n"

1708 << "New Instr at block end: "

1709 << *CombinedStoreDsCntInstr << '\n'

1710 : dbgs() << "applied pre-existing waitcnt\n"

1711 << "Old Instr: " << *It << "New Instr: "

1712 << *CombinedStoreDsCntInstr << '\n');

1713 } else {

1716 }

1717 }

1718

1719

1720

1721

1722

1723

1724

1725 if (Wait.DsCnt != ~0u) {

1726

1727

1729

1730

1731

1732

1733

1734 if (Wait.LoadCnt != ~0u) {

1735 WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]);

1736 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);

1737 } else if (Wait.StoreCnt != ~0u) {

1738 WaitsToErase.push_back(&WaitInstrs[STORE_CNT]);

1739 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);

1740 }

1741

1742 for (MachineInstr **WI : WaitsToErase) {

1743 if (!*WI)

1744 continue;

1745

1746 (*WI)->eraseFromParent();

1747 *WI = nullptr;

1749 }

1750 }

1751

1752 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {

1753 if ((CT == KM_CNT && ScoreBrackets.hasRedundantXCntWithKmCnt(PreCombine)) ||

1754 (CT == LOAD_CNT &&

1755 ScoreBrackets.canOptimizeXCntWithLoadCnt(PreCombine))) {

1756

1757

1758 ScoreBrackets.simplifyXcnt(PreCombine, Wait);

1759 }

1760 if (!WaitInstrs[CT])

1761 continue;

1762

1763 unsigned NewCnt = getWait(Wait, CT);

1764 if (NewCnt != ~0u) {

1766 AMDGPU::OpName::simm16, NewCnt);

1767 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);

1768

1769 ScoreBrackets.applyWaitcnt(CT, NewCnt);

1770 setNoWait(Wait, CT);

1771

1773 ? dbgs() << "applied pre-existing waitcnt\n"

1774 << "New Instr at block end: " << *WaitInstrs[CT]

1775 << '\n'

1776 : dbgs() << "applied pre-existing waitcnt\n"

1777 << "Old Instr: " << *It

1778 << "New Instr: " << *WaitInstrs[CT] << '\n');

1779 } else {

1782 }

1783 }

1784

1786}

1787

1788

1789bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(

1791 AMDGPU::Waitcnt Wait) {

1793 assert(!isNormalMode(MaxCounter));

1794

1797

1798

1799 if (Wait.DsCnt != ~0u) {

1800 MachineInstr *SWaitInst = nullptr;

1801

1802 if (Wait.LoadCnt != ~0u) {

1804

1807

1808 Wait.LoadCnt = ~0u;

1809 Wait.DsCnt = ~0u;

1810 } else if (Wait.StoreCnt != ~0u) {

1812

1813 SWaitInst =

1816

1817 Wait.StoreCnt = ~0u;

1818 Wait.DsCnt = ~0u;

1819 }

1820

1821 if (SWaitInst) {

1823

1825 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;

1826 dbgs() << "New Instr: " << *SWaitInst << '\n');

1827 }

1828 }

1829

1830

1831

1832

1833 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {

1834 unsigned Count = getWait(Wait, CT);

1835 if (Count == ~0u)

1836 continue;

1837

1838 [[maybe_unused]] auto SWaitInst =

1841

1843

1845 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;

1846 dbgs() << "New Instr: " << *SWaitInst << '\n');

1847 }

1848

1850}

1851

1852

1854

1855

1856

1857

1858 return true;

1859}

1860

1861

1862

1864

1865

1866

1867

1868

1869

1870

1871

1872

1873

1874

1875

1876

1877bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,

1878 WaitcntBrackets &ScoreBrackets,

1879 MachineInstr *OldWaitcntInstr,

1880 bool FlushVmCnt) {

1881 setForceEmitWaitcnt();

1882

1883 assert(MI.isMetaInstruction());

1884

1885 AMDGPU::Waitcnt Wait;

1886 const unsigned Opc = MI.getOpcode();

1887

1888

1889

1890

1891

1892 if (Opc == AMDGPU::BUFFER_WBINVL1 || Opc == AMDGPU::BUFFER_WBINVL1_SC ||

1893 Opc == AMDGPU::BUFFER_WBINVL1_VOL || Opc == AMDGPU::BUFFER_GL0_INV ||

1894 Opc == AMDGPU::BUFFER_GL1_INV) {

1895 Wait.LoadCnt = 0;

1896 }

1897

1898

1899

1900

1901 if (Opc == AMDGPU::SI_RETURN_TO_EPILOG || Opc == AMDGPU::SI_RETURN ||

1902 Opc == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||

1903 Opc == AMDGPU::S_SETPC_B64_return ||

1905 Wait = Wait.combined(WCG->getAllZeroWaitcnt(false));

1906 }

1907

1908

1909

1910

1911

1912

1913

1914

1915 else if (Opc == AMDGPU::S_ENDPGM || Opc == AMDGPU::S_ENDPGM_SAVED) {

1916 if (!WCG->isOptNone() &&

1917 (MI.getMF()->getInfo()->isDynamicVGPREnabled() ||

1918 (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&

1919 ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&

1920 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))))

1921 ReleaseVGPRInsts.insert(&MI);

1922 }

1923

1924 else if ((Opc == AMDGPU::S_SENDMSG || Opc == AMDGPU::S_SENDMSGHALT) &&

1925 ST->hasLegacyGeometry() &&

1928 Wait.LoadCnt = 0;

1929 }

1930

1931

1932

1933

1934

1935 else {

1936 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {

1937

1938

1939 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||

1940 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||

1941 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||

1942 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {

1943 Wait.ExpCnt = 0;

1944 }

1945 }

1946

1947

1948

1949 if (TII->isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS())

1950 addWait(Wait, DS_CNT, ScoreBrackets.getPendingGDSWait());

1951

1953

1954

1955

1956 Wait = AMDGPU::Waitcnt();

1957

1958 const auto &CallAddrOp = *TII->getNamedOperand(MI, AMDGPU::OpName::src0);

1959 if (CallAddrOp.isReg()) {

1960 RegInterval CallAddrOpInterval =

1961 ScoreBrackets.getRegInterval(&MI, CallAddrOp);

1962

1963 ScoreBrackets.determineWait(SmemAccessCounter, CallAddrOpInterval,

1965

1966 if (const auto *RtnAddrOp =

1967 TII->getNamedOperand(MI, AMDGPU::OpName::dst)) {

1968 RegInterval RtnAddrOpInterval =

1969 ScoreBrackets.getRegInterval(&MI, *RtnAddrOp);

1970

1971 ScoreBrackets.determineWait(SmemAccessCounter, RtnAddrOpInterval,

1973 }

1974 }

1975 } else if (Opc == AMDGPU::S_BARRIER_WAIT) {

1976 ScoreBrackets.tryClearSCCWriteEvent(&MI);

1977 } else {

1978

1979

1980

1981

1982

1983

1984

1985

1986

1987

1988

1989

1990

1991

1992 for (const MachineMemOperand *Memop : MI.memoperands()) {

1993 const Value *Ptr = Memop->getValue();

1994 if (Memop->isStore()) {

1995 if (auto It = SLoadAddresses.find(Ptr); It != SLoadAddresses.end()) {

1996 addWait(Wait, SmemAccessCounter, 0);

1997 if (PDT->dominates(MI.getParent(), It->second))

1998 SLoadAddresses.erase(It);

1999 }

2000 }

2001 unsigned AS = Memop->getAddrSpace();

2003 continue;

2004

2005 if (TII->mayWriteLDSThroughDMA(MI))

2006 continue;

2007

2008

2009 unsigned RegNo = FIRST_LDS_VGPR;

2010 if (Ptr && Memop->getAAInfo()) {

2011 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();

2012 for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {

2013 if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {

2014 if ((I + 1) >= NUM_LDS_VGPRS) {

2015

2016

2017 ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);

2018 break;

2019 }

2020

2021 ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait);

2022 }

2023 }

2024 } else {

2025 ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);

2026 }

2027

2028 if (Memop->isStore())

2029 ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);

2030 }

2031

2032

2033 for (const MachineOperand &Op : MI.operands()) {

2034 if (Op.isReg())

2035 continue;

2036

2037

2038 if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))

2039 continue;

2040

2041 RegInterval Interval = ScoreBrackets.getRegInterval(&MI, Op);

2042

2043 const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());

2044 if (IsVGPR) {

2045

2046

2047

2048

2049

2050 if (Op.isImplicit() && MI.mayLoadOrStore())

2051 continue;

2052

2053

2054

2055

2056

2057

2058

2059 if (Op.isUse() || !updateVMCntOnly(MI) ||

2060 ScoreBrackets.hasOtherPendingVmemTypes(Interval,

2061 getVmemType(MI)) ||

2062 ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Interval) ||

2063 ST->hasVmemWriteVgprInOrder()) {

2064 ScoreBrackets.determineWait(LOAD_CNT, Interval, Wait);

2065 ScoreBrackets.determineWait(SAMPLE_CNT, Interval, Wait);

2066 ScoreBrackets.determineWait(BVH_CNT, Interval, Wait);

2067 ScoreBrackets.clearVgprVmemTypes(Interval);

2068 }

2069

2070 if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {

2071 ScoreBrackets.determineWait(EXP_CNT, Interval, Wait);

2072 }

2073 ScoreBrackets.determineWait(DS_CNT, Interval, Wait);

2074 } else if (Op.getReg() == AMDGPU::SCC) {

2075 ScoreBrackets.determineWait(KM_CNT, Interval, Wait);

2076 } else {

2077 ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait);

2078 }

2079

2080 if (ST->hasWaitXCnt() && Op.isDef())

2081 ScoreBrackets.determineWait(X_CNT, Interval, Wait);

2082 }

2083 }

2084 }

2085

2086

2087

2088

2089

2090

2091

2092

2093

2094

2095

2096

2097

2098 if (Opc == AMDGPU::S_BARRIER && ST->hasAutoWaitcntBeforeBarrier() &&

2099 ST->supportsBackOffBarrier()) {

2100 Wait = Wait.combined(WCG->getAllZeroWaitcnt(true));

2101 }

2102

2103

2104

2105

2107 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {

2108 Wait.DsCnt = 0;

2109 }

2110

2111

2112 ScoreBrackets.simplifyWaitcnt(Wait);

2113

2114

2115

2116

2117 if (Wait.XCnt != ~0u && isVmemAccess(MI)) {

2118 ScoreBrackets.applyWaitcnt(X_CNT, Wait.XCnt);

2119 Wait.XCnt = ~0u;

2120 }

2121

2122

2123

2125 Wait = WCG->getAllZeroWaitcnt(false);

2126

2127 if (ForceEmitWaitcnt[LOAD_CNT])

2128 Wait.LoadCnt = 0;

2129 if (ForceEmitWaitcnt[EXP_CNT])

2130 Wait.ExpCnt = 0;

2131 if (ForceEmitWaitcnt[DS_CNT])

2132 Wait.DsCnt = 0;

2133 if (ForceEmitWaitcnt[SAMPLE_CNT])

2134 Wait.SampleCnt = 0;

2135 if (ForceEmitWaitcnt[BVH_CNT])

2136 Wait.BvhCnt = 0;

2137 if (ForceEmitWaitcnt[KM_CNT])

2138 Wait.KmCnt = 0;

2139 if (ForceEmitWaitcnt[X_CNT])

2140 Wait.XCnt = 0;

2141

2142 if (FlushVmCnt) {

2143 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))

2144 Wait.LoadCnt = 0;

2145 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))

2146 Wait.SampleCnt = 0;

2147 if (ScoreBrackets.hasPendingEvent(BVH_CNT))

2148 Wait.BvhCnt = 0;

2149 }

2150

2152 Wait.LoadCnt = 0;

2153

2154 return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,

2155 OldWaitcntInstr);

2156}

2157

2158bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,

2160 MachineBasicBlock &Block,

2161 WaitcntBrackets &ScoreBrackets,

2162 MachineInstr *OldWaitcntInstr) {

2164

2165 if (OldWaitcntInstr)

2166

2167

2169 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);

2170

2171

2172

2173 ScoreBrackets.applyWaitcnt(Wait);

2174

2175

2176 if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&

2178 MachineOperand *WaitExp =

2179 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);

2180 if (Wait.ExpCnt < WaitExp->getImm()) {

2183 }

2184 Wait.ExpCnt = ~0u;

2185

2187 << "Update Instr: " << *It);

2188 }

2189

2190 if (WCG->createNewWaitcnt(Block, It, Wait))

2192

2194}

2195

2196bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {

2197 return (TII->isFLAT(MI) && TII->mayAccessVMEMThroughFlat(MI)) ||

2199}

2200

2201

2202

2204 MachineBasicBlock *Block) const {

2205 auto BlockEnd = Block->getParent()->end();

2206 auto BlockIter = Block->getIterator();

2207

2208 while (true) {

2209 if (It.isEnd()) {

2210 if (++BlockIter != BlockEnd) {

2211 It = BlockIter->instr_begin();

2212 continue;

2213 }

2214

2215 return false;

2216 }

2217

2218 if (!It->isMetaInstruction())

2219 break;

2220

2221 It++;

2222 }

2223

2224 assert(!It.isEnd());

2225

2226 return It->getOpcode() == AMDGPU::S_ENDPGM;

2227}

2228

2229

2230bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,

2231 MachineBasicBlock &Block,

2232 WaitcntBrackets &ScoreBrackets) {

2233 AMDGPU::Waitcnt Wait;

2234 bool NeedsEndPGMCheck = false;

2235

2237 Wait = WCG->getAllZeroWaitcnt(Inst.mayStore() &&

2239

2241 Wait.DsCnt = 0;

2242 NeedsEndPGMCheck = true;

2243 }

2244

2245 ScoreBrackets.simplifyWaitcnt(Wait);

2246

2247 auto SuccessorIt = std::next(Inst.getIterator());

2248 bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,

2249 nullptr);

2250

2251 if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &Block)) {

2254 }

2255

2257}

2258

2259void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,

2260 WaitcntBrackets *ScoreBrackets) {

2261

2262

2263

2264

2265

2266

2267

2268 bool IsVMEMAccess = false;

2269 bool IsSMEMAccess = false;

2270 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {

2272 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {

2273 ScoreBrackets->updateByEvent(GDS_ACCESS, Inst);

2274 ScoreBrackets->updateByEvent(GDS_GPR_LOCK, Inst);

2275 ScoreBrackets->setPendingGDS();

2276 } else {

2277 ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);

2278 }

2279 } else if (TII->isFLAT(Inst)) {

2281 ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);

2282 return;

2283 }

2284

2286

2287 int FlatASCount = 0;

2288

2289 if (TII->mayAccessVMEMThroughFlat(Inst)) {

2290 ++FlatASCount;

2291 IsVMEMAccess = true;

2292 ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);

2293 }

2294

2295 if (TII->mayAccessLDSThroughFlat(Inst)) {

2296 ++FlatASCount;

2297 ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);

2298 }

2299

2300

2301

2302

2303

2304

2306 ScoreBrackets->setPendingFlat();

2309 IsVMEMAccess = true;

2310 ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);

2311

2312 if (ST->vmemWriteNeedsExpWaitcnt() &&

2314 ScoreBrackets->updateByEvent(VMW_GPR_LOCK, Inst);

2315 }

2316 } else if (TII->isSMRD(Inst)) {

2317 IsSMEMAccess = true;

2318 ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst);

2319 } else if (Inst.isCall()) {

2321

2322 ScoreBrackets->applyWaitcnt(

2323 WCG->getAllZeroWaitcnt(false));

2324 ScoreBrackets->setStateOnFunctionEntryOrReturn();

2325 } else {

2326

2327 ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());

2328 }

2330 ScoreBrackets->updateByEvent(EXP_LDS_ACCESS, Inst);

2331 } else if (TII->isVINTERP(Inst)) {

2332 int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();

2333 ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);

2335 unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();

2337 ScoreBrackets->updateByEvent(EXP_PARAM_ACCESS, Inst);

2339 ScoreBrackets->updateByEvent(EXP_POS_ACCESS, Inst);

2340 else

2341 ScoreBrackets->updateByEvent(EXP_GPR_LOCK, Inst);

2343 ScoreBrackets->updateByEvent(SCC_WRITE, Inst);

2344 } else {

2346 case AMDGPU::S_SENDMSG:

2347 case AMDGPU::S_SENDMSG_RTN_B32:

2348 case AMDGPU::S_SENDMSG_RTN_B64:

2349 case AMDGPU::S_SENDMSGHALT:

2350 ScoreBrackets->updateByEvent(SQ_MESSAGE, Inst);

2351 break;

2352 case AMDGPU::S_MEMTIME:

2353 case AMDGPU::S_MEMREALTIME:

2354 case AMDGPU::S_GET_BARRIER_STATE_M0:

2355 case AMDGPU::S_GET_BARRIER_STATE_IMM:

2356 ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst);

2357 break;

2358 }

2359 }

2360

2361 if (ST->hasWaitXCnt())

2362 return;

2363

2364 if (IsVMEMAccess)

2365 ScoreBrackets->updateByEvent(VMEM_GROUP, Inst);

2366

2367 if (IsSMEMAccess)

2368 ScoreBrackets->updateByEvent(SMEM_GROUP, Inst);

2369}

2370

2371bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,

2372 unsigned OtherScore) {

2373 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;

2374 unsigned OtherShifted =

2375 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;

2376 Score = std::max(MyShifted, OtherShifted);

2377 return OtherShifted > MyShifted;

2378}

2379

2380

2381

2382

2383

2384

2385bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {

2386 bool StrictDom = false;

2387

2388 VgprUB = std::max(VgprUB, Other.VgprUB);

2389 SgprUB = std::max(SgprUB, Other.SgprUB);

2390

2391 for (auto T : inst_counter_types(Context->MaxCounter)) {

2392

2393 const unsigned *WaitEventMaskForInst = Context->WaitEventMaskForInst;

2394 const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];

2395 const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];

2396 if (OtherEvents & ~OldEvents)

2397 StrictDom = true;

2398 PendingEvents |= OtherEvents;

2399

2400

2401 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];

2402 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];

2403 const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);

2404 if (NewUB < ScoreLBs[T])

2406

2407 MergeInfo M;

2408 M.OldLB = ScoreLBs[T];

2409 M.OtherLB = Other.ScoreLBs[T];

2410 M.MyShift = NewUB - ScoreUBs[T];

2411 M.OtherShift = NewUB - Other.ScoreUBs[T];

2412

2413 ScoreUBs[T] = NewUB;

2414

2415 StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);

2416

2417 if (T == DS_CNT)

2418 StrictDom |= mergeScore(M, LastGDS, Other.LastGDS);

2419

2420 if (T == KM_CNT) {

2421 StrictDom |= mergeScore(M, SCCScore, Other.SCCScore);

2422 if (Other.hasPendingEvent(SCC_WRITE)) {

2423 unsigned OldEventsHasSCCWrite = OldEvents & (1 << SCC_WRITE);

2424 if (!OldEventsHasSCCWrite) {

2425 PendingSCCWrite = Other.PendingSCCWrite;

2426 } else if (PendingSCCWrite != Other.PendingSCCWrite) {

2427 PendingSCCWrite = nullptr;

2428 }

2429 }

2430 }

2431

2432 for (int J = 0; J <= VgprUB; J++)

2433 StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);

2434

2435 if (isSmemCounter(T)) {

2436 unsigned Idx = getSgprScoresIdx(T);

2437 for (int J = 0; J <= SgprUB; J++)

2438 StrictDom |=

2439 mergeScore(M, SgprScores[Idx][J], Other.SgprScores[Idx][J]);

2440 }

2441 }

2442

2443 for (int J = 0; J <= VgprUB; J++) {

2444 unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];

2445 StrictDom |= NewVmemTypes != VgprVmemTypes[J];

2446 VgprVmemTypes[J] = NewVmemTypes;

2447 }

2448

2449 return StrictDom;

2450}

2451

2454 return Opcode == AMDGPU::S_WAITCNT ||

2455 (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&

2457 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||

2458 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||

2459 Opcode == AMDGPU::S_WAITCNT_lds_direct ||

2461}

2462

2463

2464bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,

2465 MachineBasicBlock &Block,

2466 WaitcntBrackets &ScoreBrackets) {

2468

2470 dbgs() << "*** Begin Block: ";

2472 ScoreBrackets.dump();

2473 });

2474

2475

2476

2477

2478 bool VCCZCorrect = true;

2479 if (ST->hasReadVCCZBug()) {

2480

2481

2482 VCCZCorrect = false;

2483 } else if (ST->partialVCCWritesUpdateVCCZ()) {

2484

2485

2486 VCCZCorrect = false;

2487 }

2488

2489

2490 MachineInstr *OldWaitcntInstr = nullptr;

2491

2493 E = Block.instr_end();

2494 Iter != E;) {

2495 MachineInstr &Inst = *Iter;

2497 ++Iter;

2498 continue;

2499 }

2500

2501

2502

2504 if (!OldWaitcntInstr)

2505 OldWaitcntInstr = &Inst;

2506 ++Iter;

2507 continue;

2508 }

2509

2510 bool FlushVmCnt = Block.getFirstTerminator() == Inst &&

2511 isPreheaderToFlush(Block, ScoreBrackets);

2512

2513

2514 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,

2515 FlushVmCnt);

2516 OldWaitcntInstr = nullptr;

2517

2518

2520

2521

2522 if (ST->hasReadVCCZBug() || ST->partialVCCWritesUpdateVCCZ()) {

2523 if (Inst.definesRegister(AMDGPU::VCC_LO, nullptr) ||

2524 Inst.definesRegister(AMDGPU::VCC_HI, nullptr)) {

2525

2526 if (ST->partialVCCWritesUpdateVCCZ())

2527 VCCZCorrect = false;

2528 } else if (Inst.definesRegister(AMDGPU::VCC, nullptr)) {

2529

2530

2531

2532

2533

2534

2535

2536 if (ST->hasReadVCCZBug() &&

2537 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {

2538

2539

2540 VCCZCorrect = false;

2541 } else {

2542

2543 VCCZCorrect = true;

2544 }

2545 }

2546 }

2547

2548 if (TII->isSMRD(Inst)) {

2549 for (const MachineMemOperand *Memop : Inst.memoperands()) {

2550

2551

2552 if (!Memop->isInvariant()) {

2553 const Value *Ptr = Memop->getValue();

2554 SLoadAddresses.insert(std::pair(Ptr, Inst.getParent()));

2555 }

2556 }

2557 if (ST->hasReadVCCZBug()) {

2558

2559 VCCZCorrect = false;

2560 }

2561 }

2562

2563 updateEventWaitcntAfter(Inst, &ScoreBrackets);

2564

2565 Modified |= insertForcedWaitAfter(Inst, Block, ScoreBrackets);

2566

2569 ScoreBrackets.dump();

2570 });

2571

2572

2573

2574 if (RestoreVCCZ) {

2575

2576

2577

2579 TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),

2580 TRI->getVCC())

2582 VCCZCorrect = true;

2584 }

2585

2586 ++Iter;

2587 }

2588

2589

2590

2591 AMDGPU::Waitcnt Wait;

2592 if (Block.getFirstTerminator() == Block.end() &&

2593 isPreheaderToFlush(Block, ScoreBrackets)) {

2594 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))

2595 Wait.LoadCnt = 0;

2596 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))

2597 Wait.SampleCnt = 0;

2598 if (ScoreBrackets.hasPendingEvent(BVH_CNT))

2599 Wait.BvhCnt = 0;

2600 }

2601

2602

2604 OldWaitcntInstr);

2605

2607 dbgs() << "*** End Block: ";

2609 ScoreBrackets.dump();

2610 });

2611

2613}

2614

2615

2616

2617bool SIInsertWaitcnts::isPreheaderToFlush(

2618 MachineBasicBlock &MBB, const WaitcntBrackets &ScoreBrackets) {

2619 auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false);

2620 if (!IsInserted)

2621 return Iterator->second;

2622

2624 if (!Succ)

2625 return false;

2626

2627 MachineLoop *Loop = MLI->getLoopFor(Succ);

2628 if (!Loop)

2629 return false;

2630

2632 shouldFlushVmCnt(Loop, ScoreBrackets)) {

2633 Iterator->second = true;

2634 return true;

2635 }

2636

2637 return false;

2638}

2639

2640bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {

2642 return TII->mayAccessVMEMThroughFlat(MI);

2644}

2645

2646

2647

2648

2649

2650

2651

2652

2653

2654bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,

2655 const WaitcntBrackets &Brackets) {

2656 bool HasVMemLoad = false;

2657 bool HasVMemStore = false;

2658 bool UsesVgprLoadedOutside = false;

2659 DenseSet VgprUse;

2660 DenseSet VgprDef;

2661

2662 for (MachineBasicBlock *MBB : ML->blocks()) {

2663 for (MachineInstr &MI : *MBB) {

2664 if (isVMEMOrFlatVMEM(MI)) {

2665 HasVMemLoad |= MI.mayLoad();

2666 HasVMemStore |= MI.mayStore();

2667 }

2668

2669 for (const MachineOperand &Op : MI.all_uses()) {

2670 if (Op.isDebug() || TRI->isVectorRegister(*MRI, Op.getReg()))

2671 continue;

2672 RegInterval Interval = Brackets.getRegInterval(&MI, Op);

2673

2674 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {

2675

2676

2677 if (VgprDef.contains(RegNo))

2678 return false;

2679 VgprUse.insert(RegNo);

2680

2681

2682 if (Brackets.getRegScore(RegNo, LOAD_CNT) >

2683 Brackets.getScoreLB(LOAD_CNT) ||

2684 Brackets.getRegScore(RegNo, SAMPLE_CNT) >

2685 Brackets.getScoreLB(SAMPLE_CNT) ||

2686 Brackets.getRegScore(RegNo, BVH_CNT) >

2687 Brackets.getScoreLB(BVH_CNT)) {

2688 UsesVgprLoadedOutside = true;

2689 break;

2690 }

2691 }

2692 }

2693

2694

2695 if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {

2696 for (const MachineOperand &Op : MI.all_defs()) {

2697 RegInterval Interval = Brackets.getRegInterval(&MI, Op);

2698 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {

2699

2700

2701 if (VgprUse.contains(RegNo))

2702 return false;

2703 VgprDef.insert(RegNo);

2704 }

2705 }

2706 }

2707 }

2708 }

2709 if (ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)

2710 return true;

2711 return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder();

2712}

2713

2714bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {

2715 auto *MLI = &getAnalysis().getLI();

2716 auto *PDT =

2717 &getAnalysis().getPostDomTree();

2719 if (auto *AAR = getAnalysisIfAvailable())

2720 AA = &AAR->getAAResults();

2721

2722 return SIInsertWaitcnts(MLI, PDT, AA).run(MF);

2723}

2724

2725PreservedAnalyses

2731 .getManager()

2733

2734 if (!SIInsertWaitcnts(MLI, PDT, AA).run(MF))

2736

2739 .preserve();

2740}

2741

2744 TII = ST->getInstrInfo();

2745 TRI = &TII->getRegisterInfo();

2748

2750

2751 if (ST->hasExtendedWaitCounts()) {

2752 MaxCounter = NUM_EXTENDED_INST_CNTS;

2753 WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter);

2754 WCG = &WCGGFX12Plus;

2755 } else {

2756 MaxCounter = NUM_NORMAL_INST_CNTS;

2757 WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF, MaxCounter);

2758 WCG = &WCGPreGFX12;

2759 }

2760

2761 for (auto T : inst_counter_types())

2762 ForceEmitWaitcnt[T] = false;

2763

2764 WaitEventMaskForInst = WCG->getWaitEventMask();

2765

2766 SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);

2767

2768 if (ST->hasExtendedWaitCounts()) {

2771 } else {

2774 }

2781

2782 [[maybe_unused]] unsigned NumVGPRsMax =

2784 [[maybe_unused]] unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();

2785 assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);

2786 assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);

2787

2788 BlockInfos.clear();

2790

2791 MachineBasicBlock &EntryBB = MF.front();

2793

2795

2796

2797

2798

2799

2800

2802 I != E && (I->isPHI() || I->isMetaInstruction()); ++I)

2803 ;

2804

2805 if (ST->hasExtendedWaitCounts()) {

2808 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {

2809 if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT)

2810 continue;

2811

2812 if (ST->hasImageInsts() &&

2813 (CT == EXP_CNT || CT == SAMPLE_CNT || CT == BVH_CNT))

2814 continue;

2815

2817 TII->get(instrsForExtendedCounterTypes[CT]))

2819 }

2820 } else {

2822 }

2823

2824 auto NonKernelInitialState = std::make_unique(this);

2825 NonKernelInitialState->setStateOnFunctionEntryOrReturn();

2826 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);

2827

2829 }

2830

2831

2832

2833 for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))

2835

2836 std::unique_ptr Brackets;

2837 bool Repeat;

2838 do {

2839 Repeat = false;

2840

2841 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;

2842 ++BII) {

2843 MachineBasicBlock *MBB = BII->first;

2844 BlockInfo &BI = BII->second;

2845 if (!BI.Dirty)

2846 continue;

2847

2848 if (BI.Incoming) {

2849 if (!Brackets)

2850 Brackets = std::make_unique(*BI.Incoming);

2851 else

2852 *Brackets = *BI.Incoming;

2853 } else {

2854 if (!Brackets) {

2855 Brackets = std::make_unique(this);

2856 } else {

2857

2858

2859

2860 Brackets->~WaitcntBrackets();

2861 new (Brackets.get()) WaitcntBrackets(this);

2862 }

2863 }

2864

2865 Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);

2866 BI.Dirty = false;

2867

2868 if (Brackets->hasPendingEvent()) {

2869 BlockInfo *MoveBracketsToSucc = nullptr;

2870 for (MachineBasicBlock *Succ : MBB->successors()) {

2871 auto *SuccBII = BlockInfos.find(Succ);

2872 BlockInfo &SuccBI = SuccBII->second;

2873 if (!SuccBI.Incoming) {

2874 SuccBI.Dirty = true;

2875 if (SuccBII <= BII) {

2877 Repeat = true;

2878 }

2879 if (!MoveBracketsToSucc) {

2880 MoveBracketsToSucc = &SuccBI;

2881 } else {

2882 SuccBI.Incoming = std::make_unique(*Brackets);

2883 }

2884 } else if (SuccBI.Incoming->merge(*Brackets)) {

2885 SuccBI.Dirty = true;

2886 if (SuccBII <= BII) {

2888 Repeat = true;

2889 }

2890 }

2891 }

2892 if (MoveBracketsToSucc)

2893 MoveBracketsToSucc->Incoming = std::move(Brackets);

2894 }

2895 }

2896 } while (Repeat);

2897

2898 if (ST->hasScalarStores()) {

2899 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;

2900 bool HaveScalarStores = false;

2901

2902 for (MachineBasicBlock &MBB : MF) {

2903 for (MachineInstr &MI : MBB) {

2904 if (!HaveScalarStores && TII->isScalarStore(MI))

2905 HaveScalarStores = true;

2906

2907 if (MI.getOpcode() == AMDGPU::S_ENDPGM ||

2908 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)

2910 }

2911 }

2912

2913 if (HaveScalarStores) {

2914

2915

2916

2917

2918

2919

2920

2921

2922 for (MachineBasicBlock *MBB : EndPgmBlocks) {

2923 bool SeenDCacheWB = false;

2924

2926 I != E; ++I) {

2927 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)

2928 SeenDCacheWB = true;

2929 else if (TII->isScalarStore(*I))

2930 SeenDCacheWB = false;

2931

2932

2933 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||

2934 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&

2935 !SeenDCacheWB) {

2938 }

2939 }

2940 }

2941 }

2942 }

2943

2944

2945

2946

2947

2948

2950 for (MachineInstr *MI : ReleaseVGPRInsts) {

2951 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),

2952 TII->get(AMDGPU::S_ALLOC_VGPR))

2955 }

2956 } else {

2957 if (!ReleaseVGPRInsts.empty() &&

2958 (MF.getFrameInfo().hasCalls() ||

2959 ST->getOccupancyWithNumVGPRs(

2960 TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass),

2961 false) <

2963 for (MachineInstr *MI : ReleaseVGPRInsts) {

2964 if (ST->requiresNopBeforeDeallocVGPRs()) {

2965 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),

2966 TII->get(AMDGPU::S_NOP))

2968 }

2969 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),

2970 TII->get(AMDGPU::S_SENDMSG))

2973 }

2974 }

2975 }

2976 ReleaseVGPRInsts.clear();

2977 PreheadersToFlush.clear();

2978 SLoadAddresses.clear();

2979

2981}

unsigned const MachineRegisterInfo * MRI

assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

const TargetInstrInfo & TII

Provides AMDGPU specific target descriptions.

MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL

static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

Analysis containing CSE Info

This file provides an implementation of debug counters.

#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)

AMD GCN specific subclass of TargetSubtarget.

static bool isOptNone(const MachineFunction &MF)

static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)

Register const TargetRegisterInfo * TRI

This file implements a map that provides insertion order iteration.

std::pair< uint64_t, uint64_t > Interval

static bool isReg(const MCInst &MI, unsigned OpNo)

MachineInstr unsigned OpIdx

uint64_t IntrinsicInst * II

#define INITIALIZE_PASS_DEPENDENCY(depName)

#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)

#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)

This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.

static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)

static bool callWaitsOnFunctionReturn(const MachineInstr &MI)

Definition SIInsertWaitcnts.cpp:1863

#define AMDGPU_EVENT_NAME(Name)

Definition SIInsertWaitcnts.cpp:142

static bool callWaitsOnFunctionEntry(const MachineInstr &MI)

Definition SIInsertWaitcnts.cpp:1853

static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)

Definition SIInsertWaitcnts.cpp:1354

static bool isWaitInstr(MachineInstr &Inst)

Definition SIInsertWaitcnts.cpp:2452

static std::optional< InstCounterType > counterTypeForInstr(unsigned Opcode)

Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...

Definition SIInsertWaitcnts.cpp:1370

static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)

#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)

Definition SIInsertWaitcnts.cpp:113

#define AMDGPU_EVENT_ENUM(Name)

Definition SIInsertWaitcnts.cpp:135

Provides some synthesis utilities to produce sequences of values.

static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)

static const uint32_t IV[8]

A manager for alias analyses.

bool isEntryFunction() const

PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)

Get the result of an analysis pass for a given IR unit.

AnalysisUsage & addUsedIfAvailable()

Add the specified Pass class to the set of analyses used by this pass.

AnalysisUsage & addRequired()

AnalysisUsage & addPreserved()

Add the specified Pass class to the set of analyses preserved by this pass.

LLVM_ABI void setPreservesCFG()

This function should be called by the pass, iff they do not:

Represents analyses that only rely on functions' control flow.

static bool shouldExecute(CounterInfo &Counter)

static bool isCounterSet(CounterInfo &Info)

iterator find(const_arg_type_t< KeyT > Val)

std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)

bool erase(const KeyT &Val)

std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)

bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const

dominates - Returns true iff A dominates B.

FunctionPass class - This class is used to implement most global optimizations.

BlockT * getLoopPreheader() const

If there is a preheader for this loop, return it.

LoopT * getLoopFor(const BlockT *BB) const

Return the inner most loop that BB lives in.

const MCInstrDesc & get(unsigned Opcode) const

Return the machine instruction descriptor that corresponds to the specified instruction opcode.

LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const

Return the successor of this block if it has a single successor.

LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)

Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.

Instructions::iterator instr_iterator

instr_iterator instr_end()

iterator_range< succ_iterator > successors()

MachineInstrBundleIterator< MachineInstr > iterator

MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...

void getAnalysisUsage(AnalysisUsage &AU) const override

getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.

const TargetSubtargetInfo & getSubtarget() const

getSubtarget - Return the subtarget for which this machine code is being compiled.

MachineRegisterInfo & getRegInfo()

getRegInfo - Return information about the registers currently in use.

Function & getFunction()

Return the LLVM function that this machine code represents.

Ty * getInfo()

getInfo - Keep track of various per-function pieces of information for backends that would like to do...

const MachineBasicBlock & front() const

const MachineInstrBuilder & addImm(int64_t Val) const

Add a new immediate operand.

const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const

Add a new virtual register operand.

Representation of each machine instruction.

mop_range defs()

Returns all explicit operands that are register definitions.

unsigned getOpcode() const

Returns the opcode of this MachineInstr.

bool mayLoadOrStore(QueryType Type=AnyInBundle) const

Return true if this instruction could possibly read or modify memory.

const MachineBasicBlock * getParent() const

filtered_mop_range all_defs()

Returns an iterator range over all operands that are (explicit or implicit) register defs.

bool isCall(QueryType Type=AnyInBundle) const

bool mayLoad(QueryType Type=AnyInBundle) const

Return true if this instruction could possibly read memory.

bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const

Return true if the MachineInstr fully defines the specified register.

LLVM_ABI void setDesc(const MCInstrDesc &TID)

Replace the instruction descriptor (thus opcode) of the current instruction with a new one.

ArrayRef< MachineMemOperand * > memoperands() const

Access to memory operands of the instruction.

LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const

Print this MI to OS.

bool mayStore(QueryType Type=AnyInBundle) const

Return true if this instruction could possibly modify memory.

const DebugLoc & getDebugLoc() const

Returns the debug location id of this MachineInstr.

LLVM_ABI void eraseFromParent()

Unlink 'this' from the containing basic block and delete it.

filtered_mop_range all_uses()

Returns an iterator range over all operands that are (explicit or implicit) register uses.

const MachineOperand & getOperand(unsigned i) const

bool isMetaInstruction(QueryType Type=IgnoreBundle) const

Return true if this instruction doesn't produce any output in the form of executable instructions.

Analysis pass that exposes the MachineLoopInfo for a machine function.

MachineOperand class - Representation of each machine instruction operand.

void setImm(int64_t immVal)

Register getReg() const

getReg - Returns the register number.

iterator find(const KeyT &Key)

std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)

static PreservedAnalyses all()

Construct a special preserved set that preserves all passes.

PreservedAnalyses & preserveSet()

Mark an analysis set as preserved.

PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)

Definition SIInsertWaitcnts.cpp:2726

static bool isCBranchVCCZRead(const MachineInstr &MI)

static bool isVMEM(const MachineInstr &MI)

static bool isFLATScratch(const MachineInstr &MI)

static bool isEXP(const MachineInstr &MI)

static bool mayWriteLDSThroughDMA(const MachineInstr &MI)

static bool isLDSDIR(const MachineInstr &MI)

static bool isGWS(const MachineInstr &MI)

static bool isFLATGlobal(const MachineInstr &MI)

static bool isVSAMPLE(const MachineInstr &MI)

static bool isAtomicRet(const MachineInstr &MI)

static bool isImage(const MachineInstr &MI)

static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)

static bool isVINTERP(const MachineInstr &MI)

static bool isGFX12CacheInvOrWBInst(unsigned Opc)

static bool isSBarrierSCCWrite(unsigned Opcode)

static bool isMIMG(const MachineInstr &MI)

static bool isFLAT(const MachineInstr &MI)

static bool isLDSDMA(const MachineInstr &MI)

static bool isAtomicNoRet(const MachineInstr &MI)

This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...

unsigned getDynamicVGPRBlockSize() const

bool isDynamicVGPREnabled() const

void push_back(const T &Elt)

A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...

std::pair< iterator, bool > insert(const ValueT &V)

bool contains(const_arg_type_t< ValueT > V) const

Check if the set contains the given element.

self_iterator getIterator()

#define llvm_unreachable(msg)

Marks that the current location is not supposed to be reachable.

Abstract Attribute helper functions.

@ LOCAL_ADDRESS

Address space for local memory.

@ FLAT_ADDRESS

Address space for flat memory.

unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)

@ ID_DEALLOC_VGPRS_GFX11Plus

LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)

void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)

Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...

MCRegister getMCReg(MCRegister Reg, const MCSubtargetInfo &STI)

If Reg is a pseudo reg, return the correct hardware register given STI otherwise return Reg.

bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)

unsigned getStorecntBitMask(const IsaVersion &Version)

LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)

unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)

Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.

unsigned getSamplecntBitMask(const IsaVersion &Version)

unsigned getKmcntBitMask(const IsaVersion &Version)

unsigned getVmcntBitMask(const IsaVersion &Version)

unsigned getXcntBitMask(const IsaVersion &Version)

Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)

unsigned getLgkmcntBitMask(const IsaVersion &Version)

unsigned getBvhcntBitMask(const IsaVersion &Version)

unsigned getExpcntBitMask(const IsaVersion &Version)

Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)

static unsigned encodeStorecntDscnt(const IsaVersion &Version, unsigned Storecnt, unsigned Dscnt)

bool getMUBUFIsBufferInv(unsigned Opc)

LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)

unsigned getLoadcntBitMask(const IsaVersion &Version)

static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt, unsigned Dscnt)

unsigned getDscntBitMask(const IsaVersion &Version)

constexpr std::underlying_type_t< E > Mask()

Get a bitmask with 1s in all places up to the high-order bit of E's largest value.

unsigned ID

LLVM IR allows to use arbitrary numbers as calling convention identifiers.

@ Undef

Value of the register doesn't matter.

initializer< Ty > init(const Ty &Val)

PointerTypeMap run(const Module &M)

Compute the PointerTypeMap for the module M.

This is an optimization pass for GlobalISel generic memory operations.

void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)

FunctionAddr VTableAddr Value

Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)

MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)

Builder interface. Specify how to create the initial instruction itself.

auto enum_seq(EnumT Begin, EnumT End)

Iterate over an enum type from Begin up to - but not including - End.

static StringRef getCPU(StringRef CPU)

Processes a CPU name.

iterator_range< T > make_range(T x, T y)

Convenience function for iterating over sub-ranges.

iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)

Make a range that does early increment to allow mutation of the underlying range without disrupting i...

AnalysisManager< MachineFunction > MachineFunctionAnalysisManager

LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()

Returns the minimum set of Analyses that all machine function passes must preserve.

char & SIInsertWaitcntsID

Definition SIInsertWaitcnts.cpp:1348

LLVM_ABI raw_ostream & dbgs()

dbgs() - This returns a reference to a raw_ostream for debugging messages.

LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)

FunctionAddr VTableAddr Count

CodeGenOptLevel

Code generation optimization level.

class LLVM_GSL_OWNER SmallVector

Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...

constexpr T divideCeil(U Numerator, V Denominator)

Returns the integer ceil(Numerator / Denominator).

DWARFExpression::Operation Op

FunctionPass * createSIInsertWaitcntsPass()

Definition SIInsertWaitcnts.cpp:1350

AAResults AliasAnalysis

Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.

Instruction set architecture version.

Represents the counter values to wait for in an s_waitcnt instruction.

static constexpr bool is_iterable

Definition SIInsertWaitcnts.cpp:88