LLVM: lib/Target/AMDGPU/SIInsertWaitcnts.cpp Source File (original) (raw)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
43
44using namespace llvm;
45
46#define DEBUG_TYPE "si-insert-waitcnts"
47
49 "Force emit s_waitcnt expcnt(0) instrs");
51 "Force emit s_waitcnt lgkmcnt(0) instrs");
53 "Force emit s_waitcnt vmcnt(0) instrs");
54
57 cl::desc("Force all waitcnt instrs to be emitted as "
58 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
60
62 "amdgpu-waitcnt-load-forcezero",
63 cl::desc("Force all waitcnt load counters to wait until 0"),
65
66namespace {
67
68
69
70
71enum InstCounterType {
72 LOAD_CNT = 0,
73 DS_CNT,
74 EXP_CNT,
75 STORE_CNT,
76 NUM_NORMAL_INST_CNTS,
77 SAMPLE_CNT = NUM_NORMAL_INST_CNTS,
78 BVH_CNT,
79 KM_CNT,
80 X_CNT,
81 NUM_EXTENDED_INST_CNTS,
82 NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
83};
84}
85
86namespace llvm {
90}
91
92namespace {
93
94
95
96auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
97 return enum_seq(LOAD_CNT, MaxCounter);
98}
99
100using RegInterval = std::pair<int, int>;
101
102struct HardwareLimits {
103 unsigned LoadcntMax;
104 unsigned ExpcntMax;
105 unsigned DscntMax;
106 unsigned StorecntMax;
107 unsigned SamplecntMax;
108 unsigned BvhcntMax;
109 unsigned KmcntMax;
110 unsigned XcntMax;
111};
112
113#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
114 DECL(VMEM_ACCESS) \
115 DECL(VMEM_READ_ACCESS) \
116 DECL(VMEM_SAMPLER_READ_ACCESS) \
117 DECL(VMEM_BVH_READ_ACCESS) \
118 DECL(VMEM_WRITE_ACCESS) \
119 DECL(SCRATCH_WRITE_ACCESS) \
120 DECL(VMEM_GROUP) \
121 DECL(LDS_ACCESS) \
122 DECL(GDS_ACCESS) \
123 DECL(SQ_MESSAGE) \
124 DECL(SCC_WRITE) \
125 DECL(SMEM_ACCESS) \
126 DECL(SMEM_GROUP) \
127 DECL(EXP_GPR_LOCK) \
128 DECL(GDS_GPR_LOCK) \
129 DECL(EXP_POS_ACCESS) \
130 DECL(EXP_PARAM_ACCESS) \
131 DECL(VMW_GPR_LOCK) \
132 DECL(EXP_LDS_ACCESS)
133
134
135#define AMDGPU_EVENT_ENUM(Name) Name,
136enum WaitEventType {
138 NUM_WAIT_EVENTS
139};
140#undef AMDGPU_EVENT_ENUM
141
142#define AMDGPU_EVENT_NAME(Name) #Name,
143static constexpr StringLiteral WaitEventTypeName[] = {
145};
146#undef AMDGPU_EVENT_NAME
147
148
149
150
151
152
153
154
155
156enum RegisterMapping {
157 SQ_MAX_PGM_VGPRS = 2048,
158 AGPR_OFFSET = 512,
159 SQ_MAX_PGM_SGPRS = 128,
160
161
162
163
164
165 FIRST_LDS_VGPR = SQ_MAX_PGM_VGPRS,
166 NUM_LDS_VGPRS = 9,
167 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_LDS_VGPRS,
168 NUM_ALL_ALLOCATABLE = NUM_ALL_VGPRS + SQ_MAX_PGM_SGPRS,
169
170 SCC = NUM_ALL_ALLOCATABLE
171};
172
173
174
175
176
177
178enum VmemType {
179
180 VMEM_NOSAMPLER,
181
182 VMEM_SAMPLER,
183
184 VMEM_BVH,
185 NUM_VMEM_TYPES
186};
187
188
189
190
191static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
192 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
193 AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
194 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT};
195
196static bool updateVMCntOnly(const MachineInstr &Inst) {
199}
200
201#ifndef NDEBUG
202static bool isNormalMode(InstCounterType MaxCounter) {
203 return MaxCounter == NUM_NORMAL_INST_CNTS;
204}
205#endif
206
207VmemType getVmemType(const MachineInstr &Inst) {
208 assert(updateVMCntOnly(Inst));
210 return VMEM_NOSAMPLER;
214
215 if (BaseInfo->BVH)
216 return VMEM_BVH;
217
218
219
220
222 return VMEM_SAMPLER;
223
224 return VMEM_NOSAMPLER;
225}
226
228 switch (T) {
229 case LOAD_CNT:
230 return Wait.LoadCnt;
232 return Wait.ExpCnt;
233 case DS_CNT:
234 return Wait.DsCnt;
235 case STORE_CNT:
236 return Wait.StoreCnt;
237 case SAMPLE_CNT:
238 return Wait.SampleCnt;
239 case BVH_CNT:
240 return Wait.BvhCnt;
241 case KM_CNT:
242 return Wait.KmCnt;
243 case X_CNT:
244 return Wait.XCnt;
245 default:
247 }
248}
249
251 unsigned &WC = getCounterRef(Wait, T);
252 WC = std::min(WC, Count);
253}
254
256 getCounterRef(Wait, T) = ~0u;
257}
258
260 return getCounterRef(Wait, T);
261}
262
263
264InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
265 for (auto T : inst_counter_types()) {
267 return T;
268 }
270}
271
272class WaitcntBrackets;
273
274
275
276
277
278
279
280class WaitcntGenerator {
281protected:
282 const GCNSubtarget *ST = nullptr;
283 const SIInstrInfo *TII = nullptr;
284 AMDGPU::IsaVersion IV;
285 InstCounterType MaxCounter;
286 bool OptNone;
287
288public:
289 WaitcntGenerator() = default;
290 WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter)
291 : ST(&MF.getSubtarget()), TII(ST->getInstrInfo()),
295
296
297
298 bool isOptNone() const { return OptNone; }
299
300
301
302
303
304
305
306
307
308
309
310
311 virtual bool
312 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
313 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
315
316
317 bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
318
319
320
321 virtual bool createNewWaitcnt(MachineBasicBlock &Block,
323 AMDGPU::Waitcnt Wait) = 0;
324
325
326
327 virtual const unsigned *getWaitEventMask() const = 0;
328
329
330
331 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
332
333 virtual ~WaitcntGenerator() = default;
334
335
336 static constexpr unsigned
337 eventMask(std::initializer_list Events) {
338 unsigned Mask = 0;
339 for (auto &E : Events)
341
343 }
344};
345
346class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
347public:
348 using WaitcntGenerator::WaitcntGenerator;
349
350 bool
351 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
352 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
354
355 bool createNewWaitcnt(MachineBasicBlock &Block,
357 AMDGPU::Waitcnt Wait) override;
358
359 const unsigned *getWaitEventMask() const override {
361
362 static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
363 eventMask({VMEM_ACCESS, VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS,
364 VMEM_BVH_READ_ACCESS}),
365 eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
366 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
367 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
368 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
369 0,
370 0,
371 0,
372 0};
373
374 return WaitEventMaskForInstPreGFX12;
375 }
376
377 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
378};
379
380class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
381public:
382 using WaitcntGenerator::WaitcntGenerator;
383
384 bool
385 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
386 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
388
389 bool createNewWaitcnt(MachineBasicBlock &Block,
391 AMDGPU::Waitcnt Wait) override;
392
393 const unsigned *getWaitEventMask() const override {
395
396 static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
397 eventMask({VMEM_ACCESS, VMEM_READ_ACCESS}),
398 eventMask({LDS_ACCESS, GDS_ACCESS}),
399 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
400 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
401 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
402 eventMask({VMEM_SAMPLER_READ_ACCESS}),
403 eventMask({VMEM_BVH_READ_ACCESS}),
404 eventMask({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
405 eventMask({VMEM_GROUP, SMEM_GROUP})};
406
407 return WaitEventMaskForInstGFX12Plus;
408 }
409
410 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
411};
412
413class SIInsertWaitcnts {
414public:
415 const GCNSubtarget *ST;
416 const SIInstrInfo *TII = nullptr;
417 const SIRegisterInfo *TRI = nullptr;
418 const MachineRegisterInfo *MRI = nullptr;
419 InstCounterType SmemAccessCounter;
420 InstCounterType MaxCounter;
421 const unsigned *WaitEventMaskForInst;
422
423private:
424 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
425 DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
426 MachineLoopInfo *MLI;
427 MachinePostDominatorTree *PDT;
429
430 struct BlockInfo {
431 std::unique_ptr Incoming;
432 bool Dirty = true;
433 };
434
435 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
436
437 bool ForceEmitWaitcnt[NUM_INST_CNTS];
438
439
440
441
442 WaitcntGeneratorPreGFX12 WCGPreGFX12;
443 WaitcntGeneratorGFX12Plus WCGGFX12Plus;
444
445 WaitcntGenerator *WCG = nullptr;
446
447
448
449 DenseSet<MachineInstr *> ReleaseVGPRInsts;
450
451 HardwareLimits Limits;
452
453public:
454 SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
456 : MLI(MLI), PDT(PDT), AA(AA) {
457 (void)ForceExpCounter;
458 (void)ForceLgkmCounter;
459 (void)ForceVMCounter;
460 }
461
462 unsigned getWaitCountMax(InstCounterType T) const {
463 switch (T) {
464 case LOAD_CNT:
465 return Limits.LoadcntMax;
466 case DS_CNT:
467 return Limits.DscntMax;
469 return Limits.ExpcntMax;
470 case STORE_CNT:
471 return Limits.StorecntMax;
472 case SAMPLE_CNT:
473 return Limits.SamplecntMax;
474 case BVH_CNT:
475 return Limits.BvhcntMax;
476 case KM_CNT:
477 return Limits.KmcntMax;
478 case X_CNT:
479 return Limits.XcntMax;
480 default:
481 break;
482 }
483 return 0;
484 }
485
486 bool shouldFlushVmCnt(MachineLoop *ML, const WaitcntBrackets &Brackets);
487 bool isPreheaderToFlush(MachineBasicBlock &MBB,
488 const WaitcntBrackets &ScoreBrackets);
489 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
490 bool run(MachineFunction &MF);
491
492 void setForceEmitWaitcnt() {
493
494
495#ifndef NDEBUG
498 ForceEmitWaitcnt[EXP_CNT] = true;
499 } else {
500 ForceEmitWaitcnt[EXP_CNT] = false;
501 }
502
505 ForceEmitWaitcnt[DS_CNT] = true;
506 ForceEmitWaitcnt[KM_CNT] = true;
507 } else {
508 ForceEmitWaitcnt[DS_CNT] = false;
509 ForceEmitWaitcnt[KM_CNT] = false;
510 }
511
514 ForceEmitWaitcnt[LOAD_CNT] = true;
515 ForceEmitWaitcnt[SAMPLE_CNT] = true;
516 ForceEmitWaitcnt[BVH_CNT] = true;
517 } else {
518 ForceEmitWaitcnt[LOAD_CNT] = false;
519 ForceEmitWaitcnt[SAMPLE_CNT] = false;
520 ForceEmitWaitcnt[BVH_CNT] = false;
521 }
522#endif
523 }
524
525
526
527 WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
529
530 case AMDGPU::GLOBAL_INV:
531 return VMEM_READ_ACCESS;
532 case AMDGPU::GLOBAL_WB:
533 case AMDGPU::GLOBAL_WBINV:
534 return VMEM_WRITE_ACCESS;
535 default:
536 break;
537 }
538
539
540 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
541 VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
542
544
545
547 return VMEM_ACCESS;
550 if (TII->mayAccessScratch(Inst))
551 return SCRATCH_WRITE_ACCESS;
552 return VMEM_WRITE_ACCESS;
553 }
555 return VMEM_READ_ACCESS;
556 return VmemReadMapping[getVmemType(Inst)];
557 }
558
559 bool isVmemAccess(const MachineInstr &MI) const;
560 bool generateWaitcntInstBefore(MachineInstr &MI,
561 WaitcntBrackets &ScoreBrackets,
562 MachineInstr *OldWaitcntInstr,
563 bool FlushVmCnt);
564 bool generateWaitcnt(AMDGPU::Waitcnt Wait,
566 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
567 MachineInstr *OldWaitcntInstr);
568 void updateEventWaitcntAfter(MachineInstr &Inst,
569 WaitcntBrackets *ScoreBrackets);
571 MachineBasicBlock *Block) const;
572 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &Block,
573 WaitcntBrackets &ScoreBrackets);
574 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
575 WaitcntBrackets &ScoreBrackets);
576};
577
578
579
580
581
582
583
584
585
586class WaitcntBrackets {
587public:
588 WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {}
589
590 bool isSmemCounter(InstCounterType T) const {
591 return T == Context->SmemAccessCounter || T == X_CNT;
592 }
593
594 unsigned getSgprScoresIdx(InstCounterType T) const {
595 assert(isSmemCounter(T) && "Invalid SMEM counter");
596 return T == X_CNT ? 1 : 0;
597 }
598
599 unsigned getScoreLB(InstCounterType T) const {
600 assert(T < NUM_INST_CNTS);
601 return ScoreLBs[T];
602 }
603
604 unsigned getScoreUB(InstCounterType T) const {
605 assert(T < NUM_INST_CNTS);
606 return ScoreUBs[T];
607 }
608
609 unsigned getScoreRange(InstCounterType T) const {
610 return getScoreUB(T) - getScoreLB(T);
611 }
612
613 unsigned getRegScore(int GprNo, InstCounterType T) const {
614 if (GprNo < NUM_ALL_VGPRS)
615 return VgprScores[T][GprNo];
616
617 if (GprNo < NUM_ALL_ALLOCATABLE)
618 return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
619
620 assert(GprNo == SCC);
621 return SCCScore;
622 }
623
624 bool merge(const WaitcntBrackets &Other);
625
626 RegInterval getRegInterval(const MachineInstr *MI,
627 const MachineOperand &Op) const;
628
629 bool counterOutOfOrder(InstCounterType T) const;
630 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait);
631 void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
632 bool hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait);
633 bool canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait);
634 void simplifyXcnt(AMDGPU::Waitcnt &CheckWait, AMDGPU::Waitcnt &UpdateWait);
635
636 void determineWait(InstCounterType T, RegInterval Interval,
637 AMDGPU::Waitcnt &Wait) const;
638 void determineWait(InstCounterType T, int RegNo,
639 AMDGPU::Waitcnt &Wait) const {
640 determineWait(T, {RegNo, RegNo + 1}, Wait);
641 }
642 void tryClearSCCWriteEvent(MachineInstr *Inst);
643
644 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
645 void applyWaitcnt(InstCounterType T, unsigned Count);
646 void updateByEvent(WaitEventType E, MachineInstr &MI);
647
648 unsigned hasPendingEvent() const { return PendingEvents; }
649 unsigned hasPendingEvent(WaitEventType E) const {
650 return PendingEvents & (1 << E);
651 }
652 unsigned hasPendingEvent(InstCounterType T) const {
653 unsigned HasPending = PendingEvents & Context->WaitEventMaskForInst[T];
654 assert((HasPending != 0) == (getScoreRange(T) != 0));
655 return HasPending;
656 }
657
658 bool hasMixedPendingEvents(InstCounterType T) const {
659 unsigned Events = hasPendingEvent(T);
660
661 return Events & (Events - 1);
662 }
663
664 bool hasPendingFlat() const {
665 return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
666 LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
667 (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
668 LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
669 }
670
671 void setPendingFlat() {
672 LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
673 LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
674 }
675
676 bool hasPendingGDS() const {
677 return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
678 }
679
680 unsigned getPendingGDSWait() const {
681 return std::min(getScoreUB(DS_CNT) - LastGDS,
682 Context->getWaitCountMax(DS_CNT) - 1);
683 }
684
685 void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
686
687
688
689 bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const {
690 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
691 assert(RegNo < NUM_ALL_VGPRS);
692 if (VgprVmemTypes[RegNo] & ~(1 << V))
693 return true;
694 }
695 return false;
696 }
697
698 void clearVgprVmemTypes(RegInterval Interval) {
699 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
700 assert(RegNo < NUM_ALL_VGPRS);
701 VgprVmemTypes[RegNo] = 0;
702 }
703 }
704
705 void setStateOnFunctionEntryOrReturn() {
706 setScoreUB(STORE_CNT,
707 getScoreUB(STORE_CNT) + Context->getWaitCountMax(STORE_CNT));
708 PendingEvents |= Context->WaitEventMaskForInst[STORE_CNT];
709 }
710
711 ArrayRef<const MachineInstr *> getLDSDMAStores() const {
712 return LDSDMAStores;
713 }
714
715 bool hasPointSampleAccel(const MachineInstr &MI) const;
716 bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
718
719 void print(raw_ostream &) const;
721
722private:
723 struct MergeInfo {
724 unsigned OldLB;
725 unsigned OtherLB;
726 unsigned MyShift;
727 unsigned OtherShift;
728 };
729 static bool mergeScore(const MergeInfo &M, unsigned &Score,
730 unsigned OtherScore);
731
732 void setScoreLB(InstCounterType T, unsigned Val) {
733 assert(T < NUM_INST_CNTS);
734 ScoreLBs[T] = Val;
735 }
736
737 void setScoreUB(InstCounterType T, unsigned Val) {
738 assert(T < NUM_INST_CNTS);
739 ScoreUBs[T] = Val;
740
741 if (T != EXP_CNT)
742 return;
743
744 if (getScoreRange(EXP_CNT) > Context->getWaitCountMax(EXP_CNT))
745 ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - Context->getWaitCountMax(EXP_CNT);
746 }
747
748 void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
749 setScoreByInterval({GprNo, GprNo + 1}, T, Val);
750 }
751
752 void setScoreByInterval(RegInterval Interval, InstCounterType CntTy,
753 unsigned Score);
754
755 void setScoreByOperand(const MachineInstr *MI, const MachineOperand &Op,
756 InstCounterType CntTy, unsigned Val);
757
758 const SIInsertWaitcnts *Context;
759
760 unsigned ScoreLBs[NUM_INST_CNTS] = {0};
761 unsigned ScoreUBs[NUM_INST_CNTS] = {0};
762 unsigned PendingEvents = 0;
763
764 unsigned LastFlat[NUM_INST_CNTS] = {0};
765
766 unsigned LastGDS = 0;
767
768
769 int VgprUB = -1;
770 int SgprUB = -1;
771 unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
772
773
774
775
776 unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}};
777
778 unsigned SCCScore = 0;
779
780 const MachineInstr *PendingSCCWrite = nullptr;
781
782
783 unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
784
785
786 SmallVector<const MachineInstr *, NUM_LDS_VGPRS - 1> LDSDMAStores;
787};
788
790public:
791 static char ID;
792 SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}
793
794 bool runOnMachineFunction(MachineFunction &MF) override;
795
796 StringRef getPassName() const override {
797 return "SI insert wait instructions";
798 }
799
800 void getAnalysisUsage(AnalysisUsage &AU) const override {
802 AU.addRequired();
803 AU.addRequired();
807 }
808};
809
810}
811
812RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
814 if (Op.getReg() == AMDGPU::SCC)
816
817 const SIRegisterInfo *TRI = Context->TRI;
818 const MachineRegisterInfo *MRI = Context->MRI;
819
820 if (->isInAllocatableClass(Op.getReg()))
821 return {-1, -1};
822
823
824
825 assert(.getSubReg() ||
.isUndef());
826
828
830 unsigned RegIdx = TRI->getHWRegIndex(MCReg);
831
832 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());
833 unsigned Size = TRI->getRegSizeInBits(*RC);
834
835
836 if (TRI->isVectorRegister(*MRI, Op.getReg())) {
840 if (TRI->isAGPR(*MRI, Op.getReg()))
841 Result.first += AGPR_OFFSET;
845
846 if (Size == 16 && Context->ST->hasD16Writes32BitVgpr()) {
847
848
851 else
853 }
854 } else if (TRI->isSGPRReg(*MRI, Op.getReg()) && RegIdx < SQ_MAX_PGM_SGPRS) {
855
856
857 Result.first = RegIdx + NUM_ALL_VGPRS;
859 } else {
860 return {-1, -1};
861 }
862
864}
865
866void WaitcntBrackets::setScoreByInterval(RegInterval Interval,
867 InstCounterType CntTy,
868 unsigned Score) {
869 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
870 if (RegNo < NUM_ALL_VGPRS) {
871 VgprUB = std::max(VgprUB, RegNo);
872 VgprScores[CntTy][RegNo] = Score;
873 } else if (RegNo < NUM_ALL_ALLOCATABLE) {
874 SgprUB = std::max(SgprUB, RegNo - NUM_ALL_VGPRS);
875 SgprScores[getSgprScoresIdx(CntTy)][RegNo - NUM_ALL_VGPRS] = Score;
876 } else {
877 assert(RegNo == SCC);
878 SCCScore = Score;
879 }
880 }
881}
882
883void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI,
884 const MachineOperand &Op,
885 InstCounterType CntTy, unsigned Score) {
886 RegInterval Interval = getRegInterval(MI, Op);
887 setScoreByInterval(Interval, CntTy, Score);
888}
889
890
891
892
893
894
895bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
897 return false;
898
900 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
903}
904
905
906
907
908
909
910bool WaitcntBrackets::hasPointSamplePendingVmemTypes(
911 const MachineInstr &MI, RegInterval Interval) const {
912 if (!hasPointSampleAccel(MI))
913 return false;
914
915 return hasOtherPendingVmemTypes(Interval, VMEM_NOSAMPLER);
916}
917
918void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
919 InstCounterType T = eventCounter(Context->WaitEventMaskForInst, E);
921
922 unsigned UB = getScoreUB(T);
923 unsigned CurrScore = UB + 1;
924 if (CurrScore == 0)
926
927
928
929 PendingEvents |= 1 << E;
930 setScoreUB(T, CurrScore);
931
932 const SIRegisterInfo *TRI = Context->TRI;
933 const MachineRegisterInfo *MRI = Context->MRI;
934 const SIInstrInfo *TII = Context->TII;
935
936 if (T == EXP_CNT) {
937
938
940
941
942 if (const auto *AddrOp = TII->getNamedOperand(Inst, AMDGPU::OpName::addr))
943 setScoreByOperand(&Inst, *AddrOp, EXP_CNT, CurrScore);
944
946 if (const auto *Data0 =
947 TII->getNamedOperand(Inst, AMDGPU::OpName::data0))
948 setScoreByOperand(&Inst, *Data0, EXP_CNT, CurrScore);
949 if (const auto *Data1 =
950 TII->getNamedOperand(Inst, AMDGPU::OpName::data1))
951 setScoreByOperand(&Inst, *Data1, EXP_CNT, CurrScore);
953 Inst.getOpcode() != AMDGPU::DS_APPEND &&
954 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
955 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
956 for (const MachineOperand &Op : Inst.all_uses()) {
957 if (TRI->isVectorRegister(*MRI, Op.getReg()))
958 setScoreByOperand(&Inst, Op, EXP_CNT, CurrScore);
959 }
960 }
961 } else if (TII->isFLAT(Inst)) {
963 setScoreByOperand(&Inst,
964 *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
965 EXP_CNT, CurrScore);
967 setScoreByOperand(&Inst,
968 *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
969 EXP_CNT, CurrScore);
970 }
971 } else if (TII->isMIMG(Inst)) {
973 setScoreByOperand(&Inst, Inst.getOperand(0), EXP_CNT, CurrScore);
975 setScoreByOperand(&Inst,
976 *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
977 EXP_CNT, CurrScore);
978 }
979 } else if (TII->isMTBUF(Inst)) {
981 setScoreByOperand(&Inst, Inst.getOperand(0), EXP_CNT, CurrScore);
982 } else if (TII->isMUBUF(Inst)) {
984 setScoreByOperand(&Inst, Inst.getOperand(0), EXP_CNT, CurrScore);
986 setScoreByOperand(&Inst,
987 *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
988 EXP_CNT, CurrScore);
989 }
990 } else if (TII->isLDSDIR(Inst)) {
991
992 setScoreByOperand(&Inst,
993 *TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),
994 EXP_CNT, CurrScore);
995 } else {
996 if (TII->isEXP(Inst)) {
997
998
999
1000
1001 for (MachineOperand &DefMO : Inst.all_defs()) {
1002 if (TRI->isVGPR(*MRI, DefMO.getReg())) {
1003 setScoreByOperand(&Inst, DefMO, EXP_CNT, CurrScore);
1004 }
1005 }
1006 }
1007 for (const MachineOperand &Op : Inst.all_uses()) {
1008 if (TRI->isVectorRegister(*MRI, Op.getReg()))
1009 setScoreByOperand(&Inst, Op, EXP_CNT, CurrScore);
1010 }
1011 }
1012 } else if (T == X_CNT) {
1013 WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
1014 if (PendingEvents & (1 << OtherEvent)) {
1015
1016
1017
1018
1019 setScoreLB(T, getScoreUB(T) - 1);
1020 PendingEvents &= ~(1 << OtherEvent);
1021 }
1022 for (const MachineOperand &Op : Inst.all_uses())
1023 setScoreByOperand(&Inst, Op, T, CurrScore);
1024 } else {
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034 for (const MachineOperand &Op : Inst.defs()) {
1035 RegInterval Interval = getRegInterval(&Inst, Op);
1036 if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
1037 if (Interval.first >= NUM_ALL_VGPRS)
1038 continue;
1039 if (updateVMCntOnly(Inst)) {
1040
1041
1042
1044 VmemType V = getVmemType(Inst);
1045 unsigned char TypesMask = 1 << V;
1046
1047
1048 if (hasPointSampleAccel(Inst))
1049 TypesMask |= 1 << VMEM_NOSAMPLER;
1050 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
1051 VgprVmemTypes[RegNo] |= TypesMask;
1052 }
1053 }
1054 setScoreByInterval(Interval, T, CurrScore);
1055 }
1057 (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
1058
1059
1060 unsigned Slot = 0;
1061 for (const auto *MemOp : Inst.memoperands()) {
1062 if (!MemOp->isStore() ||
1064 continue;
1065
1066
1067 auto AAI = MemOp->getAAInfo();
1068
1069
1070
1071
1072
1073
1074
1075 if (!AAI || !AAI.Scope)
1076 break;
1077 for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
1078 for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
1079 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1081 break;
1082 }
1083 }
1084 }
1085 if (Slot)
1086 break;
1087
1088
1089
1090 LDSDMAStores.push_back(&Inst);
1091 Slot = LDSDMAStores.size();
1092 break;
1093 }
1094 if (Slot < NUM_LDS_VGPRS)
1095 setRegScore(FIRST_LDS_VGPR + Slot, T, CurrScore);
1096 if (Slot)
1097 setRegScore(FIRST_LDS_VGPR, T, CurrScore);
1098 }
1099
1101 setRegScore(SCC, T, CurrScore);
1102 PendingSCCWrite = &Inst;
1103 }
1104 }
1105}
1106
1107void WaitcntBrackets::print(raw_ostream &OS) const {
1108 const GCNSubtarget *ST = Context->ST;
1109
1110 OS << '\n';
1111 for (auto T : inst_counter_types(Context->MaxCounter)) {
1112 unsigned SR = getScoreRange(T);
1113
1114 switch (T) {
1115 case LOAD_CNT:
1116 OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
1117 << SR << "): ";
1118 break;
1119 case DS_CNT:
1120 OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
1121 << SR << "): ";
1122 break;
1124 OS << " EXP_CNT(" << SR << "): ";
1125 break;
1126 case STORE_CNT:
1127 OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
1128 << SR << "): ";
1129 break;
1130 case SAMPLE_CNT:
1131 OS << " SAMPLE_CNT(" << SR << "): ";
1132 break;
1133 case BVH_CNT:
1134 OS << " BVH_CNT(" << SR << "): ";
1135 break;
1136 case KM_CNT:
1137 OS << " KM_CNT(" << SR << "): ";
1138 break;
1139 case X_CNT:
1140 OS << " X_CNT(" << SR << "): ";
1141 break;
1142 default:
1143 OS << " UNKNOWN(" << SR << "): ";
1144 break;
1145 }
1146
1147 if (SR != 0) {
1148
1149 unsigned LB = getScoreLB(T);
1150
1151 for (int J = 0; J <= VgprUB; J++) {
1152 unsigned RegScore = getRegScore(J, T);
1153 if (RegScore <= LB)
1154 continue;
1155 unsigned RelScore = RegScore - LB - 1;
1156 if (J < FIRST_LDS_VGPR) {
1157 OS << RelScore << ":v" << J << " ";
1158 } else {
1159 OS << RelScore << ":ds ";
1160 }
1161 }
1162
1163 if (isSmemCounter(T)) {
1164 for (int J = 0; J <= SgprUB; J++) {
1165 unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, T);
1166 if (RegScore <= LB)
1167 continue;
1168 unsigned RelScore = RegScore - LB - 1;
1169 OS << RelScore << ":s" << J << " ";
1170 }
1171 }
1172 if (T == KM_CNT && SCCScore > 0)
1173 OS << SCCScore << ":scc ";
1174 }
1175 OS << '\n';
1176 }
1177
1178 OS << "Pending Events: ";
1179 if (hasPendingEvent()) {
1180 ListSeparator LS;
1181 for (unsigned I = 0; I != NUM_WAIT_EVENTS; ++I) {
1182 if (hasPendingEvent((WaitEventType)I)) {
1183 OS << LS << WaitEventTypeName[I];
1184 }
1185 }
1186 } else {
1187 OS << "none";
1188 }
1189 OS << '\n';
1190
1191 OS << '\n';
1192}
1193
1194
1195
1196void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) {
1197 simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1198 simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
1199 simplifyWaitcnt(DS_CNT, Wait.DsCnt);
1200 simplifyWaitcnt(STORE_CNT, Wait.StoreCnt);
1201 simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1202 simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
1203 simplifyWaitcnt(KM_CNT, Wait.KmCnt);
1205}
1206
1207void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
1208 unsigned &Count) const {
1209
1210
1211
1212 if (Count >= getScoreRange(T))
1214}
1215
1216void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
1217 AMDGPU::Waitcnt &Wait) const {
1218 const unsigned LB = getScoreLB(T);
1219 const unsigned UB = getScoreUB(T);
1220 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1221 unsigned ScoreToWait = getRegScore(RegNo, T);
1222
1223
1224
1225 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1226 if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
1227 ->ST->hasFlatLgkmVMemCountInOrder()) {
1228
1229
1230
1232 } else if (counterOutOfOrder(T)) {
1233
1234
1235
1237 } else {
1238
1239
1240 unsigned NeededWait =
1241 std::min(UB - ScoreToWait, Context->getWaitCountMax(T) - 1);
1242 addWait(Wait, T, NeededWait);
1243 }
1244 }
1245 }
1246}
1247
1248void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
1249
1250
1251 if (PendingSCCWrite &&
1252 PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1254 unsigned SCC_WRITE_PendingEvent = 1 << SCC_WRITE;
1255
1256 if ((PendingEvents & Context->WaitEventMaskForInst[KM_CNT]) ==
1257 SCC_WRITE_PendingEvent) {
1258 setScoreLB(KM_CNT, getScoreUB(KM_CNT));
1259 }
1260
1261 PendingEvents &= ~SCC_WRITE_PendingEvent;
1262 PendingSCCWrite = nullptr;
1263 }
1264}
1265
1266void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1267 applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1268 applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1269 applyWaitcnt(DS_CNT, Wait.DsCnt);
1270 applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1271 applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1272 applyWaitcnt(BVH_CNT, Wait.BvhCnt);
1273 applyWaitcnt(KM_CNT, Wait.KmCnt);
1274 applyWaitcnt(X_CNT, Wait.XCnt);
1275}
1276
1277void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
1278 const unsigned UB = getScoreUB(T);
1279 if (Count >= UB)
1280 return;
1281 if (Count != 0) {
1282 if (counterOutOfOrder(T))
1283 return;
1284 setScoreLB(T, std::max(getScoreLB(T), UB - Count));
1285 } else {
1286 setScoreLB(T, UB);
1287 PendingEvents &= ~Context->WaitEventMaskForInst[T];
1288 }
1289}
1290
1291bool WaitcntBrackets::hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait) {
1292
1293
1294
1295 return Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP);
1296}
1297
1298bool WaitcntBrackets::canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait) {
1299
1300
1301
1302 return Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
1303 !hasPendingEvent(STORE_CNT);
1304}
1305
1306void WaitcntBrackets::simplifyXcnt(AMDGPU::Waitcnt &CheckWait,
1307 AMDGPU::Waitcnt &UpdateWait) {
1308
1309
1310
1311
1312
1313 if (hasRedundantXCntWithKmCnt(CheckWait)) {
1314 if (!hasMixedPendingEvents(X_CNT)) {
1315 applyWaitcnt(X_CNT, 0);
1316 } else {
1317 PendingEvents &= ~(1 << SMEM_GROUP);
1318 }
1319 } else if (canOptimizeXCntWithLoadCnt(CheckWait)) {
1320 if (!hasMixedPendingEvents(X_CNT)) {
1321 applyWaitcnt(X_CNT, std::min(CheckWait.XCnt, CheckWait.LoadCnt));
1322 } else if (CheckWait.LoadCnt == 0) {
1323 PendingEvents &= ~(1 << VMEM_GROUP);
1324 }
1325 }
1326 simplifyWaitcnt(X_CNT, UpdateWait.XCnt);
1327}
1328
1329
1330
1331bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
1332
1333 if ((T == Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
1334 (T == X_CNT && hasPendingEvent(SMEM_GROUP)))
1335 return true;
1336 return hasMixedPendingEvents(T);
1337}
1338
1340 false, false)
1345
1346char SIInsertWaitcntsLegacy::ID = 0;
1347
1349
1351 return new SIInsertWaitcntsLegacy();
1352}
1353
1355 unsigned NewEnc) {
1356 int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
1358
1360
1361 if (NewEnc == MO.getImm())
1362 return false;
1363
1365 return true;
1366}
1367
1368
1369
1371 switch (Opcode) {
1372 case AMDGPU::S_WAIT_LOADCNT:
1373 return LOAD_CNT;
1374 case AMDGPU::S_WAIT_EXPCNT:
1375 return EXP_CNT;
1376 case AMDGPU::S_WAIT_STORECNT:
1377 return STORE_CNT;
1378 case AMDGPU::S_WAIT_SAMPLECNT:
1379 return SAMPLE_CNT;
1380 case AMDGPU::S_WAIT_BVHCNT:
1381 return BVH_CNT;
1382 case AMDGPU::S_WAIT_DSCNT:
1383 return DS_CNT;
1384 case AMDGPU::S_WAIT_KMCNT:
1385 return KM_CNT;
1386 case AMDGPU::S_WAIT_XCNT:
1387 return X_CNT;
1388 default:
1389 return {};
1390 }
1391}
1392
1393bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
1395 if (Opcode == Waitcnt->getOpcode())
1396 return false;
1397
1399 return true;
1400}
1401
1402
1403
1404
1405
1406
1407bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1408 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1411 assert(isNormalMode(MaxCounter));
1412
1414 MachineInstr *WaitcntInstr = nullptr;
1415 MachineInstr *WaitcntVsCntInstr = nullptr;
1416
1418 dbgs() << "PreGFX12::applyPreexistingWaitcnt at: ";
1420 dbgs() << "end of block\n";
1421 else
1422 dbgs() << *It;
1423 });
1424
1425 for (auto &II :
1428 if (II.isMetaInstruction()) {
1430 continue;
1431 }
1432
1434 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1435
1436
1437
1438 if (Opcode == AMDGPU::S_WAITCNT) {
1439 unsigned IEnc = II.getOperand(0).getImm();
1441 if (TrySimplify)
1442 ScoreBrackets.simplifyWaitcnt(OldWait);
1443 Wait = Wait.combined(OldWait);
1444
1445
1446 if (WaitcntInstr || (.hasWaitExceptStoreCnt() && TrySimplify)) {
1447 II.eraseFromParent();
1449 } else
1450 WaitcntInstr = &II;
1451 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1452 assert(ST->hasVMemToLDSLoad());
1453 LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II
1454 << "Before: " << Wait.LoadCnt << '\n';);
1455 ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, Wait);
1457
1458
1459
1460
1461
1462
1463
1464 II.eraseFromParent();
1465 } else {
1466 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1467 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1468
1469 unsigned OldVSCnt =
1470 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1471 if (TrySimplify)
1472 ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
1473 Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);
1474
1475 if (WaitcntVsCntInstr || (.hasWaitStoreCnt() && TrySimplify)) {
1476 II.eraseFromParent();
1478 } else
1479 WaitcntVsCntInstr = &II;
1480 }
1481 }
1482
1483 if (WaitcntInstr) {
1486 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1487
1488 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1489 ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1490 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1494
1497 << "applied pre-existing waitcnt\n"
1498 << "New Instr at block end: " << *WaitcntInstr << '\n'
1499 : dbgs() << "applied pre-existing waitcnt\n"
1500 << "Old Instr: " << *It
1501 << "New Instr: " << *WaitcntInstr << '\n');
1502 }
1503
1504 if (WaitcntVsCntInstr) {
1506 AMDGPU::OpName::simm16, Wait.StoreCnt);
1507 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1508
1509 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1511
1513 ? dbgs() << "applied pre-existing waitcnt\n"
1514 << "New Instr at block end: " << *WaitcntVsCntInstr
1515 << '\n'
1516 : dbgs() << "applied pre-existing waitcnt\n"
1517 << "Old Instr: " << *It
1518 << "New Instr: " << *WaitcntVsCntInstr << '\n');
1519 }
1520
1522}
1523
1524
1525
1526bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1528 AMDGPU::Waitcnt Wait) {
1530 assert(isNormalMode(MaxCounter));
1531
1534
1535
1536
1537 if (Wait.hasWaitExceptStoreCnt()) {
1539 [[maybe_unused]] auto SWaitInst =
1542
1544 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1545 dbgs() << "New Instr: " << *SWaitInst << '\n');
1546 }
1547
1548 if (Wait.hasWaitStoreCnt()) {
1550
1551 [[maybe_unused]] auto SWaitInst =
1556
1558 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1559 dbgs() << "New Instr: " << *SWaitInst << '\n');
1560 }
1561
1563}
1564
1565AMDGPU::Waitcnt
1566WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1567 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u);
1568}
1569
1570AMDGPU::Waitcnt
1571WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1572 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
1573 ~0u );
1574}
1575
1576
1577
1578
1579
1580bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1581 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1584 assert(!isNormalMode(MaxCounter));
1585
1587 MachineInstr *CombinedLoadDsCntInstr = nullptr;
1588 MachineInstr *CombinedStoreDsCntInstr = nullptr;
1589 MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
1590
1592 dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: ";
1594 dbgs() << "end of block\n";
1595 else
1596 dbgs() << *It;
1597 });
1598
1599 for (auto &II :
1602 if (II.isMetaInstruction()) {
1604 continue;
1605 }
1606
1607 MachineInstr **UpdatableInstr;
1608
1609
1610
1611
1613 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1614
1615
1616
1617 if (Opcode == AMDGPU::S_WAITCNT)
1618 continue;
1619
1620 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1621 unsigned OldEnc =
1622 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1624 if (TrySimplify)
1625 ScoreBrackets.simplifyWaitcnt(OldWait);
1626 Wait = Wait.combined(OldWait);
1627 UpdatableInstr = &CombinedLoadDsCntInstr;
1628 } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1629 unsigned OldEnc =
1630 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1632 if (TrySimplify)
1633 ScoreBrackets.simplifyWaitcnt(OldWait);
1634 Wait = Wait.combined(OldWait);
1635 UpdatableInstr = &CombinedStoreDsCntInstr;
1636 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1637
1638
1639 II.eraseFromParent();
1640 continue;
1641 } else {
1643 assert(CT.has_value());
1644 unsigned OldCnt =
1645 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1646 if (TrySimplify)
1647 ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);
1648 addWait(Wait, CT.value(), OldCnt);
1649 UpdatableInstr = &WaitInstrs[CT.value()];
1650 }
1651
1652
1653 if (!*UpdatableInstr) {
1654 *UpdatableInstr = &II;
1655 } else {
1656 II.eraseFromParent();
1658 }
1659 }
1660
1661
1662 AMDGPU::Waitcnt PreCombine = Wait;
1663 if (CombinedLoadDsCntInstr) {
1664
1665
1666
1667
1668
1669
1670
1671 if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) {
1674 AMDGPU::OpName::simm16, NewEnc);
1675 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
1676 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1677 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1680
1682 ? dbgs() << "applied pre-existing waitcnt\n"
1683 << "New Instr at block end: "
1684 << *CombinedLoadDsCntInstr << '\n'
1685 : dbgs() << "applied pre-existing waitcnt\n"
1686 << "Old Instr: " << *It << "New Instr: "
1687 << *CombinedLoadDsCntInstr << '\n');
1688 } else {
1691 }
1692 }
1693
1694 if (CombinedStoreDsCntInstr) {
1695
1696 if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) {
1699 AMDGPU::OpName::simm16, NewEnc);
1700 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
1701 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1702 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1705
1707 ? dbgs() << "applied pre-existing waitcnt\n"
1708 << "New Instr at block end: "
1709 << *CombinedStoreDsCntInstr << '\n'
1710 : dbgs() << "applied pre-existing waitcnt\n"
1711 << "Old Instr: " << *It << "New Instr: "
1712 << *CombinedStoreDsCntInstr << '\n');
1713 } else {
1716 }
1717 }
1718
1719
1720
1721
1722
1723
1724
1725 if (Wait.DsCnt != ~0u) {
1726
1727
1729
1730
1731
1732
1733
1734 if (Wait.LoadCnt != ~0u) {
1735 WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]);
1736 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
1737 } else if (Wait.StoreCnt != ~0u) {
1738 WaitsToErase.push_back(&WaitInstrs[STORE_CNT]);
1739 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
1740 }
1741
1742 for (MachineInstr **WI : WaitsToErase) {
1743 if (!*WI)
1744 continue;
1745
1746 (*WI)->eraseFromParent();
1747 *WI = nullptr;
1749 }
1750 }
1751
1752 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1753 if ((CT == KM_CNT && ScoreBrackets.hasRedundantXCntWithKmCnt(PreCombine)) ||
1754 (CT == LOAD_CNT &&
1755 ScoreBrackets.canOptimizeXCntWithLoadCnt(PreCombine))) {
1756
1757
1758 ScoreBrackets.simplifyXcnt(PreCombine, Wait);
1759 }
1760 if (!WaitInstrs[CT])
1761 continue;
1762
1763 unsigned NewCnt = getWait(Wait, CT);
1764 if (NewCnt != ~0u) {
1766 AMDGPU::OpName::simm16, NewCnt);
1767 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
1768
1769 ScoreBrackets.applyWaitcnt(CT, NewCnt);
1770 setNoWait(Wait, CT);
1771
1773 ? dbgs() << "applied pre-existing waitcnt\n"
1774 << "New Instr at block end: " << *WaitInstrs[CT]
1775 << '\n'
1776 : dbgs() << "applied pre-existing waitcnt\n"
1777 << "Old Instr: " << *It
1778 << "New Instr: " << *WaitInstrs[CT] << '\n');
1779 } else {
1782 }
1783 }
1784
1786}
1787
1788
1789bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
1791 AMDGPU::Waitcnt Wait) {
1793 assert(!isNormalMode(MaxCounter));
1794
1797
1798
1799 if (Wait.DsCnt != ~0u) {
1800 MachineInstr *SWaitInst = nullptr;
1801
1802 if (Wait.LoadCnt != ~0u) {
1804
1807
1810 } else if (Wait.StoreCnt != ~0u) {
1812
1813 SWaitInst =
1816
1819 }
1820
1821 if (SWaitInst) {
1823
1825 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1826 dbgs() << "New Instr: " << *SWaitInst << '\n');
1827 }
1828 }
1829
1830
1831
1832
1833 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1834 unsigned Count = getWait(Wait, CT);
1835 if (Count == ~0u)
1836 continue;
1837
1838 [[maybe_unused]] auto SWaitInst =
1841
1843
1845 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1846 dbgs() << "New Instr: " << *SWaitInst << '\n');
1847 }
1848
1850}
1851
1852
1854
1855
1856
1857
1858 return true;
1859}
1860
1861
1862
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
1878 WaitcntBrackets &ScoreBrackets,
1879 MachineInstr *OldWaitcntInstr,
1880 bool FlushVmCnt) {
1881 setForceEmitWaitcnt();
1882
1883 assert(.isMetaInstruction());
1884
1885 AMDGPU::Waitcnt Wait;
1886 const unsigned Opc = MI.getOpcode();
1887
1888
1889
1890
1891
1892 if (Opc == AMDGPU::BUFFER_WBINVL1 || Opc == AMDGPU::BUFFER_WBINVL1_SC ||
1893 Opc == AMDGPU::BUFFER_WBINVL1_VOL || Opc == AMDGPU::BUFFER_GL0_INV ||
1894 Opc == AMDGPU::BUFFER_GL1_INV) {
1895 Wait.LoadCnt = 0;
1896 }
1897
1898
1899
1900
1901 if (Opc == AMDGPU::SI_RETURN_TO_EPILOG || Opc == AMDGPU::SI_RETURN ||
1902 Opc == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
1903 Opc == AMDGPU::S_SETPC_B64_return ||
1905 Wait = Wait.combined(WCG->getAllZeroWaitcnt(false));
1906 }
1907
1908
1909
1910
1911
1912
1913
1914
1915 else if (Opc == AMDGPU::S_ENDPGM || Opc == AMDGPU::S_ENDPGM_SAVED) {
1916 if (!WCG->isOptNone() &&
1917 (MI.getMF()->getInfo()->isDynamicVGPREnabled() ||
1918 (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
1919 ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
1920 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))))
1921 ReleaseVGPRInsts.insert(&MI);
1922 }
1923
1924 else if ((Opc == AMDGPU::S_SENDMSG || Opc == AMDGPU::S_SENDMSGHALT) &&
1925 ST->hasLegacyGeometry() &&
1928 Wait.LoadCnt = 0;
1929 }
1930
1931
1932
1933
1934
1935 else {
1936 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
1937
1938
1939 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
1940 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
1941 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
1942 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
1943 Wait.ExpCnt = 0;
1944 }
1945 }
1946
1947
1948
1949 if (TII->isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS())
1950 addWait(Wait, DS_CNT, ScoreBrackets.getPendingGDSWait());
1951
1953
1954
1955
1956 Wait = AMDGPU::Waitcnt();
1957
1958 const auto &CallAddrOp = *TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1959 if (CallAddrOp.isReg()) {
1960 RegInterval CallAddrOpInterval =
1961 ScoreBrackets.getRegInterval(&MI, CallAddrOp);
1962
1963 ScoreBrackets.determineWait(SmemAccessCounter, CallAddrOpInterval,
1965
1966 if (const auto *RtnAddrOp =
1967 TII->getNamedOperand(MI, AMDGPU::OpName::dst)) {
1968 RegInterval RtnAddrOpInterval =
1969 ScoreBrackets.getRegInterval(&MI, *RtnAddrOp);
1970
1971 ScoreBrackets.determineWait(SmemAccessCounter, RtnAddrOpInterval,
1973 }
1974 }
1975 } else if (Opc == AMDGPU::S_BARRIER_WAIT) {
1976 ScoreBrackets.tryClearSCCWriteEvent(&MI);
1977 } else {
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992 for (const MachineMemOperand *Memop : MI.memoperands()) {
1993 const Value *Ptr = Memop->getValue();
1994 if (Memop->isStore()) {
1995 if (auto It = SLoadAddresses.find(Ptr); It != SLoadAddresses.end()) {
1996 addWait(Wait, SmemAccessCounter, 0);
1997 if (PDT->dominates(MI.getParent(), It->second))
1998 SLoadAddresses.erase(It);
1999 }
2000 }
2001 unsigned AS = Memop->getAddrSpace();
2003 continue;
2004
2005 if (TII->mayWriteLDSThroughDMA(MI))
2006 continue;
2007
2008
2009 unsigned RegNo = FIRST_LDS_VGPR;
2010 if (Ptr && Memop->getAAInfo()) {
2011 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
2012 for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
2013 if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
2014 if ((I + 1) >= NUM_LDS_VGPRS) {
2015
2016
2017 ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
2018 break;
2019 }
2020
2021 ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait);
2022 }
2023 }
2024 } else {
2025 ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
2026 }
2027
2028 if (Memop->isStore())
2029 ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
2030 }
2031
2032
2033 for (const MachineOperand &Op : MI.operands()) {
2034 if (.isReg())
2035 continue;
2036
2037
2038 if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
2039 continue;
2040
2041 RegInterval Interval = ScoreBrackets.getRegInterval(&MI, Op);
2042
2043 const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
2044 if (IsVGPR) {
2045
2046
2047
2048
2049
2050 if (Op.isImplicit() && MI.mayLoadOrStore())
2051 continue;
2052
2053
2054
2055
2056
2057
2058
2059 if (Op.isUse() || !updateVMCntOnly(MI) ||
2060 ScoreBrackets.hasOtherPendingVmemTypes(Interval,
2061 getVmemType(MI)) ||
2062 ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Interval) ||
2063 ->hasVmemWriteVgprInOrder()) {
2064 ScoreBrackets.determineWait(LOAD_CNT, Interval, Wait);
2065 ScoreBrackets.determineWait(SAMPLE_CNT, Interval, Wait);
2066 ScoreBrackets.determineWait(BVH_CNT, Interval, Wait);
2067 ScoreBrackets.clearVgprVmemTypes(Interval);
2068 }
2069
2070 if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
2071 ScoreBrackets.determineWait(EXP_CNT, Interval, Wait);
2072 }
2073 ScoreBrackets.determineWait(DS_CNT, Interval, Wait);
2074 } else if (Op.getReg() == AMDGPU::SCC) {
2075 ScoreBrackets.determineWait(KM_CNT, Interval, Wait);
2076 } else {
2077 ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait);
2078 }
2079
2080 if (ST->hasWaitXCnt() && Op.isDef())
2081 ScoreBrackets.determineWait(X_CNT, Interval, Wait);
2082 }
2083 }
2084 }
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098 if (Opc == AMDGPU::S_BARRIER && ->hasAutoWaitcntBeforeBarrier() &&
2099 ->supportsBackOffBarrier()) {
2100 Wait = Wait.combined(WCG->getAllZeroWaitcnt(true));
2101 }
2102
2103
2104
2105
2107 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2108 Wait.DsCnt = 0;
2109 }
2110
2111
2112 ScoreBrackets.simplifyWaitcnt(Wait);
2113
2114
2115
2116
2117 if (Wait.XCnt != ~0u && isVmemAccess(MI)) {
2118 ScoreBrackets.applyWaitcnt(X_CNT, Wait.XCnt);
2120 }
2121
2122
2123
2125 Wait = WCG->getAllZeroWaitcnt(false);
2126
2127 if (ForceEmitWaitcnt[LOAD_CNT])
2128 Wait.LoadCnt = 0;
2129 if (ForceEmitWaitcnt[EXP_CNT])
2130 Wait.ExpCnt = 0;
2131 if (ForceEmitWaitcnt[DS_CNT])
2132 Wait.DsCnt = 0;
2133 if (ForceEmitWaitcnt[SAMPLE_CNT])
2134 Wait.SampleCnt = 0;
2135 if (ForceEmitWaitcnt[BVH_CNT])
2136 Wait.BvhCnt = 0;
2137 if (ForceEmitWaitcnt[KM_CNT])
2138 Wait.KmCnt = 0;
2139 if (ForceEmitWaitcnt[X_CNT])
2140 Wait.XCnt = 0;
2141
2142 if (FlushVmCnt) {
2143 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
2144 Wait.LoadCnt = 0;
2145 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
2146 Wait.SampleCnt = 0;
2147 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
2148 Wait.BvhCnt = 0;
2149 }
2150
2152 Wait.LoadCnt = 0;
2153
2154 return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
2155 OldWaitcntInstr);
2156}
2157
2158bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
2160 MachineBasicBlock &Block,
2161 WaitcntBrackets &ScoreBrackets,
2162 MachineInstr *OldWaitcntInstr) {
2164
2165 if (OldWaitcntInstr)
2166
2167
2169 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
2170
2171
2172
2173 ScoreBrackets.applyWaitcnt(Wait);
2174
2175
2176 if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
2178 MachineOperand *WaitExp =
2179 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
2180 if (Wait.ExpCnt < WaitExp->getImm()) {
2183 }
2185
2187 << "Update Instr: " << *It);
2188 }
2189
2190 if (WCG->createNewWaitcnt(Block, It, Wait))
2192
2194}
2195
2196bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
2197 return (TII->isFLAT(MI) && TII->mayAccessVMEMThroughFlat(MI)) ||
2199}
2200
2201
2202
2204 MachineBasicBlock *Block) const {
2205 auto BlockEnd = Block->getParent()->end();
2206 auto BlockIter = Block->getIterator();
2207
2208 while (true) {
2209 if (It.isEnd()) {
2210 if (++BlockIter != BlockEnd) {
2211 It = BlockIter->instr_begin();
2212 continue;
2213 }
2214
2215 return false;
2216 }
2217
2218 if (!It->isMetaInstruction())
2219 break;
2220
2221 It++;
2222 }
2223
2224 assert(!It.isEnd());
2225
2226 return It->getOpcode() == AMDGPU::S_ENDPGM;
2227}
2228
2229
2230bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2231 MachineBasicBlock &Block,
2232 WaitcntBrackets &ScoreBrackets) {
2233 AMDGPU::Waitcnt Wait;
2234 bool NeedsEndPGMCheck = false;
2235
2237 Wait = WCG->getAllZeroWaitcnt(Inst.mayStore() &&
2239
2241 Wait.DsCnt = 0;
2242 NeedsEndPGMCheck = true;
2243 }
2244
2245 ScoreBrackets.simplifyWaitcnt(Wait);
2246
2247 auto SuccessorIt = std::next(Inst.getIterator());
2248 bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,
2249 nullptr);
2250
2251 if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &Block)) {
2254 }
2255
2257}
2258
2259void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2260 WaitcntBrackets *ScoreBrackets) {
2261
2262
2263
2264
2265
2266
2267
2268 bool IsVMEMAccess = false;
2269 bool IsSMEMAccess = false;
2270 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
2272 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2273 ScoreBrackets->updateByEvent(GDS_ACCESS, Inst);
2274 ScoreBrackets->updateByEvent(GDS_GPR_LOCK, Inst);
2275 ScoreBrackets->setPendingGDS();
2276 } else {
2277 ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);
2278 }
2279 } else if (TII->isFLAT(Inst)) {
2281 ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
2282 return;
2283 }
2284
2286
2287 int FlatASCount = 0;
2288
2289 if (TII->mayAccessVMEMThroughFlat(Inst)) {
2290 ++FlatASCount;
2291 IsVMEMAccess = true;
2292 ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
2293 }
2294
2295 if (TII->mayAccessLDSThroughFlat(Inst)) {
2296 ++FlatASCount;
2297 ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);
2298 }
2299
2300
2301
2302
2303
2304
2306 ScoreBrackets->setPendingFlat();
2309 IsVMEMAccess = true;
2310 ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
2311
2312 if (ST->vmemWriteNeedsExpWaitcnt() &&
2314 ScoreBrackets->updateByEvent(VMW_GPR_LOCK, Inst);
2315 }
2316 } else if (TII->isSMRD(Inst)) {
2317 IsSMEMAccess = true;
2318 ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst);
2319 } else if (Inst.isCall()) {
2321
2322 ScoreBrackets->applyWaitcnt(
2323 WCG->getAllZeroWaitcnt(false));
2324 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2325 } else {
2326
2327 ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
2328 }
2330 ScoreBrackets->updateByEvent(EXP_LDS_ACCESS, Inst);
2331 } else if (TII->isVINTERP(Inst)) {
2332 int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2333 ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
2335 unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2337 ScoreBrackets->updateByEvent(EXP_PARAM_ACCESS, Inst);
2339 ScoreBrackets->updateByEvent(EXP_POS_ACCESS, Inst);
2340 else
2341 ScoreBrackets->updateByEvent(EXP_GPR_LOCK, Inst);
2343 ScoreBrackets->updateByEvent(SCC_WRITE, Inst);
2344 } else {
2346 case AMDGPU::S_SENDMSG:
2347 case AMDGPU::S_SENDMSG_RTN_B32:
2348 case AMDGPU::S_SENDMSG_RTN_B64:
2349 case AMDGPU::S_SENDMSGHALT:
2350 ScoreBrackets->updateByEvent(SQ_MESSAGE, Inst);
2351 break;
2352 case AMDGPU::S_MEMTIME:
2353 case AMDGPU::S_MEMREALTIME:
2354 case AMDGPU::S_GET_BARRIER_STATE_M0:
2355 case AMDGPU::S_GET_BARRIER_STATE_IMM:
2356 ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst);
2357 break;
2358 }
2359 }
2360
2361 if (->hasWaitXCnt())
2362 return;
2363
2364 if (IsVMEMAccess)
2365 ScoreBrackets->updateByEvent(VMEM_GROUP, Inst);
2366
2367 if (IsSMEMAccess)
2368 ScoreBrackets->updateByEvent(SMEM_GROUP, Inst);
2369}
2370
2371bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
2372 unsigned OtherScore) {
2373 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
2374 unsigned OtherShifted =
2375 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
2376 Score = std::max(MyShifted, OtherShifted);
2377 return OtherShifted > MyShifted;
2378}
2379
2380
2381
2382
2383
2384
2385bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
2386 bool StrictDom = false;
2387
2388 VgprUB = std::max(VgprUB, Other.VgprUB);
2389 SgprUB = std::max(SgprUB, Other.SgprUB);
2390
2391 for (auto T : inst_counter_types(Context->MaxCounter)) {
2392
2393 const unsigned *WaitEventMaskForInst = Context->WaitEventMaskForInst;
2394 const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
2395 const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
2396 if (OtherEvents & ~OldEvents)
2397 StrictDom = true;
2398 PendingEvents |= OtherEvents;
2399
2400
2401 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
2402 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
2403 const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
2404 if (NewUB < ScoreLBs[T])
2406
2407 MergeInfo M;
2409 M.OtherLB = Other.ScoreLBs[T];
2410 M.MyShift = NewUB - ScoreUBs[T];
2411 M.OtherShift = NewUB - Other.ScoreUBs[T];
2412
2413 ScoreUBs[T] = NewUB;
2414
2415 StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
2416
2417 if (T == DS_CNT)
2418 StrictDom |= mergeScore(M, LastGDS, Other.LastGDS);
2419
2420 if (T == KM_CNT) {
2421 StrictDom |= mergeScore(M, SCCScore, Other.SCCScore);
2422 if (Other.hasPendingEvent(SCC_WRITE)) {
2423 unsigned OldEventsHasSCCWrite = OldEvents & (1 << SCC_WRITE);
2424 if (!OldEventsHasSCCWrite) {
2425 PendingSCCWrite = Other.PendingSCCWrite;
2426 } else if (PendingSCCWrite != Other.PendingSCCWrite) {
2427 PendingSCCWrite = nullptr;
2428 }
2429 }
2430 }
2431
2432 for (int J = 0; J <= VgprUB; J++)
2433 StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
2434
2435 if (isSmemCounter(T)) {
2436 unsigned Idx = getSgprScoresIdx(T);
2437 for (int J = 0; J <= SgprUB; J++)
2438 StrictDom |=
2439 mergeScore(M, SgprScores[Idx][J], Other.SgprScores[Idx][J]);
2440 }
2441 }
2442
2443 for (int J = 0; J <= VgprUB; J++) {
2444 unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
2445 StrictDom |= NewVmemTypes != VgprVmemTypes[J];
2446 VgprVmemTypes[J] = NewVmemTypes;
2447 }
2448
2449 return StrictDom;
2450}
2451
2454 return Opcode == AMDGPU::S_WAITCNT ||
2455 (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
2457 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
2458 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
2459 Opcode == AMDGPU::S_WAITCNT_lds_direct ||
2461}
2462
2463
2464bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2465 MachineBasicBlock &Block,
2466 WaitcntBrackets &ScoreBrackets) {
2468
2470 dbgs() << "*** Begin Block: ";
2472 ScoreBrackets.dump();
2473 });
2474
2475
2476
2477
2478 bool VCCZCorrect = true;
2479 if (ST->hasReadVCCZBug()) {
2480
2481
2482 VCCZCorrect = false;
2483 } else if (->partialVCCWritesUpdateVCCZ()) {
2484
2485
2486 VCCZCorrect = false;
2487 }
2488
2489
2490 MachineInstr *OldWaitcntInstr = nullptr;
2491
2494 Iter != E;) {
2495 MachineInstr &Inst = *Iter;
2497 ++Iter;
2498 continue;
2499 }
2500
2501
2502
2504 if (!OldWaitcntInstr)
2505 OldWaitcntInstr = &Inst;
2506 ++Iter;
2507 continue;
2508 }
2509
2510 bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
2511 isPreheaderToFlush(Block, ScoreBrackets);
2512
2513
2514 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
2515 FlushVmCnt);
2516 OldWaitcntInstr = nullptr;
2517
2518
2520
2521
2522 if (ST->hasReadVCCZBug() || ->partialVCCWritesUpdateVCCZ()) {
2523 if (Inst.definesRegister(AMDGPU::VCC_LO, nullptr) ||
2524 Inst.definesRegister(AMDGPU::VCC_HI, nullptr)) {
2525
2526 if (->partialVCCWritesUpdateVCCZ())
2527 VCCZCorrect = false;
2528 } else if (Inst.definesRegister(AMDGPU::VCC, nullptr)) {
2529
2530
2531
2532
2533
2534
2535
2536 if (ST->hasReadVCCZBug() &&
2537 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2538
2539
2540 VCCZCorrect = false;
2541 } else {
2542
2543 VCCZCorrect = true;
2544 }
2545 }
2546 }
2547
2548 if (TII->isSMRD(Inst)) {
2549 for (const MachineMemOperand *Memop : Inst.memoperands()) {
2550
2551
2552 if (!Memop->isInvariant()) {
2553 const Value *Ptr = Memop->getValue();
2554 SLoadAddresses.insert(std::pair(Ptr, Inst.getParent()));
2555 }
2556 }
2557 if (ST->hasReadVCCZBug()) {
2558
2559 VCCZCorrect = false;
2560 }
2561 }
2562
2563 updateEventWaitcntAfter(Inst, &ScoreBrackets);
2564
2565 Modified |= insertForcedWaitAfter(Inst, Block, ScoreBrackets);
2566
2569 ScoreBrackets.dump();
2570 });
2571
2572
2573
2574 if (RestoreVCCZ) {
2575
2576
2577
2579 TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
2580 TRI->getVCC())
2582 VCCZCorrect = true;
2584 }
2585
2586 ++Iter;
2587 }
2588
2589
2590
2591 AMDGPU::Waitcnt Wait;
2592 if (Block.getFirstTerminator() == Block.end() &&
2593 isPreheaderToFlush(Block, ScoreBrackets)) {
2594 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
2595 Wait.LoadCnt = 0;
2596 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
2597 Wait.SampleCnt = 0;
2598 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
2599 Wait.BvhCnt = 0;
2600 }
2601
2602
2604 OldWaitcntInstr);
2605
2607 dbgs() << "*** End Block: ";
2609 ScoreBrackets.dump();
2610 });
2611
2613}
2614
2615
2616
2617bool SIInsertWaitcnts::isPreheaderToFlush(
2618 MachineBasicBlock &MBB, const WaitcntBrackets &ScoreBrackets) {
2619 auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false);
2620 if (!IsInserted)
2621 return Iterator->second;
2622
2624 if (!Succ)
2625 return false;
2626
2627 MachineLoop *Loop = MLI->getLoopFor(Succ);
2628 if (!Loop)
2629 return false;
2630
2632 shouldFlushVmCnt(Loop, ScoreBrackets)) {
2633 Iterator->second = true;
2634 return true;
2635 }
2636
2637 return false;
2638}
2639
2640bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
2642 return TII->mayAccessVMEMThroughFlat(MI);
2644}
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
2655 const WaitcntBrackets &Brackets) {
2656 bool HasVMemLoad = false;
2657 bool HasVMemStore = false;
2658 bool UsesVgprLoadedOutside = false;
2659 DenseSet VgprUse;
2660 DenseSet VgprDef;
2661
2662 for (MachineBasicBlock *MBB : ML->blocks()) {
2663 for (MachineInstr &MI : *MBB) {
2664 if (isVMEMOrFlatVMEM(MI)) {
2665 HasVMemLoad |= MI.mayLoad();
2666 HasVMemStore |= MI.mayStore();
2667 }
2668
2669 for (const MachineOperand &Op : MI.all_uses()) {
2670 if (Op.isDebug() || ->isVectorRegister(*MRI, Op.getReg()))
2671 continue;
2672 RegInterval Interval = Brackets.getRegInterval(&MI, Op);
2673
2674 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
2675
2676
2677 if (VgprDef.contains(RegNo))
2678 return false;
2679 VgprUse.insert(RegNo);
2680
2681
2682 if (Brackets.getRegScore(RegNo, LOAD_CNT) >
2683 Brackets.getScoreLB(LOAD_CNT) ||
2684 Brackets.getRegScore(RegNo, SAMPLE_CNT) >
2685 Brackets.getScoreLB(SAMPLE_CNT) ||
2686 Brackets.getRegScore(RegNo, BVH_CNT) >
2687 Brackets.getScoreLB(BVH_CNT)) {
2688 UsesVgprLoadedOutside = true;
2689 break;
2690 }
2691 }
2692 }
2693
2694
2695 if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {
2696 for (const MachineOperand &Op : MI.all_defs()) {
2697 RegInterval Interval = Brackets.getRegInterval(&MI, Op);
2698 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
2699
2700
2701 if (VgprUse.contains(RegNo))
2702 return false;
2703 VgprDef.insert(RegNo);
2704 }
2705 }
2706 }
2707 }
2708 }
2709 if (->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
2710 return true;
2711 return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder();
2712}
2713
2714bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
2715 auto *MLI = &getAnalysis().getLI();
2716 auto *PDT =
2717 &getAnalysis().getPostDomTree();
2719 if (auto *AAR = getAnalysisIfAvailable())
2720 AA = &AAR->getAAResults();
2721
2722 return SIInsertWaitcnts(MLI, PDT, AA).run(MF);
2723}
2724
2725PreservedAnalyses
2731 .getManager()
2733
2734 if (!SIInsertWaitcnts(MLI, PDT, AA).run(MF))
2736
2739 .preserve();
2740}
2741
2744 TII = ST->getInstrInfo();
2745 TRI = &TII->getRegisterInfo();
2748
2750
2751 if (ST->hasExtendedWaitCounts()) {
2752 MaxCounter = NUM_EXTENDED_INST_CNTS;
2753 WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter);
2754 WCG = &WCGGFX12Plus;
2755 } else {
2756 MaxCounter = NUM_NORMAL_INST_CNTS;
2757 WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF, MaxCounter);
2758 WCG = &WCGPreGFX12;
2759 }
2760
2761 for (auto T : inst_counter_types())
2762 ForceEmitWaitcnt[T] = false;
2763
2764 WaitEventMaskForInst = WCG->getWaitEventMask();
2765
2766 SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
2767
2768 if (ST->hasExtendedWaitCounts()) {
2771 } else {
2774 }
2781
2782 [[maybe_unused]] unsigned NumVGPRsMax =
2784 [[maybe_unused]] unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
2785 assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
2786 assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
2787
2788 BlockInfos.clear();
2790
2791 MachineBasicBlock &EntryBB = MF.front();
2793
2795
2796
2797
2798
2799
2800
2802 I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
2803 ;
2804
2805 if (ST->hasExtendedWaitCounts()) {
2808 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
2809 if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT)
2810 continue;
2811
2812 if (->hasImageInsts() &&
2813 (CT == EXP_CNT || CT == SAMPLE_CNT || CT == BVH_CNT))
2814 continue;
2815
2817 TII->get(instrsForExtendedCounterTypes[CT]))
2819 }
2820 } else {
2822 }
2823
2824 auto NonKernelInitialState = std::make_unique(this);
2825 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
2826 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
2827
2829 }
2830
2831
2832
2833 for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
2835
2836 std::unique_ptr Brackets;
2837 bool Repeat;
2838 do {
2839 Repeat = false;
2840
2841 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
2842 ++BII) {
2843 MachineBasicBlock *MBB = BII->first;
2844 BlockInfo &BI = BII->second;
2845 if (!BI.Dirty)
2846 continue;
2847
2848 if (BI.Incoming) {
2849 if (!Brackets)
2850 Brackets = std::make_unique(*BI.Incoming);
2851 else
2852 *Brackets = *BI.Incoming;
2853 } else {
2854 if (!Brackets) {
2855 Brackets = std::make_unique(this);
2856 } else {
2857
2858
2859
2860 Brackets->~WaitcntBrackets();
2861 new (Brackets.get()) WaitcntBrackets(this);
2862 }
2863 }
2864
2865 Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
2866 BI.Dirty = false;
2867
2868 if (Brackets->hasPendingEvent()) {
2869 BlockInfo *MoveBracketsToSucc = nullptr;
2870 for (MachineBasicBlock *Succ : MBB->successors()) {
2871 auto *SuccBII = BlockInfos.find(Succ);
2872 BlockInfo &SuccBI = SuccBII->second;
2873 if (!SuccBI.Incoming) {
2874 SuccBI.Dirty = true;
2875 if (SuccBII <= BII) {
2877 Repeat = true;
2878 }
2879 if (!MoveBracketsToSucc) {
2880 MoveBracketsToSucc = &SuccBI;
2881 } else {
2882 SuccBI.Incoming = std::make_unique(*Brackets);
2883 }
2884 } else if (SuccBI.Incoming->merge(*Brackets)) {
2885 SuccBI.Dirty = true;
2886 if (SuccBII <= BII) {
2888 Repeat = true;
2889 }
2890 }
2891 }
2892 if (MoveBracketsToSucc)
2893 MoveBracketsToSucc->Incoming = std::move(Brackets);
2894 }
2895 }
2896 } while (Repeat);
2897
2898 if (ST->hasScalarStores()) {
2899 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
2900 bool HaveScalarStores = false;
2901
2902 for (MachineBasicBlock &MBB : MF) {
2903 for (MachineInstr &MI : MBB) {
2904 if (!HaveScalarStores && TII->isScalarStore(MI))
2905 HaveScalarStores = true;
2906
2907 if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
2908 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
2910 }
2911 }
2912
2913 if (HaveScalarStores) {
2914
2915
2916
2917
2918
2919
2920
2921
2922 for (MachineBasicBlock *MBB : EndPgmBlocks) {
2923 bool SeenDCacheWB = false;
2924
2927 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
2928 SeenDCacheWB = true;
2929 else if (TII->isScalarStore(*I))
2930 SeenDCacheWB = false;
2931
2932
2933 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
2934 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
2935 !SeenDCacheWB) {
2938 }
2939 }
2940 }
2941 }
2942 }
2943
2944
2945
2946
2947
2948
2950 for (MachineInstr *MI : ReleaseVGPRInsts) {
2951 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2952 TII->get(AMDGPU::S_ALLOC_VGPR))
2955 }
2956 } else {
2957 if (!ReleaseVGPRInsts.empty() &&
2958 (MF.getFrameInfo().hasCalls() ||
2959 ST->getOccupancyWithNumVGPRs(
2960 TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass),
2961 false) <
2963 for (MachineInstr *MI : ReleaseVGPRInsts) {
2964 if (ST->requiresNopBeforeDeallocVGPRs()) {
2965 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2968 }
2969 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2970 TII->get(AMDGPU::S_SENDMSG))
2973 }
2974 }
2975 }
2976 ReleaseVGPRInsts.clear();
2977 PreheadersToFlush.clear();
2978 SLoadAddresses.clear();
2979
2981}
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
const TargetInstrInfo & TII
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
AMD GCN specific subclass of TargetSubtarget.
static bool isOptNone(const MachineFunction &MF)
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
std::pair< uint64_t, uint64_t > Interval
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)
static bool callWaitsOnFunctionReturn(const MachineInstr &MI)
Definition SIInsertWaitcnts.cpp:1863
#define AMDGPU_EVENT_NAME(Name)
Definition SIInsertWaitcnts.cpp:142
static bool callWaitsOnFunctionEntry(const MachineInstr &MI)
Definition SIInsertWaitcnts.cpp:1853
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
Definition SIInsertWaitcnts.cpp:1354
static bool isWaitInstr(MachineInstr &Inst)
Definition SIInsertWaitcnts.cpp:2452
static std::optional< InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
Definition SIInsertWaitcnts.cpp:1370
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)
Definition SIInsertWaitcnts.cpp:113
#define AMDGPU_EVENT_ENUM(Name)
Definition SIInsertWaitcnts.cpp:135
Provides some synthesis utilities to produce sequences of values.
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static const uint32_t IV[8]
A manager for alias analyses.
bool isEntryFunction() const
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Represents analyses that only rely on functions' control flow.
static bool shouldExecute(CounterInfo &Counter)
static bool isCounterSet(CounterInfo &Info)
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
FunctionPass class - This class is used to implement most global optimizations.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
const MCInstrDesc & get(unsigned Opcode) const
Return the machine instruction descriptor that corresponds to the specified instruction opcode.
LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
instr_iterator instr_end()
iterator_range< succ_iterator > successors()
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
mop_range defs()
Returns all explicit operands that are register definitions.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool isCall(QueryType Type=AnyInBundle) const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
const MachineOperand & getOperand(unsigned i) const
bool isMetaInstruction(QueryType Type=IgnoreBundle) const
Return true if this instruction doesn't produce any output in the form of executable instructions.
Analysis pass that exposes the MachineLoopInfo for a machine function.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
Register getReg() const
getReg - Returns the register number.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
Definition SIInsertWaitcnts.cpp:2726
static bool isCBranchVCCZRead(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isEXP(const MachineInstr &MI)
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isVSAMPLE(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static bool isVINTERP(const MachineInstr &MI)
static bool isGFX12CacheInvOrWBInst(unsigned Opc)
static bool isSBarrierSCCWrite(unsigned Opcode)
static bool isMIMG(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
static bool isAtomicNoRet(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
unsigned getDynamicVGPRBlockSize() const
bool isDynamicVGPREnabled() const
void push_back(const T &Elt)
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
@ ID_DEALLOC_VGPRS_GFX11Plus
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
MCRegister getMCReg(MCRegister Reg, const MCSubtargetInfo &STI)
If Reg is a pseudo reg, return the correct hardware register given STI otherwise return Reg.
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
unsigned getStorecntBitMask(const IsaVersion &Version)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
unsigned getSamplecntBitMask(const IsaVersion &Version)
unsigned getKmcntBitMask(const IsaVersion &Version)
unsigned getVmcntBitMask(const IsaVersion &Version)
unsigned getXcntBitMask(const IsaVersion &Version)
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
unsigned getLgkmcntBitMask(const IsaVersion &Version)
unsigned getBvhcntBitMask(const IsaVersion &Version)
unsigned getExpcntBitMask(const IsaVersion &Version)
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
static unsigned encodeStorecntDscnt(const IsaVersion &Version, unsigned Storecnt, unsigned Dscnt)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
unsigned getLoadcntBitMask(const IsaVersion &Version)
static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt, unsigned Dscnt)
unsigned getDscntBitMask(const IsaVersion &Version)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Undef
Value of the register doesn't matter.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
This is an optimization pass for GlobalISel generic memory operations.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
FunctionAddr VTableAddr Value
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
char & SIInsertWaitcntsID
Definition SIInsertWaitcnts.cpp:1348
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
FunctionAddr VTableAddr Count
CodeGenOptLevel
Code generation optimization level.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
DWARFExpression::Operation Op
FunctionPass * createSIInsertWaitcntsPass()
Definition SIInsertWaitcnts.cpp:1350
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
Instruction set architecture version.
Represents the counter values to wait for in an s_waitcnt instruction.
static constexpr bool is_iterable
Definition SIInsertWaitcnts.cpp:88