LLVM: lib/Target/AMDGPU/GCNHazardRecognizer.cpp Source File (original) (raw)

1

2

3

4

5

6

7

8

9

10

11

12

21

22using namespace llvm;

23

24namespace {

25

26struct MFMAPaddingRatioParser : public cl::parser {

27 MFMAPaddingRatioParser(cl::Option &O) : cl::parser(O) {}

28

29 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {

31 return O.error("'" + Arg + "' value invalid for uint argument!");

32

34 return O.error("'" + Arg + "' value must be in the range [0, 100]!");

35

36 return false;

37 }

38};

39

40}

41

44 cl::desc("Fill a percentage of the latency between "

45 "neighboring MFMA with s_nops."));

46

47

50 cl::desc("Insert a s_nop x before every instruction"));

51

52

53

54

55

58

60 : IsHazardRecognizerMode(false), CurrCycleInstr(nullptr), MF(MF),

61 ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),

62 TRI(TII.getRegisterInfo()), TSchedModel(TII.getSchedModel()),

63 ClauseUses(TRI.getNumRegUnits()), ClauseDefs(TRI.getNumRegUnits()) {

64 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;

66}

67

69 EmittedInstrs.clear();

70}

71

75

77 CurrCycleInstr = MI;

78}

79

81 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;

82}

83

85 return Opcode == AMDGPU::S_GETREG_B32 || Opcode == AMDGPU::S_GETREG_B32_const;

86}

87

89 switch (Opcode) {

90 case AMDGPU::S_SETREG_B32:

91 case AMDGPU::S_SETREG_B32_mode:

92 case AMDGPU::S_SETREG_IMM32_B32:

93 case AMDGPU::S_SETREG_IMM32_B32_mode:

94 return true;

95 }

96 return false;

97}

98

99static bool isRWLane(unsigned Opcode) {

100 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;

101}

102

103static bool isRFE(unsigned Opcode) {

104 return Opcode == AMDGPU::S_RFE_B64;

105}

106

108 switch (Opcode) {

109 case AMDGPU::S_MOVRELS_B32:

110 case AMDGPU::S_MOVRELS_B64:

111 case AMDGPU::S_MOVRELD_B32:

112 case AMDGPU::S_MOVRELD_B64:

113 return true;

114 default:

115 return false;

116 }

117}

118

121 if (TII.isAlwaysGDS(MI.getOpcode()))

122 return true;

123

124 switch (MI.getOpcode()) {

125 case AMDGPU::S_SENDMSG:

126 case AMDGPU::S_SENDMSGHALT:

127 case AMDGPU::S_TTRACEDATA:

128 return true;

129

130 case AMDGPU::DS_NOP:

131 case AMDGPU::DS_PERMUTE_B32:

132 case AMDGPU::DS_BPERMUTE_B32:

133 return false;

134 default:

135 if (TII.isDS(MI.getOpcode())) {

136 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),

137 AMDGPU::OpName::gds);

138 if (MI.getOperand(GDS).getImm())

139 return true;

140 }

141 return false;

142 }

143}

144

146 unsigned Opcode = MI.getOpcode();

147 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||

148 Opcode == AMDGPU::V_PERMLANE64_B32 ||

149 Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||

150 Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||

151 Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64 ||

152 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||

153 Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||

154 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||

155 Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 ||

156 Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||

157 Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 ||

158 Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||

159 Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 ||

160 Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64;

161}

162

167

170 AMDGPU::OpName::simm16);

172}

173

177

178

180

181 if (MI->isBundle())

183

186

187 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)

189

190 if (checkFPAtomicToDenormModeHazard(MI) > 0)

192

193

194 if (!IsHazardRecognizerMode) {

195 if (checkWMMACoexecutionHazards(MI) > 0)

197 }

198

199 if (ST.hasNoDataDepHazard())

201

204

207

210

211 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)

213

214 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)

216

219 checkMAIVALUHazards(MI) > 0)

221

222 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)

224

225 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)

227

228 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)

230

231 if (((ST.hasReadM0MovRelInterpHazard() &&

232 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||

233 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||

234 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||

236 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||

237 (ST.hasReadM0LdsDirectHazard() &&

238 MI->readsRegister(AMDGPU::LDS_DIRECT, nullptr))) &&

239 checkReadM0Hazards(MI) > 0)

241

244

246 checkMAILdStHazards(MI) > 0)

248

249 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)

251

253}

254

256 unsigned Quantity) {

257 while (Quantity > 0) {

258 unsigned Arg = std::min(Quantity, 8u);

259 Quantity -= Arg;

260 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))

262 }

263}

264

265unsigned

266GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {

267 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);

268 assert(TSchedModel.getWriteProcResBegin(SC) !=

269 TSchedModel.getWriteProcResEnd(SC));

270 return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;

271}

272

273void GCNHazardRecognizer::processBundle() {

276

277 for (; MI != E && MI->isInsideBundle(); ++MI) {

278 CurrCycleInstr = &*MI;

280

281 if (IsHazardRecognizerMode) {

282 fixHazards(CurrCycleInstr);

283

285 }

286

287

288

289

290 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)

291 EmittedInstrs.push_front(nullptr);

292

293 EmittedInstrs.push_front(CurrCycleInstr);

295 }

296 CurrCycleInstr = nullptr;

297}

298

299void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {

300 assert(IsHazardRecognizerMode);

301

304 if (MI->isInsideBundle())

306 else

308 NumPreNoops);

311}

312

314 IsHazardRecognizerMode = true;

315 CurrCycleInstr = MI;

317 fixHazards(MI);

318 CurrCycleInstr = nullptr;

319 return std::max(W, NopPadding.getValue());

320}

321

323 if (MI->isBundle())

324 return 0;

325

326 int WaitStates = 0;

327

329 return std::max(WaitStates, checkSMRDHazards(MI));

330

331 if (ST.hasNSAtoVMEMBug())

332 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));

333

334 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));

335

336 if (ST.hasNoDataDepHazard())

337 return WaitStates;

338

340 WaitStates = std::max(WaitStates, checkVMEMHazards(MI));

341

343 WaitStates = std::max(WaitStates, checkVALUHazards(MI));

344

346 WaitStates = std::max(WaitStates, checkDPPHazards(MI));

347

349 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));

350

352 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));

353

356 checkMAIVALUHazards(MI) > 0)

357 WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));

358

359 if (MI->isInlineAsm())

360 return std::max(WaitStates, checkInlineAsmHazards(MI));

361

363 return std::max(WaitStates, checkGetRegHazards(MI));

364

366 return std::max(WaitStates, checkSetRegHazards(MI));

367

368 if (isRFE(MI->getOpcode()))

369 return std::max(WaitStates, checkRFEHazards(MI));

370

371 if ((ST.hasReadM0MovRelInterpHazard() &&

372 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||

373 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||

374 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||

376 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||

377 (ST.hasReadM0LdsDirectHazard() &&

378 MI->readsRegister(AMDGPU::LDS_DIRECT, nullptr)))

379 return std::max(WaitStates, checkReadM0Hazards(MI));

380

382 return std::max(WaitStates, checkMAIHazards(MI));

383

385 return std::max(WaitStates, checkMAILdStHazards(MI));

386

388 return std::max(WaitStates, checkPermlaneHazards(MI));

389

390 return WaitStates;

391}

392

394 EmittedInstrs.push_front(nullptr);

395}

396

398

399

400 if (!CurrCycleInstr) {

401 EmittedInstrs.push_front(nullptr);

402 return;

403 }

404

405 if (CurrCycleInstr->isBundle()) {

406 processBundle();

407 return;

408 }

409

410 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);

411 if (!NumWaitStates) {

412 CurrCycleInstr = nullptr;

413 return;

414 }

415

416

417 EmittedInstrs.push_front(CurrCycleInstr);

418

419

420

421

422 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());

423 i < e; ++i) {

424 EmittedInstrs.push_front(nullptr);

425 }

426

427

428

429

431

432 CurrCycleInstr = nullptr;

433}

434

436 assert(!IsHazardRecognizerMode &&

437 "Bottom-up scheduling shouldn't run in hazard recognizer mode");

438}

439

440

441

442

443

445

446

447template

448static bool

454 struct StateMapKey {

456 unsigned Idx;

457 static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) {

458 return LHS.States == RHS.States && LHS.Idx == RHS.Idx;

459 }

460 };

461 struct StateMapKeyTraits : DenseMapInfo {

462 static inline StateMapKey getEmptyKey() {

466 }

467 static inline StateMapKey getTombstoneKey() {

471 }

472 static unsigned getHashValue(const StateMapKey &Key) {

473 return StateT::getHashValue((*Key.States)[Key.Idx]);

474 }

475 static unsigned getHashValue(const StateT &State) {

476 return StateT::getHashValue(State);

477 }

478 static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) {

479 const auto EKey = getEmptyKey();

480 const auto TKey = getTombstoneKey();

481 if (StateMapKey::isEqual(LHS, EKey) || StateMapKey::isEqual(RHS, EKey) ||

482 StateMapKey::isEqual(LHS, TKey) || StateMapKey::isEqual(RHS, TKey))

483 return StateMapKey::isEqual(LHS, RHS);

484 return StateT::isEqual((*LHS.States)[LHS.Idx], (*RHS.States)[RHS.Idx]);

485 }

486 static bool isEqual(const StateT &LHS, const StateMapKey &RHS) {

487 if (StateMapKey::isEqual(RHS, getEmptyKey()) ||

488 StateMapKey::isEqual(RHS, getTombstoneKey()))

489 return false;

490 return StateT::isEqual(LHS, (*RHS.States)[RHS.Idx]);

491 }

492 };

493

496

499 StateT State = InitialState;

500

502 unsigned WorkIdx = 0;

503 for (;;) {

504 bool Expired = false;

505 for (auto E = MBB->instr_rend(); I != E; ++I) {

506

507 if (I->isBundle())

508 continue;

509

510 auto Result = IsHazard(State, *I);

512 return true;

514 Expired = true;

515 break;

516 }

517

518 if (I->isInlineAsm() || I->isMetaInstruction())

519 continue;

520

521 UpdateState(State, *I);

522 }

523

524 if (!Expired) {

525 unsigned StateIdx = States.size();

526 StateMapKey Key = {&States, StateIdx};

527 auto Insertion = StateMap.insert_as(std::pair(Key, StateIdx), State);

528 if (Insertion.second) {

530 } else {

531 StateIdx = Insertion.first->second;

532 }

534 Worklist.insert(std::pair(Pred, StateIdx));

535 }

536

537 if (WorkIdx == Worklist.size())

538 break;

539

540 unsigned StateIdx;

541 std::tie(MBB, StateIdx) = Worklist[WorkIdx++];

542 State = States[StateIdx];

543 I = MBB->instr_rbegin();

544 }

545

546 return false;

547}

548

549

550

551

552static int

560 for (auto E = MBB->instr_rend(); I != E; ++I) {

561

562 if (I->isBundle())

563 continue;

564

565 if (IsHazard(*I))

566 return WaitStates;

567

568 if (I->isInlineAsm())

569 continue;

570

571 WaitStates += GetNumWaitStates(*I);

572

573 if (IsExpired(*I, WaitStates))

574 return std::numeric_limits::max();

575 }

576

577 int MinWaitStates = std::numeric_limits::max();

579 if (!Visited.insert(Pred).second)

580 continue;

581

582 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,

583 IsExpired, Visited, GetNumWaitStates);

584

585 MinWaitStates = std::min(MinWaitStates, W);

586 }

587

588 return MinWaitStates;

589}

590

591static int

599 std::next(MI->getReverseIterator()), 0, IsExpired,

600 Visited, GetNumWaitStates);

601}

602

603int GCNHazardRecognizer::getWaitStatesSince(

604 IsHazardFn IsHazard, int Limit, GetNumWaitStatesFn GetNumWaitStates) {

605 if (IsHazardRecognizerMode) {

606 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {

607 return WaitStates >= Limit;

608 };

609 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn,

610 GetNumWaitStates);

611 }

612

613 int WaitStates = 0;

614 for (MachineInstr *MI : EmittedInstrs) {

615 if (MI) {

616 if (IsHazard(*MI))

617 return WaitStates;

618

619 if (MI->isInlineAsm())

620 continue;

621 }

622 WaitStates += MI ? GetNumWaitStates(*MI) : 1;

623

624 if (WaitStates >= Limit)

625 break;

626 }

627 return std::numeric_limits::max();

628}

629

630int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {

632}

633

634int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,

635 IsHazardFn IsHazardDef,

636 int Limit) {

637 const SIRegisterInfo *TRI = ST.getRegisterInfo();

638

639 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {

640 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);

641 };

642

643 return getWaitStatesSince(IsHazardFn, Limit);

644}

645

646int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,

647 int Limit) {

648 auto IsHazardFn = [IsHazard](const MachineInstr &MI) {

649 return isSSetReg(MI.getOpcode()) && IsHazard(MI);

650 };

651

652 return getWaitStatesSince(IsHazardFn, Limit);

653}

654

655

656

657

658

661 for (MCRegUnit Unit : TRI.regunits(Reg))

662 BV.set(static_cast<unsigned>(Unit));

663}

664

669 if (Op.isReg())

670 addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());

671 }

672}

673

674void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {

675 addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);

676}

677

681

685

686int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {

687

688

689 if (!ST.isXNACKEnabled())

690 return 0;

691

692 bool IsSMRD = TII.isSMRD(*MEM);

693

694 resetClause();

695

696

697

698

699

700

701

702

703

704

705

706 for (MachineInstr *MI : EmittedInstrs) {

707

708

709 if (MI)

710 break;

711

713 break;

714

715 addClauseInst(*MI);

716 }

717

718 if (ClauseDefs.none())

719 return 0;

720

721

722

723

725 return 1;

726

727 addClauseInst(*MEM);

728

729

730

731 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;

732}

733

734int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {

735 int WaitStatesNeeded = 0;

736

737 WaitStatesNeeded = checkSoftClauseHazards(SMRD);

738

739

740 if (!ST.hasSMRDReadVALUDefHazard())

741 return WaitStatesNeeded;

742

743

744

745 int SmrdSgprWaitStates = 4;

746 auto IsHazardDefFn = [this](const MachineInstr &MI) {

747 return TII.isVALU(MI);

748 };

749 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {

750 return TII.isSALU(MI);

751 };

752

753 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);

754

755 for (const MachineOperand &Use : SMRD->uses()) {

756 if (Use.isReg())

757 continue;

758 int WaitStatesNeededForUse =

759 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,

760 SmrdSgprWaitStates);

761 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);

762

763

764

765

766

767

768

769

770 if (IsBufferSMRD) {

771 int WaitStatesNeededForUse =

772 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),

773 IsBufferHazardDefFn,

774 SmrdSgprWaitStates);

775 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);

776 }

777 }

778

779 return WaitStatesNeeded;

780}

781

782int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {

783 if (!ST.hasVMEMReadSGPRVALUDefHazard())

784 return 0;

785

786 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);

787

788

789

790 const int VmemSgprWaitStates = 5;

791 auto IsHazardDefFn = [this](const MachineInstr &MI) {

792 return TII.isVALU(MI);

793 };

794 for (const MachineOperand &Use : VMEM->uses()) {

795 if (Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))

796 continue;

797

798 int WaitStatesNeededForUse =

799 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,

800 VmemSgprWaitStates);

801 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);

802 }

803 return WaitStatesNeeded;

804}

805

806int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {

807 const SIRegisterInfo *TRI = ST.getRegisterInfo();

808 const SIInstrInfo *TII = ST.getInstrInfo();

809

810

811 int DppVgprWaitStates = 2;

812 int DppExecWaitStates = 5;

813 int WaitStatesNeeded = 0;

814 auto IsHazardDefFn = [TII](const MachineInstr &MI) {

815 return TII->isVALU(MI);

816 };

817

818 for (const MachineOperand &Use : DPP->uses()) {

819 if (Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))

820 continue;

821 int WaitStatesNeededForUse =

822 DppVgprWaitStates - getWaitStatesSinceDef(

823 Use.getReg(),

824 [](const MachineInstr &) { return true; },

825 DppVgprWaitStates);

826 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);

827 }

828

829 WaitStatesNeeded = std::max(

830 WaitStatesNeeded,

831 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,

832 DppExecWaitStates));

833

834 return WaitStatesNeeded;

835}

836

837int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {

838 const SIInstrInfo *TII = ST.getInstrInfo();

839

840

841

842 const int DivFMasWaitStates = 4;

843 auto IsHazardDefFn = [TII](const MachineInstr &MI) {

844 return TII->isVALU(MI);

845 };

846 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,

847 DivFMasWaitStates);

848

849 return DivFMasWaitStates - WaitStatesNeeded;

850}

851

852int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {

853 const SIInstrInfo *TII = ST.getInstrInfo();

854 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);

855

856 const int GetRegWaitStates = 2;

857 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {

858 return GetRegHWReg == getHWReg(TII, MI);

859 };

860 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);

861

862 return GetRegWaitStates - WaitStatesNeeded;

863}

864

865int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {

866 const SIInstrInfo *TII = ST.getInstrInfo();

867 unsigned HWReg = getHWReg(TII, *SetRegInstr);

868

869 const int SetRegWaitStates = ST.getSetRegWaitStates();

870 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {

872 };

873 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);

874 return SetRegWaitStates - WaitStatesNeeded;

875}

876

877int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {

878 if (MI.mayStore())

879 return -1;

880

881 const SIInstrInfo *TII = ST.getInstrInfo();

882 unsigned Opcode = MI.getOpcode();

883 const MCInstrDesc &Desc = MI.getDesc();

884

885 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);

886 int VDataRCID = -1;

887 if (VDataIdx != -1)

888 VDataRCID = TII->getOpRegClassID(Desc.operands()[VDataIdx]);

889

890 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {

891

892

893 if (VDataIdx == -1)

894 return -1;

895

896

897 const MachineOperand *SOffset =

898 TII->getNamedOperand(MI, AMDGPU::OpName::soffset);

899

900

902 (!SOffset || !SOffset->isReg()))

903 return VDataIdx;

904 }

905

906

907

908

909

910 if (TII->isMIMG(MI)) {

911 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);

913 Desc.operands()[SRsrcIdx])) == 256);

914 (void)SRsrcIdx;

915 }

916

917 if (TII->isFLAT(MI)) {

918

919 if (VDataIdx == -1)

920 return -1;

921

923 return VDataIdx;

924 }

925

926 return -1;

927}

928

929int

930GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,

932

933

934 const SIRegisterInfo *TRI = ST.getRegisterInfo();

935

936 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;

937 int WaitStatesNeeded = 0;

938

939 if (!TRI->isVectorRegister(MRI, Def.getReg()))

940 return WaitStatesNeeded;

942 auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {

943 int DataIdx = createsVALUHazard(MI);

944 return DataIdx >= 0 &&

945 TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);

946 };

947

948 int WaitStatesNeededForDef =

949 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);

950 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);

951

952 return WaitStatesNeeded;

953}

954

955

956

957

958

959

960

964 return nullptr;

965

967

968 unsigned Opcode = MI.getOpcode();

969

970

971

972

973

974

975

977

978 if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))

980 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);

981 }

982

985

986 if (TII->getNamedImmOperand(MI, AMDGPU::OpName::src0_modifiers) &

988 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);

989

990

992 (TII->getNamedImmOperand(MI, AMDGPU::OpName::src2_modifiers) &

994 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);

995 }

996

997

998

1000 return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);

1001

1002 return nullptr;

1003}

1004

1005

1006

1007

1011

1012

1013

1014

1015

1016

1017

1018

1019

1020

1021 for (auto &Operand : VALU->operands()) {

1022 if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {

1023 return true;

1024 }

1025 }

1026 return false;

1027}

1028

1029int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {

1030 int WaitStatesNeeded = 0;

1031

1033 const int TransDefWaitstates = 1;

1034

1035 auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {

1037 return false;

1038 const SIRegisterInfo *TRI = ST.getRegisterInfo();

1039 const SIInstrInfo *TII = ST.getInstrInfo();

1040 Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();

1041

1042 for (const MachineOperand &Use : VALU->explicit_uses()) {

1043 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))

1044 return true;

1045 }

1046

1047 return false;

1048 };

1049

1050 int WaitStatesNeededForDef =

1051 TransDefWaitstates -

1052 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);

1053 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);

1054 }

1055

1056 if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {

1057 const int Shift16DefWaitstates = 1;

1058

1059 auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {

1060 const SIRegisterInfo *TRI = ST.getRegisterInfo();

1061 const MachineOperand *ForwardedDst =

1063 if (ForwardedDst) {

1065 }

1066

1067 if (ProducerMI.isInlineAsm()) {

1068

1069 for (auto &Def : ProducerMI.all_defs()) {

1071 return true;

1072 }

1073 }

1074

1075 return false;

1076 };

1077

1078 int WaitStatesNeededForDef =

1079 Shift16DefWaitstates -

1080 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);

1081 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);

1082 }

1083

1084 if (ST.hasVDecCoExecHazard()) {

1085 const int VALUWriteSGPRVALUReadWaitstates = 2;

1086 const int VALUWriteEXECRWLane = 4;

1087 const int VALUWriteVGPRReadlaneRead = 1;

1088

1089 const SIRegisterInfo *TRI = ST.getRegisterInfo();

1090 const MachineRegisterInfo &MRI = MF.getRegInfo();

1092 auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {

1094 return false;

1095 return MI.modifiesRegister(UseReg, TRI);

1096 };

1097

1098 for (const MachineOperand &Use : VALU->explicit_uses()) {

1099 if (Use.isReg())

1100 continue;

1101

1103 if (TRI->isSGPRReg(MRI, UseReg)) {

1104 int WaitStatesNeededForDef =

1105 VALUWriteSGPRVALUReadWaitstates -

1106 getWaitStatesSince(IsVALUDefSGPRFn,

1107 VALUWriteSGPRVALUReadWaitstates);

1108 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);

1109 }

1110 }

1111

1112 if (VALU->readsRegister(AMDGPU::VCC, TRI)) {

1113 UseReg = AMDGPU::VCC;

1114 int WaitStatesNeededForDef =

1115 VALUWriteSGPRVALUReadWaitstates -

1116 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);

1117 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);

1118 }

1119

1120 switch (VALU->getOpcode()) {

1121 case AMDGPU::V_READLANE_B32:

1122 case AMDGPU::V_READFIRSTLANE_B32: {

1123 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);

1124 UseReg = Src->getReg();

1125 int WaitStatesNeededForDef =

1126 VALUWriteVGPRReadlaneRead -

1127 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);

1128 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);

1129 }

1130 [[fallthrough]];

1131 case AMDGPU::V_WRITELANE_B32: {

1132 UseReg = AMDGPU::EXEC;

1133 int WaitStatesNeededForDef =

1134 VALUWriteEXECRWLane -

1135 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);

1136 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);

1137 break;

1138 }

1139 default:

1140 break;

1141 }

1142 }

1143

1144

1145

1146 if (!ST.has12DWordStoreHazard())

1147 return WaitStatesNeeded;

1148

1149 const MachineRegisterInfo &MRI = MF.getRegInfo();

1150

1151 for (const MachineOperand &Def : VALU->defs()) {

1152 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));

1153 }

1154

1155 return WaitStatesNeeded;

1156}

1157

1158int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {

1159

1160

1161

1162

1163

1164

1165

1166

1167 if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&

1168 !ST.hasCvtScaleForwardingHazard())

1169 return 0;

1170

1171 const MachineRegisterInfo &MRI = MF.getRegInfo();

1172 int WaitStatesNeeded = 0;

1173

1174 for (const MachineOperand &Op :

1176 if (Op.isReg() && Op.isDef()) {

1177 if (!TRI.isVectorRegister(MRI, Op.getReg()))

1178 continue;

1179

1180 if (ST.has12DWordStoreHazard()) {

1181 WaitStatesNeeded =

1182 std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));

1183 }

1184 }

1185 }

1186

1187 if (ST.hasDstSelForwardingHazard()) {

1188 const int Shift16DefWaitstates = 1;

1189

1190 auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) {

1192

1193 if (Dst)

1194 return IA->modifiesRegister(Dst->getReg(), &TRI) ||

1195 IA->readsRegister(Dst->getReg(), &TRI);

1196

1197 if (ProducerMI.isInlineAsm()) {

1198

1199 for (auto &Def : ProducerMI.all_defs()) {

1200 if (IA->modifiesRegister(Def.getReg(), &TRI) ||

1201 IA->readsRegister(Def.getReg(), &TRI)) {

1202 return true;

1203 }

1204 }

1205 }

1206

1207 return false;

1208 };

1209

1210 int WaitStatesNeededForDef =

1211 Shift16DefWaitstates -

1212 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);

1213 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);

1214 }

1215

1216 return WaitStatesNeeded;

1217}

1218

1219int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {

1220 const SIInstrInfo *TII = ST.getInstrInfo();

1221 const SIRegisterInfo *TRI = ST.getRegisterInfo();

1222 const MachineRegisterInfo &MRI = MF.getRegInfo();

1223

1224 const MachineOperand *LaneSelectOp =

1225 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);

1226

1227 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))

1228 return 0;

1229

1231 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };

1232

1233 const int RWLaneWaitStates = 4;

1234 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,

1235 RWLaneWaitStates);

1236 return RWLaneWaitStates - WaitStatesSince;

1237}

1238

1239int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {

1240 if (!ST.hasRFEHazards())

1241 return 0;

1242

1243 const SIInstrInfo *TII = ST.getInstrInfo();

1244

1245 const int RFEWaitStates = 1;

1246

1247 auto IsHazardFn = [TII](const MachineInstr &MI) {

1249 };

1250 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);

1251 return RFEWaitStates - WaitStatesNeeded;

1252}

1253

1254int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {

1255 const SIInstrInfo *TII = ST.getInstrInfo();

1256 const int ReadM0WaitStates = 1;

1257 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };

1258 return ReadM0WaitStates -

1259 getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);

1260}

1261

1262

1263

1264bool GCNHazardRecognizer::emitVNops(MachineInstr *MI, int WaitStatesNeeded) {

1265 if (WaitStatesNeeded <= 0)

1266 return false;

1267

1268 const SIInstrInfo *TII = ST.getInstrInfo();

1269 for (int I = 0; I < WaitStatesNeeded; ++I)

1270 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),

1271 TII->get(AMDGPU::V_NOP_e32));

1272

1273 return true;

1274}

1275

1276void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {

1277 fixVMEMtoScalarWriteHazards(MI);

1278 fixVcmpxPermlaneHazards(MI);

1279 fixSMEMtoVectorWriteHazards(MI);

1280 fixVcmpxExecWARHazard(MI);

1281 fixLdsBranchVmemWARHazard(MI);

1282 if (ST.hasLdsDirect()) {

1283 fixLdsDirectVALUHazard(MI);

1284 fixLdsDirectVMEMHazard(MI);

1285 }

1286 fixVALUPartialForwardingHazard(MI);

1287 fixVALUTransUseHazard(MI);

1288 fixVALUTransCoexecutionHazards(MI);

1289 fixWMMAHazards(MI);

1290 emitVNops(MI, checkWMMACoexecutionHazards(MI));

1291 fixShift64HighRegBug(MI);

1292 fixVALUMaskWriteHazard(MI);

1293 fixRequiredExportPriority(MI);

1294 if (ST.requiresWaitIdleBeforeGetReg())

1295 fixGetRegWaitIdle(MI);

1296 if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug())

1297 fixDsAtomicAsyncBarrierArriveB64(MI);

1298 if (ST.hasScratchBaseForwardingHazard())

1299 fixScratchBaseForwardingHazard(MI);

1300 if (ST.setRegModeNeedsVNOPs())

1301 fixSetRegMode(MI);

1302}

1303

1306 return (TII.isVOPC(MI) ||

1307 (MI.isCompare() && (TII.isVOP3(MI) || TII.isSDWA(MI)))) &&

1308 MI.modifiesRegister(AMDGPU::EXEC, &TRI);

1309}

1310

1311bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {

1312 if (!ST.hasVcmpxPermlaneHazard() || isPermlane(*MI))

1313 return false;

1314

1315 const SIInstrInfo *TII = ST.getInstrInfo();

1316 const SIRegisterInfo *TRI = ST.getRegisterInfo();

1317 auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {

1319 };

1320

1321 auto IsExpiredFn = [](const MachineInstr &MI, int) {

1322 unsigned Opc = MI.getOpcode();

1324 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;

1325 };

1326

1328 std::numeric_limits::max())

1329 return false;

1330

1331

1332

1333

1334 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);

1336 bool IsUndef = Src0->isUndef();

1337 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),

1338 TII->get(AMDGPU::V_MOV_B32_e32))

1341

1342 return true;

1343}

1344

1345bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {

1346 if (!ST.hasVMEMtoScalarWriteHazard())

1347 return false;

1348 assert(!ST.hasExtendedWaitCounts());

1349

1351 return false;

1352

1353 if (MI->getNumDefs() == 0)

1354 return false;

1355

1356 const SIRegisterInfo *TRI = ST.getRegisterInfo();

1357

1358 auto IsHazardFn = [TRI, MI](const MachineInstr &I) {

1360 return false;

1361

1362 for (const MachineOperand &Def : MI->defs()) {

1363 const MachineOperand *Op =

1364 I.findRegisterUseOperand(Def.getReg(), TRI, false);

1365 if (Op)

1366 continue;

1367 return true;

1368 }

1369 return false;

1370 };

1371

1372 auto IsExpiredFn = [](const MachineInstr &MI, int) {

1374 (MI.getOpcode() == AMDGPU::S_WAITCNT &&

1375 MI.getOperand(0).getImm()) ||

1376 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&

1378 };

1379

1381 std::numeric_limits::max())

1382 return false;

1383

1384 const SIInstrInfo *TII = ST.getInstrInfo();

1385 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),

1386 TII->get(AMDGPU::S_WAITCNT_DEPCTR))

1388 return true;

1389}

1390

1391bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {

1392 if (!ST.hasSMEMtoVectorWriteHazard())

1393 return false;

1394 assert(!ST.hasExtendedWaitCounts());

1395

1397 return false;

1398

1399 AMDGPU::OpName SDSTName;

1400 switch (MI->getOpcode()) {

1401 case AMDGPU::V_READLANE_B32:

1402 case AMDGPU::V_READFIRSTLANE_B32:

1403 SDSTName = AMDGPU::OpName::vdst;

1404 break;

1405 default:

1406 SDSTName = AMDGPU::OpName::sdst;

1407 break;

1408 }

1409

1410 const SIInstrInfo *TII = ST.getInstrInfo();

1411 const SIRegisterInfo *TRI = ST.getRegisterInfo();

1413 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);

1414 if (!SDST) {

1415 for (const auto &MO : MI->implicit_operands()) {

1416 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {

1417 SDST = &MO;

1418 break;

1419 }

1420 }

1421 }

1422

1423 if (!SDST)

1424 return false;

1425

1427 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {

1429 };

1430

1431 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {

1432 if (TII->isSALU(MI)) {

1433 switch (MI.getOpcode()) {

1434 case AMDGPU::S_SETVSKIP:

1435 case AMDGPU::S_VERSION:

1436 case AMDGPU::S_WAITCNT_VSCNT:

1437 case AMDGPU::S_WAITCNT_VMCNT:

1438 case AMDGPU::S_WAITCNT_EXPCNT:

1439

1440 return false;

1441 case AMDGPU::S_WAITCNT_LGKMCNT:

1442

1443 return (MI.getOperand(1).getImm() == 0) &&

1444 (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);

1445 case AMDGPU::S_WAITCNT: {

1446 const int64_t Imm = MI.getOperand(0).getImm();

1448

1449 return (Decoded.DsCnt == 0);

1450 }

1451 default:

1453 MI.getOpcode() == AMDGPU::S_WAIT_IDLE) &&

1454 "unexpected wait count instruction");

1455

1456 if (TII->isSOPP(MI))

1457 return false;

1458

1459

1460

1461

1462

1463

1464

1465 return true;

1466 }

1467 }

1468 return false;

1469 };

1470

1472 std::numeric_limits::max())

1473 return false;

1474

1475 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),

1476 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)

1478 return true;

1479}

1480

1481bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {

1482 if (!ST.hasVcmpxExecWARHazard())

1483 return false;

1484 assert(!ST.hasExtendedWaitCounts());

1485

1487 return false;

1488

1489 const SIRegisterInfo *TRI = ST.getRegisterInfo();

1490 if (MI->modifiesRegister(AMDGPU::EXEC, TRI))

1491 return false;

1492

1493 auto IsHazardFn = [TRI](const MachineInstr &I) {

1495 return false;

1496 return I.readsRegister(AMDGPU::EXEC, TRI);

1497 };

1498

1499 const SIInstrInfo *TII = ST.getInstrInfo();

1500 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {

1502 if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))

1503 return true;

1504 for (auto MO : MI.implicit_operands())

1505 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))

1506 return true;

1507 }

1508 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&

1510 return true;

1511 return false;

1512 };

1513

1515 std::numeric_limits::max())

1516 return false;

1517

1518 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),

1519 TII->get(AMDGPU::S_WAITCNT_DEPCTR))

1521 return true;

1522}

1523

1526 if (!ST.hasLdsBranchVmemWARHazard())

1527 return false;

1528

1529

1530

1531 bool HasLds = false;

1532 bool HasVmem = false;

1533 for (auto &MBB : MF) {

1534 for (auto &MI : MBB) {

1537 if (HasLds && HasVmem)

1538 return true;

1539 }

1540 }

1541 return false;

1542}

1543

1545 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&

1546 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&

1547 I.getOperand(1).getImm();

1548}

1549

1550bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {

1551 if (!RunLdsBranchVmemWARHazardFixup)

1552 return false;

1553

1554 assert(ST.hasLdsBranchVmemWARHazard());

1555 assert(!ST.hasExtendedWaitCounts());

1556

1557 auto IsHazardInst = [](const MachineInstr &MI) {

1559 return 1;

1561 return 2;

1562 return 0;

1563 };

1564

1565 auto InstType = IsHazardInst(*MI);

1566 if (!InstType)

1567 return false;

1568

1569 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {

1571 };

1572

1573 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {

1574 if (I.isBranch())

1575 return false;

1576

1577 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {

1578 auto InstType2 = IsHazardInst(I);

1579 return InstType2 && InstType != InstType2;

1580 };

1581

1582 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {

1583 auto InstType2 = IsHazardInst(I);

1584 if (InstType == InstType2)

1585 return true;

1586

1588 };

1589

1591 std::numeric_limits::max();

1592 };

1593

1595 std::numeric_limits::max())

1596 return false;

1597

1598 const SIInstrInfo *TII = ST.getInstrInfo();

1599 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),

1600 TII->get(AMDGPU::S_WAITCNT_VSCNT))

1603

1604 return true;

1605}

1606

1607bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {

1609 return false;

1610

1611 const int NoHazardWaitStates = 15;

1612 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);

1614

1615 bool VisitedTrans = false;

1616 auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {

1618 return false;

1620

1621 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);

1622 };

1623 auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {

1624 if (WaitStates >= NoHazardWaitStates)

1625 return true;

1626

1629 };

1630 auto GetWaitStatesFn = [](const MachineInstr &MI) {

1632 };

1633

1634 DenseSet<const MachineBasicBlock *> Visited;

1636 std::next(MI->getReverseIterator()), 0,

1638

1639

1640

1641 if (VisitedTrans)

1643

1644 MachineOperand *WaitVdstOp =

1645 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);

1646 WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));

1647

1648 return true;

1649}

1650

1651bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {

1653 return false;

1654

1655 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);

1657

1658 auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {

1660 return false;

1661 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);

1662 };

1663 bool LdsdirCanWait = ST.hasLdsWaitVMSRC();

1664

1665

1666 auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {

1668 (I.getOpcode() == AMDGPU::S_WAITCNT && I.getOperand(0).getImm()) ||

1669 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&

1672 !TII.getNamedOperand(I, AMDGPU::OpName::waitvsrc)->getImm());

1673 };

1674

1676 std::numeric_limits::max())

1677 return false;

1678

1679 if (LdsdirCanWait) {

1680 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvsrc)->setImm(0);

1681 } else {

1682 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),

1683 TII.get(AMDGPU::S_WAITCNT_DEPCTR))

1685 }

1686

1687 return true;

1688}

1689

1690bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {

1691 if (!ST.hasVALUPartialForwardingHazard())

1692 return false;

1693 assert(!ST.hasExtendedWaitCounts());

1694

1696 return false;

1697

1698 SmallSetVector<Register, 4> SrcVGPRs;

1699

1700 for (const MachineOperand &Use : MI->explicit_uses()) {

1701 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))

1703 }

1704

1705

1706 if (SrcVGPRs.size() <= 1)

1707 return false;

1708

1709

1710

1711

1712

1713

1714

1715

1716

1717

1718

1719

1720

1721

1722

1723

1724 const int Intv1plus2MaxVALUs = 2;

1725 const int Intv3MaxVALUs = 4;

1726 const int IntvMaxVALUs = 6;

1727 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;

1728

1729 struct StateType {

1730 SmallDenseMap<Register, int, 4> DefPos;

1731 int ExecPos = std::numeric_limits::max();

1732 int VALUs = 0;

1733

1734 static unsigned getHashValue(const StateType &State) {

1735 return hash_combine(State.ExecPos, State.VALUs,

1737 }

1738 static bool isEqual(const StateType &LHS, const StateType &RHS) {

1739 return LHS.DefPos == RHS.DefPos && LHS.ExecPos == RHS.ExecPos &&

1740 LHS.VALUs == RHS.VALUs;

1741 }

1742 };

1743

1744 StateType State;

1745

1746

1747 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {

1748

1749 if (State.VALUs > NoHazardVALUWaitStates)

1751

1752

1755 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&

1758

1759

1762 for (Register Src : SrcVGPRs) {

1763 if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {

1764 State.DefPos[Src] = State.VALUs;

1766 }

1767 }

1769 if (State.ExecPos == std::numeric_limits::max()) {

1770 if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {

1771 State.ExecPos = State.VALUs;

1773 }

1774 }

1775 }

1776

1777

1778 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())

1780

1781

1784

1785

1786 if (State.ExecPos == std::numeric_limits::max())

1788

1789 int PreExecPos = std::numeric_limits::max();

1790 int PostExecPos = std::numeric_limits::max();

1791

1792 for (auto Entry : State.DefPos) {

1793 int DefVALUs = Entry.second;

1794 if (DefVALUs != std::numeric_limits::max()) {

1795 if (DefVALUs >= State.ExecPos)

1796 PreExecPos = std::min(PreExecPos, DefVALUs);

1797 else

1798 PostExecPos = std::min(PostExecPos, DefVALUs);

1799 }

1800 }

1801

1802

1803 if (PostExecPos == std::numeric_limits::max())

1805

1806

1807 int Intv3VALUs = PostExecPos;

1808 if (Intv3VALUs > Intv3MaxVALUs)

1810

1811

1812 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;

1813 if (Intv2VALUs > Intv1plus2MaxVALUs)

1815

1816

1817 if (PreExecPos == std::numeric_limits::max())

1819

1820

1821 int Intv1VALUs = PreExecPos - State.ExecPos;

1822 if (Intv1VALUs > Intv1plus2MaxVALUs)

1824

1825

1826 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)

1828

1830 };

1831 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {

1833 State.VALUs += 1;

1834 };

1835

1837 std::next(MI->getReverseIterator())))

1838 return false;

1839

1840 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),

1841 TII.get(AMDGPU::S_WAITCNT_DEPCTR))

1843

1844 return true;

1845}

1846

1847bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {

1848 if (!ST.hasVALUTransUseHazard())

1849 return false;

1850 assert(!ST.hasExtendedWaitCounts());

1851

1853 return false;

1854

1855 SmallSet<Register, 4> SrcVGPRs;

1856

1857 for (const MachineOperand &Use : MI->explicit_uses()) {

1858 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))

1860 }

1861

1862

1863

1864

1865

1866

1867

1868

1869

1870

1871

1872 const int IntvMaxVALUs = 5;

1873 const int IntvMaxTRANS = 1;

1874

1875 struct StateType {

1876 int VALUs = 0;

1878

1879 static unsigned getHashValue(const StateType &State) {

1880 return hash_combine(State.VALUs, State.TRANS);

1881 }

1882 static bool isEqual(const StateType &LHS, const StateType &RHS) {

1883 return LHS.VALUs == RHS.VALUs && LHS.TRANS == RHS.TRANS;

1884 }

1885 };

1886

1887 StateType State;

1888

1889

1890 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {

1891

1892 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)

1894

1895

1898 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&

1901

1902

1904 for (Register Src : SrcVGPRs) {

1905 if (I.modifiesRegister(Src, &TRI)) {

1907 }

1908 }

1909 }

1910

1912 };

1913 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {

1915 State.VALUs += 1;

1917 State.TRANS += 1;

1918 };

1919

1921 std::next(MI->getReverseIterator())))

1922 return false;

1923

1924

1925

1926 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),

1927 TII.get(AMDGPU::S_WAITCNT_DEPCTR))

1929

1930 return true;

1931}

1932

1933bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(MachineInstr *MI) {

1936 return false;

1937

1938 const SIInstrInfo *TII = ST.getInstrInfo();

1939 const SIRegisterInfo *TRI = ST.getRegisterInfo();

1940

1941 auto IsTransHazardFn = [MI, TII, TRI](const MachineInstr &I) {

1943 return false;

1944

1945

1946 Register TransDef = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();

1947 for (const MachineOperand &ValuUse : MI->explicit_uses()) {

1948 if (ValuUse.isReg() && TRI->regsOverlap(TransDef, ValuUse.getReg()))

1949 return true;

1950 }

1951

1952 auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);

1953 if (!ValuDst || !ValuDst->isReg())

1954 return false;

1955

1956

1957 Register ValuDef = ValuDst->getReg();

1958 for (const MachineOperand &TransUse : I.explicit_uses()) {

1959 if (TransUse.isReg() && TRI->regsOverlap(ValuDef, TransUse.getReg()))

1960 return true;

1961 }

1962

1963 return false;

1964 };

1965

1966 auto IsExpiredFn = [](const MachineInstr &I, int) {

1968 };

1969

1970 const int HasVALU = std::numeric_limits::max();

1971 if (::getWaitStatesSince(IsTransHazardFn, MI, IsExpiredFn) == HasVALU)

1972 return false;

1973

1974 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));

1975 return true;

1976}

1977

1978bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {

1980 return false;

1981

1982 const SIInstrInfo *TII = ST.getInstrInfo();

1983 const SIRegisterInfo *TRI = ST.getRegisterInfo();

1984

1985 auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {

1987 return false;

1988

1989

1990

1992 TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();

1994 TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();

1995

1997 TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();

1998

1999 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||

2000 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {

2001 return true;

2002 }

2003

2004

2005

2009 TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();

2010 if (TRI->regsOverlap(PrevDstReg, CurIndex))

2011 return true;

2012 }

2013 return false;

2014 }

2015

2016 return false;

2017 };

2018

2019 auto IsExpiredFn = [](const MachineInstr &I, int) {

2021 };

2022

2024 std::numeric_limits::max())

2025 return false;

2026

2027 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));

2028

2029 return true;

2030}

2031

2036

2039 unsigned Category) {

2041 "Handle me if the xdl wmma instruction latency changes");

2042

2043 switch (Category) {

2044 case 0:

2045

2046

2047

2048

2049

2050

2052

2053 case 1:

2054

2055

2056

2058

2059 case 2:

2060

2061

2062

2063

2064

2066

2067 case 3:

2068

2069

2071 default:

2072 break;

2073 }

2074

2075 return false;

2076}

2077

2078int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) {

2080 return 0;

2081

2082 const SIInstrInfo *TII = ST.getInstrInfo();

2084 return 0;

2085

2086 const SIRegisterInfo *TRI = ST.getRegisterInfo();

2087

2088

2089

2090

2091

2092

2093 const int WMMAWaitStates[] = {5, 9, 3, 5};

2094 const int VALUWaitStates[] = {4, 8, 2, 4};

2095 unsigned Category = 0;

2096

2097 auto IsWMMAHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) {

2098 if (!TII->isXDLWMMA(I))

2099 return false;

2100

2101 unsigned Latency = TSchedModel.computeInstrLatency(&I);

2103 return false;

2104

2105 Register D0 = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();

2106 Register A1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();

2107 Register B1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();

2108

2109

2110 if (TRI->regsOverlap(D0, A1) || TRI->regsOverlap(D0, B1))

2111 return true;

2112

2114 Register Idx1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();

2115 if (TRI->regsOverlap(D0, Idx1))

2116 return true;

2117 }

2118

2119 return false;

2120 };

2121

2122 auto IsVALUHazardFn = [MI, TII, TRI, &Category, this](const MachineInstr &I) {

2123 if (!TII->isXDLWMMA(I))

2124 return false;

2125

2126 unsigned Latency = TSchedModel.computeInstrLatency(&I);

2128 return false;

2129

2130

2131 Register D0 = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();

2132 for (const MachineOperand &ValuUse : MI->explicit_uses()) {

2133 if (ValuUse.isReg() && TRI->regsOverlap(D0, ValuUse.getReg()))

2134 return true;

2135 }

2136

2137 auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst);

2138 if (!ValuDst || !ValuDst->isReg())

2139 return false;

2140 Register D1 = ValuDst->getReg();

2141

2142

2143 if (TRI->regsOverlap(D0, D1))

2144 return true;

2145

2146

2147 Register A0 = TII->getNamedOperand(I, AMDGPU::OpName::src0)->getReg();

2148 Register B0 = TII->getNamedOperand(I, AMDGPU::OpName::src1)->getReg();

2149 if (TRI->regsOverlap(A0, D1) || TRI->regsOverlap(B0, D1))

2150 return true;

2151

2153 Register Idx0 = TII->getNamedOperand(I, AMDGPU::OpName::src2)->getReg();

2154 if (TRI->regsOverlap(D1, Idx0))

2155 return true;

2156 }

2157

2158 return false;

2159 };

2160

2161 int Limit = 0;

2162

2163 auto GetWaitStatesFn = [](const MachineInstr &I) {

2165 };

2166

2167 int WaitStatesNeeded = -1;

2168 if (TII->isXDLWMMA(*MI)) {

2169 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {

2170 Limit = WMMAWaitStates[Category];

2171

2172

2173

2174

2175 WaitStatesNeeded =

2176 Limit - getWaitStatesSince(IsWMMAHazardFn, Limit, GetWaitStatesFn);

2177 }

2178 } else {

2179 for (Category = 0; WaitStatesNeeded < 0 && Category < 4; Category++) {

2180 Limit = VALUWaitStates[Category];

2181

2182

2183

2184

2185 WaitStatesNeeded =

2186 Limit - getWaitStatesSince(IsVALUHazardFn, Limit, GetWaitStatesFn);

2187 }

2188 }

2189

2190 return WaitStatesNeeded;

2191}

2192

2193bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {

2194 if (!ST.hasShift64HighRegBug())

2195 return false;

2196 assert(!ST.hasExtendedWaitCounts());

2197

2198 switch (MI->getOpcode()) {

2199 default:

2200 return false;

2201 case AMDGPU::V_LSHLREV_B64_e64:

2202 case AMDGPU::V_LSHRREV_B64_e64:

2203 case AMDGPU::V_ASHRREV_I64_e64:

2204 break;

2205 }

2206

2207 MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);

2208 if (!Amt->isReg())

2209 return false;

2210

2212 const MachineRegisterInfo &MRI = MF.getRegInfo();

2213

2214 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)

2215 return false;

2216

2217 if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))

2218 return false;

2219

2220 MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);

2221 bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);

2222 bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);

2223 bool Overlapped = OverlappedSrc || OverlappedDst;

2224

2225 assert(!OverlappedDst || !OverlappedSrc ||

2226 Src1->getReg() == MI->getOperand(0).getReg());

2227 assert(ST.needsAlignedVGPRs());

2228 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);

2229

2231 for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass

2232 : AMDGPU::VGPR_32RegClass) {

2233 if (MI->modifiesRegister(Reg, &TRI) && MI->readsRegister(Reg, &TRI)) {

2234 NewReg = Reg;

2235 break;

2236 }

2237 }

2238

2239 Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)

2240 : NewReg;

2242

2243 if (Overlapped)

2244 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);

2245

2247 MachineBasicBlock *MBB = MI->getParent();

2248

2251

2252

2253 if (Overlapped)

2254 runOnInstruction(

2255 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)

2259 runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)

2263

2264

2265

2266 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),

2267 AmtReg)

2271 if (Overlapped)

2272 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),

2273 AmtReg - 1)

2276 .addReg(AmtReg - 1);

2277

2278

2279

2280

2281 Amt->setReg(NewAmt);

2283

2285 if (OverlappedDst)

2286 MI->getOperand(0).setReg(NewReg);

2287 if (OverlappedSrc) {

2288 Src1->setReg(NewReg);

2291 }

2292

2293 return true;

2294}

2295

2296int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {

2297 int NSAtoVMEMWaitStates = 1;

2298

2299 if (!ST.hasNSAtoVMEMBug())

2300 return 0;

2301

2303 return 0;

2304

2305 const SIInstrInfo *TII = ST.getInstrInfo();

2306 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);

2308 return 0;

2309

2310 auto IsHazardFn = [TII](const MachineInstr &I) {

2312 return false;

2314 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&

2315 TII->getInstSizeInBytes(I) >= 16;

2316 };

2317

2318 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);

2319}

2320

2321int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {

2322 int FPAtomicToDenormModeWaitStates = 3;

2323

2324 if (!ST.hasFPAtomicToDenormModeHazard())

2325 return 0;

2326 assert(!ST.hasExtendedWaitCounts());

2327

2328 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)

2329 return 0;

2330

2331 auto IsHazardFn = [](const MachineInstr &I) {

2333 return false;

2335 };

2336

2337 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {

2339 return true;

2340

2342 };

2343

2344 return FPAtomicToDenormModeWaitStates -

2346}

2347

2348int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {

2350

2351 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);

2352}

2353

2354int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {

2355

2357 return 0;

2358

2359 const SIMachineFunctionInfo *MFI = MF.getInfo();

2361 return 0;

2362

2363 int NeighborMFMALatency = 0;

2364 auto IsNeighboringMFMA = [&NeighborMFMALatency,

2365 this](const MachineInstr &MI) {

2367 return false;

2368

2369 NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);

2370 return true;

2371 };

2372

2373 const int MaxMFMAPipelineWaitStates = 16;

2374 int WaitStatesSinceNeighborMFMA =

2375 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);

2376

2377 int NeighborMFMAPaddingNeeded =

2379 WaitStatesSinceNeighborMFMA;

2380

2381 return std::max(0, NeighborMFMAPaddingNeeded);

2382}

2383

2384int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {

2385 int WaitStatesNeeded = 0;

2386 unsigned Opc = MI->getOpcode();

2387

2388 auto IsVALUFn = [](const MachineInstr &MI) {

2390 };

2391

2392 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) {

2393 const int LegacyVALUWritesVGPRWaitStates = 2;

2394 const int VALUWritesExecWaitStates = 4;

2395 const int MaxWaitStates = 4;

2396

2397 int WaitStatesNeededForUse = VALUWritesExecWaitStates -

2398 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);

2399 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);

2400

2401 if (WaitStatesNeeded < MaxWaitStates) {

2402 for (const MachineOperand &Use : MI->explicit_uses()) {

2403 const int MaxWaitStates = 2;

2404

2405 if (Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))

2406 continue;

2407

2408 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -

2409 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);

2410 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);

2411

2412 if (WaitStatesNeeded == MaxWaitStates)

2413 break;

2414 }

2415 }

2416 }

2417

2418 for (const MachineOperand &Op : MI->explicit_operands()) {

2419 if (Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))

2420 continue;

2421

2422 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)

2423 continue;

2424

2425 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;

2426 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;

2427 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;

2428 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;

2429 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;

2430 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;

2431 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;

2432 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;

2433 const int MaxWaitStates = 18;

2435 unsigned HazardDefLatency = 0;

2436

2437 auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,

2438 this](const MachineInstr &MI) {

2440 return false;

2441 Register DstReg = MI.getOperand(0).getReg();

2442 if (DstReg == Reg)

2443 return false;

2444 HazardDefLatency =

2445 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));

2446 return TRI.regsOverlap(DstReg, Reg);

2447 };

2448

2449 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,

2450 MaxWaitStates);

2451 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;

2452 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);

2453 int OpNo = Op.getOperandNo();

2454 if (OpNo == SrcCIdx) {

2455 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;

2456 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {

2457 switch (HazardDefLatency) {

2458 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;

2459 break;

2460 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;

2461 break;

2462 case 16: [[fallthrough]];

2463 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;

2464 break;

2465 }

2466 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {

2467 switch (HazardDefLatency) {

2468 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;

2469 break;

2470 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;

2471 break;

2472 case 16: [[fallthrough]];

2473 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;

2474 break;

2475 }

2476 }

2477

2478 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;

2479 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);

2480

2481 if (WaitStatesNeeded == MaxWaitStates)

2482 return WaitStatesNeeded;

2483

2484 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {

2485 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)

2486 return false;

2487 Register DstReg = MI.getOperand(0).getReg();

2488 return TRI.regsOverlap(Reg, DstReg);

2489 };

2490

2491 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;

2492 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;

2493 const int AccVGPRWriteAccVgprReadWaitStates = 3;

2494 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;

2495 if (OpNo == SrcCIdx)

2496 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;

2497 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)

2498 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;

2499

2500 WaitStatesNeededForUse = NeedWaitStates -

2501 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);

2502 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);

2503

2504 if (WaitStatesNeeded == MaxWaitStates)

2505 return WaitStatesNeeded;

2506 }

2507

2508 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {

2509 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;

2510 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;

2511 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;

2512 const int MaxWaitStates = 13;

2513 Register DstReg = MI->getOperand(0).getReg();

2514 unsigned HazardDefLatency = 0;

2515

2516 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,

2517 this](const MachineInstr &MI) {

2519 return false;

2520 Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();

2521 HazardDefLatency =

2522 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));

2523 return TRI.regsOverlap(Reg, DstReg);

2524 };

2525

2526 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);

2527 int NeedWaitStates;

2528 switch (HazardDefLatency) {

2529 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;

2530 break;

2531 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;

2532 break;

2533 case 16: [[fallthrough]];

2534 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;

2535 break;

2536 }

2537

2538 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;

2539 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);

2540 }

2541

2542

2543 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));

2544

2545 return WaitStatesNeeded;

2546}

2547

2548static int

2550 bool IsGFX950) {

2551

2552

2553

2554

2555

2556 return NumPasses + 1 + IsGFX950;

2557}

2558

2559static int

2561 bool IsGFX950) {

2562

2563

2564

2565

2566

2567 return NumPasses + 1 + (NumPasses != 2 && IsGFX950);

2568}

2569

2570static int

2572

2573

2574

2575

2576 return NumPasses;

2577}

2578

2579static int

2581

2582

2583

2584

2585 return NumPasses + 2;

2586}

2587

2589 bool IsGFX950) {

2590

2591

2592

2593

2594

2595 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);

2596}

2597

2598int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {

2599 int WaitStatesNeeded = 0;

2600 unsigned Opc = MI->getOpcode();

2601

2602 auto IsLegacyVALUFn = [](const MachineInstr &MI) {

2604 };

2605

2606 auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {

2609 };

2610

2612 return WaitStatesNeeded;

2613

2614 const int VALUWritesExecWaitStates = 4;

2615 int WaitStatesNeededForUse = VALUWritesExecWaitStates -

2616 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,

2617 VALUWritesExecWaitStates);

2618 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);

2619

2620 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);

2621

2622

2623 for (const MachineOperand &Use : MI->explicit_uses()) {

2624 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;

2625 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;

2626 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;

2627 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;

2628 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;

2629 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;

2630 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;

2631 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;

2632 const int GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 17;

2633 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;

2634 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;

2635 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;

2636 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;

2637 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;

2638 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;

2639 const int GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 19;

2640 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;

2641 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;

2642 const int MaxWaitStates = 19;

2643

2644 if (Use.isReg())

2645 continue;

2647 bool FullReg;

2648 const MachineInstr *MI1;

2649

2650 auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,

2651 this](const MachineInstr &MI) {

2653 return false;

2654 Register DstReg = MI.getOperand(0).getReg();

2655 FullReg = (DstReg == Reg);

2656 MI1 = &MI;

2657 return TRI.regsOverlap(DstReg, Reg);

2658 };

2659

2660 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -

2661 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);

2662 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);

2663

2664 int NumWaitStates =

2665 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);

2666 if (NumWaitStates == std::numeric_limits::max())

2667 continue;

2668

2669 int OpNo = Use.getOperandNo();

2670 unsigned Opc1 = MI1->getOpcode();

2671 int NeedWaitStates = 0;

2672 if (OpNo == SrcCIdx) {

2675 NeedWaitStates = 0;

2676 } else if (FullReg) {

2677 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||

2678 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&

2679 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||

2680 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))

2681 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;

2682 else if (ST.hasGFX940Insts() &&

2683 TSchedModel.computeInstrLatency(MI1) == 2)

2684 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;

2685 } else {

2686 switch (Opc1) {

2687 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:

2688 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:

2689 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:

2690 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:

2691 if (!TII.isXDL(*MI))

2692 NeedWaitStates =

2693 ST.hasGFX950Insts()

2694 ? GFX950_DMFMA16x16WritesVGPROverlappedSrcCWaitStates

2695 : DMFMA16x16WritesVGPROverlappedSrcCWaitStates;

2696 break;

2697 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:

2698 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:

2699 if (!TII.isXDL(*MI))

2700 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;

2701 break;

2702 default:

2703 int NumPasses = TSchedModel.computeInstrLatency(MI1);

2704 if (ST.hasGFX940Insts()) {

2705 if (TII.isXDL(*MI) && !TII.isXDL(*MI1))

2706 break;

2707

2708 NeedWaitStates =

2709 TII.isXDL(*MI1)

2710 ? (TII.isXDL(*MI)

2712 NumPasses, ST.hasGFX950Insts())

2714 NumPasses, ST.hasGFX950Insts()))

2716 NumPasses);

2717 break;

2718 }

2719

2720 switch (NumPasses) {

2721 case 2:

2722 NeedWaitStates =

2724 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates

2725 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;

2726 break;

2727 case 8:

2728 NeedWaitStates =

2730 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates

2731 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;

2732 break;

2733 case 16:

2734 NeedWaitStates =

2736 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates

2737 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;

2738 break;

2739 default:

2741 }

2742 }

2743 }

2744 } else {

2745 switch (Opc1) {

2746 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:

2747 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:

2748 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:

2749 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:

2750 NeedWaitStates =

2751 ST.hasGFX950Insts()

2752 ? GFX950_DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates

2753 : DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;

2754 break;

2755 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:

2756 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:

2757 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;

2758 break;

2759 default:

2760 int NumPasses = TSchedModel.computeInstrLatency(MI1);

2761

2762 if (ST.hasGFX940Insts()) {

2763 NeedWaitStates =

2764 TII.isXDL(*MI1)

2766 NumPasses, ST.hasGFX950Insts())

2768 NumPasses);

2769 break;

2770 }

2771

2772 switch (NumPasses) {

2773 case 2:

2774 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;

2775 break;

2776 case 4:

2778 case 8:

2779 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;

2780 break;

2781 case 16:

2782 default:

2783 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;

2784 }

2785 }

2786 }

2787 if (WaitStatesNeeded >= NeedWaitStates)

2788 continue;

2789

2790 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;

2791 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);

2792

2793 if (WaitStatesNeeded == MaxWaitStates)

2794 break;

2795 }

2796

2797

2798 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));

2799

2800 return WaitStatesNeeded;

2801}

2802

2803int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {

2804

2805 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())

2806 return 0;

2807

2808 int WaitStatesNeeded = 0;

2809

2810 auto IsAccVgprReadFn = [](const MachineInstr &MI) {

2811 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;

2812 };

2813

2814 for (const MachineOperand &Op : MI->explicit_uses()) {

2815 if (Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))

2816 continue;

2817

2819

2820 const int AccVgprReadLdStWaitStates = 2;

2821 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;

2822 const int MaxWaitStates = 2;

2823

2824 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -

2825 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);

2826 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);

2827

2828 if (WaitStatesNeeded == MaxWaitStates)

2829 return WaitStatesNeeded;

2830

2831 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {

2832 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&

2833 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)

2834 return false;

2835 auto IsVALUFn = [](const MachineInstr &MI) {

2837 };

2838 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 ) <

2839 std::numeric_limits::max();

2840 };

2841

2842 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -

2843 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);

2844 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);

2845 }

2846

2847 return WaitStatesNeeded;

2848}

2849

2850int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) {

2851 assert(!ST.hasVcmpxPermlaneHazard() &&

2852 "this is a different vcmpx+permlane hazard");

2853 const SIRegisterInfo *TRI = ST.getRegisterInfo();

2854 const SIInstrInfo *TII = ST.getInstrInfo();

2855

2856 auto IsVCmpXWritesExecFn = [TII, TRI](const MachineInstr &MI) {

2858 };

2859

2860 auto IsVALUFn = [](const MachineInstr &MI) {

2862 };

2863

2864 const int VCmpXWritesExecWaitStates = 4;

2865 const int VALUWritesVDstWaitStates = 2;

2866 int WaitStatesNeeded = 0;

2867

2868 for (const MachineOperand &Op : MI->explicit_uses()) {

2869 if (Op.isReg() || !TRI->isVGPR(MF.getRegInfo(), Op.getReg()))

2870 continue;

2872

2873 int WaitStatesSinceDef =

2874 VALUWritesVDstWaitStates -

2875 getWaitStatesSinceDef(Reg, IsVALUFn,

2876 VALUWritesVDstWaitStates);

2877 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);

2878 if (WaitStatesNeeded >= VALUWritesVDstWaitStates)

2879 break;

2880 }

2881

2882 int VCmpXHazardWaits =

2883 VCmpXWritesExecWaitStates -

2884 getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);

2885

2886 WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);

2887 return WaitStatesNeeded;

2888}

2889

2891

2892

2893

2894

2895 return NumPasses + 2;

2896}

2897

2899 bool IsGFX950) {

2900

2901

2902

2903

2904

2905 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);

2906}

2907

2909 bool IsGFX950) {

2910

2911

2912

2913

2914

2915 return NumPasses + 3 + (NumPasses != 2 && IsGFX950);

2916}

2917

2919

2920

2921

2922

2923 return NumPasses + 2;

2924}

2925

2926int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {

2927 if (!ST.hasGFX90AInsts())

2928 return 0;

2929

2930 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {

2932 };

2933

2934

2936 return 0;

2937

2938 const MachineRegisterInfo &MRI = MF.getRegInfo();

2939

2940 int WaitStatesNeeded = 0;

2941

2945

2946 const MachineInstr *MFMA = nullptr;

2947 unsigned Reg;

2948 auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {

2950 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))

2951 return false;

2953 return true;

2954 };

2955

2956 const MachineInstr *DOT = nullptr;

2957 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {

2959 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))

2960 return false;

2962 return true;

2963 };

2964

2965 bool DGEMMAfterVALUWrite = false;

2966 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {

2967

2969 DGEMMAfterVALUWrite = true;

2970

2971

2972

2973 if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)

2974 return false;

2975

2976 return true;

2977 };

2978

2979 int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),

2980 AMDGPU::OpName::src2);

2981

2982 if (IsMemOrExport || IsVALU) {

2983 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;

2984 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;

2985 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;

2986 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;

2987 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;

2988 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;

2989 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;

2990 const int GFX950_DMFMA16x16WriteVgprVALUReadWaitStates = 19;

2991 const int DotWriteSameDotReadSrcAB = 3;

2992 const int DotWriteDifferentVALURead = 3;

2993 const int DMFMABetweenVALUWriteVMEMRead = 2;

2994 const int MaxWaitStates = 19;

2995

2996 for (const MachineOperand &Use : MI->explicit_uses()) {

2997 if (Use.isReg())

2998 continue;

3000

3001 DOT = nullptr;

3002 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,

3003 MaxWaitStates);

3004 if (DOT) {

3005 int NeedWaitStates = 0;

3006 if (DOT->getOpcode() == MI->getOpcode()) {

3007 if (&Use - &MI->getOperand(0) != SrcCIdx)

3008 NeedWaitStates = DotWriteSameDotReadSrcAB;

3009 } else {

3010 NeedWaitStates = DotWriteDifferentVALURead;

3011 }

3012

3013 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;

3014 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);

3015 }

3016

3017

3018

3019

3020

3021 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {

3022 DGEMMAfterVALUWrite = false;

3023 if (TRI.isVectorRegister(MRI, Reg)) {

3024 int WaitStatesNeededForUse =

3025 DMFMABetweenVALUWriteVMEMRead -

3026 getWaitStatesSinceDef(Reg, IsDGEMMHazard,

3027 DMFMABetweenVALUWriteVMEMRead);

3028

3029 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);

3030 }

3031 }

3032

3033 MFMA = nullptr;

3034 WaitStatesSinceDef =

3035 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);

3037 continue;

3038

3039 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);

3040 int NumPasses = HazardDefLatency;

3041 int NeedWaitStates = MaxWaitStates;

3042

3044 switch (HazardDefLatency) {

3045 case 4:

3046 NeedWaitStates = IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates

3047 : DMFMA4x4WriteVgprVALUReadWaitStates;

3048 break;

3049 case 8:

3050 case 16:

3051 NeedWaitStates =

3052 IsMemOrExport

3053 ? DMFMA16x16WriteVgprMemExpReadWaitStates

3054 : (ST.hasGFX950Insts()

3055 ? GFX950_DMFMA16x16WriteVgprVALUReadWaitStates

3056 : DMFMA16x16WriteVgprVALUReadWaitStates);

3057 break;

3058 default:

3060 }

3061 } else if (ST.hasGFX940Insts()) {

3062 NeedWaitStates =

3063 TII.isXDL(*MFMA)

3065 NumPasses, ST.hasGFX950Insts())

3067 NumPasses);

3068 } else {

3069 switch (HazardDefLatency) {

3070 case 2:

3071 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;

3072 break;

3073 case 8:

3074 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;

3075 break;

3076 case 16:

3077 NeedWaitStates = SMFMA32x32WriteVgprVALUMemExpReadWaitStates;

3078 break;

3079 default:

3081 }

3082 }

3083

3084 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;

3085 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);

3086

3087 if (WaitStatesNeeded == MaxWaitStates)

3088 break;

3089 }

3090 }

3091

3092 unsigned Opc = MI->getOpcode();

3093 const int DMFMAToFMA64WaitStates = 2;

3094 if ((Opc == AMDGPU::V_FMA_F64_e64 ||

3095 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||

3096 Opc == AMDGPU::V_FMAC_F64_dpp) &&

3097 WaitStatesNeeded < DMFMAToFMA64WaitStates) {

3098 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -

3099 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);

3100 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);

3101 }

3102

3103 if (!IsVALU && !IsMemOrExport)

3104 return WaitStatesNeeded;

3105

3106 for (const MachineOperand &Def : MI->defs()) {

3107 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;

3108 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;

3109 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;

3110 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;

3111 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;

3112 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;

3113 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;

3114 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;

3115 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;

3116 const int DotWriteDifferentVALUWrite = 3;

3117 const int MaxWaitStates = 19;

3118 const int MaxWarWaitStates = 15;

3119

3121

3122 DOT = nullptr;

3123 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,

3124 MaxWaitStates);

3125 if (DOT && DOT->getOpcode() != MI->getOpcode())

3126 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -

3127 WaitStatesSinceDef);

3128

3129 MFMA = nullptr;

3130 WaitStatesSinceDef =

3131 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);

3133 int NeedWaitStates = MaxWaitStates;

3134 int NumPasses = TSchedModel.computeInstrLatency(MFMA);

3135

3137 switch (NumPasses) {

3138 case 4:

3139 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;

3140 break;

3141 case 8:

3142 case 16:

3143 NeedWaitStates = DMFMA16x16WriteVgprVALUWriteWaitStates;

3144 break;

3145 default:

3147 }

3148 } else if (ST.hasGFX940Insts()) {

3149 NeedWaitStates =

3150 TII.isXDL(*MFMA)

3152 NumPasses, ST.hasGFX950Insts())

3154 } else {

3155 switch (NumPasses) {

3156 case 2:

3157 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;

3158 break;

3159 case 8:

3160 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;

3161 break;

3162 case 16:

3163 NeedWaitStates = SMFMA32x32WriteVgprVALUWawWaitStates;

3164 break;

3165 default:

3167 }

3168 }

3169

3170 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;

3171 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);

3172

3173 if (WaitStatesNeeded == MaxWaitStates)

3174 break;

3175 }

3176

3177 auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {

3179 MI.readsRegister(Reg, &TRI))

3180 return false;

3181

3182 if (ST.hasGFX940Insts() && !TII.isXDL(MI))

3183 return false;

3184

3185 const MachineOperand *SrcC =

3186 TII.getNamedOperand(MI, AMDGPU::OpName::src2);

3188 if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))

3189 return false;

3190

3192 return true;

3193 };

3194

3195 MFMA = nullptr;

3196 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,

3197 MaxWarWaitStates);

3199 continue;

3200

3201 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);

3202 int NeedWaitStates = MaxWaitStates;

3203 switch (HazardDefLatency) {

3204 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;

3205 break;

3206 case 4: assert(ST.hasGFX940Insts());

3207 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;

3208 break;

3209 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;

3210 break;

3211 case 16: [[fallthrough]];

3212 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;

3213 break;

3214 }

3215

3216 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;

3217 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);

3218 }

3219

3220 return WaitStatesNeeded;

3221}

3222

3225 return false;

3226

3228

3230 MAI = nullptr;

3232 MAI = &MI;

3233 return MAI != nullptr;

3234 };

3235

3237 if (IsMFMAFn(*MI)) {

3238 int W = getWaitStatesSince(IsMFMAFn, 16);

3239 if (MAI)

3240 return W < (int)TSchedModel.computeInstrLatency(MAI);

3241 }

3242

3243 return false;

3244}

3245

3246

3247

3250 return;

3251

3252

3254 while (I->isBundledWithPred())

3255 I--;

3256 if (I->isBundle())

3257 I++;

3258

3259

3260 if (I->getOpcode() != AMDGPU::S_GETPC_B64)

3261 return;

3262

3263

3264 const unsigned NewBytes = 4;

3265 assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&

3266 "Unexpected instruction insertion in bundle");

3267 auto NextMI = std::next(NewMI->getIterator());

3269 while (NextMI != End && NextMI->isBundledWithPred()) {

3270 for (auto &Operand : NextMI->operands()) {

3271 if (Operand.isGlobal())

3272 Operand.setOffset(Operand.getOffset() + NewBytes);

3273 }

3274 NextMI++;

3275 }

3276}

3277

3278bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {

3279 if (!ST.hasVALUMaskWriteHazard())

3280 return false;

3281 assert(!ST.hasExtendedWaitCounts());

3282

3283 if (!ST.isWave64())

3284 return false;

3285

3288 if (!IsSALU && !IsVALU)

3289 return false;

3290

3291

3292

3293

3294

3295

3296

3297

3298

3299

3300 const SIRegisterInfo *TRI = ST.getRegisterInfo();

3301 const MachineRegisterInfo &MRI = MF.getRegInfo();

3302

3303 auto IgnoreableSGPR = [](const Register Reg) {

3304 switch (Reg) {

3305 case AMDGPU::EXEC:

3306 case AMDGPU::EXEC_LO:

3307 case AMDGPU::EXEC_HI:

3308 case AMDGPU::M0:

3309 case AMDGPU::SGPR_NULL:

3310 case AMDGPU::SGPR_NULL64:

3311 case AMDGPU::SCC:

3312 return true;

3313 default:

3314 return false;

3315 }

3316 };

3318 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI;

3319 };

3320

3321 struct StateType {

3322 SmallSet<Register, 2> HazardSGPRs;

3323

3324 static unsigned getHashValue(const StateType &State) {

3326 }

3327 static bool isEqual(const StateType &LHS, const StateType &RHS) {

3328 return LHS.HazardSGPRs == RHS.HazardSGPRs;

3329 }

3330 };

3331

3332 SmallVector<const MachineInstr *> WaitInstrs;

3333 bool HasSGPRRead = false;

3334 StateType InitialState;

3335

3336

3337 MachineOperand *HazardDef = nullptr;

3338 for (MachineOperand &Op : MI->operands()) {

3339 if (Op.isReg())

3340 continue;

3341 if (Op.isDef() && HazardDef)

3342 continue;

3343

3345 if (IgnoreableSGPR(Reg))

3346 continue;

3347 if (!IsVCC(Reg)) {

3348 if (Op.isImplicit())

3349 continue;

3350 if (!TRI->isSGPRReg(MRI, Reg))

3351 continue;

3352 }

3353

3354 if (Op.isUse()) {

3355 HasSGPRRead = true;

3356 continue;

3357 }

3358

3360 HazardDef = &Op;

3361 }

3362

3363 if (!HazardDef)

3364 return false;

3365

3366

3368 if (AMDGPU::SReg_32RegClass.contains(HazardReg)) {

3369 InitialState.HazardSGPRs.insert(HazardReg);

3370 } else {

3371 assert(AMDGPU::SReg_64RegClass.contains(HazardReg));

3372 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub0));

3373 InitialState.HazardSGPRs.insert(TRI->getSubReg(HazardReg, AMDGPU::sub1));

3374 }

3375

3376 auto IsHazardFn = [&](StateType &State, const MachineInstr &I) {

3377 if (State.HazardSGPRs.empty())

3379

3380 switch (I.getOpcode()) {

3381 case AMDGPU::V_ADDC_U32_e32:

3382 case AMDGPU::V_ADDC_U32_dpp:

3383 case AMDGPU::V_CNDMASK_B16_t16_e32:

3384 case AMDGPU::V_CNDMASK_B16_fake16_e32:

3385 case AMDGPU::V_CNDMASK_B16_t16_dpp:

3386 case AMDGPU::V_CNDMASK_B16_fake16_dpp:

3387 case AMDGPU::V_CNDMASK_B32_e32:

3388 case AMDGPU::V_CNDMASK_B32_dpp:

3389 case AMDGPU::V_DIV_FMAS_F32_e64:

3390 case AMDGPU::V_DIV_FMAS_F64_e64:

3391 case AMDGPU::V_SUBB_U32_e32:

3392 case AMDGPU::V_SUBB_U32_dpp:

3393 case AMDGPU::V_SUBBREV_U32_e32:

3394 case AMDGPU::V_SUBBREV_U32_dpp: {

3395

3397 }

3398 case AMDGPU::V_ADDC_U32_e64:

3399 case AMDGPU::V_ADDC_U32_e64_dpp:

3400 case AMDGPU::V_CNDMASK_B16_t16_e64:

3401 case AMDGPU::V_CNDMASK_B16_fake16_e64:

3402 case AMDGPU::V_CNDMASK_B16_t16_e64_dpp:

3403 case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:

3404 case AMDGPU::V_CNDMASK_B32_e64:

3405 case AMDGPU::V_CNDMASK_B32_e64_dpp:

3406 case AMDGPU::V_SUBB_U32_e64:

3407 case AMDGPU::V_SUBB_U32_e64_dpp:

3408 case AMDGPU::V_SUBBREV_U32_e64:

3409 case AMDGPU::V_SUBBREV_U32_e64_dpp: {

3410

3411 const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);

3413 bool Result = TRI->regsOverlap(SSRCOp->getReg(), HazardReg);

3415 }

3416 default:

3418 }

3419 };

3420

3423 0),

3424 0);

3425 auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) {

3426 switch (I.getOpcode()) {

3427 case AMDGPU::S_WAITCNT_DEPCTR:

3428

3429 if (!HasSGPRRead && I.getParent() == MI->getParent() && I.isBundled() &&

3430 (I.getOperand(0).getImm() & ConstantMaskBits) == ConstantMaskBits)

3432 break;

3433 default:

3434

3435 for (auto &Op : I.operands()) {

3436 if (Op.isReg())

3437 continue;

3438

3440 if (IgnoreableSGPR(Reg))

3441 continue;

3442 if (!IsVCC(Reg)) {

3443 if (Op.isImplicit())

3444 continue;

3445 if (!TRI->isSGPRReg(MRI, Reg))

3446 continue;

3447 }

3448 if (Op.isUse()) {

3449 HasSGPRRead = true;

3450 continue;

3451 }

3452

3453

3454

3456 for (Register SGPR : State.HazardSGPRs) {

3457 if (Reg == SGPR || TRI->regsOverlap(Reg, SGPR))

3459 }

3461 State.HazardSGPRs.erase(SGPR);

3462 }

3463 break;

3464 }

3465 };

3466

3467

3469 MI->getParent(),

3470 std::next(MI->getReverseIterator())))

3471 return false;

3472

3473

3474 unsigned DepCtr =

3478

3479

3480 if (!WaitInstrs.empty()) {

3481

3482

3483

3484 SmallVector<MachineInstr *> ToErase;

3485 unsigned Found = 0;

3486 for (MachineBasicBlock::reverse_iterator It = MI->getReverseIterator(),

3487 End = MI->getParent()->rend();

3488 Found < WaitInstrs.size() && It != End; ++It) {

3489 MachineInstr *WaitMI = &*It;

3490

3491 if (std::as_const(WaitMI) != WaitInstrs[Found])

3492 continue;

3493 Found++;

3494 unsigned WaitMask = WaitMI->getOperand(0).getImm();

3495 assert((WaitMask & ConstantMaskBits) == ConstantMaskBits);

3496 DepCtr = AMDGPU::DepCtr::encodeFieldSaSdst(

3497 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(WaitMask),

3498 AMDGPU::DepCtr::decodeFieldSaSdst(DepCtr)));

3499 DepCtr = AMDGPU::DepCtr::encodeFieldVaSdst(

3500 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(WaitMask),

3501 AMDGPU::DepCtr::decodeFieldVaSdst(DepCtr)));

3502 DepCtr = AMDGPU::DepCtr::encodeFieldVaVcc(

3503 DepCtr, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(WaitMask),

3504 AMDGPU::DepCtr::decodeFieldVaVcc(DepCtr)));

3505 ToErase.push_back(WaitMI);

3506 }

3508 for (MachineInstr *WaitMI : ToErase)

3509 WaitMI->eraseFromParent();

3510 }

3511

3512

3513 auto NextMI = std::next(MI->getIterator());

3514 auto NewMI = BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),

3515 TII.get(AMDGPU::S_WAITCNT_DEPCTR))

3517

3518

3520

3521 return true;

3522}

3523

3527 if (EntryMBB.begin() != EntryMBB.end()) {

3528 auto &EntryMI = *EntryMBB.begin();

3529 if (EntryMI.getOpcode() == AMDGPU::S_SETPRIO &&

3530 EntryMI.getOperand(0).getImm() >= Priority)

3531 return false;

3532 }

3533

3536 return true;

3537}

3538

3539bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {

3540 if (!ST.hasRequiredExportPriority())

3541 return false;

3542

3543

3544

3545 MachineBasicBlock *MBB = MI->getParent();

3548 switch (CC) {

3553 return false;

3554 default:

3555 break;

3556 }

3557

3558 const int MaxPriority = 3;

3559 const int NormalPriority = 2;

3560 const int PostExportPriority = 0;

3561

3562 auto It = MI->getIterator();

3563 switch (MI->getOpcode()) {

3564 case AMDGPU::S_ENDPGM:

3565 case AMDGPU::S_ENDPGM_SAVED:

3566 case AMDGPU::S_ENDPGM_ORDERED_PS_DONE:

3567 case AMDGPU::SI_RETURN_TO_EPILOG:

3568

3569

3570 if (MF->getFrameInfo().hasCalls())

3572 return false;

3573 case AMDGPU::S_SETPRIO: {

3574

3575 auto &PrioOp = MI->getOperand(0);

3576 int Prio = PrioOp.getImm();

3577 bool InWA = (Prio == PostExportPriority) &&

3578 (It != MBB->begin() && TII.isEXP(*std::prev(It)));

3579 if (InWA || Prio >= NormalPriority)

3580 return false;

3581 PrioOp.setImm(std::min(Prio + NormalPriority, MaxPriority));

3582 return true;

3583 }

3584 default:

3585 if (!TII.isEXP(*MI))

3586 return false;

3587 break;

3588 }

3589

3590

3591

3595

3596 auto NextMI = std::next(It);

3597 bool EndOfShader = false;

3598 if (NextMI != MBB->end()) {

3599

3600 if (TII.isEXP(*NextMI))

3602

3603 if (NextMI->getOpcode() == AMDGPU::S_SETPRIO &&

3604 NextMI->getOperand(0).getImm() == PostExportPriority)

3606 EndOfShader = NextMI->getOpcode() == AMDGPU::S_ENDPGM;

3607 }

3608

3610

3611

3612 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))

3613 .addImm(PostExportPriority);

3614

3615 if (!EndOfShader) {

3616

3617 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_WAITCNT_EXPCNT))

3618 .addReg(AMDGPU::SGPR_NULL)

3620 }

3621

3624

3625 if (!EndOfShader) {

3626

3627 BuildMI(*MBB, NextMI, DL, TII.get(AMDGPU::S_SETPRIO))

3628 .addImm(NormalPriority);

3629 }

3630

3631 return true;

3632}

3633

3634bool GCNHazardRecognizer::fixGetRegWaitIdle(MachineInstr *MI) {

3636 return false;

3637

3638 const SIInstrInfo *TII = ST.getInstrInfo();

3640 default:

3641 return false;

3646 break;

3647 }

3648

3649 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),

3650 TII->get(AMDGPU::S_WAITCNT_DEPCTR))

3652 return true;

3653}

3654

3655bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI) {

3656 if (MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64)

3657 return false;

3658

3659 const SIInstrInfo *TII = ST.getInstrInfo();

3660 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),

3661 TII->get(AMDGPU::S_WAITCNT_DEPCTR))

3663 BuildMI(*MI->getParent(), std::next(MI->getIterator()), MI->getDebugLoc(),

3664 TII->get(AMDGPU::S_WAITCNT_DEPCTR))

3666

3667 return true;

3668}

3669

3670bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(MachineInstr *MI) {

3671

3672

3673 if (!IsHazardRecognizerMode)

3674 return false;

3675

3676 const SIRegisterInfo *TRI = ST.getRegisterInfo();

3677 const SIInstrInfo *TII = ST.getInstrInfo();

3678

3679 const int FlatScrBaseWaitStates = 10;

3680

3681 bool ReadsFlatScrLo =

3682 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI);

3683 bool ReadsFlatScrHi =

3684 MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI);

3687 default:

3688 break;

3690 ReadsFlatScrLo = true;

3691 break;

3693 ReadsFlatScrHi = true;

3694 break;

3695 }

3696 }

3697

3698 const MachineRegisterInfo &MRI = MF.getRegInfo();

3699

3700 auto IsRegDefHazard = [&](Register Reg) -> bool {

3701 DenseSet<const MachineBasicBlock *> Visited;

3703 return MI.modifiesRegister(Reg, TRI);

3704 };

3705

3706

3707

3708 auto IsSGPRDef = [TII, TRI, &MRI](const MachineInstr &MI) -> unsigned {

3709 if (!TII->isSALU(MI) && !TII->isVALU(MI))

3710 return 0;

3711 for (const MachineOperand &MO : MI.all_defs()) {

3712 if (TRI->isSGPRReg(MRI, MO.getReg()))

3713 return 1;

3714 }

3715 return 0;

3716 };

3717

3718 auto IsExpiredFn = [=](const MachineInstr &MI, int SgprWrites) {

3719 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {

3720 unsigned Wait = MI.getOperand(0).getImm();

3723 return true;

3724 }

3725 return SgprWrites >= FlatScrBaseWaitStates;

3726 };

3727

3728 return ::getWaitStatesSince(

3729 IsHazardFn, MI->getParent(), std::next(MI->getReverseIterator()),

3730 0, IsExpiredFn, Visited, IsSGPRDef) < FlatScrBaseWaitStates;

3731 };

3732

3733 if ((!ReadsFlatScrLo || MRI.isConstantPhysReg(AMDGPU::SGPR102) ||

3734 !IsRegDefHazard(AMDGPU::SGPR102)) &&

3735 (!ReadsFlatScrHi || MRI.isConstantPhysReg(AMDGPU::SGPR103) ||

3736 !IsRegDefHazard(AMDGPU::SGPR103)))

3737 return false;

3738

3739 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),

3740 TII->get(AMDGPU::S_WAITCNT_DEPCTR))

3743 return true;

3744}

3745

3746bool GCNHazardRecognizer::fixSetRegMode(MachineInstr *MI) {

3749 return false;

3750

3751 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));

3752 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32));

3753 return true;

3754}

unsigned const MachineRegisterInfo * MRI

for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))

assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

const TargetInstrInfo & TII

Provides AMDGPU specific target descriptions.

AMDGPU Rewrite AGPR Copy MFMA

MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL

static bool isEqual(const Function &Caller, const Function &Callee)

static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

Analysis containing CSE Info

static cl::opt< unsigned, false, MFMAPaddingRatioParser > MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden, cl::desc("Fill a percentage of the latency between " "neighboring MFMA with s_nops."))

static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF, const GCNSubtarget &ST)

Definition GCNHazardRecognizer.cpp:1524

static bool consumesDstSelForwardingOperand(const MachineInstr *VALU, const MachineOperand *Dst, const SIRegisterInfo *TRI)

Checks whether the provided MI "consumes" the operand with a Dest sel fowarding issue Dst .

Definition GCNHazardRecognizer.cpp:1008

static bool isSGetReg(unsigned Opcode)

Definition GCNHazardRecognizer.cpp:84

static bool breaksSMEMSoftClause(MachineInstr *MI)

Definition GCNHazardRecognizer.cpp:678

static bool isLdsDma(const MachineInstr &MI)

Definition GCNHazardRecognizer.cpp:163

static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses, bool IsGFX950)

Definition GCNHazardRecognizer.cpp:2588

static bool isRFE(unsigned Opcode)

Definition GCNHazardRecognizer.cpp:103

static bool isRWLane(unsigned Opcode)

Definition GCNHazardRecognizer.cpp:99

static bool isSMovRel(unsigned Opcode)

Definition GCNHazardRecognizer.cpp:107

static const MachineOperand * getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST)

Dest sel forwarding issue occurs if additional logic is needed to swizzle / pack the computed value i...

Definition GCNHazardRecognizer.cpp:962

static int GFX940_XDL_N_PassWritesVGPROverlappedSGEMMDGEMMSrcCWaitStates(int NumPasses, bool IsGFX950)

Definition GCNHazardRecognizer.cpp:2560

static void updateGetPCBundle(MachineInstr *NewMI)

Definition GCNHazardRecognizer.cpp:3248

static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses, bool IsGFX950)

Definition GCNHazardRecognizer.cpp:2908

static bool isStoreCountWaitZero(const MachineInstr &I)

Definition GCNHazardRecognizer.cpp:1544

static bool breaksVMEMSoftClause(MachineInstr *MI)

Definition GCNHazardRecognizer.cpp:682

static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, const MachineInstr &MI)

Definition GCNHazardRecognizer.cpp:1304

static bool isSSetReg(unsigned Opcode)

Definition GCNHazardRecognizer.cpp:88

static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg)

Definition GCNHazardRecognizer.cpp:659

static bool IsWMMAHazardInstInCategory(const MachineInstr &MI, const SIInstrInfo *TII, unsigned Latency, unsigned Category)

Definition GCNHazardRecognizer.cpp:2037

HazardFnResult

Definition GCNHazardRecognizer.cpp:444

@ HazardExpired

Definition GCNHazardRecognizer.cpp:444

@ HazardFound

Definition GCNHazardRecognizer.cpp:444

@ NoHazardFound

Definition GCNHazardRecognizer.cpp:444

static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr)

Definition GCNHazardRecognizer.cpp:168

static bool isDivFMas(unsigned Opcode)

Definition GCNHazardRecognizer.cpp:80

static bool hasHazard(StateT InitialState, function_ref< HazardFnResult(StateT &, const MachineInstr &)> IsHazard, function_ref< void(StateT &, const MachineInstr &)> UpdateState, const MachineBasicBlock *InitialMBB, MachineBasicBlock::const_reverse_instr_iterator InitialI)

Definition GCNHazardRecognizer.cpp:449

static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB, MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates, GCNHazardRecognizer::IsExpiredFn IsExpired, DenseSet< const MachineBasicBlock * > &Visited, GCNHazardRecognizer::GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)

Definition GCNHazardRecognizer.cpp:553

static int GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses)

Definition GCNHazardRecognizer.cpp:2580

static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses, bool IsGFX950)

Definition GCNHazardRecognizer.cpp:2898

static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses)

Definition GCNHazardRecognizer.cpp:2918

static int GFX940_SMFMA_N_PassWritesVGPROverlappedSMFMASrcCWaitStates(int NumPasses)

Definition GCNHazardRecognizer.cpp:2571

static bool isCoexecutableVALUInst(const MachineInstr &MI)

Definition GCNHazardRecognizer.cpp:2032

static bool ensureEntrySetPrio(MachineFunction *MF, int Priority, const SIInstrInfo &TII)

Definition GCNHazardRecognizer.cpp:3524

static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range< MachineInstr::const_mop_iterator > Ops, BitVector &DefSet, BitVector &UseSet)

Definition GCNHazardRecognizer.cpp:665

static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, unsigned Quantity)

Definition GCNHazardRecognizer.cpp:255

static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, const MachineInstr &MI)

Definition GCNHazardRecognizer.cpp:119

static cl::opt< unsigned > NopPadding("amdgpu-snop-padding", cl::init(0), cl::Hidden, cl::desc("Insert a s_nop x before every instruction"))

static bool isPermlane(const MachineInstr &MI)

Definition GCNHazardRecognizer.cpp:145

static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses)

Definition GCNHazardRecognizer.cpp:2890

static int GFX940_XDL_N_PassWritesVGPROverlappedXDLOrSMFMASrcCWaitStates(int NumPasses, bool IsGFX950)

Definition GCNHazardRecognizer.cpp:2549

AMD GCN specific subclass of TargetSubtarget.

static Register UseReg(const MachineOperand &MO)

const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]

static llvm::Error parse(DataExtractor &Data, uint64_t BaseAddr, LineEntryCallback const &Callback)

Register const TargetRegisterInfo * TRI

Promote Memory to Register

static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)

static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)

static const uint32_t IV[8]

std::pair< iterator, bool > insert_as(std::pair< KeyT, ValueT > &&KV, const LookupKeyT &Val)

Alternate version of insert() which allows a different, and possibly less expensive,...

Implements a dense probed hash-table based set.

CallingConv::ID getCallingConv() const

getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...

void EmitNoop() override

EmitNoop - This callback is invoked when a noop was added to the instruction stream.

Definition GCNHazardRecognizer.cpp:393

void Reset() override

Reset - This callback is invoked when a new block of instructions is about to be schedule.

Definition GCNHazardRecognizer.cpp:68

unsigned PreEmitNoops(MachineInstr *) override

This overload will be used when the hazard recognizer is being used by a non-scheduling pass,...

Definition GCNHazardRecognizer.cpp:313

void EmitInstruction(SUnit *SU) override

EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...

Definition GCNHazardRecognizer.cpp:72

function_ref< bool(const MachineInstr &)> IsHazardFn

void AdvanceCycle() override

AdvanceCycle - This callback is invoked whenever the next top-down instruction to be scheduled cannot...

Definition GCNHazardRecognizer.cpp:397

function_ref< unsigned int(const MachineInstr &)> GetNumWaitStatesFn

unsigned PreEmitNoopsCommon(MachineInstr *)

Definition GCNHazardRecognizer.cpp:322

function_ref< bool(const MachineInstr &, int WaitStates)> IsExpiredFn

bool ShouldPreferAnother(SUnit *SU) override

ShouldPreferAnother - This callback may be invoked if getHazardType returns NoHazard.

Definition GCNHazardRecognizer.cpp:3223

HazardType getHazardType(SUnit *SU, int Stalls) override

getHazardType - Return the hazard type of emitting this node.

Definition GCNHazardRecognizer.cpp:175

GCNHazardRecognizer(const MachineFunction &MF)

Definition GCNHazardRecognizer.cpp:59

void RecedeCycle() override

RecedeCycle - This callback is invoked whenever the next bottom-up instruction to be scheduled cannot...

Definition GCNHazardRecognizer.cpp:435

const MCInstrDesc & get(unsigned Opcode) const

Return the machine instruction descriptor that corresponds to the specified instruction opcode.

Wrapper class representing physical registers. Should be passed by value.

Instructions::const_reverse_iterator const_reverse_instr_iterator

Instructions::iterator instr_iterator

const MachineFunction * getParent() const

Return the MachineFunction containing this basic block.

MachineInstrBundleIterator< MachineInstr > iterator

Function & getFunction()

Return the LLVM function that this machine code represents.

const MachineBasicBlock & front() const

const MachineInstrBuilder & addImm(int64_t Val) const

Add a new immediate operand.

const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const

Add a new virtual register operand.

const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const

Add a virtual register definition operand.

Representation of each machine instruction.

unsigned getOpcode() const

Returns the opcode of this MachineInstr.

const MachineBasicBlock * getParent() const

bool mayStore(QueryType Type=AnyInBundle) const

Return true if this instruction could possibly modify memory.

bool isBundled() const

Return true if this instruction part of a bundle.

MachineOperand class - Representation of each machine instruction operand.

void setImm(int64_t immVal)

bool isReg() const

isReg - Tests if this is a MO_Register operand.

LLVM_ABI void setReg(Register Reg)

Change the register this operand corresponds to.

void setIsKill(bool Val=true)

void setIsUndef(bool Val=true)

Register getReg() const

getReg - Returns the register number.

MachineRegisterInfo - Keep track of information for virtual and physical registers,...

static bool isDS(const MachineInstr &MI)

static bool isVMEM(const MachineInstr &MI)

static bool isSMRD(const MachineInstr &MI)

static bool isMTBUF(const MachineInstr &MI)

static bool isDGEMM(unsigned Opcode)

static bool isEXP(const MachineInstr &MI)

static bool isSALU(const MachineInstr &MI)

static bool isSDWA(const MachineInstr &MI)

static bool isDOT(const MachineInstr &MI)

static bool isSWMMAC(const MachineInstr &MI)

static bool isLDSDIR(const MachineInstr &MI)

static bool isTRANS(const MachineInstr &MI)

static bool isMUBUF(const MachineInstr &MI)

static bool isWaitcnt(unsigned Opcode)

static bool isDPP(const MachineInstr &MI)

static bool isMFMA(const MachineInstr &MI)

static bool isMAI(const MCInstrDesc &Desc)

static bool isFPAtomic(const MachineInstr &MI)

static bool isMIMG(const MachineInstr &MI)

static unsigned getNumWaitStates(const MachineInstr &MI)

Return the number of wait states that result from executing this instruction.

static bool isWMMA(const MachineInstr &MI)

static bool isFLAT(const MachineInstr &MI)

static bool isVALU(const MachineInstr &MI)

static bool isLDSDMA(const MachineInstr &MI)

unsigned getOccupancy() const

Scheduling unit. This is a node in the scheduling DAG.

bool isInstr() const

Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.

MachineInstr * getInstr() const

Returns the representative MachineInstr for this SUnit.

unsigned getMaxLookAhead() const

unsigned MaxLookAhead

MaxLookAhead - Indicate the number of cycles in the scoreboard state.

virtual void EmitNoops(unsigned Quantity)

EmitNoops - This callback is invoked when noops were added to the instruction stream.

size_type size() const

Determine the number of elements in the SetVector.

bool insert(const value_type &X)

Insert a new element into the SetVector.

A SetVector that performs no allocations if smaller than a certain size.

std::pair< const_iterator, bool > insert(const T &V)

insert - Insert an element into the set if it isn't already there.

This class consists of common code factored out of the SmallVector class to reduce code duplication b...

reference emplace_back(ArgTypes &&... Args)

void push_back(const T &Elt)

This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.

bool getAsInteger(unsigned Radix, T &Result) const

Parse the current string as an integer of the specified radix.

std::pair< iterator, bool > insert(const ValueT &V)

An efficient, type-erasing, non-owning reference to a callable.

self_iterator getIterator()

A range adaptor for a pair of iterators.

#define llvm_unreachable(msg)

Marks that the current location is not supposed to be reachable.

unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc)

unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)

unsigned decodeFieldSaSdst(unsigned Encoded)

unsigned decodeFieldVaSdst(unsigned Encoded)

unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)

unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)

unsigned decodeFieldVaVdst(unsigned Encoded)

unsigned decodeFieldVmVsrc(unsigned Encoded)

unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst)

LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)

void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)

Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...

FPType getFPDstSelType(unsigned Opc)

bool isGFX12Plus(const MCSubtargetInfo &STI)

LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)

LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)

unsigned getRegBitWidth(const TargetRegisterClass &RC)

Get the size in bits of a register from the register class RC.

bool isGFX1250(const MCSubtargetInfo &STI)

@ AMDGPU_CS

Used for Mesa/AMDPAL compute shaders.

@ AMDGPU_KERNEL

Used for AMDGPU code object kernels.

@ AMDGPU_Gfx

Used for AMD graphics targets.

@ AMDGPU_CS_ChainPreserve

Used on AMDGPUs to give the middle-end more control over argument placement.

@ AMDGPU_CS_Chain

Used on AMDGPUs to give the middle-end more control over argument placement.

@ Define

Register definition.

@ Kill

The last use of a register.

@ Undef

Value of the register doesn't matter.

initializer< Ty > init(const Ty &Val)

NodeAddr< DefNode * > Def

NodeAddr< UseNode * > Use

This is an optimization pass for GlobalISel generic memory operations.

auto drop_begin(T &&RangeOrContainer, size_t N=1)

Return a range covering RangeOrContainer with the first N elements excluded.

FunctionAddr VTableAddr Value

MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)

Builder interface. Specify how to create the initial instruction itself.

FunctionAddr VTableAddr Count

class LLVM_GSL_OWNER SmallVector

Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...

LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key

DWARFExpression::Operation Op

hash_code hash_combine(const Ts &...args)

Combine values into a single hash_code.

hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)

Compute a hash_code for a sequence of values.

static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)

An information struct used to provide DenseMap with the various necessary components for a given valu...