LLVM: lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp Source File (original) (raw)

1

2

3

4

5

6

7

8

9

10

11

12

13

20

21using namespace llvm;

22

23#define DEBUG_TYPE "amdgpu-wait-sgpr-hazards"

24

27 cl::desc("Enable required s_wait_alu on SGPR hazards"));

28

31 cl::desc("Cull hazards on function boundaries"));

32

36 cl::desc("Cull hazards on memory waits"));

37

39 "amdgpu-sgpr-hazard-mem-wait-cull-threshold", cl::init(8), cl::Hidden,

40 cl::desc("Number of tracked SGPRs before initiating hazard cull on memory "

41 "wait"));

42

43namespace {

44

45class AMDGPUWaitSGPRHazards {

46public:

51 unsigned DsNopCount;

52

53 bool EnableSGPRHazardWaits;

54 bool CullSGPRHazardsOnFunctionBoundary;

55 bool CullSGPRHazardsAtMemWait;

56 unsigned CullSGPRHazardsMemWaitThreshold;

57

58 AMDGPUWaitSGPRHazards() = default;

59

60

61 static std::optional sgprNumber(Register Reg,

63 switch (Reg) {

64 case AMDGPU::M0:

65 case AMDGPU::EXEC:

66 case AMDGPU::EXEC_LO:

67 case AMDGPU::EXEC_HI:

68 case AMDGPU::SGPR_NULL:

69 case AMDGPU::SGPR_NULL64:

70 return {};

71 default:

72 break;

73 }

74 unsigned RegN = TRI.getHWRegIndex(Reg);

75 if (RegN > 127)

76 return {};

77 return RegN;

78 }

79

81 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI;

82 }

83

84

85

88 return;

89

90

92 while (I->isBundledWithPred())

93 I--;

94 if (I->isBundle())

95 I++;

96

97

98 if (I->getOpcode() != AMDGPU::S_GETPC_B64)

99 return;

100

101

102 const unsigned NewBytes = 4;

104 "Unexpected instruction insertion in bundle");

105 auto NextMI = std::next(NewMI->getIterator());

107 while (NextMI != End && NextMI->isBundledWithPred()) {

108 for (auto &Operand : NextMI->operands()) {

109 if (Operand.isGlobal())

110 Operand.setOffset(Operand.getOffset() + NewBytes);

111 }

112 NextMI++;

113 }

114 }

115

116 struct HazardState {

117 static constexpr unsigned None = 0;

118 static constexpr unsigned SALU = (1 << 0);

119 static constexpr unsigned VALU = (1 << 1);

120

121 std::bitset<64> Tracked;

122 std::bitset<128> SALUHazards;

123 std::bitset<128> VALUHazards;

124 unsigned VCCHazard = None;

125 bool ActiveFlat = false;

126

127 bool merge(const HazardState &RHS) {

128 HazardState Orig(*this);

129 *this |= RHS;

130 return (*this != Orig);

131 }

132

134 return Tracked == RHS.Tracked && SALUHazards == RHS.SALUHazards &&

135 VALUHazards == RHS.VALUHazards && VCCHazard == RHS.VCCHazard &&

136 ActiveFlat == RHS.ActiveFlat;

137 }

138

139 bool operator!=(const HazardState &RHS) const { return !(*this == RHS); }

140

142 Tracked |= RHS.Tracked;

143 SALUHazards |= RHS.SALUHazards;

144 VALUHazards |= RHS.VALUHazards;

145 VCCHazard |= RHS.VCCHazard;

146 ActiveFlat |= RHS.ActiveFlat;

147 }

148 };

149

150 struct BlockHazardState {

151 HazardState In;

152 HazardState Out;

153 };

154

155 DenseMap<const MachineBasicBlock *, BlockHazardState> BlockState;

156

157 static constexpr unsigned WAVE32_NOPS = 4;

158 static constexpr unsigned WAVE64_NOPS = 8;

159

160 void insertHazardCull(MachineBasicBlock &MBB,

163 unsigned Count = DsNopCount;

165 BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::DS_NOP));

166 }

167

168 unsigned mergeMasks(unsigned Mask1, unsigned Mask2) {

192 }

193

195 unsigned Mask) {

196 auto MBB = MI->getParent();

198 return false;

199

201 if (It->getOpcode() != AMDGPU::S_WAITCNT_DEPCTR)

202 return false;

203

204 It->getOperand(0).setImm(mergeMasks(Mask, It->getOperand(0).getImm()));

205 return true;

206 }

207

208 bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {

209 enum { WA_VALU = 0x1, WA_SALU = 0x2, WA_VCC = 0x4 };

210

211 HazardState State = BlockState[&MBB].In;

212 SmallSet<Register, 8> SeenRegs;

214 unsigned DsNops = 0;

215

219 if (MI->isMetaInstruction())

220 continue;

221

222

223 if (MI->getOpcode() == AMDGPU::DS_NOP) {

224 if (++DsNops >= DsNopCount)

225 State.Tracked.reset();

226 continue;

227 }

228 DsNops = 0;

229

230

231

233 State.ActiveFlat = true;

234

235

236

239 State.VCCHazard = HazardState::None;

240 State.SALUHazards.reset();

241 State.VALUHazards.reset();

242 continue;

243 }

244

245

246 if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {

247 unsigned int Mask = MI->getOperand(0).getImm();

249 State.VCCHazard &= ~HazardState::VALU;

251 State.SALUHazards.reset();

252 State.VCCHazard &= ~HazardState::SALU;

253 }

255 State.VALUHazards.reset();

256 continue;

257 }

258

259

260 if (CullSGPRHazardsAtMemWait &&

261 (MI->getOpcode() == AMDGPU::S_WAIT_LOADCNT ||

262 MI->getOpcode() == AMDGPU::S_WAIT_SAMPLECNT ||

263 MI->getOpcode() == AMDGPU::S_WAIT_BVHCNT) &&

264 (MI->getOperand(0).isImm() && MI->getOperand(0).getImm() == 0) &&

265 (State.Tracked.count() >= CullSGPRHazardsMemWaitThreshold)) {

266 if (MI->getOpcode() == AMDGPU::S_WAIT_LOADCNT && State.ActiveFlat) {

267 State.ActiveFlat = false;

268 } else {

269 State.Tracked.reset();

270 if (Emit)

271 insertHazardCull(MBB, MI);

272 continue;

273 }

274 }

275

276

279 if (!IsVALU && !IsSALU)

280 continue;

281

282 unsigned Wait = 0;

283

284 auto processOperand = [&](const MachineOperand &Op, bool IsUse) {

285 if (Op.isReg())

286 return;

289 if (!TRI->isSGPRReg(*MRI, Reg))

290 return;

291

292

293 if (!SeenRegs.insert(Reg).second)

294 return;

295

296 auto RegNumber = sgprNumber(Reg, *TRI);

297 if (!RegNumber)

298 return;

299

300

301

302 unsigned RegN = *RegNumber;

303 unsigned PairN = (RegN >> 1) & 0x3f;

304

305

306

307 if (!State.Tracked[PairN]) {

308 if (IsVALU && IsUse)

309 State.Tracked.set(PairN);

310 return;

311 }

312

313 uint8_t SGPRCount =

315

316 if (IsUse) {

317

318 if (IsSALU) {

319 if (isVCC(Reg)) {

320 if (State.VCCHazard & HazardState::VALU)

321 State.VCCHazard = HazardState::None;

322 } else {

323 State.VALUHazards.reset();

324 }

325 }

326

327 for (uint8_t RegIdx = 0; RegIdx < SGPRCount; ++RegIdx) {

328 Wait |= State.SALUHazards[RegN + RegIdx] ? WA_SALU : 0;

329 Wait |= IsVALU && State.VALUHazards[RegN + RegIdx] ? WA_VALU : 0;

330 }

331 if (isVCC(Reg) && State.VCCHazard) {

332

333

334 if (State.VCCHazard & HazardState::SALU)

335 Wait |= WA_SALU;

336 if (State.VCCHazard & HazardState::VALU)

337 Wait |= WA_VCC;

338 }

339 } else {

340

341 if (isVCC(Reg)) {

342 State.VCCHazard = IsSALU ? HazardState::SALU : HazardState::VALU;

343 } else {

344 for (uint8_t RegIdx = 0; RegIdx < SGPRCount; ++RegIdx) {

345 if (IsSALU)

346 State.SALUHazards.set(RegN + RegIdx);

347 else

348 State.VALUHazards.set(RegN + RegIdx);

349 }

350 }

351 }

352 };

353

354 const bool IsSetPC =

355 (MI->isCall() || MI->isReturn() || MI->isIndirectBranch()) &&

356 MI->getOpcode() != AMDGPU::S_ENDPGM &&

357 MI->getOpcode() != AMDGPU::S_ENDPGM_SAVED;

358

359

360 const bool HasImplicitVCC =

361 llvm::any_of(MI->getDesc().implicit_uses(), isVCC) ||

363

364 if (IsSetPC) {

365

366

367 if (State.VCCHazard & HazardState::VALU)

368 Wait |= WA_VCC;

369 if (State.SALUHazards.any() || (State.VCCHazard & HazardState::SALU))

370 Wait |= WA_SALU;

371 if (State.VALUHazards.any())

372 Wait |= WA_VALU;

373 if (CullSGPRHazardsOnFunctionBoundary && State.Tracked.any()) {

374 State.Tracked.reset();

375 if (Emit)

376 insertHazardCull(MBB, MI);

377 }

378 } else {

379

380 SeenRegs.clear();

381 for (const MachineOperand &Op : MI->all_uses()) {

382 if (Op.isImplicit() &&

383 (!HasImplicitVCC || Op.isReg() || !isVCC(Op.getReg())))

384 continue;

385 processOperand(Op, true);

386 }

387 }

388

389

392 if (Wait & WA_VCC) {

393 State.VCCHazard &= ~HazardState::VALU;

395 }

396 if (Wait & WA_SALU) {

397 State.SALUHazards.reset();

398 State.VCCHazard &= ~HazardState::SALU;

400 }

401 if (Wait & WA_VALU) {

402 State.VALUHazards.reset();

404 }

405 if (Emit) {

406 if (!mergeConsecutiveWaitAlus(MI, Mask)) {

408 TII->get(AMDGPU::S_WAITCNT_DEPCTR))

411 }

413 }

414 }

415

416

417 if (MI->isCall() && !CullSGPRHazardsOnFunctionBoundary)

418 State.Tracked.set();

419

420

421 SeenRegs.clear();

422 for (const MachineOperand &Op : MI->all_defs()) {

423 if (Op.isImplicit() &&

424 (!HasImplicitVCC || Op.isReg() || !isVCC(Op.getReg())))

425 continue;

426 processOperand(Op, false);

427 }

428 }

429

430 BlockHazardState &BS = BlockState[&MBB];

431 bool Changed = State != BS.Out;

432 if (Emit) {

433 assert(Changed && "Hazard state should not change on emit pass");

435 }

437 BS.Out = State;

439 }

440

441 bool run(MachineFunction &MF) {

443 if (!ST->hasVALUReadSGPRHazard())

444 return false;

445

446

451

454 "amdgpu-sgpr-hazard-wait", EnableSGPRHazardWaits);

456 CullSGPRHazardsOnFunctionBoundary =

459 CullSGPRHazardsAtMemWait =

462 CullSGPRHazardsMemWaitThreshold =

464 "amdgpu-sgpr-hazard-mem-wait-cull-threshold",

465 CullSGPRHazardsMemWaitThreshold);

466

467

468 if (!EnableSGPRHazardWaits)

469 return false;

470

471 TII = ST->getInstrInfo();

472 TRI = ST->getRegisterInfo();

474 DsNopCount = ST->isWave64() ? WAVE64_NOPS : WAVE32_NOPS;

475

478 !CullSGPRHazardsOnFunctionBoundary) {

479

480 LLVM_DEBUG(dbgs() << "Is called function, track all SGPRs.\n");

481 MachineBasicBlock &EntryBlock = MF.front();

482 BlockState[&EntryBlock].In.Tracked.set();

483 }

484

485

486

487

488

489

490

491

492

493 SetVector<MachineBasicBlock *> Worklist;

496 while (!Worklist.empty()) {

498 bool Changed = runOnMachineBasicBlock(MBB, false);

500

501 HazardState NewState = BlockState[&MBB].Out;

502

504

505 auto &SuccState = BlockState[Succ];

506 if (Succ->getSinglePredecessor() && !Succ->isEntryBlock()) {

507 if (SuccState.In != NewState) {

508 SuccState.In = NewState;

509 Worklist.insert(Succ);

510 }

511 } else if (SuccState.In.merge(NewState)) {

512 Worklist.insert(Succ);

513 }

514 }

515 }

516 }

517

518 LLVM_DEBUG(dbgs() << "Emit s_wait_alu instructions\n");

519

520

522 for (auto &MBB : MF)

523 Changed |= runOnMachineBasicBlock(MBB, true);

524

525 BlockState.clear();

527 }

528};

529

531public:

532 static char ID;

533

534 AMDGPUWaitSGPRHazardsLegacy() : MachineFunctionPass(ID) {}

535

536 bool runOnMachineFunction(MachineFunction &MF) override {

537 return AMDGPUWaitSGPRHazards().run(MF);

538 }

539

540 void getAnalysisUsage(AnalysisUsage &AU) const override {

543 }

544};

545

546}

547

548char AMDGPUWaitSGPRHazardsLegacy::ID = 0;

549

551

553 "AMDGPU Insert waits for SGPR read hazards", false, false)

554

558 if (AMDGPUWaitSGPRHazards().run(MF))

561}

unsigned const MachineRegisterInfo * MRI

assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

Provides AMDGPU specific target descriptions.

static cl::opt< bool > GlobalCullSGPRHazardsAtMemWait("amdgpu-sgpr-hazard-mem-wait-cull", cl::init(false), cl::Hidden, cl::desc("Cull hazards on memory waits"))

static cl::opt< unsigned > GlobalCullSGPRHazardsMemWaitThreshold("amdgpu-sgpr-hazard-mem-wait-cull-threshold", cl::init(8), cl::Hidden, cl::desc("Number of tracked SGPRs before initiating hazard cull on memory " "wait"))

static cl::opt< bool > GlobalCullSGPRHazardsOnFunctionBoundary("amdgpu-sgpr-hazard-boundary-cull", cl::init(false), cl::Hidden, cl::desc("Cull hazards on function boundaries"))

static cl::opt< bool > GlobalEnableSGPRHazardWaits("amdgpu-sgpr-hazard-wait", cl::init(true), cl::Hidden, cl::desc("Enable required s_wait_alu on SGPR hazards"))

static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

static void updateGetPCBundle(MachineInstr *NewMI)

AMD GCN specific subclass of TargetSubtarget.

const HexagonInstrInfo * TII

static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)

Register const TargetRegisterInfo * TRI

Promote Memory to Register

#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)

Interface definition for SIInstrInfo.

This file implements a set that has insertion order iteration characteristics.

LLVM_ABI void setPreservesCFG()

This function should be called by the pass, iff they do not:

uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const

For a string attribute Kind, parse attribute as an integer.

CallingConv::ID getCallingConv() const

getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...

bool hasFnAttribute(Attribute::AttrKind Kind) const

Return true if the function has the attribute.

instr_iterator instr_begin()

Instructions::iterator instr_iterator

instr_iterator instr_end()

iterator_range< succ_iterator > successors()

MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...

void getAnalysisUsage(AnalysisUsage &AU) const override

getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.

const TargetSubtargetInfo & getSubtarget() const

getSubtarget - Return the subtarget for which this machine code is being compiled.

MachineRegisterInfo & getRegInfo()

getRegInfo - Return information about the registers currently in use.

Function & getFunction()

Return the LLVM function that this machine code represents.

const MachineBasicBlock & front() const

const MachineInstrBuilder & addImm(int64_t Val) const

Add a new immediate operand.

unsigned getOpcode() const

Returns the opcode of this MachineInstr.

const MachineBasicBlock * getParent() const

bool isBundled() const

Return true if this instruction part of a bundle.

MachineRegisterInfo - Keep track of information for virtual and physical registers,...

A set of analyses that are preserved following a run of a transformation pass.

static PreservedAnalyses none()

Convenience factory function for the empty preserved set.

static PreservedAnalyses all()

Construct a special preserved set that preserves all passes.

Wrapper class representing virtual and physical registers.

static bool isVMEM(const MachineInstr &MI)

static bool isSMRD(const MachineInstr &MI)

static bool isSALU(const MachineInstr &MI)

static bool isFLATGlobal(const MachineInstr &MI)

static bool isFLAT(const MachineInstr &MI)

static bool isVALU(const MachineInstr &MI)

bool empty() const

Determine if the SetVector is empty or not.

bool insert(const value_type &X)

Insert a new element into the SetVector.

value_type pop_back_val()

std::pair< const_iterator, bool > insert(const T &V)

insert - Insert an element into the set if it isn't already there.

int getNumOccurrences() const

self_iterator getIterator()

unsigned decodeFieldVaVcc(unsigned Encoded)

unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc)

unsigned encodeFieldHoldCnt(unsigned Encoded, unsigned HoldCnt)

unsigned encodeFieldVaSsrc(unsigned Encoded, unsigned VaSsrc)

unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst)

unsigned decodeFieldSaSdst(unsigned Encoded)

unsigned decodeFieldVaSdst(unsigned Encoded)

unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)

unsigned decodeFieldVaSsrc(unsigned Encoded)

unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)

unsigned decodeFieldVaVdst(unsigned Encoded)

unsigned decodeFieldHoldCnt(unsigned Encoded)

int getDefaultDepCtrEncoding(const MCSubtargetInfo &STI)

unsigned decodeFieldVmVsrc(unsigned Encoded)

unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst)

LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)

unsigned getRegBitWidth(const TargetRegisterClass &RC)

Get the size in bits of a register from the register class RC.

constexpr std::underlying_type_t< E > Mask()

Get a bitmask with 1s in all places up to the high-order bit of E's largest value.

initializer< Ty > init(const Ty &Val)

PointerTypeMap run(const Module &M)

Compute the PointerTypeMap for the module M.

@ Emitted

Assigned address, still materializing.

This is an optimization pass for GlobalISel generic memory operations.

MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)

Builder interface. Specify how to create the initial instruction itself.

bool operator!=(uint64_t V1, const APInt &V2)

char & AMDGPUWaitSGPRHazardsLegacyID

Definition AMDGPUWaitSGPRHazards.cpp:550

AnalysisManager< MachineFunction > MachineFunctionAnalysisManager

bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)

bool any_of(R &&range, UnaryPredicate P)

Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.

auto reverse(ContainerTy &&C)

LLVM_ABI raw_ostream & dbgs()

dbgs() - This returns a reference to a raw_ostream for debugging messages.

FunctionAddr VTableAddr Count

DWARFExpression::Operation Op

bool operator|=(SparseBitVector< ElementSize > &LHS, const SparseBitVector< ElementSize > *RHS)

IterT prev_nodbg(IterT It, IterT Begin, bool SkipPseudoOp=true)

Decrement It, then continue decrementing it while it points to a debug instruction.