LLVM: lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp Source File (original) (raw)

1

2

3

4

5

6

7

8

9

10

11

12

13

19

20using namespace llvm;

21

22#define DEBUG_TYPE "amdgpu-insert-delay-alu"

23

24namespace {

25

26class AMDGPUInsertDelayAlu {

27public:

31

33

34

35 static bool instructionWaitsForVALU(const MachineInstr &MI) {

36

40 if (MI.getDesc().TSFlags & VA_VDST_0)

41 return true;

42 if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 ||

43 MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)

44 return true;

45 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&

47 return true;

48 return false;

49 }

50

51 static bool instructionWaitsForSGPRWrites(const MachineInstr &MI) {

52

53 uint64_t MIFlags = MI.getDesc().TSFlags;

55 return true;

56

58 for (auto &Op : MI.operands()) {

59 if (Op.isReg())

60 return true;

61 }

62 }

63 return false;

64 }

65

66

67 enum DelayType { VALU, TRANS, SALU, OTHER };

68

69

72 return TRANS;

73

75 return TRANS;

77 return VALU;

79 return SALU;

80 return OTHER;

81 }

82

83

84

85

86

87 struct DelayInfo {

88

89

90 static constexpr unsigned VALU_MAX = 5;

91

92

93

94 static constexpr unsigned TRANS_MAX = 4;

95

96

97

98 static constexpr unsigned SALU_CYCLES_MAX = 4;

99

100

101

102

104 uint8_t VALUNum = VALU_MAX;

105

106

107

108

110 uint8_t TRANSNum = TRANS_MAX;

111

112

113

114

115 uint8_t TRANSNumVALU = VALU_MAX;

116

117

118

120

121 DelayInfo() = default;

122

123 DelayInfo(DelayType Type, unsigned Cycles) {

124 switch (Type) {

125 default:

127 case VALU:

128 VALUCycles = Cycles;

129 VALUNum = 0;

130 break;

131 case TRANS:

132 TRANSCycles = Cycles;

133 TRANSNum = 0;

134 TRANSNumVALU = 0;

135 break;

136 case SALU:

137

138

139 SALUCycles = std::min(Cycles, SALU_CYCLES_MAX);

140 break;

141 }

142 }

143

145 return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum &&

146 TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum &&

147 TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles;

148 }

149

150 bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); }

151

152

153

154 void merge(const DelayInfo &RHS) {

155 VALUCycles = std::max(VALUCycles, RHS.VALUCycles);

156 VALUNum = std::min(VALUNum, RHS.VALUNum);

157 TRANSCycles = std::max(TRANSCycles, RHS.TRANSCycles);

158 TRANSNum = std::min(TRANSNum, RHS.TRANSNum);

159 TRANSNumVALU = std::min(TRANSNumVALU, RHS.TRANSNumVALU);

160 SALUCycles = std::max(SALUCycles, RHS.SALUCycles);

161 }

162

163

164

165

166 bool advance(DelayType Type, unsigned Cycles) {

167 bool Erase = true;

168

169 VALUNum += (Type == VALU);

170 if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) {

171

172

173 VALUNum = VALU_MAX;

174 VALUCycles = 0;

175 } else {

176 VALUCycles -= Cycles;

177 Erase = false;

178 }

179

180 TRANSNum += (Type == TRANS);

181 TRANSNumVALU += (Type == VALU);

182 if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) {

183

184

185 TRANSNum = TRANS_MAX;

186 TRANSNumVALU = VALU_MAX;

187 TRANSCycles = 0;

188 } else {

189 TRANSCycles -= Cycles;

190 Erase = false;

191 }

192

193 if (SALUCycles <= Cycles) {

194

195

196 SALUCycles = 0;

197 } else {

198 SALUCycles -= Cycles;

199 Erase = false;

200 }

201

202 return Erase;

203 }

204

205#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

206 void dump() const {

207 if (VALUCycles)

208 dbgs() << " VALUCycles=" << (int)VALUCycles;

209 if (VALUNum < VALU_MAX)

210 dbgs() << " VALUNum=" << (int)VALUNum;

211 if (TRANSCycles)

212 dbgs() << " TRANSCycles=" << (int)TRANSCycles;

213 if (TRANSNum < TRANS_MAX)

214 dbgs() << " TRANSNum=" << (int)TRANSNum;

215 if (TRANSNumVALU < VALU_MAX)

216 dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU;

217 if (SALUCycles)

218 dbgs() << " SALUCycles=" << (int)SALUCycles;

219 }

220#endif

221 };

222

223

224 struct DelayState : DenseMap<MCRegUnit, DelayInfo> {

225

226

227 void merge(const DelayState &RHS) {

228 for (const auto &KV : RHS) {

230 bool Inserted;

231 std::tie(It, Inserted) = insert(KV);

232 if (!Inserted)

233 It->second.merge(KV.second);

234 }

235 }

236

237

238

239 void advance(DelayType Type, unsigned Cycles) {

241 for (auto I = begin(), E = end(); I != E; I = Next) {

242 Next = std::next(I);

243 if (I->second.advance(Type, Cycles))

245 }

246 }

247

248 void advanceByVALUNum(unsigned VALUNum) {

250 for (auto I = begin(), E = end(); I != E; I = Next) {

251 Next = std::next(I);

252 if (I->second.VALUNum >= VALUNum && I->second.VALUCycles > 0) {

254 }

255 }

256 }

257

258#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

260 if (empty()) {

261 dbgs() << " empty\n";

262 return;

263 }

264

265

271 return A->first < B->first;

272 });

275 I->second.dump();

276 dbgs() << "\n";

277 }

278 }

279#endif

280 };

281

282

284

285

288 unsigned Imm = 0;

289

290

291 if (Delay.TRANSNum < DelayInfo::TRANS_MAX)

292 Imm |= 4 + Delay.TRANSNum;

293

294

295

296 if (Delay.VALUNum < DelayInfo::VALU_MAX &&

297 Delay.VALUNum <= Delay.TRANSNumVALU) {

298 if (Imm & 0xf)

299 Imm |= Delay.VALUNum << 7;

300 else

301 Imm |= Delay.VALUNum;

302 }

303

304

305 if (Delay.SALUCycles) {

306 assert(Delay.SALUCycles < DelayInfo::SALU_CYCLES_MAX);

307 if (Imm & 0x780) {

308

309

310 } else if (Imm & 0xf) {

311 Imm |= (Delay.SALUCycles + 8) << 7;

312 } else {

313 Imm |= Delay.SALUCycles + 8;

314 }

315 }

316

317

318 if (!Imm)

319 return LastDelayAlu;

320

321

322

323 if (!(Imm & 0x780) && LastDelayAlu) {

324 unsigned Skip = 0;

327 ++I != E;) {

328 if (I->isBundle() && I->isMetaInstruction())

329 ++Skip;

330 }

331 if (Skip < 6) {

333 unsigned LastImm = Op.getImm();

334 assert((LastImm & ~0xf) == 0 &&

335 "Remembered an s_delay_alu with no room for another delay!");

336 LastImm |= Imm << 7 | Skip << 4;

337 Op.setImm(LastImm);

338 return nullptr;

339 }

340 }

341

342 auto &MBB = *MI.getParent();

345

346

347 return (Imm & 0x780) ? nullptr : DelayAlu;

348 }

349

351 DelayState State;

352 for (auto *Pred : MBB.predecessors())

353 State.merge(BlockState[Pred]);

354

356 << "\n";

357 State.dump(TRI););

358

361

362

363 MCRegUnit LastSGPRFromVALU = static_cast<MCRegUnit>(0);

364

365

366 for (auto &MI : MBB.instrs()) {

367 if (MI.isBundle() || MI.isMetaInstruction())

368 continue;

369

370

371 switch (MI.getOpcode()) {

372 case AMDGPU::SI_RETURN_TO_EPILOG:

373 continue;

374 }

375

376 DelayType Type = getDelayType(MI);

377

378 if (instructionWaitsForSGPRWrites(MI)) {

379 auto It = State.find(LastSGPRFromVALU);

380 if (It != State.end()) {

381 DelayInfo Info = It->getSecond();

382 State.advanceByVALUNum(Info.VALUNum);

383

384 LastSGPRFromVALU = static_cast<MCRegUnit>(0);

385 }

386 }

387

388 if (instructionWaitsForVALU(MI)) {

389

390

391 State = DelayState();

392 } else if (Type != OTHER) {

393 DelayInfo Delay;

394

395 for (const auto &Op : MI.explicit_uses()) {

396 if (Op.isReg()) {

397

398

399

400 if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied())

401 continue;

402 for (MCRegUnit Unit : TRI->regunits(Op.getReg())) {

403 auto It = State.find(Unit);

404 if (It != State.end()) {

405 Delay.merge(It->second);

406 State.erase(Unit);

407 }

408 }

409 }

410 }

411

412 if (SII->isVALU(MI.getOpcode())) {

413 for (const auto &Op : MI.defs()) {

416 LastSGPRFromVALU = *TRI->regunits(Reg).begin();

417 break;

418 }

419 }

420 }

421

422 if (Emit && MI.isBundledWithPred()) {

423

424

425 LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu);

426 }

427 }

428

429 if (Type != OTHER) {

430

431 for (const auto &Op : MI.defs()) {

433 &MI, Op.getOperandNo(), nullptr, 0);

434 for (MCRegUnit Unit : TRI->regunits(Op.getReg()))

436 }

437 }

438

439

440

441

443

444

445

446 State.advance(Type, Cycles);

447

449 }

450

451 if (Emit) {

452 assert(State == BlockState[&MBB] &&

453 "Basic block state should not have changed on final pass!");

454 } else if (DelayState &BS = BlockState[&MBB]; State != BS) {

455 BS = std::move(State);

457 }

459 }

460

463 << "\n");

464

466 if (!ST->hasDelayAlu())

467 return false;

468

469 SII = ST->getInstrInfo();

470 TRI = ST->getRegisterInfo();

472

473

474

478 while (!WorkList.empty()) {

480 bool Changed = runOnMachineBasicBlock(MBB, false);

483 }

484

486

487

488

490 for (auto &MBB : MF)

491 Changed |= runOnMachineBasicBlock(MBB, true);

493 }

494};

495

497public:

498 static char ID;

499

501

502 void getAnalysisUsage(AnalysisUsage &AU) const override {

505 }

506

507 bool runOnMachineFunction(MachineFunction &MF) override {

509 return false;

510 AMDGPUInsertDelayAlu Impl;

511 return Impl.run(MF);

512 }

513};

514}

515

519 if (!AMDGPUInsertDelayAlu().run(MF))

523 return PA;

524}

525

526char AMDGPUInsertDelayAluLegacy::ID = 0;

527

529

531 "AMDGPU Insert Delay ALU", false, false)

assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

Provides AMDGPU specific target descriptions.

static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")

static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

Analysis containing CSE Info

AMD GCN specific subclass of TargetSubtarget.

static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)

Register const TargetRegisterInfo * TRI

#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)

Interface definition for SIInstrInfo.

This file implements a set that has insertion order iteration characteristics.

Represent the analysis usage information of a pass.

LLVM_ABI void setPreservesCFG()

This function should be called by the pass, iff they do not:

Represents analyses that only rely on functions' control flow.

Instructions::iterator instr_iterator

MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...

void getAnalysisUsage(AnalysisUsage &AU) const override

getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.

const TargetSubtargetInfo & getSubtarget() const

getSubtarget - Return the subtarget for which this machine code is being compiled.

StringRef getName() const

getName - Return the name of the corresponding LLVM function.

Function & getFunction()

Return the LLVM function that this machine code represents.

const MachineInstrBuilder & addImm(int64_t Val) const

Add a new immediate operand.

Representation of each machine instruction.

const MachineOperand & getOperand(unsigned i) const

MachineOperand class - Representation of each machine instruction operand.

A set of analyses that are preserved following a run of a transformation pass.

static PreservedAnalyses all()

Construct a special preserved set that preserves all passes.

Wrapper class representing virtual and physical registers.

bool isXDLWMMA(const MachineInstr &MI) const

static bool isSALU(const MachineInstr &MI)

const TargetSchedModel & getSchedModel() const

static bool isTRANS(const MachineInstr &MI)

static unsigned getNumWaitStates(const MachineInstr &MI)

Return the number of wait states that result from executing this instruction.

static bool isVALU(const MachineInstr &MI)

A vector that has set insertion semantics.

void insert_range(Range &&R)

bool empty() const

Determine if the SetVector is empty or not.

bool insert(const value_type &X)

Insert a new element into the SetVector.

value_type pop_back_val()

void reserve(size_type N)

void push_back(const T &Elt)

This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.

TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...

Provide an instruction scheduling machine model to CodeGen passes.

LLVM_ABI unsigned computeOperandLatency(const MachineInstr *DefMI, unsigned DefOperIdx, const MachineInstr *UseMI, unsigned UseOperIdx) const

Compute operand latency based on the available machine model.

The instances of the Type class are immutable: once they are created, they are never changed.

#define llvm_unreachable(msg)

Marks that the current location is not supposed to be reachable.

unsigned decodeFieldVaVdst(unsigned Encoded)

bool isSGPR(MCRegister Reg, const MCRegisterInfo *TRI)

Is Reg - scalar register.

bool isGFX1250(const MCSubtargetInfo &STI)

unsigned ID

LLVM IR allows to use arbitrary numbers as calling convention identifiers.

This is an optimization pass for GlobalISel generic memory operations.

void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)

auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)

Get the size of a range.

MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)

Builder interface. Specify how to create the initial instruction itself.

bool operator!=(uint64_t V1, const APInt &V2)

LLVM_ABI Printable printRegUnit(MCRegUnit Unit, const TargetRegisterInfo *TRI)

Create Printable object to print register units on a raw_ostream.

AnalysisManager< MachineFunction > MachineFunctionAnalysisManager

bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)

LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()

Returns the minimum set of Analyses that all machine function passes must preserve.

void erase(Container &C, ValueType V)

Wrapper function to remove a value from a container:

auto reverse(ContainerTy &&C)

void sort(IteratorTy Start, IteratorTy End)

LLVM_ABI raw_ostream & dbgs()

dbgs() - This returns a reference to a raw_ostream for debugging messages.

char & AMDGPUInsertDelayAluID

Definition AMDGPUInsertDelayAlu.cpp:528

FunctionAddr VTableAddr Next

DWARFExpression::Operation Op

LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)

Prints a machine basic block reference.

PreservedAnalyses run(MachineFunction &F, MachineFunctionAnalysisManager &MFAM)

Definition AMDGPUInsertDelayAlu.cpp:517