LLVM: lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp Source File (original) (raw)
1
2
3
4
5
6
7
8
9
10
11
12
13
19
20using namespace llvm;
21
22#define DEBUG_TYPE "amdgpu-insert-delay-alu"
23
24namespace {
25
26class AMDGPUInsertDelayAlu {
27public:
31
33
34
35 static bool instructionWaitsForVALU(const MachineInstr &MI) {
36
40 if (MI.getDesc().TSFlags & VA_VDST_0)
41 return true;
42 if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 ||
43 MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)
44 return true;
45 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
47 return true;
48 return false;
49 }
50
51 static bool instructionWaitsForSGPRWrites(const MachineInstr &MI) {
52
53 uint64_t MIFlags = MI.getDesc().TSFlags;
55 return true;
56
58 for (auto &Op : MI.operands()) {
59 if (Op.isReg())
60 return true;
61 }
62 }
63 return false;
64 }
65
66
67 enum DelayType { VALU, TRANS, SALU, OTHER };
68
69
72 return TRANS;
73
75 return TRANS;
77 return VALU;
79 return SALU;
80 return OTHER;
81 }
82
83
84
85
86
87 struct DelayInfo {
88
89
90 static constexpr unsigned VALU_MAX = 5;
91
92
93
94 static constexpr unsigned TRANS_MAX = 4;
95
96
97
98 static constexpr unsigned SALU_CYCLES_MAX = 4;
99
100
101
102
104 uint8_t VALUNum = VALU_MAX;
105
106
107
108
110 uint8_t TRANSNum = TRANS_MAX;
111
112
113
114
115 uint8_t TRANSNumVALU = VALU_MAX;
116
117
118
120
121 DelayInfo() = default;
122
123 DelayInfo(DelayType Type, unsigned Cycles) {
124 switch (Type) {
125 default:
127 case VALU:
128 VALUCycles = Cycles;
129 VALUNum = 0;
130 break;
131 case TRANS:
132 TRANSCycles = Cycles;
133 TRANSNum = 0;
134 TRANSNumVALU = 0;
135 break;
136 case SALU:
137
138
139 SALUCycles = std::min(Cycles, SALU_CYCLES_MAX);
140 break;
141 }
142 }
143
145 return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum &&
146 TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum &&
147 TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles;
148 }
149
150 bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); }
151
152
153
154 void merge(const DelayInfo &RHS) {
155 VALUCycles = std::max(VALUCycles, RHS.VALUCycles);
156 VALUNum = std::min(VALUNum, RHS.VALUNum);
157 TRANSCycles = std::max(TRANSCycles, RHS.TRANSCycles);
158 TRANSNum = std::min(TRANSNum, RHS.TRANSNum);
159 TRANSNumVALU = std::min(TRANSNumVALU, RHS.TRANSNumVALU);
160 SALUCycles = std::max(SALUCycles, RHS.SALUCycles);
161 }
162
163
164
165
166 bool advance(DelayType Type, unsigned Cycles) {
167 bool Erase = true;
168
169 VALUNum += (Type == VALU);
170 if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) {
171
172
173 VALUNum = VALU_MAX;
174 VALUCycles = 0;
175 } else {
176 VALUCycles -= Cycles;
177 Erase = false;
178 }
179
180 TRANSNum += (Type == TRANS);
181 TRANSNumVALU += (Type == VALU);
182 if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) {
183
184
185 TRANSNum = TRANS_MAX;
186 TRANSNumVALU = VALU_MAX;
187 TRANSCycles = 0;
188 } else {
189 TRANSCycles -= Cycles;
190 Erase = false;
191 }
192
193 if (SALUCycles <= Cycles) {
194
195
196 SALUCycles = 0;
197 } else {
198 SALUCycles -= Cycles;
199 Erase = false;
200 }
201
202 return Erase;
203 }
204
205#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
206 void dump() const {
207 if (VALUCycles)
208 dbgs() << " VALUCycles=" << (int)VALUCycles;
209 if (VALUNum < VALU_MAX)
210 dbgs() << " VALUNum=" << (int)VALUNum;
211 if (TRANSCycles)
212 dbgs() << " TRANSCycles=" << (int)TRANSCycles;
213 if (TRANSNum < TRANS_MAX)
214 dbgs() << " TRANSNum=" << (int)TRANSNum;
215 if (TRANSNumVALU < VALU_MAX)
216 dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU;
217 if (SALUCycles)
218 dbgs() << " SALUCycles=" << (int)SALUCycles;
219 }
220#endif
221 };
222
223
224 struct DelayState : DenseMap<MCRegUnit, DelayInfo> {
225
226
227 void merge(const DelayState &RHS) {
228 for (const auto &KV : RHS) {
230 bool Inserted;
231 std::tie(It, Inserted) = insert(KV);
232 if (!Inserted)
233 It->second.merge(KV.second);
234 }
235 }
236
237
238
239 void advance(DelayType Type, unsigned Cycles) {
241 for (auto I = begin(), E = end(); I != E; I = Next) {
243 if (I->second.advance(Type, Cycles))
245 }
246 }
247
248 void advanceByVALUNum(unsigned VALUNum) {
250 for (auto I = begin(), E = end(); I != E; I = Next) {
252 if (I->second.VALUNum >= VALUNum && I->second.VALUCycles > 0) {
254 }
255 }
256 }
257
258#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
260 if (empty()) {
261 dbgs() << " empty\n";
262 return;
263 }
264
265
271 return A->first < B->first;
272 });
275 I->second.dump();
276 dbgs() << "\n";
277 }
278 }
279#endif
280 };
281
282
284
285
288 unsigned Imm = 0;
289
290
291 if (Delay.TRANSNum < DelayInfo::TRANS_MAX)
292 Imm |= 4 + Delay.TRANSNum;
293
294
295
296 if (Delay.VALUNum < DelayInfo::VALU_MAX &&
297 Delay.VALUNum <= Delay.TRANSNumVALU) {
298 if (Imm & 0xf)
299 Imm |= Delay.VALUNum << 7;
300 else
301 Imm |= Delay.VALUNum;
302 }
303
304
305 if (Delay.SALUCycles) {
306 assert(Delay.SALUCycles < DelayInfo::SALU_CYCLES_MAX);
307 if (Imm & 0x780) {
308
309
310 } else if (Imm & 0xf) {
311 Imm |= (Delay.SALUCycles + 8) << 7;
312 } else {
313 Imm |= Delay.SALUCycles + 8;
314 }
315 }
316
317
318 if (!Imm)
319 return LastDelayAlu;
320
321
322
323 if (!(Imm & 0x780) && LastDelayAlu) {
324 unsigned Skip = 0;
328 if (->isBundle() &&
->isMetaInstruction())
329 ++Skip;
330 }
331 if (Skip < 6) {
333 unsigned LastImm = Op.getImm();
334 assert((LastImm & ~0xf) == 0 &&
335 "Remembered an s_delay_alu with no room for another delay!");
336 LastImm |= Imm << 7 | Skip << 4;
337 Op.setImm(LastImm);
338 return nullptr;
339 }
340 }
341
342 auto &MBB = *MI.getParent();
345
346
347 return (Imm & 0x780) ? nullptr : DelayAlu;
348 }
349
351 DelayState State;
352 for (auto *Pred : MBB.predecessors())
353 State.merge(BlockState[Pred]);
354
356 << "\n";
357 State.dump(TRI););
358
361
362
363 MCRegUnit LastSGPRFromVALU = static_cast<MCRegUnit>(0);
364
365
366 for (auto &MI : MBB.instrs()) {
367 if (MI.isBundle() || MI.isMetaInstruction())
368 continue;
369
370
371 switch (MI.getOpcode()) {
372 case AMDGPU::SI_RETURN_TO_EPILOG:
373 continue;
374 }
375
376 DelayType Type = getDelayType(MI);
377
378 if (instructionWaitsForSGPRWrites(MI)) {
379 auto It = State.find(LastSGPRFromVALU);
380 if (It != State.end()) {
381 DelayInfo Info = It->getSecond();
382 State.advanceByVALUNum(Info.VALUNum);
383
384 LastSGPRFromVALU = static_cast<MCRegUnit>(0);
385 }
386 }
387
388 if (instructionWaitsForVALU(MI)) {
389
390
391 State = DelayState();
392 } else if (Type != OTHER) {
393 DelayInfo Delay;
394
395 for (const auto &Op : MI.explicit_uses()) {
396 if (Op.isReg()) {
397
398
399
400 if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied())
401 continue;
402 for (MCRegUnit Unit : TRI->regunits(Op.getReg())) {
403 auto It = State.find(Unit);
404 if (It != State.end()) {
405 Delay.merge(It->second);
406 State.erase(Unit);
407 }
408 }
409 }
410 }
411
412 if (SII->isVALU(MI.getOpcode())) {
413 for (const auto &Op : MI.defs()) {
416 LastSGPRFromVALU = *TRI->regunits(Reg).begin();
417 break;
418 }
419 }
420 }
421
422 if (Emit && .isBundledWithPred()) {
423
424
425 LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu);
426 }
427 }
428
429 if (Type != OTHER) {
430
431 for (const auto &Op : MI.defs()) {
433 &MI, Op.getOperandNo(), nullptr, 0);
434 for (MCRegUnit Unit : TRI->regunits(Op.getReg()))
436 }
437 }
438
439
440
441
443
444
445
446 State.advance(Type, Cycles);
447
449 }
450
451 if (Emit) {
452 assert(State == BlockState[&MBB] &&
453 "Basic block state should not have changed on final pass!");
454 } else if (DelayState &BS = BlockState[&MBB]; State != BS) {
455 BS = std::move(State);
457 }
459 }
460
463 << "\n");
464
466 if (!ST->hasDelayAlu())
467 return false;
468
469 SII = ST->getInstrInfo();
470 TRI = ST->getRegisterInfo();
472
473
474
478 while (!WorkList.empty()) {
480 bool Changed = runOnMachineBasicBlock(MBB, false);
483 }
484
486
487
488
490 for (auto &MBB : MF)
491 Changed |= runOnMachineBasicBlock(MBB, true);
493 }
494};
495
497public:
498 static char ID;
499
501
502 void getAnalysisUsage(AnalysisUsage &AU) const override {
505 }
506
507 bool runOnMachineFunction(MachineFunction &MF) override {
509 return false;
510 AMDGPUInsertDelayAlu Impl;
511 return Impl.run(MF);
512 }
513};
514}
515
519 if (!AMDGPUInsertDelayAlu().run(MF))
523 return PA;
524}
525
526char AMDGPUInsertDelayAluLegacy::ID = 0;
527
529
531 "AMDGPU Insert Delay ALU", false, false)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
AMD GCN specific subclass of TargetSubtarget.
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Register const TargetRegisterInfo * TRI
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Interface definition for SIInstrInfo.
This file implements a set that has insertion order iteration characteristics.
Represent the analysis usage information of a pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Represents analyses that only rely on functions' control flow.
Instructions::iterator instr_iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Wrapper class representing virtual and physical registers.
bool isXDLWMMA(const MachineInstr &MI) const
static bool isSALU(const MachineInstr &MI)
const TargetSchedModel & getSchedModel() const
static bool isTRANS(const MachineInstr &MI)
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isVALU(const MachineInstr &MI)
A vector that has set insertion semantics.
void insert_range(Range &&R)
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
value_type pop_back_val()
void reserve(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Provide an instruction scheduling machine model to CodeGen passes.
LLVM_ABI unsigned computeOperandLatency(const MachineInstr *DefMI, unsigned DefOperIdx, const MachineInstr *UseMI, unsigned UseOperIdx) const
Compute operand latency based on the available machine model.
The instances of the Type class are immutable: once they are created, they are never changed.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned decodeFieldVaVdst(unsigned Encoded)
bool isSGPR(MCRegister Reg, const MCRegisterInfo *TRI)
Is Reg - scalar register.
bool isGFX1250(const MCSubtargetInfo &STI)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
This is an optimization pass for GlobalISel generic memory operations.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool operator!=(uint64_t V1, const APInt &V2)
LLVM_ABI Printable printRegUnit(MCRegUnit Unit, const TargetRegisterInfo *TRI)
Create Printable object to print register units on a raw_ostream.
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
auto reverse(ContainerTy &&C)
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
char & AMDGPUInsertDelayAluID
Definition AMDGPUInsertDelayAlu.cpp:528
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
PreservedAnalyses run(MachineFunction &F, MachineFunctionAnalysisManager &MFAM)
Definition AMDGPUInsertDelayAlu.cpp:517