LLVM: lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp Source File (original) (raw)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
25
26#define DEBUG_TYPE "amdgpu-global-isel-divergence-lowering"
27
28using namespace llvm;
29
30namespace {
31
33public:
34 static char ID;
35
36public:
38
40
41 StringRef getPassName() const override {
42 return "AMDGPU GlobalISel divergence lowering";
43 }
44
45 void getAnalysisUsage(AnalysisUsage &AU) const override {
51 }
52};
53
55public:
59
60private:
64
65public:
66 void markAsLaneMask(Register DstReg) const override;
67 void getCandidatesForLowering(
69 void collectIncomingValuesFromPhi(
78 void constrainAsLaneMask(Incoming &In) override;
79
80 bool lowerTemporalDivergence();
81 bool lowerTemporalDivergenceI1();
82};
83
84DivergenceLoweringHelper::DivergenceLoweringHelper(
88
89
90void DivergenceLoweringHelper::markAsLaneMask(Register DstReg) const {
92
93 if (MRI->getRegClassOrNull(DstReg)) {
94 if (MRI->constrainRegClass(DstReg, ST->getBoolRC()))
95 return;
97 }
98
99 MRI->setRegClass(DstReg, ST->getBoolRC());
100}
101
102void DivergenceLoweringHelper::getCandidatesForLowering(
105
106
109 Register Dst = MI.getOperand(0).getReg();
112 }
113 }
114}
115
116void DivergenceLoweringHelper::collectIncomingValuesFromPhi(
118 for (unsigned i = 1; i < MI->getNumOperands(); i += 2) {
120 MI->getOperand(i + 1).getMBB(), Register());
121 }
122}
123
124void DivergenceLoweringHelper::replaceDstReg(Register NewReg, Register OldReg,
126 BuildMI(*MBB, MBB->getFirstNonPHI(), {}, TII->get(AMDGPU::COPY), OldReg)
128}
129
130
131
136 B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(Instr->getIterator())));
137 B.buildCopy(LaneMask, Reg);
138 return LaneMask;
139}
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164void DivergenceLoweringHelper::buildMergeLaneMasks(
167
168
169
170 Register PrevRegCopy = buildRegCopyToLaneMask(PrevReg);
171 Register CurRegCopy = buildRegCopyToLaneMask(CurReg);
174
176 B.buildInstr(AndN2Op, {PrevMaskedReg}, {PrevRegCopy, ExecReg});
177 B.buildInstr(AndOp, {CurMaskedReg}, {ExecReg, CurRegCopy});
178 B.buildInstr(OrOp, {DstReg}, {PrevMaskedReg, CurMaskedReg});
179}
180
181
182
183
184void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) {
185 B.setInsertPt(*In.Block, In.Block->getFirstTerminator());
186
187 auto Copy = B.buildCopy(LLT::scalar(1), In.Reg);
188 MRI->setRegClass(Copy.getReg(0), ST->getBoolRC());
189 In.Reg = Copy.getReg(0);
190}
191
195 if (Op.isReg() && Op.getReg() == Reg)
196 Op.setReg(NewReg);
197 }
198}
199
200bool DivergenceLoweringHelper::lowerTemporalDivergence() {
203
207 continue;
208
210 if (CachedTDCopy) {
211 replaceUsesOfRegInInstWith(Reg, UseInst, CachedTDCopy);
212 continue;
213 }
214
217 B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(Inst->getIterator())));
218
219 Register VgprReg = MRI->createGenericVirtualRegister(MRI->getType(Reg));
220 B.buildInstr(AMDGPU::COPY, {VgprReg}, {Reg})
222
223 replaceUsesOfRegInInstWith(Reg, UseInst, VgprReg);
224 TDCache[Reg] = VgprReg;
225 }
226 return false;
227}
228
229bool DivergenceLoweringHelper::lowerTemporalDivergenceI1() {
231 initializeLaneMaskRegisterAttributes(BoolS1);
233
234
235
239 continue;
240
241 auto [LRCCacheIter, RegNotCached] = LRCCache.try_emplace(Reg);
242 auto &CycleMergedMask = LRCCacheIter->getSecond();
243 const MachineCycle *&CachedLRC = CycleMergedMask.first;
244 if (RegNotCached || LRC->contains(CachedLRC)) {
245 CachedLRC = LRC;
246 }
247 }
248
249 for (auto &LRCCacheEntry : LRCCache) {
251 auto &CycleMergedMask = LRCCacheEntry.getSecond();
253
254 Register MergedMask = MRI->createVirtualRegister(BoolS1);
256
259
260 for (auto Entry : Cycle->getEntries()) {
262 if (->contains(Pred)) {
263 B.setInsertPt(*Pred, Pred->getFirstTerminator());
264 auto ImplDef = B.buildInstr(AMDGPU::IMPLICIT_DEF, {BoolS1}, {});
266 }
267 }
268 }
269
270 buildMergeLaneMasks(*MBB, MBB->getFirstTerminator(), {}, MergedMask,
272
273 CycleMergedMask.second = MergedMask;
274 }
275
278 continue;
279
280 replaceUsesOfRegInInstWith(Reg, UseInst, LRCCache.lookup(Reg).second);
281 }
282
283 return false;
284}
285
286}
287
289 "AMDGPU GlobalISel divergence lowering", false, false)
294 "AMDGPU GlobalISel divergence lowering", false, false)
295
296char AMDGPUGlobalISelDivergenceLowering::ID = 0;
297
299 AMDGPUGlobalISelDivergenceLowering::ID;
300
302 return new AMDGPUGlobalISelDivergenceLowering();
303}
304
305bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction(
308 getAnalysis().getDomTree();
310 getAnalysis().getPostDomTree();
312 getAnalysis().getUniformityInfo();
313
314 DivergenceLoweringHelper Helper(&MF, &DT, &PDT, &MUI);
315
317
318
319
320
321
322
323
324 Changed |= Helper.lowerTemporalDivergence();
325
326
327 Changed |= Helper.lowerTemporalDivergenceI1();
328
329
330
331 Changed |= Helper.lowerPhis();
333}
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
const TargetInstrInfo & TII
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file declares the MachineIRBuilder class.
Machine IR instance of the generic uniformity analysis.
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Interface definition of the PhiLoweringHelper class that implements lane mask merging algorithm for d...
bool isS32S64LaneMask(Register Reg) const
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
FunctionPass class - This class is used to implement most global optimizations.
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
iterator_range< TemporalDivergenceTuple * > getTemporalDivergenceList() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
iterator_range< pred_iterator > predecessors()
MachineInstrBundleIterator< MachineInstr > iterator
Analysis pass which computes a MachineDominatorTree.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Helper class to build MachineInstr.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
MachineOperand class - Representation of each machine instruction operand.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
MachineSSAUpdater - This class updates SSA form for a set of virtual registers defined in multiple bl...
Legacy analysis pass which computes a MachineUniformityInfo.
Wrapper class representing virtual and physical registers.
Helper class for SSA formation on a set of values defined in multiple blocks.
void Initialize(Type *Ty, StringRef Name)
Reset this object to get ready for a new set of SSA updates with type 'Ty'.
Value * GetValueInMiddleOfBlock(BasicBlock *BB)
Construct SSA form, materializing a value that is live in the middle of the specified block.
void AddAvailableValue(BasicBlock *BB, Value *V)
Indicate that a rewritten value is available in the specified block with the specified value.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
StringRef - Represent a constant reference to a string, i.e.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
char & AMDGPUGlobalISelDivergenceLoweringID
Definition AMDGPUGlobalISelDivergenceLowering.cpp:298
Register createLaneMaskReg(MachineRegisterInfo *MRI, MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs)
DWARFExpression::Operation Op
FunctionPass * createAMDGPUGlobalISelDivergenceLoweringPass()
Definition AMDGPUGlobalISelDivergenceLowering.cpp:301
MachineCycleInfo::CycleT MachineCycle
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
All attributes(register class or bank and low-level type) a virtual register can have.