LLVM: lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp Source File (original) (raw)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
31
32using namespace llvm;
33
34#define DEBUG_TYPE "amdgpu-perf-hint"
35
38 cl::desc("Function mem bound threshold in %"));
39
42 cl::desc("Kernel limit wave threshold in %"));
43
46 cl::desc("Indirect access memory instruction weight"));
47
50 cl::desc("Large stride memory access weight"));
51
54 cl::desc("Large stride memory access threshold"));
55
56STATISTIC(NumMemBound, "Number of functions marked as memory bound");
57STATISTIC(NumLimitWave, "Number of functions marked as needing limit wave");
58
59namespace {
60
61struct AMDGPUPerfHint {
63
64public:
67 : FIM(FIM_), TLI(TLI_) {}
68
70
71private:
72 struct MemAccessInfo {
73 const Value *V = nullptr;
74 const Value *Base = nullptr;
75 int64_t Offset = 0;
76 MemAccessInfo() = default;
77 bool isLargeStride(MemAccessInfo &Reference) const;
78#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
79 Printable print() const {
80 return Printable([this](raw_ostream &OS) {
81 OS << "Value: " << *V << '\n'
82 << "Base: " << *Base << " Offset: " << Offset << '\n';
83 });
84 }
85#endif
86 };
87
88 MemAccessInfo makeMemAccessInfo(Instruction *) const;
89
90 MemAccessInfo LastAccess;
91
93
94 const DataLayout *DL = nullptr;
95
96 const SITargetLowering *TLI;
97
98 AMDGPUPerfHintAnalysis::FuncInfo *visit(const Function &F);
99 static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F);
100 static bool needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &F);
101
102 bool isIndirectAccess(const Instruction *Inst) const;
103
104
105
106
107
108
109
110
111 bool isLargeStride(const Instruction *Inst);
112
113 bool isGlobalAddr(const Value *V) const;
114 bool isLocalAddr(const Value *V) const;
115 bool isGlobalLoadUsedInBB(const Instruction &) const;
116};
117
118static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType(
121 return {LI->getPointerOperand(), LI->getType()};
123 return {SI->getPointerOperand(), SI->getValueOperand()->getType()};
125 return {AI->getPointerOperand(), AI->getCompareOperand()->getType()};
127 return {AI->getPointerOperand(), AI->getValOperand()->getType()};
130
131 return {nullptr, nullptr};
132}
133
134bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const {
135 LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n');
136 SmallPtrSet<const Value *, 32> WorkSet;
137 SmallPtrSet<const Value *, 32> Visited;
138 if (const Value *MO = getMemoryInstrPtrAndType(Inst).first) {
139 if (isGlobalAddr(MO))
141 }
142
143 while (!WorkSet.empty()) {
146 if (!Visited.insert(V).second)
147 continue;
149
151 const auto *M = LD->getPointerOperand();
152 if (isGlobalAddr(M)) {
154 return true;
155 }
156 continue;
157 }
158
160 const auto *P = GEP->getPointerOperand();
162 for (unsigned I = 1, E = GEP->getNumIndices() + 1; I != E; ++I)
164 continue;
165 }
166
168 WorkSet.insert(U->getOperand(0));
169 continue;
170 }
171
173 WorkSet.insert(BO->getOperand(0));
174 WorkSet.insert(BO->getOperand(1));
175 continue;
176 }
177
179 WorkSet.insert(S->getFalseValue());
180 WorkSet.insert(S->getTrueValue());
181 continue;
182 }
183
185 WorkSet.insert(E->getVectorOperand());
186 continue;
187 }
188
190 }
191
193 return false;
194}
195
196
197bool AMDGPUPerfHint::isGlobalLoadUsedInBB(const Instruction &I) const {
199 if (!Ld)
200 return false;
201 if (!isGlobalAddr(Ld->getPointerOperand()))
202 return false;
203
204 for (const User *Usr : Ld->users()) {
206 if (UsrInst->getParent() == I.getParent())
207 return true;
208 }
209 }
210
211 return false;
212}
213
214AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
215 AMDGPUPerfHintAnalysis::FuncInfo &FI = FIM[&F];
216
217 LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n');
218
220 LastAccess = MemAccessInfo();
221 unsigned UsedGlobalLoadsInBB = 0;
223 if (const Type *Ty = getMemoryInstrPtrAndType(&I).second) {
224 unsigned Size = divideCeil(Ty->getPrimitiveSizeInBits(), 32);
225
226
227 if (isGlobalLoadUsedInBB(I))
228 UsedGlobalLoadsInBB += Size;
229 if (isIndirectAccess(&I))
231 if (isLargeStride(&I))
235 continue;
236 }
239 if (!Callee || Callee->isDeclaration()) {
241 continue;
242 }
243 if (&F == Callee)
244 continue;
245
246 auto Loc = FIM.find(Callee);
247 if (Loc == FIM.end())
248 continue;
249
250 FI.MemInstCost += Loc->second.MemInstCost;
251 FI.InstCost += Loc->second.InstCost;
252 FI.IAMInstCost += Loc->second.IAMInstCost;
253 FI.LSMInstCost += Loc->second.LSMInstCost;
255 TargetLoweringBase::AddrMode AM;
260 GEP->getPointerAddressSpace()))
261
262 continue;
264 } else {
266 }
267 }
268
270 unsigned GlobalMemAccPercentage = UsedGlobalLoadsInBB * 100 / B.size();
271 if (GlobalMemAccPercentage > 50) {
272 LLVM_DEBUG(dbgs() << "[HasDenseGlobalMemAcc] Set to true since "
273 << B.getName() << " has " << GlobalMemAccPercentage
274 << "% global memory access\n");
276 }
277 }
278 }
279
280 return &FI;
281}
282
283bool AMDGPUPerfHint::runOnFunction(Function &F) {
284 const Module &M = *F.getParent();
286
287 if (F.hasFnAttribute("amdgpu-wave-limiter") &&
288 F.hasFnAttribute("amdgpu-memory-bound"))
289 return false;
290
291 const AMDGPUPerfHintAnalysis::FuncInfo *Info = visit(F);
292
293 LLVM_DEBUG(dbgs() << F.getName() << " MemInst cost: " << Info->MemInstCost
294 << '\n'
295 << " IAMInst cost: " << Info->IAMInstCost << '\n'
296 << " LSMInst cost: " << Info->LSMInstCost << '\n'
297 << " TotalInst cost: " << Info->InstCost << '\n');
298
300
301 if (isMemBound(*Info)) {
302 LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n");
303 NumMemBound++;
304 F.addFnAttr("amdgpu-memory-bound", "true");
306 }
307
309 LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n");
310 NumLimitWave++;
311 F.addFnAttr("amdgpu-wave-limiter", "true");
313 }
314
316}
317
318bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
319
320
322 return true;
323
325}
326
327bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
330}
331
332bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const {
334 unsigned As = PT->getAddressSpace();
335
337 }
338 return false;
339}
340
341bool AMDGPUPerfHint::isLocalAddr(const Value *V) const {
344 return false;
345}
346
347bool AMDGPUPerfHint::isLargeStride(const Instruction *Inst) {
348 LLVM_DEBUG(dbgs() << "[isLargeStride] " << *Inst << '\n');
349
350 MemAccessInfo MAI = makeMemAccessInfo(const_cast<Instruction *>(Inst));
351 bool IsLargeStride = MAI.isLargeStride(LastAccess);
352 if (MAI.Base)
353 LastAccess = std::move(MAI);
354
355 return IsLargeStride;
356}
357
358AMDGPUPerfHint::MemAccessInfo
359AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const {
360 MemAccessInfo MAI;
361 const Value *MO = getMemoryInstrPtrAndType(Inst).first;
362
363 LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n');
364
365 if (isLocalAddr(MO))
366 return MAI;
367
368 MAI.V = MO;
370 return MAI;
371}
372
373bool AMDGPUPerfHint::MemAccessInfo::isLargeStride(
374 MemAccessInfo &Reference) const {
375
377 return false;
378
383 << print() << "<=>\n"
384 << Reference.print() << "Result:" << Result << '\n');
386}
387
388class AMDGPUPerfHintAnalysisLegacy : public CallGraphSCCPass {
389private:
390
391 AMDGPUPerfHintAnalysis Impl;
392
393public:
394 static char ID;
395
396 AMDGPUPerfHintAnalysisLegacy() : CallGraphSCCPass(ID) {}
397
398 bool runOnSCC(CallGraphSCC &SCC) override;
399
400 void getAnalysisUsage(AnalysisUsage &AU) const override {
402 }
403};
404
405}
406
408 auto FI = FIM.find(F);
409 if (FI == FIM.end())
410 return false;
411
412 return AMDGPUPerfHint::isMemBound(FI->second);
413}
414
416 auto FI = FIM.find(F);
417 if (FI == FIM.end())
418 return false;
419
420 return AMDGPUPerfHint::needLimitWave(FI->second);
421}
422
428 if ( || F->isDeclaration())
429 continue;
430
432 AMDGPUPerfHint Analyzer(FIM, ST.getTargetLowering());
433
434 if (Analyzer.runOnFunction(*F))
436 }
437
439}
440
444
446
449 if (SCC.size() != 1)
450 continue;
451 Function &F = SCC.begin()->getFunction();
452
453 if (F.isDeclaration())
454 continue;
455
457 AMDGPUPerfHint Analyzer(FIM, ST.getTargetLowering());
458 if (Analyzer.runOnFunction(F))
460 }
461 }
462
464}
465
466char AMDGPUPerfHintAnalysisLegacy::ID = 0;
468
470 "Analysis if a function is memory bound", true, true)
471
472bool AMDGPUPerfHintAnalysisLegacy::runOnSCC(CallGraphSCC &SCC) {
473 auto *TPC = getAnalysisIfAvailable();
474 if (!TPC)
475 return false;
476
478 return Impl.runOnSCC(TM, SCC);
479}
480
484
488
491 return PA;
492}
static cl::opt< unsigned > LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden, cl::desc("Large stride memory access threshold"))
static cl::opt< unsigned > IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden, cl::desc("Indirect access memory instruction weight"))
static cl::opt< unsigned > LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden, cl::desc("Kernel limit wave threshold in %"))
static cl::opt< unsigned > LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden, cl::desc("Large stride memory access weight"))
static cl::opt< unsigned > MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden, cl::desc("Function mem bound threshold in %"))
Analyzes if a function potentially memory bound and if a kernel kernel may benefit from limiting numb...
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
static bool runOnFunction(Function &F, bool PostInlining)
Implements a lazy call graph analysis and related passes for the new pass manager.
Machine Check Debug Module
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
void visit(MachineFunction &MF, MachineBasicBlock &Start, std::function< void(MachineBasicBlock *)> op)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
This file describes how to lower LLVM code to machine code.
Target-Independent Code Generator Pass Configuration Options pass.
bool isMemoryBound(const Function *F) const
Definition AMDGPUPerfHintAnalysis.cpp:407
bool needsWaveLimiter(const Function *F) const
Definition AMDGPUPerfHintAnalysis.cpp:415
bool run(const GCNTargetMachine &TM, LazyCallGraph &CG)
Definition AMDGPUPerfHintAnalysis.cpp:441
bool runOnSCC(const GCNTargetMachine &TM, CallGraphSCC &SCC)
Definition AMDGPUPerfHintAnalysis.cpp:423
ValueMap< const Function *, FuncInfo > FuncInfoMap
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
void setPreservesAll()
Set by analyses that do not transform their input at all.
A node in the call graph for a module.
CallGraphSCC - This is a single SCC that a CallGraphSCCPass is run on.
An analysis pass which computes the call graph for a module.
A RefSCC of the call graph.
An SCC of the call graph.
A lazily constructed view of the call graph of a module.
LLVM_ABI void buildRefSCCs()
iterator_range< postorder_ref_scc_iterator > postorder_ref_sccs()
A Module instance is used to store all the information related to an LLVM module.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserve()
Mark an analysis as preserved.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool erase(PtrType Ptr)
Remove pointer from the set.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
iterator find(const KeyT &Val)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
initializer< Ty > init(const Ty &Val)
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
decltype(auto) dyn_cast(const From &Val)
dyn_cast - Return the argument parameter cast to the specified type.
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
auto dyn_cast_or_null(const Y &Val)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
char & AMDGPUPerfHintAnalysisLegacyID
Definition AMDGPUPerfHintAnalysis.cpp:467
std::unique_ptr< AMDGPUPerfHintAnalysis > Impl
const GCNTargetMachine & TM
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
Definition AMDGPUPerfHintAnalysis.cpp:481
bool HasDenseGlobalMemAcc