LLVM: lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp Source File (original) (raw)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
26#include "llvm/IR/IntrinsicsAMDGPU.h"
31
32#define DEBUG_TYPE "amdgpu-preload-kernel-arguments"
33
34using namespace llvm;
35
37 "amdgpu-kernarg-preload-count",
38 cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0));
39
42 cl::desc("Enable preload kernel arguments to SGPRs"),
44
45namespace {
46
47class AMDGPUPreloadKernelArgumentsLegacy : public ModulePass {
49
50public:
51 static char ID;
52 explicit AMDGPUPreloadKernelArgumentsLegacy(
54
55 StringRef getPassName() const override {
56 return "AMDGPU Preload Kernel Arguments";
57 }
58
59 bool runOnModule(Module &M) override;
60};
61
62class PreloadKernelArgInfo {
63private:
65 const GCNSubtarget &ST;
66 unsigned NumFreeUserSGPRs;
67
68 enum HiddenArg : unsigned {
69 HIDDEN_BLOCK_COUNT_X,
70 HIDDEN_BLOCK_COUNT_Y,
71 HIDDEN_BLOCK_COUNT_Z,
72 HIDDEN_GROUP_SIZE_X,
73 HIDDEN_GROUP_SIZE_Y,
74 HIDDEN_GROUP_SIZE_Z,
75 HIDDEN_REMAINDER_X,
76 HIDDEN_REMAINDER_Y,
77 HIDDEN_REMAINDER_Z,
78 END_HIDDEN_ARGS
79 };
80
81
82 struct HiddenArgInfo {
83
84
85 uint8_t Offset;
86
87 uint8_t Size;
88
89 const char *Name;
90 };
91
92 static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
93 {0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"},
94 {8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"},
95 {14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"},
96 {18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"},
97 {22, 2, "_hidden_remainder_z"}};
98
99 static HiddenArg getHiddenArgFromOffset(unsigned Offset) {
100 for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I)
102 return static_cast<HiddenArg>(I);
103
104 return END_HIDDEN_ARGS;
105 }
106
107 static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) {
108 if (HA < END_HIDDEN_ARGS)
109 return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8);
110
112 }
113
114 static const char *getHiddenArgName(HiddenArg HA) {
115 if (HA < END_HIDDEN_ARGS)
116 return HiddenArgs[HA].Name;
117
119 }
120
121
122
123
124
125
126
127
128 Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) {
129 FunctionType *FT = F.getFunctionType();
130 LLVMContext &Ctx = F.getContext();
132 for (unsigned I = 0; I <= LastPreloadIndex; ++I)
133 FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I)));
134
135 FunctionType *NFT =
136 FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg());
138 Function::Create(NFT, F.getLinkage(), F.getAddressSpace(), F.getName());
139
142
143 F.getParent()->getFunctionList().insert(F.getIterator(), NF);
146
148 for (Argument &Arg : F.args()) {
149 Arg.replaceAllUsesWith(&*NFArg);
151 ++NFArg;
152 }
153
154 AttrBuilder AB(Ctx);
155 AB.addAttribute(Attribute::InReg);
156 AB.addAttribute("amdgpu-hidden-argument");
158 for (unsigned I = 0; I <= LastPreloadIndex; ++I) {
159 AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB);
160 NFArg++->setName(getHiddenArgName(HiddenArg(I)));
161 }
162
164 F.replaceAllUsesWith(NF);
165
166 return NF;
167 }
168
169public:
170 PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
171 setInitialFreeUserSGPRsCount();
172 }
173
174
175
176 void setInitialFreeUserSGPRsCount() {
177 GCNUserSGPRUsageInfo UserSGPRInfo(F, ST);
178 NumFreeUserSGPRs = UserSGPRInfo.getNumFreeUserSGPRs();
179 }
180
181 bool canPreloadKernArgAtOffset(uint64_t ExplicitArgOffset) {
182 return ExplicitArgOffset <= NumFreeUserSGPRs * 4;
183 }
184
185
186 void
187 tryAllocHiddenArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset,
188 SmallVectorImpl<Function *> &FunctionsToErase) {
190 F.getParent(), Intrinsic::amdgcn_implicitarg_ptr);
191 if (!ImplicitArgPtr)
192 return;
193
195
197 for (auto *U : ImplicitArgPtr->users()) {
200 continue;
201
202 for (auto *U : CI->users()) {
205 if (!Load) {
207 continue;
208
210 }
211
212 if (!Load || ->isSimple())
213 continue;
214
215
216 LLVMContext &Ctx = F.getContext();
217 Type *LoadTy = Load->getType();
218 HiddenArg HA = getHiddenArgFromOffset(Offset);
219 if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA))
220 continue;
221
222 ImplicitArgLoads.push_back(std::make_pair(Load, Offset));
223 }
224 }
225
226 if (ImplicitArgLoads.empty())
227 return;
228
229
230
231 std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(), less_second());
232
233
234
235
237 ImplicitArgLoads, [&](const std::pair<LoadInst *, unsigned> &Load) {
238 unsigned LoadSize = DL.getTypeStoreSize(Load.first->getType());
239 unsigned LoadOffset = Load.second;
240 if (!canPreloadKernArgAtOffset(LoadOffset + LoadSize +
241 ImplicitArgsBaseOffset))
242 return true;
243
244 return false;
245 });
246
247 if (PreloadEnd == ImplicitArgLoads.begin())
248 return;
249
250 unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second);
251 Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex);
254 for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) {
255 LoadInst *LoadInst = I->first;
256 unsigned LoadOffset = I->second;
257 unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset);
258 unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1;
261 }
262 }
263};
264
265}
266
267char AMDGPUPreloadKernelArgumentsLegacy::ID = 0;
268
270 "AMDGPU Preload Kernel Arguments", false, false)
271
274 return new AMDGPUPreloadKernelArgumentsLegacy(
276}
277
278AMDGPUPreloadKernelArgumentsLegacy::AMDGPUPreloadKernelArgumentsLegacy(
281
284 return false;
285
288 for (auto &F : M) {
290 if (!ST.hasKernargPreload() ||
292 continue;
293
294 PreloadKernelArgInfo PreloadInfo(F, ST);
295 uint64_t ExplicitArgOffset = 0;
297 const uint64_t BaseOffset = ST.getExplicitKernelArgOffset();
299 unsigned NumPreloadedExplicitArgs = 0;
301
302
303
304
305 if (Arg.hasByRefAttr() || Arg.hasNestAttr() ||
306 Arg.hasAttribute("amdgpu-hidden-argument"))
307 break;
308
309
310 if (NumPreloadsRequested == 0 && !Arg.hasInRegAttr())
311 break;
312
313
314 if (Arg.getType()->isAggregateType())
315 break;
316
317 Type *ArgTy = Arg.getType();
318 Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
319 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
320 ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;
321
322 if (!PreloadInfo.canPreloadKernArgAtOffset(ExplicitArgOffset))
323 break;
324
325 Arg.addAttr(Attribute::InReg);
326 NumPreloadedExplicitArgs++;
327 if (NumPreloadsRequested > 0)
328 NumPreloadsRequested--;
329 }
330
331
332
333 if (NumPreloadedExplicitArgs == F.arg_size()) {
334 uint64_t ImplicitArgsBaseOffset =
335 alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) +
336 BaseOffset;
337 PreloadInfo.tryAllocHiddenArgPreloadSGPRs(ImplicitArgsBaseOffset,
338 FunctionsToErase);
339 }
340
341 Changed |= NumPreloadedExplicitArgs > 0;
342 }
343
345
346
347 for (auto *F : FunctionsToErase)
348 F->eraseFromParent();
349
351}
352
353bool AMDGPUPreloadKernelArgumentsLegacy::runOnModule(Module &M) {
354 if (skipModule(M) || !TM)
355 return false;
356
358}
359
360PreservedAnalyses
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool markKernelArgsAsInreg(Module &M, const TargetMachine &TM)
Definition AMDGPUPreloadKernelArguments.cpp:282
static cl::opt< unsigned > KernargPreloadCount("amdgpu-kernarg-preload-count", cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0))
static cl::opt< bool > EnableKernargPreload("amdgpu-kernarg-preload", cl::desc("Enable preload kernel arguments to SGPRs"), cl::init(true))
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Module.h This file contains the declarations for the Module class.
This header defines various interfaces for pass management in LLVM.
Machine Check Debug Module
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
Definition AMDGPUPreloadKernelArguments.cpp:361
This class represents an incoming formal argument to a Function.
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
A parsed version of the target data layout string in and methods for querying it.
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
void splice(Function::iterator ToIt, Function *FromF)
Transfer all blocks from FromF to this function at ToIt.
AttributeList getAttributes() const
Return the attribute list for this Function.
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Argument * getArg(unsigned i) const
void copyAttributesFrom(const Function *Src)
copyAttributesFrom - copy all additional attributes (those not needed to create a Function) from the ...
LLVM_ABI void copyMetadata(const GlobalObject *Src, unsigned Offset)
Copy metadata from Src, adjusting offsets by Offset.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
A Module instance is used to store all the information related to an LLVM module.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Primary interface to the complete machine description for the target machine.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
The instances of the Type class are immutable: once they are created, they are never changed.
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
initializer< Ty > init(const Ty &Val)
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
decltype(auto) dyn_cast(const From &Val)
dyn_cast - Return the argument parameter cast to the specified type.
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
ModulePass * createAMDGPUPreloadKernelArgumentsLegacyPass(const TargetMachine *)
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
This struct is a compact representation of a valid (non-zero power of two) alignment.