LLVM: lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp Source File (original) (raw)
1
2
3
4
5
6
7
8
9
10
11
12
13
20#include "llvm/IR/IntrinsicsAMDGPU.h"
23
24#define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
25
26using namespace llvm;
27
28namespace {
29
30class PreloadKernelArgInfo {
31private:
34 unsigned NumFreeUserSGPRs;
35
36 enum HiddenArg : unsigned {
37 HIDDEN_BLOCK_COUNT_X,
38 HIDDEN_BLOCK_COUNT_Y,
39 HIDDEN_BLOCK_COUNT_Z,
40 HIDDEN_GROUP_SIZE_X,
41 HIDDEN_GROUP_SIZE_Y,
42 HIDDEN_GROUP_SIZE_Z,
43 HIDDEN_REMAINDER_X,
44 HIDDEN_REMAINDER_Y,
45 HIDDEN_REMAINDER_Z,
46 END_HIDDEN_ARGS
47 };
48
49
50 struct HiddenArgInfo {
51
52
54
56
57 const char *Name;
58 };
59
60 static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
61 {0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"},
62 {8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"},
63 {14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"},
64 {18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"},
65 {22, 2, "_hidden_remainder_z"}};
66
67 static HiddenArg getHiddenArgFromOffset(unsigned Offset) {
68 for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I)
70 return static_cast<HiddenArg>(I);
71
72 return END_HIDDEN_ARGS;
73 }
74
75 static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) {
76 if (HA < END_HIDDEN_ARGS)
78
80 }
81
82 static const char *getHiddenArgName(HiddenArg HA) {
83 if (HA < END_HIDDEN_ARGS) {
84 return HiddenArgs[HA].Name;
85 }
87 }
88
89
90
91
92
93
94
95
96 Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) {
98 LLVMContext &Ctx = F.getParent()->getContext();
100 for (unsigned I = 0; I <= LastPreloadIndex; ++I)
101 FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I)));
102
104 FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg());
107
111
112 F.getParent()->getFunctionList().insert(F.getIterator(), NF);
115
120 ++NFArg;
121 }
122
124 AB.addAttribute(Attribute::InReg);
125 AB.addAttribute("amdgpu-hidden-argument");
127 for (unsigned I = 0; I <= LastPreloadIndex; ++I) {
128 AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB);
129 NFArg++->setName(getHiddenArgName(HiddenArg(I)));
130 }
131
133 F.replaceAllUsesWith(NF);
135
136 return NF;
137 }
138
139public:
141 setInitialFreeUserSGPRsCount();
142 }
143
144
145
146 void setInitialFreeUserSGPRsCount() {
149 }
150
151 bool tryAllocPreloadSGPRs(unsigned AllocSize, uint64_t ArgOffset,
152 uint64_t LastExplicitArgOffset) {
153
154
155 if (ArgOffset - LastExplicitArgOffset < 4 &&
157 return true;
158
159
160 ArgOffset = alignDown(ArgOffset, 4);
161 unsigned Padding = ArgOffset - LastExplicitArgOffset;
162 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
163 unsigned NumPreloadSGPRs = alignTo(AllocSize, 4) / 4;
164 if (NumPreloadSGPRs + PaddingSGPRs > NumFreeUserSGPRs)
165 return false;
166
167 NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs);
168 return true;
169 }
170
171
172 void tryAllocImplicitArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset,
173 uint64_t LastExplicitArgOffset,
176 F.getParent(), Intrinsic::amdgcn_implicitarg_ptr);
177 if (!ImplicitArgPtr)
178 return;
179
180 const DataLayout &DL = F.getParent()->getDataLayout();
181
183 for (auto *U : ImplicitArgPtr->users()) {
184 Instruction *CI = dyn_cast(U);
185 if (!CI || CI->getParent()->getParent() != &F)
186 continue;
187
188 for (auto *U : CI->users()) {
190 auto *Load = dyn_cast(U);
191 if (!Load) {
193 continue;
194
195 Load = dyn_cast(*U->user_begin());
196 }
197
198 if (!Load || !Load->isSimple())
199 continue;
200
201
202 LLVMContext &Ctx = F.getParent()->getContext();
203 Type *LoadTy = Load->getType();
204 HiddenArg HA = getHiddenArgFromOffset(Offset);
205 if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA))
206 continue;
207
208 ImplicitArgLoads.push_back(std::make_pair(Load, Offset));
209 }
210 }
211
212 if (ImplicitArgLoads.empty())
213 return;
214
215
216
217 std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(), less_second());
218
219
220
221
222 auto *PreloadEnd = std::find_if(
223 ImplicitArgLoads.begin(), ImplicitArgLoads.end(),
224 [&](const std::pair<LoadInst *, unsigned> &Load) {
225 unsigned LoadSize = DL.getTypeStoreSize(Load.first->getType());
226 unsigned LoadOffset = Load.second;
227 if (!tryAllocPreloadSGPRs(LoadSize,
228 LoadOffset + ImplicitArgsBaseOffset,
229 LastExplicitArgOffset))
230 return true;
231
232 LastExplicitArgOffset =
233 ImplicitArgsBaseOffset + LoadOffset + LoadSize;
234 return false;
235 });
236
237 if (PreloadEnd == ImplicitArgLoads.begin())
238 return;
239
240 unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second);
241 Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex);
243 for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) {
245 unsigned LoadOffset = I->second;
246 unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset);
247 unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1;
250 }
251 }
252};
253
254class AMDGPULowerKernelArguments : public FunctionPass {
255public:
256 static char ID;
257
259
261
265 }
266};
267
268}
269
270
274 AllocaInst *AI = dyn_cast(&*InsPt);
275
276
277
279 break;
280 }
281
282 return InsPt;
283}
284
288 return false;
289
291 LLVMContext &Ctx = F.getParent()->getContext();
295
296 const Align KernArgBaseAlign(16);
297 const uint64_t BaseOffset = ST.getExplicitKernelArgOffset();
298
300
301 const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign);
302 if (TotalKernArgSize == 0)
303 return false;
304
306 Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, {}, {},
307 nullptr, F.getName() + ".kernarg.segment");
308 KernArgSegment->addRetAttr(Attribute::NonNull);
311
312 uint64_t ExplicitArgOffset = 0;
313
314 bool InPreloadSequence = true;
315 PreloadKernelArgInfo PreloadInfo(F, ST);
316
318 const bool IsByRef = Arg.hasByRefAttr();
319 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
320 MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt;
321 Align ABITypeAlign = DL.getValueOrABITypeAlignment(ParamAlign, ArgTy);
322
324 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
325
326 uint64_t EltOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset;
327 uint64_t LastExplicitArgOffset = ExplicitArgOffset;
328 ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;
329
330
331
332
333 if (Arg.hasAttribute("amdgpu-hidden-argument"))
334 break;
335
336
337 if (Arg.hasInRegAttr() && InPreloadSequence && ST.hasKernargPreload() &&
338 !Arg.getType()->isAggregateType())
339 if (PreloadInfo.tryAllocPreloadSGPRs(AllocSize, EltOffset,
340 LastExplicitArgOffset))
341 continue;
342
343 InPreloadSequence = false;
344
345 if (Arg.use_empty())
346 continue;
347
348
349
350 if (IsByRef) {
352 Builder.getInt8Ty(), KernArgSegment, EltOffset,
353 Arg.getName() + ".byval.kernarg.offset");
354
355 Value *CastOffsetPtr =
358 continue;
359 }
360
361 if (PointerType *PT = dyn_cast(ArgTy)) {
362
363
364
365
368 !ST.hasUsableDSOffset())
369 continue;
370
371
372
373 if (Arg.hasNoAliasAttr())
374 continue;
375 }
376
377 auto *VT = dyn_cast(ArgTy);
378 bool IsV3 = VT && VT->getNumElements() == 3;
380
382
383 int64_t AlignDownOffset = alignDown(EltOffset, 4);
384 int64_t OffsetDiff = EltOffset - AlignDownOffset;
386 KernArgBaseAlign, DoShiftOpt ? AlignDownOffset : EltOffset);
387
389 Type *AdjustedArgTy;
390 if (DoShiftOpt) {
391
392
393
394
395
396
397
399 Builder.getInt8Ty(), KernArgSegment, AlignDownOffset,
400 Arg.getName() + ".kernarg.offset.align.down");
401 AdjustedArgTy = Builder.getInt32Ty();
402 } else {
404 Builder.getInt8Ty(), KernArgSegment, EltOffset,
405 Arg.getName() + ".kernarg.offset");
406 AdjustedArgTy = ArgTy;
407 }
408
409 if (IsV3 && Size >= 32) {
411
412 AdjustedArgTy = V4Ty;
413 }
414
417 Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {}));
418
420
421 if (Arg.hasAttribute(Attribute::NoUndef))
422 Load->setMetadata(LLVMContext::MD_noundef, MDNode::get(Ctx, {}));
423
424 if (Arg.hasAttribute(Attribute::Range)) {
426 Arg.getAttribute(Attribute::Range).getValueAsConstantRange();
427 Load->setMetadata(LLVMContext::MD_range,
429 }
430
431 if (isa(ArgTy)) {
432 if (Arg.hasNonNullAttr())
433 Load->setMetadata(LLVMContext::MD_nonnull, MDNode::get(Ctx, {}));
434
435 uint64_t DerefBytes = Arg.getDereferenceableBytes();
436 if (DerefBytes != 0) {
437 Load->setMetadata(
438 LLVMContext::MD_dereferenceable,
441 ConstantInt::get(Builder.getInt64Ty(), DerefBytes))));
442 }
443
444 uint64_t DerefOrNullBytes = Arg.getDereferenceableOrNullBytes();
445 if (DerefOrNullBytes != 0) {
446 Load->setMetadata(
447 LLVMContext::MD_dereferenceable_or_null,
450 DerefOrNullBytes))));
451 }
452
453 if (MaybeAlign ParamAlign = Arg.getParamAlign()) {
454 Load->setMetadata(
455 LLVMContext::MD_align,
457 Builder.getInt64Ty(), ParamAlign->value()))));
458 }
459 }
460
461
462
463 if (DoShiftOpt) {
464 Value *ExtractBits = OffsetDiff == 0 ?
465 Load : Builder.CreateLShr(Load, OffsetDiff * 8);
466
470 Arg.getName() + ".load");
472 } else if (IsV3) {
474 Arg.getName() + ".load");
476 } else {
477 Load->setName(Arg.getName() + ".load");
478 Arg.replaceAllUsesWith(Load);
479 }
480 }
481
484
485 if (InPreloadSequence) {
486 uint64_t ImplicitArgsBaseOffset =
487 alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) +
488 BaseOffset;
489 PreloadInfo.tryAllocImplicitArgPreloadSGPRs(ImplicitArgsBaseOffset,
490 ExplicitArgOffset, Builder);
491 }
492
493 return true;
494}
495
496bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
497 auto &TPC = getAnalysis();
500}
501
503 "AMDGPU Lower Kernel Arguments", false, false)
506
507char AMDGPULowerKernelArguments::ID = 0;
508
510 return new AMDGPULowerKernelArguments();
511}
512
516 if (Changed) {
517
520 return PA;
521 }
522
524}
AMDGPU Lower Kernel Arguments
static BasicBlock::iterator getInsertPt(BasicBlock &BB)
static bool lowerKernelArguments(Function &F, const TargetMachine &TM)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file contains the simple types necessary to represent the attributes associated with functions a...
AMD GCN specific subclass of TargetSubtarget.
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Target-Independent Code Generator Pass Configuration Options pass.
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
an instruction to allocate memory on the stack
bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
A container for analyses that lazily runs them and caches their results.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
This class represents an incoming formal argument to a Function.
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
static Attribute getWithDereferenceableBytes(LLVMContext &Context, uint64_t Bytes)
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
InstListType::iterator iterator
Instruction iterators...
Represents analyses that only rely on functions' control flow.
void addRetAttr(Attribute::AttrKind Kind)
Adds the attribute to the return value.
This class represents a function call, abstracting a target machine's calling convention.
This class represents a range of values.
const APInt & getLower() const
Return the lower value for this range.
const APInt & getUpper() const
Return the upper value for this range.
A parsed version of the target data layout string in and methods for querying it.
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
FunctionPass class - This class is used to implement most global optimizations.
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
void splice(Function::iterator ToIt, Function *FromF)
Transfer all blocks from FromF to this function at ToIt.
AttributeList getAttributes() const
Return the attribute list for this Function.
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
void setIsNewDbgInfoFormat(bool NewVal)
Argument * getArg(unsigned i) const
void copyAttributesFrom(const Function *Src)
copyAttributesFrom - copy all additional attributes (those not needed to create a Function) from the ...
unsigned getNumFreeUserSGPRs()
void copyMetadata(const GlobalObject *Src, unsigned Offset)
Copy metadata from Src, adjusting offsets by Offset.
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Value * CreateConstInBoundsGEP1_64(Type *Ty, Value *Ptr, uint64_t Idx0, const Twine &Name="")
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Class to represent integer types.
This is an important class for using LLVM in a threaded context.
An instruction for reading from memory.
ConstantAsMetadata * createConstant(Constant *C)
Return the given constant as metadata.
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Primary interface to the complete machine description for the target machine.
Target-Independent Code Generator Pass Configuration Options.
The instances of the Type class are immutable: once they are created, they are never changed.
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
bool isAggregateType() const
Return true if the type is an aggregate type.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
void setName(const Twine &Name)
Change the name of the value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
StringRef getName() const
Return a constant reference to the value's name.
void takeName(Value *V)
Transfer the name from V to this value.
const ParentTy * getParent() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ C
The default llvm calling convention, compatible with C.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Function * getDeclarationIfExists(Module *M, ID id, ArrayRef< Type * > Tys, FunctionType *FT=nullptr)
This version supports overloaded intrinsics.
This is an optimization pass for GlobalISel generic memory operations.
bool isAligned(Align Lhs, uint64_t SizeInBytes)
Checks that SizeInBytes is a multiple of the alignment.
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
FunctionPass * createAMDGPULowerKernelArgumentsPass()
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
This struct is a compact representation of a valid (non-zero power of two) alignment.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Function object to check whether the second component of a container supported by std::get (like std:...