LLVM: lib/Target/AMDGPU/AMDGPUSubtarget.cpp Source File (original) (raw)
1
2
3
4
5
6
7
8
9
10
11
12
13
26#include "llvm/IR/IntrinsicsAMDGPU.h"
27#include "llvm/IR/IntrinsicsR600.h"
29#include
30
31using namespace llvm;
32
33#define DEBUG_TYPE "amdgpu-subtarget"
34
36
40
44
45
46
47
48unsigned
53 const unsigned WavesPerWorkgroup =
54 std::max(1u, (WorkGroupSize + WaveSize - 1) / WaveSize);
55
56 const unsigned WorkGroupsPerCU =
57 std::max(1u, (NWaves * getEUsPerCU()) / WavesPerWorkgroup);
58
60}
61
63 uint32_t LDSBytes, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
64
65
66 const unsigned MaxWGsLDS = getLocalMemorySize() / std::max(LDSBytes, 1u);
67
68
69
70
71
72 if (!MaxWGsLDS)
73 return {1, 1};
74
76
77 auto PropsFromWGSize = [=](unsigned WGSize)
78 -> std::tuple<const unsigned, const unsigned, unsigned> {
79 unsigned WavesPerWG = divideCeil(WGSize, WaveSize);
81 return {WavesPerWG, WGsPerCU, WavesPerWG * WGsPerCU};
82 };
83
84
85
86
87
88 const auto [MinWGSize, MaxWGSize] = FlatWorkGroupSizes;
89 auto [MinWavesPerWG, MaxWGsPerCU, MaxWavesPerCU] = PropsFromWGSize(MinWGSize);
90 auto [MaxWavesPerWG, MinWGsPerCU, MinWavesPerCU] = PropsFromWGSize(MaxWGSize);
91
92
93
94
95 if (MinWavesPerCU >= MaxWavesPerCU) {
96 std::swap(MinWavesPerCU, MaxWavesPerCU);
97 } else {
98 const unsigned WaveSlotsPerCU = WavesPerEU * getEUsPerCU();
99
100
101
102
103 unsigned MinWavesPerCUForWGSize =
104 divideCeil(WaveSlotsPerCU, MinWGsPerCU + 1) * MinWGsPerCU;
105 if (MinWavesPerCU > MinWavesPerCUForWGSize) {
106 unsigned ExcessSlots = MinWavesPerCU - MinWavesPerCUForWGSize;
107 if (unsigned ExcessSlotsPerWG = ExcessSlots / MinWGsPerCU) {
108
109
110
111
112
113
114 MinWavesPerCU -= MinWGsPerCU * std::min(ExcessSlotsPerWG,
115 MaxWavesPerWG - MinWavesPerWG);
116 }
117 }
118
119
120
121
122 unsigned LeftoverSlots = WaveSlotsPerCU - MaxWGsPerCU * MinWavesPerWG;
123 if (unsigned LeftoverSlotsPerWG = LeftoverSlots / MaxWGsPerCU) {
124
125
126
127
128
129
130 MaxWavesPerCU += MaxWGsPerCU * std::min(LeftoverSlotsPerWG,
131 ((MaxWGSize - 1) / WaveSize) + 1 -
132 MinWavesPerWG);
133 }
134 }
135
136
137
138 return {std::clamp(MinWavesPerCU / getEUsPerCU(), 1U, WavesPerEU),
140}
141
147
148std::pair<unsigned, unsigned>
150 switch (CC) {
158 default:
160 }
161}
162
165
166 std::pair<unsigned, unsigned> Default =
168
169
171 F, "amdgpu-flat-work-group-size", Default);
172
173
174 if (Requested.first > Requested.second)
176
177
182
183 return Requested;
184}
185
187 std::pair<unsigned, unsigned> RequestedWavesPerEU,
188 std::pair<unsigned, unsigned> FlatWorkGroupSizes, unsigned LDSBytes) const {
189
190
191
192
193 std::pair<unsigned, unsigned> Default = {
197
198
199
200 if (RequestedWavesPerEU.first < Default.first ||
201 RequestedWavesPerEU.first > Default.second ||
202 RequestedWavesPerEU.first > RequestedWavesPerEU.second ||
205
206
207 RequestedWavesPerEU.second =
208 std::min(RequestedWavesPerEU.second, Default.second);
209 return RequestedWavesPerEU;
210}
211
212std::pair<unsigned, unsigned>
214
216
217 unsigned LDSBytes =
219 true)
220 .first;
221 return getWavesPerEU(FlatWorkGroupSizes, LDSBytes, F);
222}
223
224std::pair<unsigned, unsigned>
226 unsigned LDSBytes, const Function &F) const {
227
229
230
231 std::pair<unsigned, unsigned> Requested =
234}
235
236std::optional
238 unsigned Dim) const {
240 if (Node && Node->getNumOperands() == 3)
242 return std::nullopt;
243}
244
246 const Function &F, bool RequiresUniformYZ) const {
247 auto *Node = F.getMetadata("reqd_work_group_size");
248 if ( || Node->getNumOperands() != 3)
249 return false;
250 unsigned XLen =
252 unsigned YLen =
254 unsigned ZLen =
256
257 bool Is1D = YLen <= 1 && ZLen <= 1;
258 bool IsXLargeEnough =
260 return Is1D || IsXLargeEnough;
261}
262
266
268 unsigned Dimension) const {
270 if (ReqdSize)
271 return *ReqdSize - 1;
273}
274
276 for (int I = 0; I < 3; ++I) {
278 return false;
279 }
280
281 return true;
282}
283
285 Function *Kernel = I->getFunction();
286 unsigned MinSize = 0;
288 bool IdQuery = false;
289
290
292 const Function *F = CI->getCalledFunction();
293 if (F) {
294 unsigned Dim = UINT_MAX;
295 switch (F->getIntrinsicID()) {
296 case Intrinsic::amdgcn_workitem_id_x:
297 case Intrinsic::r600_read_tidig_x:
298 IdQuery = true;
299 [[fallthrough]];
300 case Intrinsic::r600_read_local_size_x:
301 Dim = 0;
302 break;
303 case Intrinsic::amdgcn_workitem_id_y:
304 case Intrinsic::r600_read_tidig_y:
305 IdQuery = true;
306 [[fallthrough]];
307 case Intrinsic::r600_read_local_size_y:
308 Dim = 1;
309 break;
310 case Intrinsic::amdgcn_workitem_id_z:
311 case Intrinsic::r600_read_tidig_z:
312 IdQuery = true;
313 [[fallthrough]];
314 case Intrinsic::r600_read_local_size_z:
315 Dim = 2;
316 break;
317 default:
318 break;
319 }
320
321 if (Dim <= 3) {
323 if (ReqdSize)
324 MinSize = MaxSize = *ReqdSize;
325 }
326 }
327 }
328
329 if (!MaxSize)
330 return false;
331
332
333
334 if (IdQuery)
335 MinSize = 0;
336 else
337 ++MaxSize;
338
343 CI->addRangeRetAttr(Range);
344 } else {
347 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
348 }
349 return true;
350}
351
354
355
356
357 if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
358 return 0;
359
361 return 16;
362
363
364 const Module *M = F.getParent();
365 unsigned NBytes =
367 return F.getFnAttributeAsParsedInteger("amdgpu-implicitarg-num-bytes",
368 NBytes);
369}
370
372 Align &MaxAlign) const {
375
377 uint64_t ExplicitArgBytes = 0;
378 MaxAlign = Align(1);
379
380 for (const Argument &Arg : F.args()) {
381 if (Arg.hasAttribute("amdgpu-hidden-argument"))
382 continue;
383
384 const bool IsByRef = Arg.hasByRefAttr();
385 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
386 Align Alignment = DL.getValueOrABITypeAlignment(
387 IsByRef ? Arg.getParamAlign() : std::nullopt, ArgTy);
388 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
389 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
390 MaxAlign = std::max(MaxAlign, Alignment);
391 }
392
393 return ExplicitArgBytes;
394}
395
397 Align &MaxAlign) const {
400 return 0;
401
403
405
406 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
408 if (ImplicitBytes != 0) {
410 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
411 MaxAlign = std::max(MaxAlign, Alignment);
412 }
413
414
415 return alignTo(TotalSize, 4);
416}
417
422
428
435
436
440 std::numeric_limits<uint32_t>::max());
441}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file describes how to lower LLVM calls to machine code calls.
This file declares the targeting of the InstructionSelector class for AMDGPU.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
Base class for AMDGPU specific classes of TargetSubtarget.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file describes how to lower LLVM inline asm to machine code INLINEASM.
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
AMDGPU R600 specific subclass of TargetSubtarget.
std::pair< unsigned, unsigned > getDefaultFlatWorkGroupSize(CallingConv::ID CC) const
Definition AMDGPUSubtarget.cpp:149
bool EnableRealTrue16Insts
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
Definition AMDGPUSubtarget.cpp:237
Align getAlignmentForImplicitArgPtr() const
unsigned getEUsPerCU() const
Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the "CU" is the unit onto whic...
bool EnableD16Writes32BitVgpr
bool isMesaKernel(const Function &F) const
Definition AMDGPUSubtarget.cpp:263
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
Definition AMDGPUSubtarget.cpp:213
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
Definition AMDGPUSubtarget.cpp:37
std::pair< unsigned, unsigned > getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
Definition AMDGPUSubtarget.cpp:163
bool makeLIDRangeMetadata(Instruction *I) const
Creates value range metadata on an workitemid.* intrinsic call or load.
Definition AMDGPUSubtarget.cpp:284
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
Definition AMDGPUSubtarget.cpp:267
unsigned getImplicitArgNumBytes(const Function &F) const
Definition AMDGPUSubtarget.cpp:352
unsigned getLocalMemorySize() const
Return the maximum number of bytes of LDS available for all workgroups running on the same WGP or CU.
SmallVector< unsigned > getMaxNumWorkGroups(const Function &F) const
Return the number of work groups for the function.
Definition AMDGPUSubtarget.cpp:438
virtual unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const =0
virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const =0
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
Definition AMDGPUSubtarget.cpp:396
bool hasTrue16BitInsts() const
Return true if the subtarget supports True16 instructions.
AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const
Definition AMDGPUSubtarget.cpp:418
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount.
Definition AMDGPUSubtarget.cpp:49
virtual unsigned getMaxFlatWorkGroupSize() const =0
AMDGPUSubtarget(Triple TT)
Definition AMDGPUSubtarget.cpp:35
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
unsigned getMaxWavesPerEU() const
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
Definition AMDGPUSubtarget.cpp:245
uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const
Definition AMDGPUSubtarget.cpp:371
bool isSingleLaneExecution(const Function &Kernel) const
Return true if only a single workitem can be active in a wave.
Definition AMDGPUSubtarget.cpp:275
static const AMDGPUSubtarget & get(const MachineFunction &MF)
Definition AMDGPUSubtarget.cpp:423
unsigned getWavefrontSize() const
virtual unsigned getMinFlatWorkGroupSize() const =0
std::pair< unsigned, unsigned > getEffectiveWavesPerEU(std::pair< unsigned, unsigned > RequestedWavesPerEU, std::pair< unsigned, unsigned > FlatWorkGroupSizes, unsigned LDSBytes) const
Returns the target minimum/maximum number of waves per EU.
Definition AMDGPUSubtarget.cpp:186
bool hasD16Writes32BitVgpr() const
Definition AMDGPUSubtarget.cpp:41
Class for arbitrary precision integers.
This class represents an incoming formal argument to a Function.
This class represents a range of values.
A parsed version of the target data layout string in and methods for querying it.
MDNode * getMetadata(unsigned KindID) const
Get the current metadata attachments for the given kind, if any.
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
A Module instance is used to store all the information related to an LLVM module.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
Triple - Helper class for working with autoconf configuration names.
bool isAMDGCN() const
Tests whether the target is AMDGCN.
The instances of the Type class are immutable: once they are created, they are never changed.
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
SmallVector< unsigned > getIntegerVecAttribute(const Function &F, StringRef Name, unsigned Size, unsigned DefaultVal)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
This is an optimization pass for GlobalISel generic memory operations.
decltype(auto) dyn_cast(const From &Val)
dyn_cast - Return the argument parameter cast to the specified type.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
@ Default
The result values are uniform if and only if all operands are uniform.
Implement std::hash so that hash_code can be used in STL containers.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.