AMDGPUSubtarget.cpp Source File (original) (raw)

26#include "llvm/IR/IntrinsicsAMDGPU.h"

27#include "llvm/IR/IntrinsicsR600.h"

29#include

31using namespace llvm;

33#define DEBUG_TYPE "amdgpu-subtarget"

39}

44unsigned

49 const unsigned WavesPerWorkgroup =

50 std::max(1u, (WorkGroupSize + WaveSize - 1) / WaveSize);

52 const unsigned WorkGroupsPerCU =

53 std::max(1u, (NWaves * getEUsPerCU()) / WavesPerWorkgroup);

56}

68 if (!MaxWorkGroupsPerCu)

69 return 0;

80 if (NumGroups == 0)

81 return 1;

83 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);

86 const unsigned MaxGroupNumWaves = divideCeil(MaxWorkGroupSize, WaveSize);

87 unsigned MaxWaves = NumGroups * MaxGroupNumWaves;

99 "computed invalid occupancy");

100 return MaxWaves;

101}

102

103unsigned

107}

108

109std::pair<unsigned, unsigned>

111 switch (CC) {

119 default:

121 }

122}

123

126

127 std::pair<unsigned, unsigned> Default =

129

130

132 F, "amdgpu-flat-work-group-size", Default);

133

134

135 if (Requested.first > Requested.second)

137

138

143

144 return Requested;

145}

146

148 std::pair<unsigned, unsigned> Requested,

149 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {

150

152

153

154

155

156

157 unsigned MinImpliedByFlatWorkGroupSize =

159 Default.first = MinImpliedByFlatWorkGroupSize;

160

161

162 if (Requested.second && Requested.first > Requested.second)

164

165

169

170

171

172 if (Requested.first < MinImpliedByFlatWorkGroupSize)

174

175 return Requested;

176}

177

179 const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {

180

182

183

184 std::pair<unsigned, unsigned> Requested =

187}

188

191 if (Node && Node->getNumOperands() == 3)

192 return mdconst::extract(Node->getOperand(Dim))->getZExtValue();

193 return std::numeric_limits::max();

194}

195

198}

199

201 unsigned Dimension) const {

203 if (ReqdSize != std::numeric_limits::max())

204 return ReqdSize - 1;

206}

207

209 for (int I = 0; I < 3; ++I) {

211 return false;

212 }

213

214 return true;

215}

216

218 Function *Kernel = I->getParent()->getParent();

219 unsigned MinSize = 0;

221 bool IdQuery = false;

222

223

224 if (auto *CI = dyn_cast(I)) {

225 const Function *F = CI->getCalledFunction();

226 if (F) {

227 unsigned Dim = UINT_MAX;

228 switch (F->getIntrinsicID()) {

229 case Intrinsic::amdgcn_workitem_id_x:

230 case Intrinsic::r600_read_tidig_x:

231 IdQuery = true;

232 [[fallthrough]];

233 case Intrinsic::r600_read_local_size_x:

234 Dim = 0;

235 break;

236 case Intrinsic::amdgcn_workitem_id_y:

237 case Intrinsic::r600_read_tidig_y:

238 IdQuery = true;

239 [[fallthrough]];

240 case Intrinsic::r600_read_local_size_y:

241 Dim = 1;

242 break;

243 case Intrinsic::amdgcn_workitem_id_z:

244 case Intrinsic::r600_read_tidig_z:

245 IdQuery = true;

246 [[fallthrough]];

247 case Intrinsic::r600_read_local_size_z:

248 Dim = 2;

249 break;

250 default:

251 break;

252 }

253

254 if (Dim <= 3) {

256 if (ReqdSize != std::numeric_limits::max())

257 MinSize = MaxSize = ReqdSize;

258 }

259 }

260 }

261

262 if (!MaxSize)

263 return false;

264

265

266

267 if (IdQuery)

268 MinSize = 0;

269 else

270 ++MaxSize;

271

274 if (auto *CI = dyn_cast(I)) {

276 CI->addRangeRetAttr(Range);

277 } else {

280 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);

281 }

282 return true;

283}

284

287

288

289

290 if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))

291 return 0;

292

294 return 16;

295

296

297 const Module *M = F.getParent();

298 unsigned NBytes =

300 return F.getFnAttributeAsParsedInteger("amdgpu-implicitarg-num-bytes",

301 NBytes);

302}

303

305 Align &MaxAlign) const {

308

310 uint64_t ExplicitArgBytes = 0;

311 MaxAlign = Align(1);

312

313 for (const Argument &Arg : F.args()) {

314 if (Arg.hasAttribute("amdgpu-hidden-argument"))

315 continue;

316

317 const bool IsByRef = Arg.hasByRefAttr();

318 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();

319 Align Alignment = DL.getValueOrABITypeAlignment(

320 IsByRef ? Arg.getParamAlign() : std::nullopt, ArgTy);

321 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);

322 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;

323 MaxAlign = std::max(MaxAlign, Alignment);

324 }

325

326 return ExplicitArgBytes;

327}

328

330 Align &MaxAlign) const {

333 return 0;

334

336

338

339 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;

341 if (ImplicitBytes != 0) {

343 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;

344 MaxAlign = std::max(MaxAlign, Alignment);

345 }

346

347

348 return alignTo(TotalSize, 4);

349}

350

354}

355

360}

361

363 if (TM.getTargetTriple().getArch() == Triple::amdgcn)

367}

368

369

373 std::numeric_limits<uint32_t>::max());

374}

This file describes how to lower LLVM calls to machine code calls.

This file declares the targeting of the InstructionSelector class for AMDGPU.

This file declares the targeting of the Machinelegalizer class for AMDGPU.

This file declares the targeting of the RegisterBankInfo class for AMDGPU.

static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim)

Base class for AMDGPU specific classes of TargetSubtarget.

MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL

This file describes how to lower LLVM inline asm to machine code INLINEASM.

ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))

AMDGPU R600 specific subclass of TargetSubtarget.

assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())

unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const

Inverse of getMaxLocalMemWithWaveCount.

std::pair< unsigned, unsigned > getDefaultFlatWorkGroupSize(CallingConv::ID CC) const

bool EnableRealTrue16Insts

Align getAlignmentForImplicitArgPtr() const

unsigned getEUsPerCU() const

Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the "CU" is the unit onto whic...

bool isMesaKernel(const Function &F) const

std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const

bool useRealTrue16Insts() const

Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...

virtual unsigned getMinWavesPerEU() const =0

std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const

bool makeLIDRangeMetadata(Instruction *I) const

Creates value range metadata on an workitemid.* intrinsic call or load.

unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const

Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.

unsigned getImplicitArgNumBytes(const Function &F) const

unsigned getLocalMemorySize() const

Return the maximum number of bytes of LDS available for all workgroups running on the same WGP or CU.

SmallVector< unsigned > getMaxNumWorkGroups(const Function &F) const

Return the number of work groups for the function.

virtual unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const =0

virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const =0

unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const

bool hasTrue16BitInsts() const

Return true if the subtarget supports True16 instructions.

AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const

unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const

Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount.

virtual unsigned getMaxFlatWorkGroupSize() const =0

AMDGPUSubtarget(Triple TT)

unsigned getExplicitKernelArgOffset() const

Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.

unsigned getMaxWavesPerEU() const

uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const

std::pair< unsigned, unsigned > getEffectiveWavesPerEU(std::pair< unsigned, unsigned > WavesPerEU, std::pair< unsigned, unsigned > FlatWorkGroupSizes) const

bool isSingleLaneExecution(const Function &Kernel) const

Return true if only a single workitem can be active in a wave.

static const AMDGPUSubtarget & get(const MachineFunction &MF)

unsigned getWavefrontSize() const

virtual unsigned getMinFlatWorkGroupSize() const =0

Class for arbitrary precision integers.

This class represents an incoming formal argument to a Function.

This class represents a range of values.

A parsed version of the target data layout string in and methods for querying it.

MDNode * getMetadata(unsigned KindID) const

Get the current metadata attachments for the given kind, if any.

MDNode * createRange(const APInt &Lo, const APInt &Hi)

Return metadata describing the range [Lo, Hi).

const TargetSubtargetInfo & getSubtarget() const

getSubtarget - Return the subtarget for which this machine code is being compiled.

Function & getFunction()

Return the LLVM function that this machine code represents.

Ty * getInfo()

getInfo - Keep track of various per-function pieces of information for backends that would like to do...

const TargetMachine & getTarget() const

getTarget - Return the target machine this machine code is compiled with

A Module instance is used to store all the information related to an LLVM module.

This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...

This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.

Primary interface to the complete machine description for the target machine.

const Triple & getTargetTriple() const

Triple - Helper class for working with autoconf configuration names.

ArchType getArch() const

Get the parsed architecture type of this triple.

The instances of the Type class are immutable: once they are created, they are never changed.

LLVM_READNONE bool isKernel(CallingConv::ID CC)

unsigned getAMDHSACodeObjectVersion(const Module &M)

bool isShader(CallingConv::ID cc)

SmallVector< unsigned > getIntegerVecAttribute(const Function &F, StringRef Name, unsigned Size, unsigned DefaultVal)

std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)

@ AMDGPU_VS

Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...

@ AMDGPU_KERNEL

Used for AMDGPU code object kernels.

@ AMDGPU_HS

Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).

@ AMDGPU_GS

Used for Mesa/AMDPAL geometry shaders.

@ AMDGPU_PS

Used for Mesa/AMDPAL pixel shaders.

@ SPIR_KERNEL

Used for SPIR kernel functions.

@ AMDGPU_ES

Used for AMDPAL shader stage before geometry shader if geometry is in use.

@ AMDGPU_LS

Used for AMDPAL vertex shader if tessellation is in use.

This is an optimization pass for GlobalISel generic memory operations.

constexpr T divideCeil(U Numerator, V Denominator)

Returns the integer ceil(Numerator / Denominator).

uint64_t alignTo(uint64_t Size, Align A)

Returns a multiple of A needed to store Size bytes.

OutputIt move(R &&Range, OutputIt Out)

Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.

@ Default

The result values are uniform if and only if all operands are uniform.

Implement std::hash so that hash_code can be used in STL containers.

This struct is a compact representation of a valid (non-zero power of two) alignment.