LLVM: lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp Source File (original) (raw)

1

2

3

4

5

6

7

8

9

10

11

12

13

20#include "llvm/IR/IntrinsicsAMDGPU.h"

23

24#define DEBUG_TYPE "amdgpu-lower-kernel-arguments"

25

26using namespace llvm;

27

28namespace {

29

30class PreloadKernelArgInfo {

31private:

34 unsigned NumFreeUserSGPRs;

35

36 enum HiddenArg : unsigned {

37 HIDDEN_BLOCK_COUNT_X,

38 HIDDEN_BLOCK_COUNT_Y,

39 HIDDEN_BLOCK_COUNT_Z,

40 HIDDEN_GROUP_SIZE_X,

41 HIDDEN_GROUP_SIZE_Y,

42 HIDDEN_GROUP_SIZE_Z,

43 HIDDEN_REMAINDER_X,

44 HIDDEN_REMAINDER_Y,

45 HIDDEN_REMAINDER_Z,

46 END_HIDDEN_ARGS

47 };

48

49

50 struct HiddenArgInfo {

51

52

54

56

57 const char *Name;

58 };

59

60 static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {

61 {0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"},

62 {8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"},

63 {14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"},

64 {18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"},

65 {22, 2, "_hidden_remainder_z"}};

66

67 static HiddenArg getHiddenArgFromOffset(unsigned Offset) {

68 for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I)

70 return static_cast<HiddenArg>(I);

71

72 return END_HIDDEN_ARGS;

73 }

74

75 static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) {

76 if (HA < END_HIDDEN_ARGS)

78

80 }

81

82 static const char *getHiddenArgName(HiddenArg HA) {

83 if (HA < END_HIDDEN_ARGS) {

84 return HiddenArgs[HA].Name;

85 }

87 }

88

89

90

91

92

93

94

95

96 Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) {

98 LLVMContext &Ctx = F.getParent()->getContext();

100 for (unsigned I = 0; I <= LastPreloadIndex; ++I)

101 FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I)));

102

104 FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg());

107

111

112 F.getParent()->getFunctionList().insert(F.getIterator(), NF);

115

120 ++NFArg;

121 }

122

124 AB.addAttribute(Attribute::InReg);

125 AB.addAttribute("amdgpu-hidden-argument");

127 for (unsigned I = 0; I <= LastPreloadIndex; ++I) {

128 AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB);

129 NFArg++->setName(getHiddenArgName(HiddenArg(I)));

130 }

131

133 F.replaceAllUsesWith(NF);

135

136 return NF;

137 }

138

139public:

141 setInitialFreeUserSGPRsCount();

142 }

143

144

145

146 void setInitialFreeUserSGPRsCount() {

149 }

150

151 bool tryAllocPreloadSGPRs(unsigned AllocSize, uint64_t ArgOffset,

152 uint64_t LastExplicitArgOffset) {

153

154

155 if (ArgOffset - LastExplicitArgOffset < 4 &&

157 return true;

158

159

160 ArgOffset = alignDown(ArgOffset, 4);

161 unsigned Padding = ArgOffset - LastExplicitArgOffset;

162 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;

163 unsigned NumPreloadSGPRs = alignTo(AllocSize, 4) / 4;

164 if (NumPreloadSGPRs + PaddingSGPRs > NumFreeUserSGPRs)

165 return false;

166

167 NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs);

168 return true;

169 }

170

171

172 void tryAllocImplicitArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset,

173 uint64_t LastExplicitArgOffset,

176 F.getParent(), Intrinsic::amdgcn_implicitarg_ptr);

177 if (!ImplicitArgPtr)

178 return;

179

180 const DataLayout &DL = F.getParent()->getDataLayout();

181

183 for (auto *U : ImplicitArgPtr->users()) {

184 Instruction *CI = dyn_cast(U);

185 if (!CI || CI->getParent()->getParent() != &F)

186 continue;

187

188 for (auto *U : CI->users()) {

190 auto *Load = dyn_cast(U);

191 if (!Load) {

193 continue;

194

195 Load = dyn_cast(*U->user_begin());

196 }

197

198 if (!Load || !Load->isSimple())

199 continue;

200

201

202 LLVMContext &Ctx = F.getParent()->getContext();

203 Type *LoadTy = Load->getType();

204 HiddenArg HA = getHiddenArgFromOffset(Offset);

205 if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA))

206 continue;

207

208 ImplicitArgLoads.push_back(std::make_pair(Load, Offset));

209 }

210 }

211

212 if (ImplicitArgLoads.empty())

213 return;

214

215

216

217 std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(), less_second());

218

219

220

221

222 auto *PreloadEnd = std::find_if(

223 ImplicitArgLoads.begin(), ImplicitArgLoads.end(),

224 [&](const std::pair<LoadInst *, unsigned> &Load) {

225 unsigned LoadSize = DL.getTypeStoreSize(Load.first->getType());

226 unsigned LoadOffset = Load.second;

227 if (!tryAllocPreloadSGPRs(LoadSize,

228 LoadOffset + ImplicitArgsBaseOffset,

229 LastExplicitArgOffset))

230 return true;

231

232 LastExplicitArgOffset =

233 ImplicitArgsBaseOffset + LoadOffset + LoadSize;

234 return false;

235 });

236

237 if (PreloadEnd == ImplicitArgLoads.begin())

238 return;

239

240 unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second);

241 Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex);

243 for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) {

245 unsigned LoadOffset = I->second;

246 unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset);

247 unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1;

250 }

251 }

252};

253

254class AMDGPULowerKernelArguments : public FunctionPass {

255public:

256 static char ID;

257

259

261

265 }

266};

267

268}

269

270

274 AllocaInst *AI = dyn_cast(&*InsPt);

275

276

277

279 break;

280 }

281

282 return InsPt;

283}

284

288 return false;

289

291 LLVMContext &Ctx = F.getParent()->getContext();

295

296 const Align KernArgBaseAlign(16);

297 const uint64_t BaseOffset = ST.getExplicitKernelArgOffset();

298

300

301 const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign);

302 if (TotalKernArgSize == 0)

303 return false;

304

306 Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, {}, {},

307 nullptr, F.getName() + ".kernarg.segment");

308 KernArgSegment->addRetAttr(Attribute::NonNull);

311

312 uint64_t ExplicitArgOffset = 0;

313

314 bool InPreloadSequence = true;

315 PreloadKernelArgInfo PreloadInfo(F, ST);

316

318 const bool IsByRef = Arg.hasByRefAttr();

319 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();

320 MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt;

321 Align ABITypeAlign = DL.getValueOrABITypeAlignment(ParamAlign, ArgTy);

322

324 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);

325

326 uint64_t EltOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset;

327 uint64_t LastExplicitArgOffset = ExplicitArgOffset;

328 ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;

329

330

331

332

333 if (Arg.hasAttribute("amdgpu-hidden-argument"))

334 break;

335

336

337 if (Arg.hasInRegAttr() && InPreloadSequence && ST.hasKernargPreload() &&

338 !Arg.getType()->isAggregateType())

339 if (PreloadInfo.tryAllocPreloadSGPRs(AllocSize, EltOffset,

340 LastExplicitArgOffset))

341 continue;

342

343 InPreloadSequence = false;

344

345 if (Arg.use_empty())

346 continue;

347

348

349

350 if (IsByRef) {

352 Builder.getInt8Ty(), KernArgSegment, EltOffset,

353 Arg.getName() + ".byval.kernarg.offset");

354

355 Value *CastOffsetPtr =

358 continue;

359 }

360

361 if (PointerType *PT = dyn_cast(ArgTy)) {

362

363

364

365

368 !ST.hasUsableDSOffset())

369 continue;

370

371

372

373 if (Arg.hasNoAliasAttr())

374 continue;

375 }

376

377 auto *VT = dyn_cast(ArgTy);

378 bool IsV3 = VT && VT->getNumElements() == 3;

380

382

383 int64_t AlignDownOffset = alignDown(EltOffset, 4);

384 int64_t OffsetDiff = EltOffset - AlignDownOffset;

386 KernArgBaseAlign, DoShiftOpt ? AlignDownOffset : EltOffset);

387

389 Type *AdjustedArgTy;

390 if (DoShiftOpt) {

391

392

393

394

395

396

397

399 Builder.getInt8Ty(), KernArgSegment, AlignDownOffset,

400 Arg.getName() + ".kernarg.offset.align.down");

401 AdjustedArgTy = Builder.getInt32Ty();

402 } else {

404 Builder.getInt8Ty(), KernArgSegment, EltOffset,

405 Arg.getName() + ".kernarg.offset");

406 AdjustedArgTy = ArgTy;

407 }

408

409 if (IsV3 && Size >= 32) {

411

412 AdjustedArgTy = V4Ty;

413 }

414

417 Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {}));

418

420

421 if (Arg.hasAttribute(Attribute::NoUndef))

422 Load->setMetadata(LLVMContext::MD_noundef, MDNode::get(Ctx, {}));

423

424 if (Arg.hasAttribute(Attribute::Range)) {

426 Arg.getAttribute(Attribute::Range).getValueAsConstantRange();

427 Load->setMetadata(LLVMContext::MD_range,

429 }

430

431 if (isa(ArgTy)) {

432 if (Arg.hasNonNullAttr())

433 Load->setMetadata(LLVMContext::MD_nonnull, MDNode::get(Ctx, {}));

434

435 uint64_t DerefBytes = Arg.getDereferenceableBytes();

436 if (DerefBytes != 0) {

437 Load->setMetadata(

438 LLVMContext::MD_dereferenceable,

441 ConstantInt::get(Builder.getInt64Ty(), DerefBytes))));

442 }

443

444 uint64_t DerefOrNullBytes = Arg.getDereferenceableOrNullBytes();

445 if (DerefOrNullBytes != 0) {

446 Load->setMetadata(

447 LLVMContext::MD_dereferenceable_or_null,

450 DerefOrNullBytes))));

451 }

452

453 if (MaybeAlign ParamAlign = Arg.getParamAlign()) {

454 Load->setMetadata(

455 LLVMContext::MD_align,

457 Builder.getInt64Ty(), ParamAlign->value()))));

458 }

459 }

460

461

462

463 if (DoShiftOpt) {

464 Value *ExtractBits = OffsetDiff == 0 ?

465 Load : Builder.CreateLShr(Load, OffsetDiff * 8);

466

470 Arg.getName() + ".load");

472 } else if (IsV3) {

474 Arg.getName() + ".load");

476 } else {

477 Load->setName(Arg.getName() + ".load");

478 Arg.replaceAllUsesWith(Load);

479 }

480 }

481

484

485 if (InPreloadSequence) {

486 uint64_t ImplicitArgsBaseOffset =

487 alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) +

488 BaseOffset;

489 PreloadInfo.tryAllocImplicitArgPreloadSGPRs(ImplicitArgsBaseOffset,

490 ExplicitArgOffset, Builder);

491 }

492

493 return true;

494}

495

496bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {

497 auto &TPC = getAnalysis();

500}

501

503 "AMDGPU Lower Kernel Arguments", false, false)

506

507char AMDGPULowerKernelArguments::ID = 0;

508

510 return new AMDGPULowerKernelArguments();

511}

512

516 if (Changed) {

517

520 return PA;

521 }

522

524}

AMDGPU Lower Kernel Arguments

static BasicBlock::iterator getInsertPt(BasicBlock &BB)

static bool lowerKernelArguments(Function &F, const TargetMachine &TM)

MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL

This file contains the simple types necessary to represent the attributes associated with functions a...

AMD GCN specific subclass of TargetSubtarget.

ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))

#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)

#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)

assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())

Target-Independent Code Generator Pass Configuration Options pass.

PreservedAnalyses run(Function &, FunctionAnalysisManager &)

an instruction to allocate memory on the stack

bool isStaticAlloca() const

Return true if this alloca is in the entry block of the function and is a constant size.

A container for analyses that lazily runs them and caches their results.

Represent the analysis usage information of a pass.

AnalysisUsage & addRequired()

void setPreservesAll()

Set by analyses that do not transform their input at all.

This class represents an incoming formal argument to a Function.

unsigned getArgNo() const

Return the index of this formal argument in its containing function.

ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...

static Attribute getWithDereferenceableBytes(LLVMContext &Context, uint64_t Bytes)

static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)

Return a uniquified Attribute object that has the specific alignment set.

LLVM Basic Block Representation.

const_iterator getFirstInsertionPt() const

Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...

InstListType::iterator iterator

Instruction iterators...

Represents analyses that only rely on functions' control flow.

void addRetAttr(Attribute::AttrKind Kind)

Adds the attribute to the return value.

This class represents a function call, abstracting a target machine's calling convention.

This class represents a range of values.

const APInt & getLower() const

Return the lower value for this range.

const APInt & getUpper() const

Return the upper value for this range.

A parsed version of the target data layout string in and methods for querying it.

static FixedVectorType * get(Type *ElementType, unsigned NumElts)

FunctionPass class - This class is used to implement most global optimizations.

virtual bool runOnFunction(Function &F)=0

runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.

static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)

void splice(Function::iterator ToIt, Function *FromF)

Transfer all blocks from FromF to this function at ToIt.

AttributeList getAttributes() const

Return the attribute list for this Function.

void setAttributes(AttributeList Attrs)

Set the attribute list for this Function.

void setIsNewDbgInfoFormat(bool NewVal)

Argument * getArg(unsigned i) const

void copyAttributesFrom(const Function *Src)

copyAttributesFrom - copy all additional attributes (those not needed to create a Function) from the ...

unsigned getNumFreeUserSGPRs()

void copyMetadata(const GlobalObject *Src, unsigned Offset)

Copy metadata from Src, adjusting offsets by Offset.

IntegerType * getIntNTy(unsigned N)

Fetch the type representing an N-bit integer.

LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)

Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)

IntegerType * getInt32Ty()

Fetch the type representing a 32-bit integer.

IntegerType * getInt64Ty()

Fetch the type representing a 64-bit integer.

CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")

Create a call to intrinsic ID with Args, mangled using Types.

Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")

Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")

Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)

Value * CreateConstInBoundsGEP1_64(Type *Ty, Value *Ptr, uint64_t Idx0, const Twine &Name="")

IntegerType * getInt8Ty()

Fetch the type representing an 8-bit integer.

Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")

This provides a uniform API for creating instructions and inserting them into a basic block: either a...

Class to represent integer types.

This is an important class for using LLVM in a threaded context.

An instruction for reading from memory.

ConstantAsMetadata * createConstant(Constant *C)

Return the given constant as metadata.

MDNode * createRange(const APInt &Lo, const APInt &Hi)

Return metadata describing the range [Lo, Hi).

static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)

virtual void getAnalysisUsage(AnalysisUsage &) const

getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...

A set of analyses that are preserved following a run of a transformation pass.

static PreservedAnalyses all()

Construct a special preserved set that preserves all passes.

void preserveSet()

Mark an analysis set as preserved.

void push_back(const T &Elt)

This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.

Primary interface to the complete machine description for the target machine.

Target-Independent Code Generator Pass Configuration Options.

The instances of the Type class are immutable: once they are created, they are never changed.

static IntegerType * getIntNTy(LLVMContext &C, unsigned N)

bool isAggregateType() const

Return true if the type is an aggregate type.

LLVM Value Representation.

Type * getType() const

All values are typed, get the type of this value.

void setName(const Twine &Name)

Change the name of the value.

void replaceAllUsesWith(Value *V)

Change all uses of this to point to a new Value.

iterator_range< user_iterator > users()

StringRef getName() const

Return a constant reference to the value's name.

void takeName(Value *V)

Transfer the name from V to this value.

const ParentTy * getParent() const

#define llvm_unreachable(msg)

Marks that the current location is not supposed to be reachable.

@ REGION_ADDRESS

Address space for region memory. (GDS)

@ LOCAL_ADDRESS

Address space for local memory.

@ AMDGPU_KERNEL

Used for AMDGPU code object kernels.

@ C

The default llvm calling convention, compatible with C.

unsigned ID

LLVM IR allows to use arbitrary numbers as calling convention identifiers.

Function * getDeclarationIfExists(Module *M, ID id, ArrayRef< Type * > Tys, FunctionType *FT=nullptr)

This version supports overloaded intrinsics.

This is an optimization pass for GlobalISel generic memory operations.

bool isAligned(Align Lhs, uint64_t SizeInBytes)

Checks that SizeInBytes is a multiple of the alignment.

Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)

Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.

constexpr T alignDown(U Value, V Align, W Skew=0)

Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.

FunctionPass * createAMDGPULowerKernelArgumentsPass()

uint64_t alignTo(uint64_t Size, Align A)

Returns a multiple of A needed to store Size bytes.

Align commonAlignment(Align A, uint64_t Offset)

Returns the alignment that satisfies both alignments.

This struct is a compact representation of a valid (non-zero power of two) alignment.

This struct is a compact representation of a valid (power of two) or undefined (0) alignment.

Function object to check whether the second component of a container supported by std::get (like std:...