LLVM: lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp Source File (original) (raw)

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

26#include "llvm/IR/IntrinsicsAMDGPU.h"

31

32#define DEBUG_TYPE "amdgpu-preload-kernel-arguments"

33

34using namespace llvm;

35

37 "amdgpu-kernarg-preload-count",

38 cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0));

39

42 cl::desc("Enable preload kernel arguments to SGPRs"),

44

45namespace {

46

47class AMDGPUPreloadKernelArgumentsLegacy : public ModulePass {

49

50public:

51 static char ID;

52 explicit AMDGPUPreloadKernelArgumentsLegacy(

54

55 StringRef getPassName() const override {

56 return "AMDGPU Preload Kernel Arguments";

57 }

58

59 bool runOnModule(Module &M) override;

60};

61

62class PreloadKernelArgInfo {

63private:

65 const GCNSubtarget &ST;

66 unsigned NumFreeUserSGPRs;

67

68 enum HiddenArg : unsigned {

69 HIDDEN_BLOCK_COUNT_X,

70 HIDDEN_BLOCK_COUNT_Y,

71 HIDDEN_BLOCK_COUNT_Z,

72 HIDDEN_GROUP_SIZE_X,

73 HIDDEN_GROUP_SIZE_Y,

74 HIDDEN_GROUP_SIZE_Z,

75 HIDDEN_REMAINDER_X,

76 HIDDEN_REMAINDER_Y,

77 HIDDEN_REMAINDER_Z,

78 END_HIDDEN_ARGS

79 };

80

81

82 struct HiddenArgInfo {

83

84

85 uint8_t Offset;

86

87 uint8_t Size;

88

89 const char *Name;

90 };

91

92 static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {

93 {0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"},

94 {8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"},

95 {14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"},

96 {18, 2, "_hidden_remainder_x"}, {20, 2, "_hidden_remainder_y"},

97 {22, 2, "_hidden_remainder_z"}};

98

99 static HiddenArg getHiddenArgFromOffset(unsigned Offset) {

100 for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I)

102 return static_cast<HiddenArg>(I);

103

104 return END_HIDDEN_ARGS;

105 }

106

107 static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) {

108 if (HA < END_HIDDEN_ARGS)

109 return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8);

110

112 }

113

114 static const char *getHiddenArgName(HiddenArg HA) {

115 if (HA < END_HIDDEN_ARGS)

116 return HiddenArgs[HA].Name;

117

119 }

120

121

122

123

124

125

126

127

128 Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) {

129 FunctionType *FT = F.getFunctionType();

130 LLVMContext &Ctx = F.getContext();

132 for (unsigned I = 0; I <= LastPreloadIndex; ++I)

133 FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I)));

134

135 FunctionType *NFT =

136 FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg());

138 Function::Create(NFT, F.getLinkage(), F.getAddressSpace(), F.getName());

139

142

143 F.getParent()->getFunctionList().insert(F.getIterator(), NF);

146

148 for (Argument &Arg : F.args()) {

149 Arg.replaceAllUsesWith(&*NFArg);

151 ++NFArg;

152 }

153

154 AttrBuilder AB(Ctx);

155 AB.addAttribute(Attribute::InReg);

156 AB.addAttribute("amdgpu-hidden-argument");

158 for (unsigned I = 0; I <= LastPreloadIndex; ++I) {

159 AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB);

160 NFArg++->setName(getHiddenArgName(HiddenArg(I)));

161 }

162

164 F.replaceAllUsesWith(NF);

165

166 return NF;

167 }

168

169public:

170 PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {

171 setInitialFreeUserSGPRsCount();

172 }

173

174

175

176 void setInitialFreeUserSGPRsCount() {

177 GCNUserSGPRUsageInfo UserSGPRInfo(F, ST);

178 NumFreeUserSGPRs = UserSGPRInfo.getNumFreeUserSGPRs();

179 }

180

181 bool canPreloadKernArgAtOffset(uint64_t ExplicitArgOffset) {

182 return ExplicitArgOffset <= NumFreeUserSGPRs * 4;

183 }

184

185

186 void

187 tryAllocHiddenArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset,

188 SmallVectorImpl<Function *> &FunctionsToErase) {

190 F.getParent(), Intrinsic::amdgcn_implicitarg_ptr);

191 if (!ImplicitArgPtr)

192 return;

193

195

197 for (auto *U : ImplicitArgPtr->users()) {

200 continue;

201

202 for (auto *U : CI->users()) {

205 if (!Load) {

207 continue;

208

210 }

211

212 if (!Load || Load->isSimple())

213 continue;

214

215

216 LLVMContext &Ctx = F.getContext();

217 Type *LoadTy = Load->getType();

218 HiddenArg HA = getHiddenArgFromOffset(Offset);

219 if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA))

220 continue;

221

222 ImplicitArgLoads.push_back(std::make_pair(Load, Offset));

223 }

224 }

225

226 if (ImplicitArgLoads.empty())

227 return;

228

229

230

231 std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(), less_second());

232

233

234

235

237 ImplicitArgLoads, [&](const std::pair<LoadInst *, unsigned> &Load) {

238 unsigned LoadSize = DL.getTypeStoreSize(Load.first->getType());

239 unsigned LoadOffset = Load.second;

240 if (!canPreloadKernArgAtOffset(LoadOffset + LoadSize +

241 ImplicitArgsBaseOffset))

242 return true;

243

244 return false;

245 });

246

247 if (PreloadEnd == ImplicitArgLoads.begin())

248 return;

249

250 unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second);

251 Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex);

254 for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) {

255 LoadInst *LoadInst = I->first;

256 unsigned LoadOffset = I->second;

257 unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset);

258 unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1;

261 }

262 }

263};

264

265}

266

267char AMDGPUPreloadKernelArgumentsLegacy::ID = 0;

268

270 "AMDGPU Preload Kernel Arguments", false, false)

271

274 return new AMDGPUPreloadKernelArgumentsLegacy(

276}

277

278AMDGPUPreloadKernelArgumentsLegacy::AMDGPUPreloadKernelArgumentsLegacy(

281

284 return false;

285

288 for (auto &F : M) {

290 if (!ST.hasKernargPreload() ||

292 continue;

293

294 PreloadKernelArgInfo PreloadInfo(F, ST);

295 uint64_t ExplicitArgOffset = 0;

297 const uint64_t BaseOffset = ST.getExplicitKernelArgOffset();

299 unsigned NumPreloadedExplicitArgs = 0;

301

302

303

304

305 if (Arg.hasByRefAttr() || Arg.hasNestAttr() ||

306 Arg.hasAttribute("amdgpu-hidden-argument"))

307 break;

308

309

310 if (NumPreloadsRequested == 0 && !Arg.hasInRegAttr())

311 break;

312

313

314 if (Arg.getType()->isAggregateType())

315 break;

316

317 Type *ArgTy = Arg.getType();

318 Align ABITypeAlign = DL.getABITypeAlign(ArgTy);

319 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);

320 ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;

321

322 if (!PreloadInfo.canPreloadKernArgAtOffset(ExplicitArgOffset))

323 break;

324

325 Arg.addAttr(Attribute::InReg);

326 NumPreloadedExplicitArgs++;

327 if (NumPreloadsRequested > 0)

328 NumPreloadsRequested--;

329 }

330

331

332

333 if (NumPreloadedExplicitArgs == F.arg_size()) {

334 uint64_t ImplicitArgsBaseOffset =

335 alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) +

336 BaseOffset;

337 PreloadInfo.tryAllocHiddenArgPreloadSGPRs(ImplicitArgsBaseOffset,

338 FunctionsToErase);

339 }

340

341 Changed |= NumPreloadedExplicitArgs > 0;

342 }

343

345

346

347 for (auto *F : FunctionsToErase)

348 F->eraseFromParent();

349

351}

352

353bool AMDGPUPreloadKernelArgumentsLegacy::runOnModule(Module &M) {

354 if (skipModule(M) || !TM)

355 return false;

356

358}

359

360PreservedAnalyses

assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

static bool markKernelArgsAsInreg(Module &M, const TargetMachine &TM)

Definition AMDGPUPreloadKernelArguments.cpp:282

static cl::opt< unsigned > KernargPreloadCount("amdgpu-kernarg-preload-count", cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0))

static cl::opt< bool > EnableKernargPreload("amdgpu-kernarg-preload", cl::desc("Enable preload kernel arguments to SGPRs"), cl::init(true))

The AMDGPU TargetMachine interface definition for hw codegen targets.

MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL

Module.h This file contains the declarations for the Module class.

This header defines various interfaces for pass management in LLVM.

Machine Check Debug Module

#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)

PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)

Definition AMDGPUPreloadKernelArguments.cpp:361

This class represents an incoming formal argument to a Function.

unsigned getArgNo() const

Return the index of this formal argument in its containing function.

A parsed version of the target data layout string in and methods for querying it.

static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)

void splice(Function::iterator ToIt, Function *FromF)

Transfer all blocks from FromF to this function at ToIt.

AttributeList getAttributes() const

Return the attribute list for this Function.

void setAttributes(AttributeList Attrs)

Set the attribute list for this Function.

Argument * getArg(unsigned i) const

void copyAttributesFrom(const Function *Src)

copyAttributesFrom - copy all additional attributes (those not needed to create a Function) from the ...

LLVM_ABI void copyMetadata(const GlobalObject *Src, unsigned Offset)

Copy metadata from Src, adjusting offsets by Offset.

LLVM_ABI const Function * getFunction() const

Return the function this instruction belongs to.

const MachineFunction * getParent() const

Return the MachineFunction containing this basic block.

const DataLayout & getDataLayout() const

Return the DataLayout attached to the Module associated to this MF.

ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...

A Module instance is used to store all the information related to an LLVM module.

static PreservedAnalyses none()

Convenience factory function for the empty preserved set.

static PreservedAnalyses all()

Construct a special preserved set that preserves all passes.

void push_back(const T &Elt)

This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.

StringRef - Represent a constant reference to a string, i.e.

Primary interface to the complete machine description for the target machine.

const STC & getSubtarget(const Function &F) const

This method returns a pointer to the specified type of TargetSubtargetInfo.

The instances of the Type class are immutable: once they are created, they are never changed.

LLVM_ABI void setName(const Twine &Name)

Change the name of the value.

LLVM_ABI void replaceAllUsesWith(Value *V)

Change all uses of this to point to a new Value.

iterator_range< user_iterator > users()

LLVM_ABI void takeName(Value *V)

Transfer the name from V to this value.

#define llvm_unreachable(msg)

Marks that the current location is not supposed to be reachable.

unsigned ID

LLVM IR allows to use arbitrary numbers as calling convention identifiers.

@ AMDGPU_KERNEL

Used for AMDGPU code object kernels.

LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)

Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.

initializer< Ty > init(const Ty &Val)

friend class Instruction

Iterator for Instructions in a `BasicBlock.

This is an optimization pass for GlobalISel generic memory operations.

decltype(auto) dyn_cast(const From &Val)

dyn_cast - Return the argument parameter cast to the specified type.

Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)

Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.

class LLVM_GSL_OWNER SmallVector

Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...

ModulePass * createAMDGPUPreloadKernelArgumentsLegacyPass(const TargetMachine *)

uint64_t alignTo(uint64_t Size, Align A)

Returns a multiple of A needed to store Size bytes.

auto find_if(R &&Range, UnaryPredicate P)

Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.

AnalysisManager< Module > ModuleAnalysisManager

Convenience typedef for the Module analysis manager.

This struct is a compact representation of a valid (non-zero power of two) alignment.