LLVM: lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp Source File (original) (raw)

1

2

3

4

5

6

7

8

9

10

11

12

13

14

31

32using namespace llvm;

33

34#define DEBUG_TYPE "amdgpu-perf-hint"

35

38 cl::desc("Function mem bound threshold in %"));

39

42 cl::desc("Kernel limit wave threshold in %"));

43

46 cl::desc("Indirect access memory instruction weight"));

47

50 cl::desc("Large stride memory access weight"));

51

54 cl::desc("Large stride memory access threshold"));

55

56STATISTIC(NumMemBound, "Number of functions marked as memory bound");

57STATISTIC(NumLimitWave, "Number of functions marked as needing limit wave");

58

59namespace {

60

61struct AMDGPUPerfHint {

63

64public:

67 : FIM(FIM_), TLI(TLI_) {}

68

70

71private:

72 struct MemAccessInfo {

73 const Value *V = nullptr;

74 const Value *Base = nullptr;

75 int64_t Offset = 0;

76 MemAccessInfo() = default;

77 bool isLargeStride(MemAccessInfo &Reference) const;

78#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

79 Printable print() const {

80 return Printable([this](raw_ostream &OS) {

81 OS << "Value: " << *V << '\n'

82 << "Base: " << *Base << " Offset: " << Offset << '\n';

83 });

84 }

85#endif

86 };

87

88 MemAccessInfo makeMemAccessInfo(Instruction *) const;

89

90 MemAccessInfo LastAccess;

91

93

94 const DataLayout *DL = nullptr;

95

96 const SITargetLowering *TLI;

97

98 AMDGPUPerfHintAnalysis::FuncInfo *visit(const Function &F);

99 static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F);

100 static bool needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &F);

101

102 bool isIndirectAccess(const Instruction *Inst) const;

103

104

105

106

107

108

109

110

111 bool isLargeStride(const Instruction *Inst);

112

113 bool isGlobalAddr(const Value *V) const;

114 bool isLocalAddr(const Value *V) const;

115 bool isGlobalLoadUsedInBB(const Instruction &) const;

116};

117

118static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType(

121 return {LI->getPointerOperand(), LI->getType()};

123 return {SI->getPointerOperand(), SI->getValueOperand()->getType()};

125 return {AI->getPointerOperand(), AI->getCompareOperand()->getType()};

127 return {AI->getPointerOperand(), AI->getValOperand()->getType()};

130

131 return {nullptr, nullptr};

132}

133

134bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const {

135 LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n');

136 SmallPtrSet<const Value *, 32> WorkSet;

137 SmallPtrSet<const Value *, 32> Visited;

138 if (const Value *MO = getMemoryInstrPtrAndType(Inst).first) {

139 if (isGlobalAddr(MO))

141 }

142

143 while (!WorkSet.empty()) {

146 if (!Visited.insert(V).second)

147 continue;

149

151 const auto *M = LD->getPointerOperand();

152 if (isGlobalAddr(M)) {

154 return true;

155 }

156 continue;

157 }

158

160 const auto *P = GEP->getPointerOperand();

162 for (unsigned I = 1, E = GEP->getNumIndices() + 1; I != E; ++I)

164 continue;

165 }

166

168 WorkSet.insert(U->getOperand(0));

169 continue;

170 }

171

173 WorkSet.insert(BO->getOperand(0));

174 WorkSet.insert(BO->getOperand(1));

175 continue;

176 }

177

179 WorkSet.insert(S->getFalseValue());

180 WorkSet.insert(S->getTrueValue());

181 continue;

182 }

183

185 WorkSet.insert(E->getVectorOperand());

186 continue;

187 }

188

190 }

191

193 return false;

194}

195

196

197bool AMDGPUPerfHint::isGlobalLoadUsedInBB(const Instruction &I) const {

199 if (!Ld)

200 return false;

201 if (!isGlobalAddr(Ld->getPointerOperand()))

202 return false;

203

204 for (const User *Usr : Ld->users()) {

206 if (UsrInst->getParent() == I.getParent())

207 return true;

208 }

209 }

210

211 return false;

212}

213

214AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {

215 AMDGPUPerfHintAnalysis::FuncInfo &FI = FIM[&F];

216

217 LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n');

218

219 for (auto &B : F) {

220 LastAccess = MemAccessInfo();

221 unsigned UsedGlobalLoadsInBB = 0;

222 for (auto &I : B) {

223 if (const Type *Ty = getMemoryInstrPtrAndType(&I).second) {

224 unsigned Size = divideCeil(Ty->getPrimitiveSizeInBits(), 32);

225

226

227 if (isGlobalLoadUsedInBB(I))

228 UsedGlobalLoadsInBB += Size;

229 if (isIndirectAccess(&I))

231 if (isLargeStride(&I))

235 continue;

236 }

239 if (!Callee || Callee->isDeclaration()) {

241 continue;

242 }

243 if (&F == Callee)

244 continue;

245

246 auto Loc = FIM.find(Callee);

247 if (Loc == FIM.end())

248 continue;

249

250 FI.MemInstCost += Loc->second.MemInstCost;

251 FI.InstCost += Loc->second.InstCost;

252 FI.IAMInstCost += Loc->second.IAMInstCost;

253 FI.LSMInstCost += Loc->second.LSMInstCost;

255 TargetLoweringBase::AddrMode AM;

260 GEP->getPointerAddressSpace()))

261

262 continue;

264 } else {

266 }

267 }

268

270 unsigned GlobalMemAccPercentage = UsedGlobalLoadsInBB * 100 / B.size();

271 if (GlobalMemAccPercentage > 50) {

272 LLVM_DEBUG(dbgs() << "[HasDenseGlobalMemAcc] Set to true since "

273 << B.getName() << " has " << GlobalMemAccPercentage

274 << "% global memory access\n");

276 }

277 }

278 }

279

280 return &FI;

281}

282

283bool AMDGPUPerfHint::runOnFunction(Function &F) {

284 const Module &M = *F.getParent();

285 DL = &M.getDataLayout();

286

287 if (F.hasFnAttribute("amdgpu-wave-limiter") &&

288 F.hasFnAttribute("amdgpu-memory-bound"))

289 return false;

290

291 const AMDGPUPerfHintAnalysis::FuncInfo *Info = visit(F);

292

293 LLVM_DEBUG(dbgs() << F.getName() << " MemInst cost: " << Info->MemInstCost

294 << '\n'

295 << " IAMInst cost: " << Info->IAMInstCost << '\n'

296 << " LSMInst cost: " << Info->LSMInstCost << '\n'

297 << " TotalInst cost: " << Info->InstCost << '\n');

298

300

301 if (isMemBound(*Info)) {

302 LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n");

303 NumMemBound++;

304 F.addFnAttr("amdgpu-memory-bound", "true");

306 }

307

309 LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n");

310 NumLimitWave++;

311 F.addFnAttr("amdgpu-wave-limiter", "true");

313 }

314

316}

317

318bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {

319

320

322 return true;

323

325}

326

327bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {

330}

331

332bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const {

334 unsigned As = PT->getAddressSpace();

335

337 }

338 return false;

339}

340

341bool AMDGPUPerfHint::isLocalAddr(const Value *V) const {

344 return false;

345}

346

347bool AMDGPUPerfHint::isLargeStride(const Instruction *Inst) {

348 LLVM_DEBUG(dbgs() << "[isLargeStride] " << *Inst << '\n');

349

350 MemAccessInfo MAI = makeMemAccessInfo(const_cast<Instruction *>(Inst));

351 bool IsLargeStride = MAI.isLargeStride(LastAccess);

352 if (MAI.Base)

353 LastAccess = std::move(MAI);

354

355 return IsLargeStride;

356}

357

358AMDGPUPerfHint::MemAccessInfo

359AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const {

360 MemAccessInfo MAI;

361 const Value *MO = getMemoryInstrPtrAndType(Inst).first;

362

363 LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n');

364

365 if (isLocalAddr(MO))

366 return MAI;

367

368 MAI.V = MO;

370 return MAI;

371}

372

373bool AMDGPUPerfHint::MemAccessInfo::isLargeStride(

374 MemAccessInfo &Reference) const {

375

377 return false;

378

383 << print() << "<=>\n"

384 << Reference.print() << "Result:" << Result << '\n');

386}

387

388class AMDGPUPerfHintAnalysisLegacy : public CallGraphSCCPass {

389private:

390

391 AMDGPUPerfHintAnalysis Impl;

392

393public:

394 static char ID;

395

396 AMDGPUPerfHintAnalysisLegacy() : CallGraphSCCPass(ID) {}

397

398 bool runOnSCC(CallGraphSCC &SCC) override;

399

400 void getAnalysisUsage(AnalysisUsage &AU) const override {

402 }

403};

404

405}

406

408 auto FI = FIM.find(F);

409 if (FI == FIM.end())

410 return false;

411

412 return AMDGPUPerfHint::isMemBound(FI->second);

413}

414

416 auto FI = FIM.find(F);

417 if (FI == FIM.end())

418 return false;

419

420 return AMDGPUPerfHint::needLimitWave(FI->second);

421}

422

428 if (F || F->isDeclaration())

429 continue;

430

432 AMDGPUPerfHint Analyzer(FIM, ST.getTargetLowering());

433

434 if (Analyzer.runOnFunction(*F))

436 }

437

439}

440

444

446

449 if (SCC.size() != 1)

450 continue;

451 Function &F = SCC.begin()->getFunction();

452

453 if (F.isDeclaration())

454 continue;

455

457 AMDGPUPerfHint Analyzer(FIM, ST.getTargetLowering());

458 if (Analyzer.runOnFunction(F))

460 }

461 }

462

464}

465

466char AMDGPUPerfHintAnalysisLegacy::ID = 0;

468

470 "Analysis if a function is memory bound", true, true)

471

472bool AMDGPUPerfHintAnalysisLegacy::runOnSCC(CallGraphSCC &SCC) {

473 auto *TPC = getAnalysisIfAvailable();

474 if (!TPC)

475 return false;

476

478 return Impl.runOnSCC(TM, SCC);

479}

480

484

488

491 return PA;

492}

static cl::opt< unsigned > LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden, cl::desc("Large stride memory access threshold"))

static cl::opt< unsigned > IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden, cl::desc("Indirect access memory instruction weight"))

static cl::opt< unsigned > LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden, cl::desc("Kernel limit wave threshold in %"))

static cl::opt< unsigned > LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden, cl::desc("Large stride memory access weight"))

static cl::opt< unsigned > MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden, cl::desc("Function mem bound threshold in %"))

Analyzes if a function potentially memory bound and if a kernel kernel may benefit from limiting numb...

The AMDGPU TargetMachine interface definition for hw codegen targets.

MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL

static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)

static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

Analysis containing CSE Info

This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...

static bool runOnFunction(Function &F, bool PostInlining)

Implements a lazy call graph analysis and related passes for the new pass manager.

Machine Check Debug Module

#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)

void visit(MachineFunction &MF, MachineBasicBlock &Start, std::function< void(MachineBasicBlock *)> op)

This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...

#define STATISTIC(VARNAME, DESC)

This file describes how to lower LLVM code to machine code.

Target-Independent Code Generator Pass Configuration Options pass.

bool isMemoryBound(const Function *F) const

Definition AMDGPUPerfHintAnalysis.cpp:407

bool needsWaveLimiter(const Function *F) const

Definition AMDGPUPerfHintAnalysis.cpp:415

bool run(const GCNTargetMachine &TM, LazyCallGraph &CG)

Definition AMDGPUPerfHintAnalysis.cpp:441

bool runOnSCC(const GCNTargetMachine &TM, CallGraphSCC &SCC)

Definition AMDGPUPerfHintAnalysis.cpp:423

ValueMap< const Function *, FuncInfo > FuncInfoMap

PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)

Get the result of an analysis pass for a given IR unit.

void setPreservesAll()

Set by analyses that do not transform their input at all.

A node in the call graph for a module.

CallGraphSCC - This is a single SCC that a CallGraphSCCPass is run on.

An analysis pass which computes the call graph for a module.

A RefSCC of the call graph.

An SCC of the call graph.

A lazily constructed view of the call graph of a module.

LLVM_ABI void buildRefSCCs()

iterator_range< postorder_ref_scc_iterator > postorder_ref_sccs()

A Module instance is used to store all the information related to an LLVM module.

A set of analyses that are preserved following a run of a transformation pass.

static PreservedAnalyses all()

Construct a special preserved set that preserves all passes.

PreservedAnalyses & preserve()

Mark an analysis as preserved.

bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override

Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...

bool erase(PtrType Ptr)

Remove pointer from the set.

std::pair< iterator, bool > insert(PtrType Ptr)

Inserts Ptr if and only if there is no element in the container equal to Ptr.

const STC & getSubtarget(const Function &F) const

This method returns a pointer to the specified type of TargetSubtargetInfo.

static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)

iterator find(const KeyT &Val)

@ LOCAL_ADDRESS

Address space for local memory.

@ FLAT_ADDRESS

Address space for flat memory.

@ GLOBAL_ADDRESS

Address space for global memory (RAT0, VTX0).

LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)

unsigned ID

LLVM IR allows to use arbitrary numbers as calling convention identifiers.

initializer< Ty > init(const Ty &Val)

friend class Instruction

Iterator for Instructions in a `BasicBlock.

This is an optimization pass for GlobalISel generic memory operations.

FunctionAddr VTableAddr Value

Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)

decltype(auto) dyn_cast(const From &Val)

dyn_cast - Return the argument parameter cast to the specified type.

Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)

Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.

auto dyn_cast_or_null(const Y &Val)

LLVM_ABI raw_ostream & dbgs()

dbgs() - This returns a reference to a raw_ostream for debugging messages.

constexpr T divideCeil(U Numerator, V Denominator)

Returns the integer ceil(Numerator / Denominator).

AnalysisManager< Module > ModuleAnalysisManager

Convenience typedef for the Module analysis manager.

char & AMDGPUPerfHintAnalysisLegacyID

Definition AMDGPUPerfHintAnalysis.cpp:467

std::unique_ptr< AMDGPUPerfHintAnalysis > Impl

const GCNTargetMachine & TM

PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)

Definition AMDGPUPerfHintAnalysis.cpp:481

bool HasDenseGlobalMemAcc