LLVM: lib/Transforms/Scalar/LoopDataPrefetch.cpp Source File (original) (raw)

1

2

3

4

5

6

7

8

9

10

11

12

15

32

33#define DEBUG_TYPE "loop-data-prefetch"

34

35using namespace llvm;

36

37

38

41 cl::desc("Prefetch write addresses"));

42

45 cl::desc("Number of instructions to prefetch ahead"),

47

51

53 "max-prefetch-iters-ahead",

55

56STATISTIC(NumPrefetches, "Number of prefetches inserted");

57

58namespace {

59

60

61class LoopDataPrefetch {

62public:

66 : AC(AC), DT(DT), LI(LI), SE(SE), TTI(TTI), ORE(ORE) {}

67

68 bool run();

69

70private:

71 bool runOnLoop(Loop *L);

72

73

74

75 bool isStrideLargeEnough(const SCEVAddRecExpr *AR, unsigned TargetMinStride);

76

77 unsigned getMinPrefetchStride(unsigned NumMemAccesses,

78 unsigned NumStridedMemAccesses,

79 unsigned NumPrefetches,

80 bool HasCall) {

84 NumPrefetches, HasCall);

85 }

86

87 unsigned getPrefetchDistance() {

91 }

92

93 unsigned getMaxPrefetchIterationsAhead() {

97 }

98

99 bool doPrefetchWrites() {

103 }

104

111};

112

113

114class LoopDataPrefetchLegacyPass : public FunctionPass {

115public:

116 static char ID;

119 }

120

133 }

134

136 };

137}

138

139char LoopDataPrefetchLegacyPass::ID = 0;

141 "Loop Data Prefetch", false, false)

150

152 return new LoopDataPrefetchLegacyPass();

153}

154

155bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR,

156 unsigned TargetMinStride) {

157

158 if (TargetMinStride <= 1)

159 return true;

160

161 const auto *ConstStride = dyn_cast(AR->getStepRecurrence(*SE));

162

163

164 if (!ConstStride)

165 return false;

166

167 unsigned AbsStride = std::abs(ConstStride->getAPInt().getSExtValue());

168 return TargetMinStride <= AbsStride;

169}

170

180

181 LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE);

182 bool Changed = LDP.run();

183

184 if (Changed) {

188 return PA;

189 }

190

192}

193

194bool LoopDataPrefetchLegacyPass::runOnFunction(Function &F) {

195 if (skipFunction(F))

196 return false;

197

198 DominatorTree *DT = &getAnalysis().getDomTree();

199 LoopInfo *LI = &getAnalysis().getLoopInfo();

200 ScalarEvolution *SE = &getAnalysis().getSE();

202 &getAnalysis().getAssumptionCache(F);

204 &getAnalysis().getORE();

206 &getAnalysis().getTTI(F);

207

208 LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE);

209 return LDP.run();

210}

211

212bool LoopDataPrefetch::run() {

213

214

215

217 LLVM_DEBUG(dbgs() << "Please set both PrefetchDistance and CacheLineSize "

218 "for loop data prefetch.\n");

219 return false;

220 }

221

222 bool MadeChange = false;

223

224 for (Loop *I : *LI)

226 MadeChange |= runOnLoop(L);

227

228 return MadeChange;

229}

230

231

232

234

236

238

240

242

243

245 addInstruction(I);

246 };

247

248

249

250

251

253 int64_t PtrDiff = 0) {

254 if (!InsertPt) {

255 MemI = I;

256 InsertPt = I;

257 Writes = isa(I);

258 } else {

261 if (PrefBB != InsBB) {

263 if (DomBB != PrefBB)

265 }

266

267 if (isa(I) && PtrDiff == 0)

269 }

270 }

271};

272

273bool LoopDataPrefetch::runOnLoop(Loop *L) {

274 bool MadeChange = false;

275

276

277 if (L->isInnermost())

278 return MadeChange;

279

282

283

285 bool HasCall = false;

286 for (const auto BB : L->blocks()) {

287

288

289 for (auto &I : *BB) {

290 if (isa(&I) || isa(&I)) {

292 if (F->getIntrinsicID() == Intrinsic::prefetch)

293 return MadeChange;

295 HasCall = true;

296 } else {

297 HasCall = true;

298 }

299 }

300 }

301 Metrics.analyzeBasicBlock(BB, *TTI, EphValues);

302 }

303

304 if (Metrics.NumInsts.isValid())

305 return MadeChange;

306

307 unsigned LoopSize = *Metrics.NumInsts.getValue();

308 if (!LoopSize)

309 LoopSize = 1;

310

311 unsigned ItersAhead = getPrefetchDistance() / LoopSize;

312 if (!ItersAhead)

313 ItersAhead = 1;

314

315 if (ItersAhead > getMaxPrefetchIterationsAhead())

316 return MadeChange;

317

319 if (ConstantMaxTripCount && ConstantMaxTripCount < ItersAhead + 1)

320 return MadeChange;

321

322 unsigned NumMemAccesses = 0;

323 unsigned NumStridedMemAccesses = 0;

325 for (const auto BB : L->blocks())

326 for (auto &I : *BB) {

329

330 if (LoadInst *LMemI = dyn_cast(&I)) {

331 MemI = LMemI;

332 PtrValue = LMemI->getPointerOperand();

333 } else if (StoreInst *SMemI = dyn_cast(&I)) {

334 if (!doPrefetchWrites()) continue;

335 MemI = SMemI;

336 PtrValue = SMemI->getPointerOperand();

337 } else continue;

338

341 continue;

342 NumMemAccesses++;

343 if (L->isLoopInvariant(PtrValue))

344 continue;

345

346 const SCEV *LSCEV = SE->getSCEV(PtrValue);

347 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast(LSCEV);

348 if (!LSCEVAddRec)

349 continue;

350 NumStridedMemAccesses++;

351

352

353

354

355 bool DupPref = false;

356 for (auto &Pref : Prefetches) {

357 const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, Pref.LSCEVAddRec);

359 dyn_cast(PtrDiff)) {

360 int64_t PD = std::abs(ConstPtrDiff->getValue()->getSExtValue());

362 Pref.addInstruction(MemI, DT, PD);

363 DupPref = true;

364 break;

365 }

366 }

367 }

368 if (!DupPref)

369 Prefetches.push_back(Prefetch(LSCEVAddRec, MemI));

370 }

371

372 unsigned TargetMinStride =

373 getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,

374 Prefetches.size(), HasCall);

375

377 << " iterations ahead (loop size: " << LoopSize << ") in "

378 << L->getHeader()->getParent()->getName() << ": " << *L);

380 << NumMemAccesses << " memory accesses, "

381 << NumStridedMemAccesses << " strided memory accesses, "

382 << Prefetches.size() << " potential prefetch(es), "

383 << "a minimum stride of " << TargetMinStride << ", "

384 << (HasCall ? "calls" : "no calls") << ".\n");

385

386 for (auto &P : Prefetches) {

387

388

389 if (!isStrideLargeEnough(P.LSCEVAddRec, TargetMinStride))

390 continue;

391

392 BasicBlock *BB = P.InsertPt->getParent();

395 SE->getConstant(P.LSCEVAddRec->getType(), ItersAhead),

396 P.LSCEVAddRec->getStepRecurrence(*SE)));

397 if (!SCEVE.isSafeToExpand(NextLSCEV))

398 continue;

399

402 Value *PrefPtrValue = SCEVE.expandCodeFor(NextLSCEV, I8Ptr, P.InsertPt);

403

406 Builder.CreateIntrinsic(Intrinsic::prefetch, PrefPtrValue->getType(),

407 {PrefPtrValue, ConstantInt::get(I32, P.Writes),

408 ConstantInt::get(I32, 3),

409 ConstantInt::get(I32, 1)});

410 ++NumPrefetches;

412 << *P.MemI->getOperand(isa(P.MemI) ? 0 : 1)

413 << ", SCEV: " << *P.LSCEVAddRec << "\n");

414 ORE->emit([&]() {

416 << "prefetched memory access";

417 });

418

419 MadeChange = true;

420 }

421

422 return MadeChange;

423}

This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.

SmallVector< uint32_t, 0 > Writes

static cl::opt< bool > PrefetchWrites("loop-prefetch-writes", cl::Hidden, cl::init(false), cl::desc("Prefetch write addresses"))

static cl::opt< unsigned > MinPrefetchStride("min-prefetch-stride", cl::desc("Min stride to add prefetches"), cl::Hidden)

static cl::opt< unsigned > PrefetchDistance("prefetch-distance", cl::desc("Number of instructions to prefetch ahead"), cl::Hidden)

static cl::opt< unsigned > MaxPrefetchIterationsAhead("max-prefetch-iters-ahead", cl::desc("Max number of iterations to prefetch ahead"), cl::Hidden)

This file provides the interface for LLVM's Loop Data Prefetching Pass.

static const Function * getCalledFunction(const Value *V)

#define INITIALIZE_PASS_DEPENDENCY(depName)

#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)

#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)

This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...

#define STATISTIC(VARNAME, DESC)

This pass exposes codegen information to IR-level passes.

A container for analyses that lazily runs them and caches their results.

PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)

Get the result of an analysis pass for a given IR unit.

Represent the analysis usage information of a pass.

AnalysisUsage & addRequiredID(const void *ID)

AnalysisUsage & addPreservedID(const void *ID)

AnalysisUsage & addRequired()

AnalysisUsage & addPreserved()

Add the specified Pass class to the set of analyses preserved by this pass.

A function analysis which provides an AssumptionCache.

An immutable pass that tracks lazily created AssumptionCache objects.

A cache of @llvm.assume calls within a function.

LLVM Basic Block Representation.

const DataLayout & getDataLayout() const

Get the data layout of the module this basic block belongs to.

LLVMContext & getContext() const

Get the context in which this basic block lives.

const Instruction * getTerminator() const LLVM_READONLY

Returns the terminator instruction if the block is well formed or null if the block is not well forme...

Analysis pass which computes a DominatorTree.

Legacy analysis pass which computes a DominatorTree.

Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.

Instruction * findNearestCommonDominator(Instruction *I1, Instruction *I2) const

Find the nearest instruction I that dominates both I1 and I2, in the sense that a result produced bef...

FunctionPass class - This class is used to implement most global optimizations.

virtual bool runOnFunction(Function &F)=0

runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.

This provides a uniform API for creating instructions and inserting them into a basic block: either a...

An instruction for reading from memory.

Analysis pass that exposes the LoopInfo for a function.

PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)

Run the pass over the function.

The legacy pass manager's analysis pass to compute loop information.

Represents a single loop in the control flow graph.

static PassRegistry * getPassRegistry()

getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...

virtual void getAnalysisUsage(AnalysisUsage &) const

getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...

static PointerType * get(Type *ElementType, unsigned AddressSpace)

This constructs a pointer to an object of the specified type in a numbered address space.

A set of analyses that are preserved following a run of a transformation pass.

static PreservedAnalyses all()

Construct a special preserved set that preserves all passes.

void preserve()

Mark an analysis as preserved.

This node represents a polynomial recurrence on the trip count of the specified loop.

const SCEV * getStepRecurrence(ScalarEvolution &SE) const

Constructs and returns the recurrence indicating how much this expression steps by.

This class represents a constant integer value.

This class uses information about analyze scalars to rewrite expressions in canonical form.

This class represents an analyzed expression in the program.

Type * getType() const

Return the LLVM type of this SCEV expression.

Analysis pass that exposes the ScalarEvolution for a function.

The main scalar evolution driver.

const SCEV * getConstant(ConstantInt *V)

const SCEV * getSCEV(Value *V)

Return a SCEV expression for the full generality of the specified expression.

unsigned getSmallConstantMaxTripCount(const Loop *L, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)

Returns the upper bound of the loop trip count as a normal unsigned value.

const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)

Return LHS-RHS.

const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)

Get a canonical multiply expression, or something simpler if possible.

const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)

Get a canonical add expression, or something simpler if possible.

SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.

This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.

An instruction for storing to memory.

Analysis pass providing the TargetTransformInfo.

Wrapper pass for TargetTransformInfo.

This pass provides access to the codegen interfaces that are needed for IR-level transformations.

unsigned getPrefetchDistance() const

bool enableWritePrefetching() const

unsigned getMaxPrefetchIterationsAhead() const

unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const

Some HW prefetchers can handle accesses up to a certain constant stride.

bool shouldPrefetchAddressSpace(unsigned AS) const

unsigned getCacheLineSize() const

bool isLoweredToCall(const Function *F) const

Test whether calls to a function lower to actual program function calls.

The instances of the Type class are immutable: once they are created, they are never changed.

unsigned getPointerAddressSpace() const

Get the address space of this pointer or pointer vector type.

static IntegerType * getInt32Ty(LLVMContext &C)

LLVM Value Representation.

Type * getType() const

All values are typed, get the type of this value.

int getNumOccurrences() const

const ParentTy * getParent() const

unsigned ID

LLVM IR allows to use arbitrary numbers as calling convention identifiers.

@ PD

PD - Prefix code for packed double precision vector floating point operations performed in the SSE re...

initializer< Ty > init(const Ty &Val)

PointerTypeMap run(const Module &M)

Compute the PointerTypeMap for the module M.

This is an optimization pass for GlobalISel generic memory operations.

void initializeLoopDataPrefetchLegacyPassPass(PassRegistry &)

FunctionPass * createLoopDataPrefetchPass()

raw_ostream & dbgs()

dbgs() - This returns a reference to a raw_ostream for debugging messages.

iterator_range< df_iterator< T > > depth_first(const T &G)

A record for a potential prefetch made during the initial scan of the loop.

void addInstruction(Instruction *I, DominatorTree *DT=nullptr, int64_t PtrDiff=0)

Add the instruction.

const SCEVAddRecExpr * LSCEVAddRec

The address formula for this prefetch as returned by ScalarEvolution.

Prefetch(const SCEVAddRecExpr *L, Instruction *I)

Constructor to create a new Prefetch for I.

Utility to calculate the size and a few similar metrics for a set of basic blocks.

static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)

Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).