MLIR: lib/Dialect/GPU/Transforms/KernelOutlining.cpp Source File (original) (raw)
1
2
3
4
5
6
7
8
9
10
11
12
14
30 #include
31
32 namespace mlir {
33 #define GEN_PASS_DEF_GPULAUNCHSINKINDEXCOMPUTATIONSPASS
34 #define GEN_PASS_DEF_GPUKERNELOUTLININGPASS
35 #include "mlir/Dialect/GPU/Transforms/Passes.h.inc"
36 }
37
38 using namespace mlir;
39
40 template
43 for (auto dim : {gpu::Dimension::x, gpu::Dimension::y, gpu::Dimension::z})
44 values.push_back(builder.create(loc, builder.getIndexType(), dim));
45 }
46
47
48
49
50
53 bool hasCluster = false) {
55 Block &firstBlock = launchOpBody.front();
58
59 createForAllDimensionsgpu::BlockIdOp(builder, loc, indexOps);
60 createForAllDimensionsgpu::ThreadIdOp(builder, loc, indexOps);
61 createForAllDimensionsgpu::GridDimOp(builder, loc, indexOps);
62 createForAllDimensionsgpu::BlockDimOp(builder, loc, indexOps);
63 if (hasCluster) {
64 createForAllDimensionsgpu::ClusterIdOp(builder, loc, indexOps);
65 createForAllDimensionsgpu::ClusterDimOp(builder, loc, indexOps);
66 }
67
68
69 for (const auto &indexOp : enumerate(indexOps))
70 map.map(firstBlock.getArgument(indexOp.index()), indexOp.value());
71 }
72
73
74
75
78 isa<memref::DimOp, arith::SelectOp, arith::CmpIOp>(op);
79 }
80
81
82
83
84
85
86
87
88
89
90
91
97 if (beneficiaryOps.count(op))
98 return true;
99
100 if (!isSinkingBeneficiary(op))
101 return false;
102
104
105 if (availableValues.count(operand))
106 continue;
107
108
109 Operation *definingOp = operand.getDefiningOp();
111 beneficiaryOps, availableValues,
112 isSinkingBeneficiary)) &&
113 !existingDependencies.count(operand))
114 return false;
115 }
116
117 beneficiaryOps.insert(op);
119 availableValues.insert(result);
120 return true;
121 }
122
124 gpu::LaunchOp launchOp,
126 assert(isSinkingBeneficiary);
127 Region &launchOpBody = launchOp.getBody();
128
129
130
133
136 for (Value operand : sinkCandidates) {
137 Operation *operandOp = operand.getDefiningOp();
138 if (!operandOp)
139 continue;
141 isSinkingBeneficiary);
142 }
143
144
149
152 launchOp.getBody());
153 }
154 return success();
155 }
156
157
161 for (Value v : {dims.x, dims.y, dims.z}) {
162 APInt constValue;
164 return nullptr;
165
166
168 return nullptr;
169 constants.push_back(
171 }
173 }
174
175
176
177
179 StringRef kernelFnName,
181 Location loc = launchOp.getLoc();
182
183
184 OpBuilder builder(launchOp.getContext());
185 Region &launchOpBody = launchOp.getBody();
186
187
188
190
191
193 kernelOperandTypes.reserve(operands.size());
194 for (Value operand : operands) {
195 kernelOperandTypes.push_back(operand.getType());
196 }
197 FunctionType type =
198 FunctionType::get(launchOp.getContext(), kernelOperandTypes, {});
199 auto outlinedFunc = builder.creategpu::GPUFuncOp(
200 loc, kernelFnName, type,
203 outlinedFunc->setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
205
206
207
208
209 if (auto blockBounds =
211 outlinedFunc.setKnownBlockSizeAttr(blockBounds);
212 if (auto gridBounds =
214 outlinedFunc.setKnownGridSizeAttr(gridBounds);
215
217
218
219
220
221 Region &outlinedFuncBody = outlinedFunc.getBody();
223 launchOp.hasClusterSize());
224
225
226 for (const auto &[launchArg, funcArg] :
227 llvm::zip(launchOp.getWorkgroupAttributions(),
228 outlinedFunc.getWorkgroupAttributions()))
229 map.map(launchArg, funcArg);
230 for (const auto &[launchArg, funcArg] :
231 llvm::zip(launchOp.getPrivateAttributions(),
232 outlinedFunc.getPrivateAttributions()))
233 map.map(launchArg, funcArg);
234
235
236
237 Block &entryBlock = outlinedFuncBody.front();
238 for (const auto &operand : enumerate(operands))
239 map.map(operand.value(), entryBlock.getArgument(operand.index()));
240
241
242 launchOpBody.cloneInto(&outlinedFuncBody, map);
243
244
245 for (Block &block : launchOpBody) {
247 auto terminator = dyn_castgpu::TerminatorOp(clonedBlock->getTerminator());
248 if (!terminator)
249 continue;
251 replacer.creategpu::ReturnOp(terminator->getLoc());
252 terminator->erase();
253 }
254
255
256
257 Block *clonedLaunchOpEntry = map.lookup(&launchOpBody.front());
260 clonedLaunchOpEntry->erase();
261
262 return outlinedFunc;
263 }
264
266 StringRef kernelFnName,
269 inputOperandSet.insert_range(operands);
272 for (auto operand : operandSet) {
273 if (!inputOperandSet.count(operand))
274 operands.push_back(operand);
275 }
276 return funcOp;
277 }
278
279
280
281
283 gpu::GPUFuncOp kernelFunc,
286
287
288 Value asyncToken = launchOp.getAsyncToken();
289 std::optionalgpu::KernelDim3 clusterSize =
290 launchOp.getClusterSizeOperandValues();
291 auto launchFunc = builder.creategpu::LaunchFuncOp(
292 launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(),
293 launchOp.getBlockSizeOperandValues(),
294 launchOp.getDynamicSharedMemorySize(), operands,
295 asyncToken ? asyncToken.getType() : nullptr,
296 launchOp.getAsyncDependencies(), clusterSize);
298 launchOp.erase();
299 }
300
301 namespace {
302
303
304 class GpuLaunchSinkIndexComputationsPass
305 : public impl::GpuLaunchSinkIndexComputationsPassBase<
306 GpuLaunchSinkIndexComputationsPass> {
307 public:
308 void runOnOperation() override {
310 if (op->walk([](gpu::LaunchOp launch) {
311
312 if (failed(sinkOperationsIntoLaunchOp(launch,
313 isLikelyAnIndexComputation)))
314 return WalkResult::interrupt();
315
316 return WalkResult::advance();
317 }).wasInterrupted())
318 signalPassFailure();
319 }
320 };
321
322
323
324
325
326
327
328
329
330
331 class GpuKernelOutliningPass
332 : public impl::GpuKernelOutliningPassBase {
333 public:
334 using Base::Base;
335
336 LogicalResult initialize(MLIRContext *context) override {
337
338 if (!dataLayoutStr.empty()) {
340 if (!resultAttr)
341 return failure();
342
343 dataLayoutSpec = dyn_cast(resultAttr);
344 if (!dataLayoutSpec)
345 return failure();
346 }
347
348 return success();
349 }
350
351 void runOnOperation() override {
353 bool modified = false;
354 for (auto func : getOperation().getOps()) {
355
357 auto funcWalkResult = func.walk([&](gpu::LaunchOp op) {
359 std::string kernelFnName;
360 if (op.getKernelFunc()) {
361 kernelFnName = op.getKernelFunc()->getRootReference().str();
362 } else {
363 kernelFnName =
364 Twine(op->getParentOfType().getName(),
365 "_kernel")
366 .str();
367 }
368
369 gpu::GPUFuncOp outlinedFunc =
371
372
373
374
375 auto kernelModule = createKernelModule(op, outlinedFunc, symbolTable);
376 symbolTable.insert(kernelModule, insertPt);
377
378
380 modified = true;
382 });
383 if (funcWalkResult.wasInterrupted())
384 return signalPassFailure();
385 }
386
387
388
389 if (modified)
390 getOperation()->setAttr(gpu::GPUDialect::getContainerModuleAttrName(),
392 }
393
394 private:
395
396 gpu::GPUModuleOp createKernelModule(gpu::LaunchOp gpuLaunchOp,
397 gpu::GPUFuncOp kernelFunc,
399
400
401
402
403 auto *context = getOperation().getContext();
405 std::string kernelModuleName;
406 gpu::GPUModuleOp kernelModule;
407 if (gpuLaunchOp.getKernelModule()) {
408 kernelModuleName =
409 gpuLaunchOp.getKernelModule()->getRootReference().str();
410 kernelModule =
411 parentSymbolTable.lookupgpu::GPUModuleOp(kernelModuleName);
412 } else {
413 kernelModuleName = kernelFunc.getName();
414 }
415
416
417 if (!kernelModule) {
418
419 kernelModule = builder.creategpu::GPUModuleOp(kernelFunc.getLoc(),
420 kernelModuleName);
421 }
422
423
424
425 if (dataLayoutSpec)
426 kernelModule->setAttr(DLTIDialect::kDataLayoutAttrName, dataLayoutSpec);
427
429 symbolTable.insert(kernelFunc);
430
432 while (!symbolDefWorklist.empty()) {
433 if (std::optionalSymbolTable::UseRange symbolUses =
436 StringRef symbolName =
437 cast(symbolUse.getSymbolRef()).getValue();
438 if (symbolTable.lookup(symbolName))
439 continue;
440
442 parentSymbolTable.lookup(symbolName)->clone();
443 symbolDefWorklist.push_back(symbolDefClone);
444 symbolTable.insert(symbolDefClone);
445 }
446 }
447 }
448
449 return kernelModule;
450 }
451
452 DataLayoutSpecInterface dataLayoutSpec;
453 };
454
455 }
static MLIRContext * getContext(OpFoldResult val)
static DenseI32ArrayAttr maybeConstantDimsAttr(gpu::KernelDim3 dims)
Return the provided KernelDim3 as an array of i32 constants if possible.
static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp, StringRef kernelFnName, SetVector< Value > &operands)
Outline the gpu.launch operation body into a kernel function.
static bool isLikelyAnIndexComputation(Operation *op)
Identifies operations that are beneficial to sink into kernels.
static void convertToLaunchFuncOp(gpu::LaunchOp launchOp, gpu::GPUFuncOp kernelFunc, ValueRange operands)
Replace gpu.launch operations with an gpu.launch_func operation launching kernelFunc.
static void createForAllDimensions(OpBuilder &builder, Location loc, SmallVectorImpl< Value > &values)
static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody, Region &launchOpBody, IRMapping &map, bool hasCluster=false)
Adds operations generating block/thread ids and grid/block dimensions at the beginning of the launchF...
static bool extractBeneficiaryOps(Operation *op, const SetVector< Value > &existingDependencies, SetVector< Operation * > &beneficiaryOps, llvm::SmallPtrSetImpl< Value > &availableValues, llvm::function_ref< bool(Operation *)> isSinkingBeneficiary)
For a given operation op, computes whether it is beneficial to sink the operation into the kernel.
static Value max(ImplicitLocOpBuilder &builder, Value value, Value bound)
Attributes are known-constant values of operations.
MLIRContext * getContext() const
Return the context this attribute belongs to.
Block represents an ordered list of Operations.
OpListType::iterator iterator
BlockArgument getArgument(unsigned i)
void erase()
Unlink this Block from its parent region and delete it.
Operation * getTerminator()
Get the terminator operation of this block.
OpListType & getOperations()
This is a utility class for mapping one set of IR entities to another.
auto lookup(T from) const
Lookup a mapped value within the map.
void map(Value from, Value to)
Inserts a new mapping for 'from' to 'to'.
This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...
MLIRContext is the top-level object for a collection of MLIR operations.
This class helps build Operations.
Operation * clone(Operation &op, IRMapping &mapper)
Creates a deep copy of the specified operation, remapping any operands that use values outside of the...
void setInsertionPointToStart(Block *block)
Sets the insertion point to the start of the specified block.
Operation * create(const OperationState &state)
Creates an operation given the fields represented as an OperationState.
Operation is the basic unit of execution within MLIR.
Operation * clone(IRMapping &mapper, CloneOptions options=CloneOptions::all())
Create a deep copy of this operation, remapping any operands that use values outside of the operation...
std::enable_if_t< llvm::function_traits< std::decay_t< FnT > >::num_args==1, RetT > walk(FnT &&callback)
Walk the operation by calling the callback for each nested operation (including this one),...
void setAttr(StringAttr name, Attribute value)
If the an attribute exists with the specified name, change it to the new value.
OperationName getName()
The name of an operation is the key identifier for it.
operand_range getOperands()
Returns an iterator on the underlying Value's.
void replaceAllUsesWith(ValuesT &&values)
Replace all uses of results of this operation with the provided 'values'.
result_range getResults()
void erase()
Remove this operation from its parent block and delete it.
This class contains a list of basic blocks and a link to the parent operation it is attached to.
void cloneInto(Region *dest, IRMapping &mapper)
Clone the internal blocks from this region into dest.
This class represents a specific symbol use.
This class allows for representing and managing the symbol table used by operations with the 'SymbolT...
Operation * lookup(StringRef name) const
Look up a symbol with the specified name, returning null if no such name exists.
static std::optional< UseRange > getSymbolUses(Operation *from)
Get an iterator range for all of the uses, for any symbol, that are nested within the given operation...
This class provides an abstraction over the various different ranges of value types.
This class provides an abstraction over the different types of ranges over Values.
This class represents an instance of an SSA value in the MLIR system, representing a computable value...
MLIRContext * getContext() const
Utility to get the associated MLIRContext that this value is defined in.
Type getType() const
Return the type of this value.
static WalkResult advance()
static DenseArrayAttrImpl get(MLIRContext *context, ArrayRef< int32_t > content)
Builder from ArrayRef.
constexpr void enumerate(std::tuple< Tys... > &tuple, CallbackT &&callback)
Include the generated interface declarations.
bool matchPattern(Value value, const Pattern &pattern)
Entry point for matching a pattern over a Value.
detail::constant_int_value_binder m_ConstantInt(IntegerAttr::ValueType *bind_value)
Matches a constant holding a scalar/vector/tensor integer (splat) and writes the integer value to bin...
void replaceAllUsesInRegionWith(Value orig, Value replacement, Region ®ion)
Replace all uses of orig within the given region with replacement.
Attribute parseAttribute(llvm::StringRef attrStr, MLIRContext *context, Type type={}, size_t *numRead=nullptr, bool isKnownNullTerminated=false)
This parses a single MLIR attribute to an MLIR context if it was valid.
void getUsedValuesDefinedAbove(Region ®ion, Region &limit, SetVector< Value > &values)
Fill values with a list of values defined at the ancestors of the limit region and used within region...
LogicalResult sinkOperationsIntoLaunchOp(gpu::LaunchOp launchOp, llvm::function_ref< bool(Operation *)> isSinkingBeneficiary)
Sink operations into the launchOp to reduce the number of values that are used within the region of t...
auto get(MLIRContext *context, Ts &&...params)
Helper method that injects context only if needed, this helps unify some of the attribute constructio...
detail::constant_op_matcher m_Constant()
Matches a constant foldable operation.
gpu::GPUFuncOp outlineKernelFunc(gpu::LaunchOp launchOp, StringRef kernelFnName, SmallVectorImpl< Value > &operands)
Get a gpu.func created from outlining the region of a gpu.launch op with the given kernelFnName.
Utility class for the GPU dialect to represent triples of Values accessible through ....