MLIR: lib/Dialect/GPU/TransformOps/Utils.cpp Source File (original) (raw)

1

2

3

4

5

6

7

8

10

33 #include "llvm/ADT/STLExtras.h"

34 #include "llvm/ADT/SmallVector.h"

35 #include "llvm/ADT/TypeSwitch.h"

36 #include "llvm/Support/Debug.h"

37 #include "llvm/Support/InterleavedRange.h"

38

39 using namespace mlir;

43

44 #define DEBUG_TYPE "gpu-transforms"

45

46 #define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")

47 #define LDBG(X) LLVM_DEBUG(DBGS() << (X) << "\n")

48 #define DBGS_ALIAS() (llvm::dbgs() << '[' << DEBUG_TYPE_ALIAS << "] ")

49

50

51 template

54 LLVM_DEBUG(DBGS() << "----buildLinearId with originalBasisOfr: "

55 << llvm::interleaved(originalBasisOfr) << "\n");

56 assert(originalBasisOfr.size() == 3 && "expected 3 sizes");

57 IndexType indexType = rewriter.getIndexType();

62 rewriter.create(loc, indexType, Dimension::x)

63 .getResult(),

64 rewriter.create(loc, indexType, Dimension::y)

65 .getResult(),

66 rewriter.create(loc, indexType, Dimension::z)

67 .getResult(),

68 originalBasisOfr[0], originalBasisOfr[1]};

70 rewriter, loc, tx + ty * bdx + tz * bdx * bdy, vals);

72 }

73

74

75

76

77 template

85 buildLinearId(rewriter, loc, originalBasisOfr);

86

87

92 rewriter, loc, d0.floorDiv(multiplicity), {linearId});

95

96 for (AffineExpr e : llvm::reverse(delinearizingExprs)) {

97 ids.push_back(

99 }

100

101 LLVM_DEBUG(DBGS() << "--delinearization basis: "

102 << llvm::interleaved(reverseBasisSizes) << "\n";

103 DBGS() << "--delinearization strides: "

104 << llvm::interleaved(strides) << "\n";

105 DBGS() << "--delinearization exprs: "

106 << llvm::interleaved(delinearizingExprs) << "\n";

107 DBGS() << "--ids: " << llvm::interleaved(ids) << "\n");

108

109

111 ids,

112

114

115

116

117

120 };

121

122 return res;

123 }

124

125

126

127

128 template

133 IndexType indexType = rewriter.getIndexType();

135 rewriter.create(loc, indexType, Dimension::x),

136 rewriter.create(loc, indexType, Dimension::y),

137 rewriter.create(loc, indexType, Dimension::z)};

138

142 rewriter, loc, d0.floorDiv(multiplicity), {scaledIds[0]}));

143

145 forallMappingSizeInOriginalBasis[0] *= multiplicity;

147 scaledIds,

149

150

151

152

154 ids};

155 };

156 return res;

157 }

158

159 namespace mlir {

160 namespace transform {

161 namespace gpu {

162

163 GpuIdBuilder::GpuIdBuilder(MLIRContext *ctx, bool useLinearMapping,

165 : mappingAttributes(), idBuilder() {

166 if (useLinearMapping) {

167 for (uint64_t d = static_cast<uint64_t>(MappingId::LinearDim0),

168 e = getMaxEnumValForMappingId();

169 d <= e; ++d)

170 mappingAttributes.push_back(fn(ctx, symbolizeMappingId(d).value()));

171 } else {

172 for (uint64_t d = static_cast<uint64_t>(MappingId::DimX),

173 e = static_cast<uint64_t>(MappingId::DimZ);

174 d <= e; ++d)

175 mappingAttributes.push_back(fn(ctx, symbolizeMappingId(d).value()));

176 }

177 }

178

182 }) {

183 idBuilder = useLinearMapping

184 ? commonLinearIdBuilderFn(1)

186 }

187

189 bool useLinearMapping)

193 }),

194 warpSize(warpSize) {

196 ? commonLinearIdBuilderFn(

200 }

201

203 bool useLinearMapping)

207 }),

208 warpSize(warpSize) {

210 useLinearMapping

211 ? commonLinearIdBuilderFn(warpSize)

213 }

214

218 }) {

219 idBuilder = useLinearMapping

220 ? commonLinearIdBuilderFn(1)

222 }

223

225 std::optional<int64_t> gridDimX,

226 std::optional<int64_t> gridDimY,

227 std::optional<int64_t> gridDimZ,

228 std::optional<int64_t> blockDimX,

229 std::optional<int64_t> blockDimY,

230 std::optional<int64_t> blockDimZ) {

231

232

233

234 if ((blockDimX.value_or(1) * blockDimY.value_or(1) * blockDimZ.value_or(1)) >

236 (gridDimX.value_or(1) * gridDimY.value_or(1) * gridDimZ.value_or(1)) >

244 return transformOp.emitSilenceableError()

245 << "Trying to launch a GPU kernel with grid_dims = ("

246 << gridDimX.value_or(1) << ", " << gridDimY.value_or(1) << ", "

247 << gridDimZ.value_or(1) << ") block_dims = ("

248 << blockDimX.value_or(1) << ", " << blockDimY.value_or(1) << ", "

249 << blockDimZ.value_or(1) << "). It is larger than the limits.";

250 }

252 }

253

256 LaunchOp &launchOp, std::optional<int64_t> gridDimX,

257 std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ,

258 std::optional<int64_t> blockDimX, std::optional<int64_t> blockDimY,

259 std::optional<int64_t> blockDimZ) {

261 checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,

262 blockDimY, blockDimZ);

263 if (diag.succeeded())

265

268 };

271 Value gridSizeX = gridDimX.has_value() ? createConst(gridDimX.value()) : one;

272 Value gridSizeY = gridDimY.has_value() ? createConst(gridDimY.value()) : one;

273 Value gridSizeZ = gridDimZ.has_value() ? createConst(gridDimZ.value()) : one;

274 Value blkSizeX = blockDimX.has_value() ? createConst(blockDimX.value()) : one;

275 Value blkSizeY = blockDimY.has_value() ? createConst(blockDimY.value()) : one;

276 Value blkSizeZ = blockDimZ.has_value() ? createConst(blockDimZ.value()) : one;

277 launchOp = rewriter.create(loc, gridSizeX, gridSizeY, gridSizeZ,

278 blkSizeX, blkSizeY, blkSizeZ);

280 rewriter.create(loc);

282 }

283

284

287 TransformOpInterface transformOp, std::optional<int64_t> gridDimX,

288 std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ,

289 std::optional<int64_t> blockDimX, std::optional<int64_t> blockDimY,

290 std::optional<int64_t> blockDimZ) {

292 checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,

293 blockDimY, blockDimZ);

294 if (diag.succeeded())

296

297 KernelDim3 currentBlockdim = gpuLaunch.getBlockSizeOperandValues();

300 auto createConstValue = [&](int dim) {

302 dim);

303 };

304

305 if (gridDimX.has_value())

306 gpuLaunch.getGridSizeXMutable().assign(createConstValue(gridDimX.value()));

307 if (gridDimY.has_value())

308 gpuLaunch.getGridSizeYMutable().assign(createConstValue(gridDimY.value()));

309 if (gridDimZ.has_value())

310 gpuLaunch.getGridSizeZMutable().assign(createConstValue(gridDimZ.value()));

311 if (blockDimX.has_value())

312 gpuLaunch.getBlockSizeXMutable().assign(

313 createConstValue(blockDimX.value()));

314 if (blockDimY.has_value())

315 gpuLaunch.getBlockSizeYMutable().assign(

316 createConstValue(blockDimY.value()));

317 if (blockDimZ.has_value())

318 gpuLaunch.getBlockSizeZMutable().assign(

319 createConstValue(blockDimZ.value()));

321 }

322

323 }

324 }

325 }

static Value createConst(Location loc, Type type, int value, PatternRewriter &rewriter)

Create an integer or index constant.

static GpuIdBuilderFnType commonLinearIdBuilderFn(int64_t multiplicity=1)

Create a linear id builder that takes the originalBasisOfr and decompose it in the basis of forallMap...

static Value buildLinearId(RewriterBase &rewriter, Location loc, ArrayRef< OpFoldResult > originalBasisOfr)

Return a flattened thread id for the workgroup with given sizes.

static GpuIdBuilderFnType common3DIdBuilderFn(int64_t multiplicity=1)

Create a simple 3-D id builder that takes the originalBasisOfr The 3-D id builder returns a 3-D vecto...

static std::string diag(const llvm::Value &value)

constexpr int kMaxGriddimz

constexpr int kMaxTotalBlockdim

constexpr int kMaxGriddimy

constexpr int kMaxBlockdimx

constexpr int kMaxBlockdimz

constexpr int kMaxGriddimx

constexpr int kMaxBlockdimy

constexpr int kMaxTotalGriddim

Base type for affine expression.

AffineExpr floorDiv(uint64_t v) const

MLIRContext * getContext() const

The result of a transform IR operation application.

static DiagnosedSilenceableFailure success()

Constructs a DiagnosedSilenceableFailure in the success state.

This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...

MLIRContext is the top-level object for a collection of MLIR operations.

RAII guard to reset the insertion point of the builder when destroyed.

void setInsertionPointToEnd(Block *block)

Sets the insertion point to the end of the specified block.

void setInsertionPointAfterValue(Value val)

Sets the insertion point to the node after the specified value.

Operation * create(const OperationState &state)

Creates an operation given the fields represented as an OperationState.

This class represents a single result from folding an operation.

This class coordinates the application of a rewrite on a set of IR, providing a way for clients to tr...

This class represents an instance of an SSA value in the MLIR system, representing a computable value...

Location getLoc() const

Return the location of this value.

Specialization of arith.constant op that returns an integer of index type.

AffineApplyOp makeComposedAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)

Returns a composed AffineApplyOp by composing map and operands with other AffineApplyOps supplying th...

OpFoldResult makeComposedFoldedAffineApply(OpBuilder &b, Location loc, AffineMap map, ArrayRef< OpFoldResult > operands)

Constructs an AffineApplyOp that applies map to operands after composing the map with the maps of any...

DiagnosedSilenceableFailure alterGpuLaunch(RewriterBase &rewriter, mlir::gpu::LaunchOp gpuLaunch, TransformOpInterface transformOp, std::optional< int64_t > gridDimX=std::nullopt, std::optional< int64_t > gridDimY=std::nullopt, std::optional< int64_t > gridDimZ=std::nullopt, std::optional< int64_t > blockDimX=std::nullopt, std::optional< int64_t > blockDimY=std::nullopt, std::optional< int64_t > blockDimZ=std::nullopt)

Alter kernel configuration of the given kernel.

DiagnosedSilenceableFailure createGpuLaunch(RewriterBase &rewriter, Location loc, TransformOpInterface transformOp, mlir::gpu::LaunchOp &launchOp, std::optional< int64_t > gridDimX=std::nullopt, std::optional< int64_t > gridDimY=std::nullopt, std::optional< int64_t > gridDimZ=std::nullopt, std::optional< int64_t > blockDimX=std::nullopt, std::optional< int64_t > blockDimY=std::nullopt, std::optional< int64_t > blockDimZ=std::nullopt)

Create an empty-body gpu::LaunchOp using the provided kernel settings and put a terminator within.

DiagnosedSilenceableFailure checkGpuLimits(TransformOpInterface transformOp, std::optional< int64_t > gridDimX, std::optional< int64_t > gridDimY, std::optional< int64_t > gridDimZ, std::optional< int64_t > blockDimX, std::optional< int64_t > blockDimY, std::optional< int64_t > blockDimZ)

Determine if the size of the kernel configuration is supported by the GPU architecture being used.

std::function< IdBuilderResult(RewriterBase &, Location, ArrayRef< int64_t >, ArrayRef< int64_t >)> GpuIdBuilderFnType

Common gpu id builder type, allows the configuration of lowering for various mapping schemes.

Include the generated interface declarations.

OpFoldResult getAsIndexOpFoldResult(MLIRContext *ctx, int64_t val)

Convert int64_t to integer attributes of index type and return them as OpFoldResult.

void bindDims(MLIRContext *ctx, AffineExprTy &...exprs)

Bind a list of AffineExpr references to DimExpr at positions: [0 .

SmallVector< int64_t > computeStrides(ArrayRef< int64_t > sizes)

SmallVector< int64_t > delinearize(int64_t linearIndex, ArrayRef< int64_t > strides)

Given the strides together with a linear index in the dimension space, return the vector-space offset...

int64_t computeProduct(ArrayRef< int64_t > basis)

Self-explicit.

void bindSymbols(MLIRContext *ctx, AffineExprTy &...exprs)

Bind a list of AffineExpr references to SymbolExpr at positions: [0 .

Value getValueOrCreateConstantIndexOp(OpBuilder &b, Location loc, OpFoldResult ofr)

Converts an OpFoldResult to a Value.

auto get(MLIRContext *context, Ts &&...params)

Helper method that injects context only if needed, this helps unify some of the attribute constructio...

AffineExpr getAffineDimExpr(unsigned position, MLIRContext *context)

These free functions allow clients of the API to not use classes in detail.

Utility class for the GPU dialect to represent triples of Values accessible through ....

GpuBlockIdBuilder(MLIRContext *ctx, bool useLinearMapping=false)

Helper struct for configuring the rewrite of mapped scf.forall ops to various gpu id configurations.

SmallVector< DeviceMappingAttrInterface > mappingAttributes

The mapping attributes targeted by this generator.

GpuIdBuilderFnType idBuilder

The constructor that builds the concrete IR for mapping ids.

std::function< DeviceMappingAttrInterface(MLIRContext *, mlir::gpu::MappingId)> MappingIdBuilderFnType

GpuThreadIdBuilder(MLIRContext *ctx, bool useLinearMapping=false)

GpuWarpIdBuilder(MLIRContext *ctx, int64_t warpSize, bool useLinearMapping=false)

static constexpr int64_t kNumWarpsPerGroup

In the future this may be configured by the transformation.

GpuWarpgroupIdBuilder(MLIRContext *ctx, int64_t warpSize, bool useLinearMapping=false)

Helper type for functions that generate ids for the mapping of a scf.forall.