LowerGpuOpsToROCDLOps.cpp Source File (original) (raw)

44 #include "llvm/Support/FormatVariadic.h"

46 #include "../GPUCommon/GPUOpsLowering.h"

47 #include "../GPUCommon/IndexIntrinsicsOpLowering.h"

49 namespace mlir {

50 #define GEN_PASS_DEF_CONVERTGPUOPSTOROCDLOPS

51 #include "mlir/Conversion/Passes.h.inc"

52 }

54 using namespace mlir;

61 int64_t intWidth = cast(value.getType()).getWidth();

63 auto indexBitwidthType =

66 if (indexBitwidth > intWidth) {

67 return rewriter.create LLVM::SExtOp(loc, indexBitwidthType, value);

68 }

69 if (indexBitwidth < intWidth) {

70 return rewriter.create LLVM::TruncOp(loc, indexBitwidthType, value);

71 }

72 return value;

73 }

78 bool canBeBare = true;

79 for (Type type : func.getArgumentTypes())

80 if (auto memrefTy = dyn_cast(type))

82 return canBeBare;

83 }

86 const unsigned indexBitwidth) {

88 Value zero = rewriter.create arith::ConstantIntOp(loc, 0, 32);

89 Value minus1 = rewriter.create arith::ConstantIntOp(loc, -1, 32);

90 Value mbcntLo = rewriter.create ROCDL::MbcntLoOp(loc, int32Type,

92 Value laneId = rewriter.create ROCDL::MbcntHiOp(loc, int32Type,

94 return laneId;

95 }

97 "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"

98 "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:"

99 "32-v32:"

100 "32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:"

101 "64-S32-A5-G1-ni:7:8:9";

102

103 namespace {

106

107 LogicalResult

108 matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor,

110 auto loc = op->getLoc();

112

113

114

116 Value zero = rewriter.create arith::ConstantIntOp(loc, 0, 32);

117 Value minus1 = rewriter.create arith::ConstantIntOp(loc, -1, 32);

119 rewriter.create ROCDL::MbcntLoOp(loc, intTy, ValueRange{minus1, zero});

120 Value laneId = rewriter.create ROCDL::MbcntHiOp(

121 loc, intTy, ValueRange{minus1, mbcntLo});

122

123

124 const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();

125 if (indexBitwidth > 32) {

126 laneId = rewriter.create LLVM::SExtOp(

128 } else if (indexBitwidth < 32) {

129 laneId = rewriter.create LLVM::TruncOp(

131 }

132 rewriter.replaceOp(op, {laneId});

133 return success();

134 }

135 };

136

139

141 amdgpu::Chipset chipset)

143 chipset(chipset) {}

144

145 LogicalResult

146 matchAndRewrite(gpu::SubgroupSizeOp op, gpu::SubgroupSizeOp::Adaptor adaptor,

148 LLVM::ConstantRangeAttr bounds = nullptr;

149 bool isBeforeGfx10 = chipset.majorVersion < 10;

150 if (auto upperBoundAttr = op.getUpperBoundAttr()) {

151 bounds = rewriter.getAttr LLVM::ConstantRangeAttr(

152 32, isBeforeGfx10 ? 64 : 32,

153 op.getUpperBoundAttr().getInt() + 1);

154 }

155 Value wavefrontOp = rewriter.create ROCDL::WavefrontSizeOp(

156 op.getLoc(), rewriter.getI32Type(), bounds);

158 *getTypeConverter());

159 rewriter.replaceOp(op, {wavefrontOp});

160 return success();

161 }

162

163 const amdgpu::Chipset chipset;

164 };

165

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185 LogicalResult

186 matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,

189 Value initShflValue = adaptor.getValue();

190

191 const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();

192 Value srcLaneId = getLaneId(rewriter, loc, indexBitwidth);

193

195 Value width = adaptor.getWidth();

196 Value zero = rewriter.create LLVM::ConstantOp(loc, int32Type, 0);

197 Value negwidth = rewriter.create LLVM::SubOp(loc, int32Type, zero, width);

198 Value add = rewriter.create LLVM::AddOp(loc, int32Type, srcLaneId, width);

199 Value widthOrZeroIfOutside =

200 rewriter.create LLVM::AndOp(loc, int32Type, add, negwidth);

202

203 switch (op.getMode()) {

204 case gpu::ShuffleMode::UP:

205 dstLane = rewriter.create LLVM::SubOp(loc, int32Type, srcLaneId,

206 adaptor.getOffset());

207 break;

208 case gpu::ShuffleMode::DOWN:

209 dstLane = rewriter.create LLVM::AddOp(loc, int32Type, srcLaneId,

210 adaptor.getOffset());

211 break;

212 case gpu::ShuffleMode::XOR:

213 dstLane = rewriter.create LLVM::XOrOp(loc, int32Type, srcLaneId,

214 adaptor.getOffset());

215 break;

216 case gpu::ShuffleMode::IDX:

217 dstLane = adaptor.getOffset();

218 break;

219 }

220 Value isActiveSrcLane = rewriter.create LLVM::ICmpOp(

221 loc, LLVM::ICmpPredicate::slt, dstLane, widthOrZeroIfOutside);

222 Value selectDstLane = rewriter.create LLVM::SelectOp(loc, isActiveSrcLane,

223 dstLane, srcLaneId);

224 Value two = rewriter.create LLVM::ConstantOp(loc, int32Type, 2);

225 Value dwordAlignedDstLane =

226 rewriter.create LLVM::ShlOp(loc, int32Type, selectDstLane, two);

227

231 for (Value v : decomposed) {

232 Value res = rewriter.create ROCDL::DsBpermuteOp(loc, int32Type,

233 dwordAlignedDstLane, v);

234 swizzled.emplace_back(res);

235 }

236 Value shflValue =

238 rewriter.replaceOp(op, {shflValue, isActiveSrcLane});

239 return success();

240 }

241 };

242

243

244 #include "GPUToROCDL.cpp.inc"

245

246

247

248

249

250

251 struct LowerGpuOpsToROCDLOpsPass final

252 : public impl::ConvertGpuOpsToROCDLOpsBase {

253 LowerGpuOpsToROCDLOpsPass() = default;

254 LowerGpuOpsToROCDLOpsPass(const std::string &chipset, unsigned indexBitwidth,

255 bool useBarePtrCallConv,

257 if (this->chipset.getNumOccurrences() == 0)

258 this->chipset = chipset;

259 if (this->indexBitwidth.getNumOccurrences() == 0)

260 this->indexBitwidth = indexBitwidth;

261 if (this->useBarePtrCallConv.getNumOccurrences() == 0)

262 this->useBarePtrCallConv = useBarePtrCallConv;

263 if (this->runtime.getNumOccurrences() == 0)

264 this->runtime = runtime;

265 }

266

267 void getDependentDialects(DialectRegistry &registry) const override {

268 Base::getDependentDialects(registry);

270 }

271

272 void runOnOperation() override {

273 gpu::GPUModuleOp m = getOperation();

275

276 auto llvmDataLayout = m->getAttrOfType(

277 LLVM::LLVMDialect::getDataLayoutAttrName());

278 if (!llvmDataLayout) {

280 m->setAttr(LLVM::LLVMDialect::getDataLayoutAttrName(), llvmDataLayout);

281 }

282

283 for (auto func : m.getOpsfunc::FuncOp()) {

284 func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(),

286 }

287

289 if (failed(maybeChipset)) {

291 return signalPassFailure();

292 }

293

294

296 ctx, DataLayout(cast(m.getOperation())));

297 options.dataLayout = llvm::DataLayout(llvmDataLayout.getValue());

299 options.overrideIndexBitwidth(indexBitwidth);

300

301 if (useBarePtrCallConv) {

302 options.useBarePtrCallConv = true;

304 m.walk([](gpu::GPUFuncOp func) -> WalkResult {

308 });

311 "bare pointer calling convention requires all memrefs to "

312 "have static shape and use the identity map");

313 return signalPassFailure();

314 }

315 }

316

317

318

319

320 {

325 }

326

329 converter, [](gpu::AddressSpace space) {

330 switch (space) {

331 case gpu::AddressSpace::Global:

332 return 1;

333 case gpu::AddressSpace::Workgroup:

334 return 3;

335 case gpu::AddressSpace::Private:

336 return 5;

337 }

338 llvm_unreachable("unknown address space enum value");

339 return 0;

340 });

341

344

345 llvm::SmallDenseSet allowedDialectsSet(allowedDialects.begin(),

346 allowedDialects.end());

348 bool allowed = allowedDialectsSet.contains(dialect->getNamespace());

349

350 if (!allowedDialectsSet.empty() && !allowed)

351 continue;

352

353 auto iface = dyn_cast(dialect);

354 if (!iface) {

355

356

357 if (allowed) {

358 m.emitError()

359 << "dialect does not implement ConvertToLLVMPatternInterface: "

360 << dialect->getNamespace();

361 return signalPassFailure();

362 }

363 continue;

364 }

365

366 iface->populateConvertToLLVMConversionPatterns(target, converter,

367 llvmPatterns);

368 }

369

371 *maybeChipset);

373 *maybeChipset);

376 signalPassFailure();

378 auto reqdWorkGroupSizeAttrHelper =

379 rocdlDialect->getReqdWorkGroupSizeAttrHelper();

380 auto flatWorkGroupSizeAttrHelper =

381 rocdlDialect->getFlatWorkGroupSizeAttrHelper();

382

383

384 m.walk([&](LLVM::LLVMFuncOp op) {

385 if (reqdWorkGroupSizeAttrHelper.isAttrPresent(op)) {

386 auto blockSizes = reqdWorkGroupSizeAttrHelper.getAttr(op);

387

388

389 uint32_t flatSize = 1;

390 for (uint32_t size : blockSizes.asArrayRef()) {

391 flatSize *= size;

392 }

393 StringAttr flatSizeAttr =

394 StringAttr::get(ctx, Twine(flatSize) + "," + Twine(flatSize));

395 flatWorkGroupSizeAttrHelper.setAttr(op, flatSizeAttr);

396 }

397 });

398 }

399 };

400

401 }

402

408 target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FCeilOp,

409 LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp, LLVM::Log10Op,

410 LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp>();

411

413 return any_of(op->getOperandTypes(), llvm::IsaPred);

414 });

415

416 target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>();

417 }

418

425 auto *rocdlDialect =

427 populateWithGenerated(patterns);

430 ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>>(

431 converter, IndexKind::Block, IntrType::Id);

433 gpu::BlockIdOp, ROCDL::BlockIdXOp, ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>>(

434 converter, IndexKind::Grid, IntrType::Id);

437 ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>>(

438 converter, IndexKind::Block, IntrType::Dim);

440 gpu::GridDimOp, ROCDL::GridDimXOp, ROCDL::GridDimYOp, ROCDL::GridDimZOp>>(

441 converter, IndexKind::Grid, IntrType::Dim);

444 converter,

446 ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace,

447 ROCDL::ROCDLDialect::kSharedMemoryAddressSpace,

448 rocdlDialect->getKernelAttrHelper().getName(),

449 rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName()});

453

455 }

456

458

459 patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL>(converter);

460 patterns.add(converter, chipset);

461

463 }

464

465 std::unique_ptr<OperationPassgpu::GPUModuleOp>

467 unsigned indexBitwidth,

468 bool useBarePtrCallConv,

470 return std::make_unique(

471 chipset, indexBitwidth, useBarePtrCallConv, runtime);

472 }

static MLIRContext * getContext(OpFoldResult val)

static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func)

Returns true if the given gpu.func can be safely called using the bare pointer calling convention.

static constexpr StringLiteral amdgcnDataLayout

static Value getLaneId(ConversionPatternRewriter &rewriter, Location loc, const unsigned indexBitwidth)

static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter, Location loc, Value value, const LLVMTypeConverter &converter)

static llvm::ManagedStatic< PassManagerOptions > options

MLIRContext * getContext() const

Attr getAttr(Args &&...args)

Get or construct an instance of the attribute Attr with provided arguments.

This class implements a pattern rewriter for use with ConversionPatterns.

void replaceOp(Operation *op, ValueRange newValues) override

Replace the given operation with the new values.

This class describes a specific conversion target.

void addLegalOp(OperationName op)

void addLegalDialect(StringRef name, Names... names)

void addDynamicallyLegalOp(OperationName op, const DynamicLegalityCallbackFn &callback)

void addIllegalDialect(StringRef name, Names... names)

void addIllegalOp(OperationName op)

Utility class for operation conversions targeting the LLVM dialect that match exactly one source oper...

ConvertOpToLLVMPattern(const LLVMTypeConverter &typeConverter, PatternBenefit benefit=1)

The main mechanism for performing data layout queries.

The DialectRegistry maps a dialect namespace to a constructor for the matching dialect.

Dialects are groups of MLIR operations, types and attributes, as well as behavior associated with the...

Derived class that automatically populates legalization information for different LLVM ops.

Conversion from types to the LLVM IR dialect.

static bool canConvertToBarePtr(BaseMemRefType type)

Check if a memref type can be converted to a bare pointer.

MLIRContext & getContext() const

Returns the MLIR context.

unsigned getIndexTypeBitwidth() const

Gets the bitwidth of the index type when converted to LLVM.

This class defines the main interface for locations in MLIR and acts as a non-nullable wrapper around...

Options to control the LLVM lowering.

MLIRContext is the top-level object for a collection of MLIR operations.

Dialect * getLoadedDialect(StringRef name)

Get a registered IR dialect with the given namespace.

std::vector< Dialect * > getLoadedDialects()

Return information about all IR dialects loaded in the context.

Operation * create(const OperationState &state)

Creates an operation given the fields represented as an OperationState.

Operation is the basic unit of execution within MLIR.

Instances of the Type class are uniqued, have an immutable identifier and an optional mutable compone...

This class provides an abstraction over the different types of ranges over Values.

This class represents an instance of an SSA value in the MLIR system, representing a computable value...

Type getType() const

Return the type of this value.

A utility result that is used to signal how to proceed with an ongoing walk:

static WalkResult advance()

bool wasInterrupted() const

Returns true if the walk was interrupted.

static WalkResult interrupt()

Value composeValue(OpBuilder &builder, Location loc, ValueRange src, Type dstType)

Composes a set of src values into a single value of type dstType through series of bitcasts and vecto...

SmallVector< Value > decomposeValue(OpBuilder &builder, Location loc, Value src, Type dstType)

Decomposes a src value into a set of values of type dstType through series of bitcasts and vector ops...

Runtime

Potential runtimes for AMD GPU kernels.

Include the generated interface declarations.

void populateGpuToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns, gpu::amd::Runtime runtime, amdgpu::Chipset chipset)

Collect a set of patterns to convert from the GPU dialect to ROCDL.

static constexpr unsigned kDeriveIndexBitwidthFromDataLayout

Value to pass as bitwidth for the index type when the converter is expected to derive the bitwidth fr...

LogicalResult applyPatternsGreedily(Region &region, const FrozenRewritePatternSet &patterns, GreedyRewriteConfig config=GreedyRewriteConfig(), bool *changed=nullptr)

Rewrite ops in the given region, which must be isolated from above, by repeatedly applying the highes...

void populateGpuRewritePatterns(RewritePatternSet &patterns)

Collect all patterns to rewrite ops within the GPU dialect.

InFlightDiagnostic emitError(Location loc)

Utility method to emit an error message using this location.

void configureGpuToROCDLConversionLegality(ConversionTarget &target)

Configure target to convert from the GPU dialect to ROCDL.

std::unique_ptr< OperationPass< gpu::GPUModuleOp > > createLowerGpuOpsToROCDLOpsPass(const std::string &chipset="gfx900", unsigned indexBitwidth=kDeriveIndexBitwidthFromDataLayout, bool useBarePtrCallConv=false, gpu::amd::Runtime runtime=gpu::amd::Runtime::Unknown)

Creates a pass that lowers GPU dialect operations to ROCDL counterparts.

const FrozenRewritePatternSet & patterns

void registerConvertToLLVMDependentDialectLoading(DialectRegistry &registry)

void populateGpuMemorySpaceAttributeConversions(TypeConverter &typeConverter, const MemorySpaceMapping &mapping)

Populates memory space attribute conversion rules for lowering gpu.address_space to integer values.

void populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, amdgpu::Chipset chipset)

Note: This function will also add conversions for the AMDGPU-specific address spaces,...

auto get(MLIRContext *context, Ts &&...params)

Helper method that injects context only if needed, this helps unify some of the attribute constructio...

void populateGpuPromoteShuffleToAMDGPUPatterns(RewritePatternSet &patterns)

Tries to promote gpu.shuffles to specialized AMDGPU intrinsics.

LogicalResult applyPartialConversion(ArrayRef< Operation * > ops, const ConversionTarget &target, const FrozenRewritePatternSet &patterns, ConversionConfig config=ConversionConfig())

Below we define several entry points for operation conversion.

void populateMathToROCDLConversionPatterns(const LLVMTypeConverter &converter, RewritePatternSet &patterns)

Populate the given list with patterns that convert from Math to ROCDL calls.

The lowering of gpu.printf to a call to HIP hostcalls.

The lowering of gpu.printf to a call to an external printf() function.

Represents the amdgpu gfx chipset version, e.g., gfx90a, gfx942, gfx1103.

static FailureOr< Chipset > parse(StringRef name)

Parses the chipset version string and returns the chipset on success, and failure otherwise.