AMDGPUPostLegalizerCombiner.cpp Source File (original) (raw)

28#include "llvm/IR/IntrinsicsAMDGPU.h"

31#define GET_GICOMBINER_DEPS

32#include "AMDGPUGenPreLegalizeGICombiner.inc"

33#undef GET_GICOMBINER_DEPS

35#define DEBUG_TYPE "amdgpu-postlegalizer-combiner"

37using namespace llvm;

40namespace {

41#define GET_GICOMBINER_TYPES

42#include "AMDGPUGenPostLegalizeGICombiner.inc"

43#undef GET_GICOMBINER_TYPES

45class AMDGPUPostLegalizerCombinerImpl : public Combiner {

46protected:

47 const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig;

53public:

54 AMDGPUPostLegalizerCombinerImpl(

57 const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig,

61 static const char *getName() { return "AMDGPUPostLegalizerCombinerImpl"; }

64 bool tryCombineAll(MachineInstr &I) const override;

66 struct FMinFMaxLegacyInfo {

70 };

74 FMinFMaxLegacyInfo &Info) const;

75 void applySelectFCmpToFMinFMaxLegacy(MachineInstr &MI,

76 const FMinFMaxLegacyInfo &Info) const;

81 bool

90 struct CvtF32UByteMatchInfo {

92 unsigned ShiftOffset;

93 };

96 CvtF32UByteMatchInfo &MatchInfo) const;

98 const CvtF32UByteMatchInfo &MatchInfo) const;

101

102

103

104 bool matchCombineSignExtendInReg(

105 MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchInfo) const;

106 void applyCombineSignExtendInReg(

107 MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchInfo) const;

108

109

110

111

112

113

114 bool matchCombine_s_mul_u64(MachineInstr &MI, unsigned &NewOpcode) const;

115

116private:

117#define GET_GICOMBINER_CLASS_MEMBERS

118#define AMDGPUSubtarget GCNSubtarget

119#include "AMDGPUGenPostLegalizeGICombiner.inc"

120#undef GET_GICOMBINER_CLASS_MEMBERS

121#undef AMDGPUSubtarget

122};

123

124#define GET_GICOMBINER_IMPL

125#define AMDGPUSubtarget GCNSubtarget

126#include "AMDGPUGenPostLegalizeGICombiner.inc"

127#undef AMDGPUSubtarget

128#undef GET_GICOMBINER_IMPL

129

130AMDGPUPostLegalizerCombinerImpl::AMDGPUPostLegalizerCombinerImpl(

133 const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig,

135 : Combiner(MF, CInfo, TPC, &VT, CSEInfo), RuleConfig(RuleConfig), STI(STI),

136 TII(*STI.getInstrInfo()),

137 Helper(Observer, B, false, &VT, MDT, LI, STI),

139#include "AMDGPUGenPostLegalizeGICombiner.inc"

141{

142}

143

144bool AMDGPUPostLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const {

145 if (tryCombineAllImpl(MI))

146 return true;

147

148 switch (MI.getOpcode()) {

149 case TargetOpcode::G_SHL:

150 case TargetOpcode::G_LSHR:

151 case TargetOpcode::G_ASHR:

152

153

154

156 }

157

158 return false;

159}

160

161bool AMDGPUPostLegalizerCombinerImpl::matchFMinFMaxLegacy(

162 MachineInstr &MI, MachineInstr &FCmp, FMinFMaxLegacyInfo &Info) const {

164 return false;

165

170 Register True = MI.getOperand(2).getReg();

171 Register False = MI.getOperand(3).getReg();

172

173

174

175 if ((Info.LHS != True || Info.RHS != False) &&

176 (Info.LHS != False || Info.RHS != True))

177 return false;

178

179

180

181

182 if (Info.LHS != True)

184

185

187}

188

189void AMDGPUPostLegalizerCombinerImpl::applySelectFCmpToFMinFMaxLegacy(

190 MachineInstr &MI, const FMinFMaxLegacyInfo &Info) const {

192 : AMDGPU::G_AMDGPU_FMIN_LEGACY;

196

197

198

200 }

201

202 B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());

203

204 MI.eraseFromParent();

205}

206

207bool AMDGPUPostLegalizerCombinerImpl::matchUCharToFloat(

208 MachineInstr &MI) const {

209 Register DstReg = MI.getOperand(0).getReg();

210

211

212

213

214

215 LLT Ty = MRI.getType(DstReg);

217 Register SrcReg = MI.getOperand(1).getReg();

218 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();

219 assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64);

222 }

223

224 return false;

225}

226

227void AMDGPUPostLegalizerCombinerImpl::applyUCharToFloat(

228 MachineInstr &MI) const {

230

231 Register DstReg = MI.getOperand(0).getReg();

232 Register SrcReg = MI.getOperand(1).getReg();

233 LLT Ty = MRI.getType(DstReg);

234 LLT SrcTy = MRI.getType(SrcReg);

235 if (SrcTy != S32)

236 SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);

237

238 if (Ty == S32) {

239 B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, {SrcReg},

240 MI.getFlags());

241 } else {

242 auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, {SrcReg},

243 MI.getFlags());

244 B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());

245 }

246

247 MI.eraseFromParent();

248}

249

250bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq(

251 MachineInstr &MI,

252 std::function<void(MachineIRBuilder &)> &MatchInfo) const {

253 auto getRcpSrc = [=](const MachineInstr &MI) -> MachineInstr * {

255 return nullptr;

256

258 if (GI->is(Intrinsic::amdgcn_rcp))

259 return MRI.getVRegDef(MI.getOperand(2).getReg());

260 }

261 return nullptr;

262 };

263

264 auto getSqrtSrc = [=](const MachineInstr &MI) -> MachineInstr * {

266 return nullptr;

267 MachineInstr *SqrtSrcMI = nullptr;

268 auto Match =

270 (void)Match;

271 return SqrtSrcMI;

272 };

273

274 MachineInstr *RcpSrcMI = nullptr, *SqrtSrcMI = nullptr;

275

276 if ((RcpSrcMI = getRcpSrc(MI)) && (SqrtSrcMI = getSqrtSrc(*RcpSrcMI))) {

277 MatchInfo = [SqrtSrcMI, &MI](MachineIRBuilder &B) {

278 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)})

280 .setMIFlags(MI.getFlags());

281 };

282 return true;

283 }

284

285

286 if ((SqrtSrcMI = getSqrtSrc(MI)) && (RcpSrcMI = getRcpSrc(*SqrtSrcMI))) {

287 MatchInfo = [RcpSrcMI, &MI](MachineIRBuilder &B) {

288 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)})

290 .setMIFlags(MI.getFlags());

291 };

292 return true;

293 }

294 return false;

295}

296

297bool AMDGPUPostLegalizerCombinerImpl::matchFDivSqrtToRsqF16(

298 MachineInstr &MI) const {

299 Register Sqrt = MI.getOperand(2).getReg();

300 return MRI.hasOneNonDBGUse(Sqrt);

301}

302

303void AMDGPUPostLegalizerCombinerImpl::applyFDivSqrtToRsqF16(

304 MachineInstr &MI, const Register &X) const {

305 Register Dst = MI.getOperand(0).getReg();

307 LLT DstTy = MRI.getType(Dst);

308 uint32_t Flags = MI.getFlags();

309 Register RSQ = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {DstTy})

310 .addUse(X)

311 .setMIFlags(Flags)

312 .getReg(0);

313 B.buildFMul(Dst, RSQ, Y, Flags);

314 MI.eraseFromParent();

315}

316

317bool AMDGPUPostLegalizerCombinerImpl::matchCvtF32UByteN(

318 MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) const {

319 Register SrcReg = MI.getOperand(1).getReg();

320

321

323

325 int64_t ShiftAmt;

328 const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0;

329

330 unsigned ShiftOffset = 8 * Offset;

331 if (IsShr)

332 ShiftOffset += ShiftAmt;

333 else

334 ShiftOffset -= ShiftAmt;

335

336 MatchInfo.CvtVal = Src0;

337 MatchInfo.ShiftOffset = ShiftOffset;

338 return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0;

339 }

340

341

342 return false;

343}

344

345void AMDGPUPostLegalizerCombinerImpl::applyCvtF32UByteN(

346 MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) const {

347 unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;

348

350 Register CvtSrc = MatchInfo.CvtVal;

351 LLT SrcTy = MRI.getType(MatchInfo.CvtVal);

352 if (SrcTy != S32) {

354 CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0);

355 }

356

357 assert(MI.getOpcode() != NewOpc);

358 B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags());

359 MI.eraseFromParent();

360}

361

362bool AMDGPUPostLegalizerCombinerImpl::matchRemoveFcanonicalize(

364 const SITargetLowering *TLI = static_cast<const SITargetLowering *>(

365 MF.getSubtarget().getTargetLowering());

366 Reg = MI.getOperand(1).getReg();

368}

369

370

371

372

373

374

375

376bool AMDGPUPostLegalizerCombinerImpl::matchCombineSignExtendInReg(

377 MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchData) const {

378 Register LoadReg = MI.getOperand(1).getReg();

379 if ( MRI .hasOneNonDBGUse(LoadReg))

380 return false;

381

382

383

384 MachineInstr *LoadMI = MRI.getVRegDef(LoadReg);

385 int64_t Width = MI.getOperand(2).getImm();

387 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:

388 MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE};

389 return Width == 8;

390 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:

391 MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT};

392 return Width == 16;

393 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:

394 MatchData = {LoadMI, AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE};

395 return Width == 8;

396 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:

397 MatchData = {LoadMI, AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT};

398 return Width == 16;

399 }

400 return false;

401}

402

403

404

405void AMDGPUPostLegalizerCombinerImpl::applyCombineSignExtendInReg(

406 MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchData) const {

407 auto [LoadMI, NewOpcode] = MatchData;

409

410

411 Register SignExtendInsnDst = MI.getOperand(0).getReg();

413

414 MI.eraseFromParent();

415}

416

417bool AMDGPUPostLegalizerCombinerImpl::matchCombine_s_mul_u64(

418 MachineInstr &MI, unsigned &NewOpcode) const {

419 Register Src0 = MI.getOperand(1).getReg();

420 Register Src1 = MI.getOperand(2).getReg();

422 return false;

423

424 if (VT->getKnownBits(Src1).countMinLeadingZeros() >= 32 &&

425 VT->getKnownBits(Src0).countMinLeadingZeros() >= 32) {

426 NewOpcode = AMDGPU::G_AMDGPU_S_MUL_U64_U32;

427 return true;

428 }

429

430 if (VT->computeNumSignBits(Src1) >= 33 &&

431 VT->computeNumSignBits(Src0) >= 33) {

432 NewOpcode = AMDGPU::G_AMDGPU_S_MUL_I64_I32;

433 return true;

434 }

435 return false;

436}

437

438

439

440

441class AMDGPUPostLegalizerCombiner : public MachineFunctionPass {

442public:

443 static char ID;

444

445 AMDGPUPostLegalizerCombiner(bool IsOptNone = false);

446

447 StringRef getPassName() const override {

448 return "AMDGPUPostLegalizerCombiner";

449 }

450

451 bool runOnMachineFunction(MachineFunction &MF) override;

452

453 void getAnalysisUsage(AnalysisUsage &AU) const override;

454

455private:

456 bool IsOptNone;

457 AMDGPUPostLegalizerCombinerImplRuleConfig RuleConfig;

458};

459}

460

461void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {

465 AU.addRequired();

466 AU.addPreserved();

467 if (!IsOptNone) {

468 AU.addRequired();

469 AU.addPreserved();

470 }

472}

473

474AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone)

475 : MachineFunctionPass(ID), IsOptNone(IsOptNone) {

476 if (!RuleConfig.parseCommandLineOption())

478}

479

480bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {

482 return false;

483 auto *TPC = &getAnalysis();

485 bool EnableOpt =

487

491

493 &getAnalysis().get(MF);

495 IsOptNone ? nullptr

496 : &getAnalysis().getDomTree();

497

498 CombinerInfo CInfo( false, true,

499 LI, EnableOpt, F.hasOptSize(), F.hasMinSize());

500

501 CInfo.MaxIterations = 1;

503

504 CInfo.EnableFullDCE = false;

505 AMDGPUPostLegalizerCombinerImpl Impl(MF, CInfo, TPC, *VT, nullptr,

506 RuleConfig, ST, MDT, LI);

507 return Impl.combineMachineInstrs();

508}

509

510char AMDGPUPostLegalizerCombiner::ID = 0;

512 "Combine AMDGPU machine instrs after legalization", false,

513 false)

517 "Combine AMDGPU machine instrs after legalization", false,

519

521 return new AMDGPUPostLegalizerCombiner(IsOptNone);

522}

unsigned const MachineRegisterInfo * MRI

assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

#define GET_GICOMBINER_CONSTRUCTOR_INITS

const TargetInstrInfo & TII

This contains common combine transformations that may be used in a combine pass.

This file declares the targeting of the Machinelegalizer class for AMDGPU.

Provides AMDGPU specific target descriptions.

static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

Analysis containing CSE Info

This contains common combine transformations that may be used in a combine pass,or by the target else...

Option class for Targets to specify which operations are combined how and when.

This contains the base class for all Combiners generated by TableGen.

AMD GCN specific subclass of TargetSubtarget.

Provides analysis for querying information about KnownBits during GISel passes.

Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...

Contains matchers for matching SSA Machine Instructions.

Promote Memory to Register

#define INITIALIZE_PASS_DEPENDENCY(depName)

#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)

#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)

static StringRef getName(Value *V)

static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")

static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")

Target-Independent Code Generator Pass Configuration Options pass.

static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)

Constructs an APInt value that has the top hiBitsSet bits set.

AnalysisUsage & addRequired()

AnalysisUsage & addPreserved()

Add the specified Pass class to the set of analyses preserved by this pass.

LLVM_ABI void setPreservesCFG()

This function should be called by the pass, iff they do not:

Predicate

This enumeration lists the possible predicates for CmpInst subclasses.

@ FCMP_OGT

0 0 1 0 True if ordered and greater than

Predicate getSwappedPredicate() const

For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.

Predicate getInversePredicate() const

For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...

Predicate getUnorderedPredicate() const

GISelValueTracking * getValueTracking() const

bool tryCombineShiftToUnmerge(MachineInstr &MI, unsigned TargetShiftAmount) const

FunctionPass class - This class is used to implement most global optimizations.

To use KnownBitsInfo analysis in a pass, KnownBitsInfo &Info = getAnalysis<GISelValueTrackingInfoAnal...

bool maskedValueIsZero(Register Val, const APInt &Mask)

constexpr bool isScalar() const

static constexpr LLT scalar(unsigned SizeInBits)

Get a low-level scalar or aggregate "bag of bits".

constexpr TypeSize getSizeInBits() const

Returns the total size of the type. Must only be called on sized types.

const MCInstrDesc & get(unsigned Opcode) const

Return the machine instruction descriptor that corresponds to the specified instruction opcode.

DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...

void getAnalysisUsage(AnalysisUsage &AU) const override

getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.

const TargetSubtargetInfo & getSubtarget() const

getSubtarget - Return the subtarget for which this machine code is being compiled.

Function & getFunction()

Return the LLVM function that this machine code represents.

const MachineFunctionProperties & getProperties() const

Get the function properties.

const TargetMachine & getTarget() const

getTarget - Return the target machine this machine code is compiled with

Helper class to build MachineInstr.

Representation of each machine instruction.

unsigned getOpcode() const

Returns the opcode of this MachineInstr.

LLVM_ABI void setDesc(const MCInstrDesc &TID)

Replace the instruction descriptor (thus opcode) of the current instruction with a new one.

const MachineOperand & getOperand(unsigned i) const

LLVM_ABI void setReg(Register Reg)

Change the register this operand corresponds to.

getReg - Returns the register number.

Wrapper class representing virtual and physical registers.

bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const

CodeGenOptLevel getOptLevel() const

Returns the optimization level: None, Less, Default, or Aggressive.

Target-Independent Code Generator Pass Configuration Options.

constexpr std::underlying_type_t< E > Mask()

Get a bitmask with 1s in all places up to the high-order bit of E's largest value.

unsigned ID

LLVM IR allows to use arbitrary numbers as calling convention identifiers.

operand_type_match m_Reg()

UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)

ConstantMatch< APInt > m_ICst(APInt &Cst)

UnaryOp_match< SrcTy, TargetOpcode::G_FSQRT > m_GFSqrt(const SrcTy &Src)

bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)

BinaryOp_match< LHS, RHS, TargetOpcode::G_SHL, false > m_GShl(const LHS &L, const RHS &R)

bind_ty< MachineInstr * > m_MInstr(MachineInstr *&MI)

BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)

Predicate getPredicate(unsigned Condition, unsigned Hint)

Return predicate consisting of specified condition and hint bits.

This is an optimization pass for GlobalISel generic memory operations.

decltype(auto) dyn_cast(const From &Val)

dyn_cast - Return the argument parameter cast to the specified type.

LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)

LLVM_ABI void getSelectionDAGFallbackAnalysisUsage(AnalysisUsage &AU)

Modify analysis usage so it preserves passes required for the SelectionDAG fallback.

FunctionPass * createAMDGPUPostLegalizeCombiner(bool IsOptNone)

Definition AMDGPUPostLegalizerCombiner.cpp:520

void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)

Implement std::swap in terms of BitVector swap.

@ SinglePass

Enables Observer-based DCE and additional heuristics that retry combining defined and used instructio...