LLVM: lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp Source File (original) (raw)
1
2
3
4
5
6
7
8
9
10
11
12
13
28#include "llvm/IR/IntrinsicsAMDGPU.h"
30
31#define GET_GICOMBINER_DEPS
32#include "AMDGPUGenPreLegalizeGICombiner.inc"
33#undef GET_GICOMBINER_DEPS
34
35#define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
36
37using namespace llvm;
39
40namespace {
41#define GET_GICOMBINER_TYPES
42#include "AMDGPUGenPostLegalizeGICombiner.inc"
43#undef GET_GICOMBINER_TYPES
44
45class AMDGPUPostLegalizerCombinerImpl : public Combiner {
46protected:
47 const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig;
50
52
53public:
54 AMDGPUPostLegalizerCombinerImpl(
57 const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig,
60
61 static const char *getName() { return "AMDGPUPostLegalizerCombinerImpl"; }
62
64 bool tryCombineAll(MachineInstr &I) const override;
65
66 struct FMinFMaxLegacyInfo {
70 };
71
72
74 FMinFMaxLegacyInfo &Info) const;
75 void applySelectFCmpToFMinFMaxLegacy(MachineInstr &MI,
76 const FMinFMaxLegacyInfo &Info) const;
77
80
81 bool
84
87
88
89
90 struct CvtF32UByteMatchInfo {
92 unsigned ShiftOffset;
93 };
94
96 CvtF32UByteMatchInfo &MatchInfo) const;
98 const CvtF32UByteMatchInfo &MatchInfo) const;
99
101
102
103
104 bool matchCombineSignExtendInReg(
105 MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchInfo) const;
106 void applyCombineSignExtendInReg(
107 MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchInfo) const;
108
109
110
111
112
113
114 bool matchCombine_s_mul_u64(MachineInstr &MI, unsigned &NewOpcode) const;
115
116private:
117#define GET_GICOMBINER_CLASS_MEMBERS
118#define AMDGPUSubtarget GCNSubtarget
119#include "AMDGPUGenPostLegalizeGICombiner.inc"
120#undef GET_GICOMBINER_CLASS_MEMBERS
121#undef AMDGPUSubtarget
122};
123
124#define GET_GICOMBINER_IMPL
125#define AMDGPUSubtarget GCNSubtarget
126#include "AMDGPUGenPostLegalizeGICombiner.inc"
127#undef AMDGPUSubtarget
128#undef GET_GICOMBINER_IMPL
129
130AMDGPUPostLegalizerCombinerImpl::AMDGPUPostLegalizerCombinerImpl(
133 const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig,
135 : Combiner(MF, CInfo, TPC, &VT, CSEInfo), RuleConfig(RuleConfig), STI(STI),
136 TII(*STI.getInstrInfo()),
137 Helper(Observer, B, false, &VT, MDT, LI, STI),
139#include "AMDGPUGenPostLegalizeGICombiner.inc"
141{
142}
143
144bool AMDGPUPostLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const {
145 if (tryCombineAllImpl(MI))
146 return true;
147
148 switch (MI.getOpcode()) {
149 case TargetOpcode::G_SHL:
150 case TargetOpcode::G_LSHR:
151 case TargetOpcode::G_ASHR:
152
153
154
156 }
157
158 return false;
159}
160
161bool AMDGPUPostLegalizerCombinerImpl::matchFMinFMaxLegacy(
162 MachineInstr &MI, MachineInstr &FCmp, FMinFMaxLegacyInfo &Info) const {
164 return false;
165
170 Register True = MI.getOperand(2).getReg();
171 Register False = MI.getOperand(3).getReg();
172
173
174
175 if ((Info.LHS != True || Info.RHS != False) &&
176 (Info.LHS != False || Info.RHS != True))
177 return false;
178
179
180
181
182 if (Info.LHS != True)
184
185
187}
188
189void AMDGPUPostLegalizerCombinerImpl::applySelectFCmpToFMinFMaxLegacy(
190 MachineInstr &MI, const FMinFMaxLegacyInfo &Info) const {
192 : AMDGPU::G_AMDGPU_FMIN_LEGACY;
196
197
198
200 }
201
202 B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
203
204 MI.eraseFromParent();
205}
206
207bool AMDGPUPostLegalizerCombinerImpl::matchUCharToFloat(
208 MachineInstr &MI) const {
209 Register DstReg = MI.getOperand(0).getReg();
210
211
212
213
214
215 LLT Ty = MRI.getType(DstReg);
217 Register SrcReg = MI.getOperand(1).getReg();
218 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
219 assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64);
222 }
223
224 return false;
225}
226
227void AMDGPUPostLegalizerCombinerImpl::applyUCharToFloat(
228 MachineInstr &MI) const {
230
231 Register DstReg = MI.getOperand(0).getReg();
232 Register SrcReg = MI.getOperand(1).getReg();
233 LLT Ty = MRI.getType(DstReg);
234 LLT SrcTy = MRI.getType(SrcReg);
235 if (SrcTy != S32)
236 SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);
237
238 if (Ty == S32) {
239 B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, {SrcReg},
240 MI.getFlags());
241 } else {
242 auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, {SrcReg},
243 MI.getFlags());
244 B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());
245 }
246
247 MI.eraseFromParent();
248}
249
250bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq(
251 MachineInstr &MI,
252 std::function<void(MachineIRBuilder &)> &MatchInfo) const {
253 auto getRcpSrc = [=](const MachineInstr &MI) -> MachineInstr * {
255 return nullptr;
256
258 if (GI->is(Intrinsic::amdgcn_rcp))
259 return MRI.getVRegDef(MI.getOperand(2).getReg());
260 }
261 return nullptr;
262 };
263
264 auto getSqrtSrc = [=](const MachineInstr &MI) -> MachineInstr * {
266 return nullptr;
267 MachineInstr *SqrtSrcMI = nullptr;
268 auto Match =
270 (void)Match;
271 return SqrtSrcMI;
272 };
273
274 MachineInstr *RcpSrcMI = nullptr, *SqrtSrcMI = nullptr;
275
276 if ((RcpSrcMI = getRcpSrc(MI)) && (SqrtSrcMI = getSqrtSrc(*RcpSrcMI))) {
277 MatchInfo = [SqrtSrcMI, &MI](MachineIRBuilder &B) {
278 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)})
280 .setMIFlags(MI.getFlags());
281 };
282 return true;
283 }
284
285
286 if ((SqrtSrcMI = getSqrtSrc(MI)) && (RcpSrcMI = getRcpSrc(*SqrtSrcMI))) {
287 MatchInfo = [RcpSrcMI, &MI](MachineIRBuilder &B) {
288 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)})
290 .setMIFlags(MI.getFlags());
291 };
292 return true;
293 }
294 return false;
295}
296
297bool AMDGPUPostLegalizerCombinerImpl::matchFDivSqrtToRsqF16(
298 MachineInstr &MI) const {
299 Register Sqrt = MI.getOperand(2).getReg();
300 return MRI.hasOneNonDBGUse(Sqrt);
301}
302
303void AMDGPUPostLegalizerCombinerImpl::applyFDivSqrtToRsqF16(
304 MachineInstr &MI, const Register &X) const {
305 Register Dst = MI.getOperand(0).getReg();
307 LLT DstTy = MRI.getType(Dst);
308 uint32_t Flags = MI.getFlags();
309 Register RSQ = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {DstTy})
310 .addUse(X)
311 .setMIFlags(Flags)
312 .getReg(0);
313 B.buildFMul(Dst, RSQ, Y, Flags);
314 MI.eraseFromParent();
315}
316
317bool AMDGPUPostLegalizerCombinerImpl::matchCvtF32UByteN(
318 MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) const {
319 Register SrcReg = MI.getOperand(1).getReg();
320
321
323
325 int64_t ShiftAmt;
328 const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0;
329
330 unsigned ShiftOffset = 8 * Offset;
331 if (IsShr)
332 ShiftOffset += ShiftAmt;
333 else
334 ShiftOffset -= ShiftAmt;
335
336 MatchInfo.CvtVal = Src0;
337 MatchInfo.ShiftOffset = ShiftOffset;
338 return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0;
339 }
340
341
342 return false;
343}
344
345void AMDGPUPostLegalizerCombinerImpl::applyCvtF32UByteN(
346 MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) const {
347 unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;
348
350 Register CvtSrc = MatchInfo.CvtVal;
351 LLT SrcTy = MRI.getType(MatchInfo.CvtVal);
352 if (SrcTy != S32) {
354 CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0);
355 }
356
357 assert(MI.getOpcode() != NewOpc);
358 B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags());
359 MI.eraseFromParent();
360}
361
362bool AMDGPUPostLegalizerCombinerImpl::matchRemoveFcanonicalize(
364 const SITargetLowering *TLI = static_cast<const SITargetLowering *>(
365 MF.getSubtarget().getTargetLowering());
366 Reg = MI.getOperand(1).getReg();
368}
369
370
371
372
373
374
375
376bool AMDGPUPostLegalizerCombinerImpl::matchCombineSignExtendInReg(
377 MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchData) const {
378 Register LoadReg = MI.getOperand(1).getReg();
379 if (.hasOneNonDBGUse(LoadReg))
380 return false;
381
382
383
384 MachineInstr *LoadMI = MRI.getVRegDef(LoadReg);
385 int64_t Width = MI.getOperand(2).getImm();
387 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
388 MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE};
389 return Width == 8;
390 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
391 MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT};
392 return Width == 16;
393 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
394 MatchData = {LoadMI, AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE};
395 return Width == 8;
396 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
397 MatchData = {LoadMI, AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT};
398 return Width == 16;
399 }
400 return false;
401}
402
403
404
405void AMDGPUPostLegalizerCombinerImpl::applyCombineSignExtendInReg(
406 MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchData) const {
407 auto [LoadMI, NewOpcode] = MatchData;
409
410
411 Register SignExtendInsnDst = MI.getOperand(0).getReg();
413
414 MI.eraseFromParent();
415}
416
417bool AMDGPUPostLegalizerCombinerImpl::matchCombine_s_mul_u64(
418 MachineInstr &MI, unsigned &NewOpcode) const {
419 Register Src0 = MI.getOperand(1).getReg();
420 Register Src1 = MI.getOperand(2).getReg();
422 return false;
423
424 if (VT->getKnownBits(Src1).countMinLeadingZeros() >= 32 &&
425 VT->getKnownBits(Src0).countMinLeadingZeros() >= 32) {
426 NewOpcode = AMDGPU::G_AMDGPU_S_MUL_U64_U32;
427 return true;
428 }
429
430 if (VT->computeNumSignBits(Src1) >= 33 &&
431 VT->computeNumSignBits(Src0) >= 33) {
432 NewOpcode = AMDGPU::G_AMDGPU_S_MUL_I64_I32;
433 return true;
434 }
435 return false;
436}
437
438
439
440
441class AMDGPUPostLegalizerCombiner : public MachineFunctionPass {
442public:
443 static char ID;
444
445 AMDGPUPostLegalizerCombiner(bool IsOptNone = false);
446
447 StringRef getPassName() const override {
448 return "AMDGPUPostLegalizerCombiner";
449 }
450
451 bool runOnMachineFunction(MachineFunction &MF) override;
452
453 void getAnalysisUsage(AnalysisUsage &AU) const override;
454
455private:
456 bool IsOptNone;
457 AMDGPUPostLegalizerCombinerImplRuleConfig RuleConfig;
458};
459}
460
461void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
465 AU.addRequired();
466 AU.addPreserved();
467 if (!IsOptNone) {
468 AU.addRequired();
469 AU.addPreserved();
470 }
472}
473
474AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone)
475 : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
476 if (!RuleConfig.parseCommandLineOption())
478}
479
480bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
482 return false;
483 auto *TPC = &getAnalysis();
485 bool EnableOpt =
487
491
493 &getAnalysis().get(MF);
495 IsOptNone ? nullptr
496 : &getAnalysis().getDomTree();
497
498 CombinerInfo CInfo( false, true,
499 LI, EnableOpt, F.hasOptSize(), F.hasMinSize());
500
501 CInfo.MaxIterations = 1;
503
504 CInfo.EnableFullDCE = false;
505 AMDGPUPostLegalizerCombinerImpl Impl(MF, CInfo, TPC, *VT, nullptr,
506 RuleConfig, ST, MDT, LI);
507 return Impl.combineMachineInstrs();
508}
509
510char AMDGPUPostLegalizerCombiner::ID = 0;
512 "Combine AMDGPU machine instrs after legalization", false,
513 false)
517 "Combine AMDGPU machine instrs after legalization", false,
519
521 return new AMDGPUPostLegalizerCombiner(IsOptNone);
522}
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
#define GET_GICOMBINER_CONSTRUCTOR_INITS
const TargetInstrInfo & TII
This contains common combine transformations that may be used in a combine pass.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
This contains common combine transformations that may be used in a combine pass,or by the target else...
Option class for Targets to specify which operations are combined how and when.
This contains the base class for all Combiners generated by TableGen.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
Contains matchers for matching SSA Machine Instructions.
Promote Memory to Register
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
static StringRef getName(Value *V)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
Target-Independent Code Generator Pass Configuration Options pass.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getUnorderedPredicate() const
GISelValueTracking * getValueTracking() const
bool tryCombineShiftToUnmerge(MachineInstr &MI, unsigned TargetShiftAmount) const
FunctionPass class - This class is used to implement most global optimizations.
To use KnownBitsInfo analysis in a pass, KnownBitsInfo &Info = getAnalysis<GISelValueTrackingInfoAnal...
bool maskedValueIsZero(Register Val, const APInt &Mask)
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
const MCInstrDesc & get(unsigned Opcode) const
Return the machine instruction descriptor that corresponds to the specified instruction opcode.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineFunctionProperties & getProperties() const
Get the function properties.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
Register getReg() const
getReg - Returns the register number.
Wrapper class representing virtual and physical registers.
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
Target-Independent Code Generator Pass Configuration Options.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
operand_type_match m_Reg()
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
ConstantMatch< APInt > m_ICst(APInt &Cst)
UnaryOp_match< SrcTy, TargetOpcode::G_FSQRT > m_GFSqrt(const SrcTy &Src)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_SHL, false > m_GShl(const LHS &L, const RHS &R)
bind_ty< MachineInstr * > m_MInstr(MachineInstr *&MI)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
Predicate getPredicate(unsigned Condition, unsigned Hint)
Return predicate consisting of specified condition and hint bits.
This is an optimization pass for GlobalISel generic memory operations.
decltype(auto) dyn_cast(const From &Val)
dyn_cast - Return the argument parameter cast to the specified type.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
LLVM_ABI void getSelectionDAGFallbackAnalysisUsage(AnalysisUsage &AU)
Modify analysis usage so it preserves passes required for the SelectionDAG fallback.
FunctionPass * createAMDGPUPostLegalizeCombiner(bool IsOptNone)
Definition AMDGPUPostLegalizerCombiner.cpp:520
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ SinglePass
Enables Observer-based DCE and additional heuristics that retry combining defined and used instructio...