LLVM: lib/Target/AMDGPU/SIFoldOperands.cpp Source File (original) (raw)
1
2
3
4
5
6
7
8
9
10
22
23#define DEBUG_TYPE "si-fold-operands"
24using namespace llvm;
25
26namespace {
27
28
29
30struct FoldableDef {
31 union {
34 int FrameIndexToFold;
35 };
36
37
39
40
42
43
44 unsigned DefSubReg = AMDGPU::NoSubRegister;
45
46
48
49 FoldableDef() = delete;
51 unsigned DefSubReg = AMDGPU::NoSubRegister)
52 : DefRC(DefRC), DefSubReg(DefSubReg), Kind(FoldOp.getType()) {
53
54 if (FoldOp.isImm()) {
55 ImmToFold = FoldOp.getImm();
56 } else if (FoldOp.isFI()) {
57 FrameIndexToFold = FoldOp.getIndex();
58 } else {
60 OpToFold = &FoldOp;
61 }
62
64 }
65
67 unsigned DefSubReg = AMDGPU::NoSubRegister)
68 : ImmToFold(FoldImm), DefRC(DefRC), DefSubReg(DefSubReg),
70
71
73 FoldableDef Copy(*this);
74 Copy.DefSubReg = TRI.composeSubRegIndices(DefSubReg, SubReg);
75 return Copy;
76 }
77
79
82 return OpToFold->getReg();
83 }
84
85 unsigned getSubReg() const {
87 return OpToFold->getSubReg();
88 }
89
91
92 bool isFI() const {
94 }
95
96 int getFI() const {
98 return FrameIndexToFold;
99 }
100
102
103
104
105
106 std::optional<int64_t> getEffectiveImmVal() const {
109 }
110
111
112
114 unsigned OpIdx) const {
115 switch (Kind) {
117 std::optional<int64_t> ImmToFold = getEffectiveImmVal();
118 if (!ImmToFold)
119 return false;
120
121
122
124 return TII.isOperandLegal(MI, OpIdx, &TmpOp);
125 }
127 if (DefSubReg != AMDGPU::NoSubRegister)
128 return false;
130 return TII.isOperandLegal(MI, OpIdx, &TmpOp);
131 }
132 default:
133
134
135 if (DefSubReg != AMDGPU::NoSubRegister)
136 return false;
137 return TII.isOperandLegal(MI, OpIdx, OpToFold);
138 }
139
141 }
142};
143
144struct FoldCandidate {
146 FoldableDef Def;
147 int ShrinkOpcode;
148 unsigned UseOpNo;
149 bool Commuted;
150
151 FoldCandidate(MachineInstr *MI, unsigned OpNo, FoldableDef Def,
152 bool Commuted = false, int ShrinkOp = -1)
153 : UseMI(MI), Def(Def), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
154 Commuted(Commuted) {}
155
156 bool isFI() const { return Def.isFI(); }
157
158 int getFI() const {
160 return Def.FrameIndexToFold;
161 }
162
163 bool isImm() const { return Def.isImm(); }
164
165 bool isReg() const { return Def.isReg(); }
166
168
169 bool isGlobal() const { return Def.isGlobal(); }
170
171 bool needsShrink() const { return ShrinkOpcode != -1; }
172};
173
174class SIFoldOperandsImpl {
175public:
182
184 const FoldableDef &OpToFold) const;
185
186
187 unsigned convertToVALUOp(unsigned Opc, bool UseVOP3 = false) const {
188 switch (Opc) {
189 case AMDGPU::S_ADD_I32: {
190 if (ST->hasAddNoCarry())
191 return UseVOP3 ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_U32_e32;
192 return UseVOP3 ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
193 }
194 case AMDGPU::S_OR_B32:
195 return UseVOP3 ? AMDGPU::V_OR_B32_e64 : AMDGPU::V_OR_B32_e32;
196 case AMDGPU::S_AND_B32:
197 return UseVOP3 ? AMDGPU::V_AND_B32_e64 : AMDGPU::V_AND_B32_e32;
198 case AMDGPU::S_MUL_I32:
199 return AMDGPU::V_MUL_LO_U32_e64;
200 default:
201 return AMDGPU::INSTRUCTION_LIST_END;
202 }
203 }
204
205 bool foldCopyToVGPROfScalarAddOfFrameIndex(Register DstReg, Register SrcReg,
207
209
210 bool canUseImmWithOpSel(const MachineInstr *MI, unsigned UseOpNo,
211 int64_t ImmVal) const;
212
213
214 bool tryFoldImmWithOpSel(MachineInstr *MI, unsigned UseOpNo,
215 int64_t ImmVal) const;
216
219 const FoldableDef &OpToFold) const;
222
225 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs) const;
226
228 getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
230
231 std::pair<int64_t, const TargetRegisterClass *>
233
235 int64_t SplatVal,
237
238 bool tryToFoldACImm(const FoldableDef &OpToFold, MachineInstr *UseMI,
239 unsigned UseOpIdx,
241 void foldOperand(FoldableDef OpToFold, MachineInstr *UseMI, int UseOpIdx,
244
245 std::optional<int64_t> getImmOrMaterializedImm(MachineOperand &Op) const;
249 bool foldInstOperand(MachineInstr &MI, const FoldableDef &OpToFold) const;
250
251 bool foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const;
254
257
258 std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
263
265
266public:
267 SIFoldOperandsImpl() = default;
268
270};
271
273public:
274 static char ID;
275
277
278 bool runOnMachineFunction(MachineFunction &MF) override {
280 return false;
281 return SIFoldOperandsImpl().run(MF);
282 }
283
284 StringRef getPassName() const override { return "SI Fold Operands"; }
285
286 void getAnalysisUsage(AnalysisUsage &AU) const override {
289 }
290
293 }
294};
295
296}
297
299 false)
300
301char SIFoldOperandsLegacy::ID = 0;
302
304
310 TRI.getSubRegisterClass(RC, MO.getSubReg()))
311 RC = SubRC;
312 return RC;
313}
314
315
317 switch (Opc) {
318 case AMDGPU::V_MAC_F32_e64:
319 return AMDGPU::V_MAD_F32_e64;
320 case AMDGPU::V_MAC_F16_e64:
321 return AMDGPU::V_MAD_F16_e64;
322 case AMDGPU::V_FMAC_F32_e64:
323 return AMDGPU::V_FMA_F32_e64;
324 case AMDGPU::V_FMAC_F16_e64:
325 return AMDGPU::V_FMA_F16_gfx9_e64;
326 case AMDGPU::V_FMAC_F16_t16_e64:
327 return AMDGPU::V_FMA_F16_gfx9_t16_e64;
328 case AMDGPU::V_FMAC_F16_fake16_e64:
329 return AMDGPU::V_FMA_F16_gfx9_fake16_e64;
330 case AMDGPU::V_FMAC_LEGACY_F32_e64:
331 return AMDGPU::V_FMA_LEGACY_F32_e64;
332 case AMDGPU::V_FMAC_F64_e64:
333 return AMDGPU::V_FMA_F64_e64;
334 }
335 return AMDGPU::INSTRUCTION_LIST_END;
336}
337
338
339
340bool SIFoldOperandsImpl::frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
341 const FoldableDef &OpToFold) const {
342 if (!OpToFold.isFI())
343 return false;
344
345 const unsigned Opc = UseMI.getOpcode();
346 switch (Opc) {
347 case AMDGPU::S_ADD_I32:
348 case AMDGPU::S_ADD_U32:
349 case AMDGPU::V_ADD_U32_e32:
350 case AMDGPU::V_ADD_CO_U32_e32:
351
352
353
354 return UseMI.getOperand(OpNo == 1 ? 2 : 1).isImm() &&
355 MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg());
356 case AMDGPU::V_ADD_U32_e64:
357 case AMDGPU::V_ADD_CO_U32_e64:
358 return UseMI.getOperand(OpNo == 2 ? 3 : 2).isImm() &&
359 MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg());
360 default:
361 break;
362 }
363
365 return OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
366 if (->isFLATScratch(UseMI))
367 return false;
368
369 int SIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
370 if (OpNo == SIdx)
371 return true;
372
373 int VIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
374 return OpNo == VIdx && SIdx == -1;
375}
376
377
378
379
380bool SIFoldOperandsImpl::foldCopyToVGPROfScalarAddOfFrameIndex(
382 if (TRI->isVGPR(*MRI, DstReg) && TRI->isSGPRReg(*MRI, SrcReg) &&
383 MRI->hasOneNonDBGUse(SrcReg)) {
384 MachineInstr *Def = MRI->getVRegDef(SrcReg);
385 if (!Def || Def->getNumOperands() != 4)
386 return false;
387
388 MachineOperand *Src0 = &Def->getOperand(1);
389 MachineOperand *Src1 = &Def->getOperand(2);
390
391
392
393
394 if (!Src0->isFI() && !Src1->isFI())
395 return false;
396
397 if (Src0->isFI())
399
400 const bool UseVOP3 = !Src0->isImm() || TII->isInlineConstant(*Src0);
401 unsigned NewOp = convertToVALUOp(Def->getOpcode(), UseVOP3);
402 if (NewOp == AMDGPU::INSTRUCTION_LIST_END ||
403 ->getOperand(3).isDead())
404 return false;
405
406 MachineBasicBlock *MBB = Def->getParent();
408 if (NewOp != AMDGPU::V_ADD_CO_U32_e32) {
409 MachineInstrBuilder Add =
411
412 if (Add->getDesc().getNumDefs() == 2) {
413 Register CarryOutReg = MRI->createVirtualRegister(TRI->getBoolRC());
415 MRI->setRegAllocationHint(CarryOutReg, 0, TRI->getVCC());
416 }
417
418 Add.add(*Src0).add(*Src1).setMIFlags(Def->getFlags());
420 Add.addImm(0);
421
422 Def->eraseFromParent();
423 MI.eraseFromParent();
424 return true;
425 }
426
427 assert(NewOp == AMDGPU::V_ADD_CO_U32_e32);
428
432
434 .add(*Src0)
435 .add(*Src1)
438 Def->eraseFromParent();
439 MI.eraseFromParent();
440 return true;
441 }
442 }
443
444 return false;
445}
446
448 return new SIFoldOperandsLegacy();
449}
450
451bool SIFoldOperandsImpl::canUseImmWithOpSel(const MachineInstr *MI,
452 unsigned UseOpNo,
453 int64_t ImmVal) const {
454 const uint64_t TSFlags = MI->getDesc().TSFlags;
455
459 return false;
460
462 int OpNo = MI->getOperandNo(&Old);
463
464 unsigned Opcode = MI->getOpcode();
465 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
466 switch (OpType) {
467 default:
468 return false;
476
477
479 static_cast<uint16_t>(ImmVal) != static_cast<uint16_t>(ImmVal >> 16))
480 return false;
481 break;
482 }
483
484 return true;
485}
486
487bool SIFoldOperandsImpl::tryFoldImmWithOpSel(MachineInstr *MI, unsigned UseOpNo,
488 int64_t ImmVal) const {
489 MachineOperand &Old = MI->getOperand(UseOpNo);
490 unsigned Opcode = MI->getOpcode();
491 int OpNo = MI->getOperandNo(&Old);
492 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
493
494
495
496
499 return true;
500 }
501
502
503
504 AMDGPU::OpName ModName = AMDGPU::OpName::NUM_OPERAND_NAMES;
505 unsigned SrcIdx = ~0;
506 if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) {
507 ModName = AMDGPU::OpName::src0_modifiers;
508 SrcIdx = 0;
509 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) {
510 ModName = AMDGPU::OpName::src1_modifiers;
511 SrcIdx = 1;
512 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) {
513 ModName = AMDGPU::OpName::src2_modifiers;
514 SrcIdx = 2;
515 }
516 assert(ModName != AMDGPU::OpName::NUM_OPERAND_NAMES);
517 int ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModName);
518 MachineOperand &Mod = MI->getOperand(ModIdx);
519 unsigned ModVal = Mod.getImm();
520
521 uint16_t ImmLo =
522 static_cast<uint16_t>(ImmVal >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0));
523 uint16_t ImmHi =
524 static_cast<uint16_t>(ImmVal >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0));
525 uint32_t Imm = (static_cast<uint32_t>(ImmHi) << 16) | ImmLo;
527
528
529
530 auto tryFoldToInline = [&](uint32_t Imm) -> bool {
534 return true;
535 }
536
537
538
539 uint16_t Lo = static_cast<uint16_t>(Imm);
540 uint16_t Hi = static_cast<uint16_t>(Imm >> 16);
543 Mod.setImm(NewModVal);
545 return true;
546 }
547
548 if (static_cast<int16_t>(Lo) < 0) {
549 int32_t SExt = static_cast<int16_t>(Lo);
551 Mod.setImm(NewModVal);
553 return true;
554 }
555 }
556
557
562 return true;
563 }
564 }
565 } else {
566 uint32_t Swapped = (static_cast<uint32_t>(Lo) << 16) | Hi;
570 return true;
571 }
572 }
573
574 return false;
575 };
576
577 if (tryFoldToInline(Imm))
578 return true;
579
580
581
582
583
584
585
586 bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16;
587 bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16;
588 if (SrcIdx == 1 && (IsUAdd || IsUSub)) {
589 unsigned ClampIdx =
590 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::clamp);
591 bool Clamp = MI->getOperand(ClampIdx).getImm() != 0;
592
593 if (!Clamp) {
594 uint16_t NegLo = -static_cast<uint16_t>(Imm);
595 uint16_t NegHi = -static_cast<uint16_t>(Imm >> 16);
596 uint32_t NegImm = (static_cast<uint32_t>(NegHi) << 16) | NegLo;
597
598 if (tryFoldToInline(NegImm)) {
599 unsigned NegOpcode =
600 IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16;
601 MI->setDesc(TII->get(NegOpcode));
602 return true;
603 }
604 }
605 }
606
607 return false;
608}
609
610bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
611 MachineInstr *MI = Fold.UseMI;
612 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
614
615 std::optional<int64_t> ImmVal;
616 if (Fold.isImm())
617 ImmVal = Fold.Def.getEffectiveImmVal();
618
619 if (ImmVal && canUseImmWithOpSel(Fold.UseMI, Fold.UseOpNo, *ImmVal)) {
620 if (tryFoldImmWithOpSel(Fold.UseMI, Fold.UseOpNo, *ImmVal))
621 return true;
622
623
624
626 int OpNo = MI->getOperandNo(&Old);
627 if (->isOperandLegal(*MI, OpNo, &New))
628 return false;
630 return true;
631 }
632
633 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
634 MachineBasicBlock *MBB = MI->getParent();
637 LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n");
638 return false;
639 }
640
641 int Op32 = Fold.ShrinkOpcode;
642 MachineOperand &Dst0 = MI->getOperand(0);
643 MachineOperand &Dst1 = MI->getOperand(1);
645
646 bool HaveNonDbgCarryUse = ->use_nodbg_empty(Dst1.getReg());
647
648 const TargetRegisterClass *Dst0RC = MRI->getRegClass(Dst0.getReg());
649 Register NewReg0 = MRI->createVirtualRegister(Dst0RC);
650
651 MachineInstr *Inst32 = TII->buildShrunkInst(*MI, Op32);
652
653 if (HaveNonDbgCarryUse) {
657 }
658
659
660
661
662
663
664
665 Dst0.setReg(NewReg0);
666 for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)
668 MI->setDesc(TII->get(AMDGPU::IMPLICIT_DEF));
669
670 if (Fold.Commuted)
672 return true;
673 }
674
675 assert(!Fold.needsShrink() && "not handled");
676
677 if (ImmVal) {
680 if (NewMFMAOpc == -1)
681 return false;
682 MI->setDesc(TII->get(NewMFMAOpc));
683 MI->untieRegOperand(0);
684 const MCInstrDesc &MCID = MI->getDesc();
685 for (unsigned I = 0; I < MI->getNumDefs(); ++I)
687 MI->getOperand(I).setIsEarlyClobber(true);
688 }
689
690
692 int OpNo = MI->getOperandNo(&Old);
693 if (->isOperandLegal(*MI, OpNo, &New))
694 return false;
695
697 return true;
698 }
699
700 if (Fold.isGlobal()) {
701 Old.ChangeToGA(Fold.Def.OpToFold->getGlobal(),
702 Fold.Def.OpToFold->getOffset(),
703 Fold.Def.OpToFold->getTargetFlags());
704 return true;
705 }
706
707 if (Fold.isFI()) {
709 return true;
710 }
711
712 MachineOperand *New = Fold.Def.OpToFold;
713
714
715 if (const TargetRegisterClass *OpRC =
717 const TargetRegisterClass *NewRC =
718 TRI->getRegClassForReg(*MRI, New->getReg());
719
720 const TargetRegisterClass *ConstrainRC = OpRC;
721 if (New->getSubReg()) {
722 ConstrainRC =
723 TRI->getMatchingSuperRegClass(NewRC, OpRC, New->getSubReg());
724
725 if (!ConstrainRC)
726 return false;
727 }
728
729 if (New->getReg().isVirtual() &&
730 ->constrainRegClass(New->getReg(), ConstrainRC)) {
732 << TRI->getRegClassName(ConstrainRC) << '\n');
733 return false;
734 }
735 }
736
737
738
739 if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
740 Old.setSubReg(AMDGPU::NoSubRegister);
741 if (New->getReg().isPhysical()) {
743 } else {
746 }
747 return true;
748}
749
751 FoldCandidate &&Entry) {
752
753 for (FoldCandidate &Fold : FoldList)
754 if (Fold.UseMI == Entry.UseMI && Fold.UseOpNo == Entry.UseOpNo)
755 return;
756 LLVM_DEBUG(dbgs() << "Append " << (Entry.Commuted ? "commuted" : "normal")
757 << " operand " << Entry.UseOpNo << "\n " << *Entry.UseMI);
759}
760
763 const FoldableDef &FoldOp,
764 bool Commuted = false, int ShrinkOp = -1) {
766 FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp));
767}
768
769
770
771
774 if (!ST->hasPKF32InstsReplicatingLower32BitsOfScalarInput())
775 return false;
776 const MCOperandInfo &OpDesc = MI->getDesc().operands()[OpNo];
778}
779
780
781
782
784 const FoldableDef &OpToFold) {
785 assert(OpToFold.isImm() && "Expected immediate operand");
786 uint64_t ImmVal = OpToFold.getEffectiveImmVal().value();
790}
791
792bool SIFoldOperandsImpl::tryAddToFoldList(
793 SmallVectorImpl &FoldList, MachineInstr *MI, unsigned OpNo,
794 const FoldableDef &OpToFold) const {
795 const unsigned Opc = MI->getOpcode();
796
797 auto tryToFoldAsFMAAKorMK = [&]() {
798 if (!OpToFold.isImm())
799 return false;
800
801 const bool TryAK = OpNo == 3;
802 const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32;
803 MI->setDesc(TII->get(NewOpc));
804
805
806 bool FoldAsFMAAKorMK =
807 tryAddToFoldList(FoldList, MI, TryAK ? 3 : 2, OpToFold);
808 if (FoldAsFMAAKorMK) {
809
810 MI->untieRegOperand(3);
811
812 if (OpNo == 1) {
813 MachineOperand &Op1 = MI->getOperand(1);
814 MachineOperand &Op2 = MI->getOperand(2);
816
817 if (Op2.isImm()) {
820 } else {
823 }
824 }
825 return true;
826 }
828 return false;
829 };
830
831 bool IsLegal = OpToFold.isOperandLegal(*TII, *MI, OpNo);
832 if (!IsLegal && OpToFold.isImm()) {
833 if (std::optional<int64_t> ImmVal = OpToFold.getEffectiveImmVal())
834 IsLegal = canUseImmWithOpSel(MI, OpNo, *ImmVal);
835 }
836
837 if (!IsLegal) {
838
840 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
841
842
843 MI->setDesc(TII->get(NewOpc));
846 if (AddOpSel)
848 bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold);
849 if (FoldAsMAD) {
850 MI->untieRegOperand(OpNo);
851 return true;
852 }
853 if (AddOpSel)
854 MI->removeOperand(MI->getNumExplicitOperands() - 1);
856 }
857
858
859
860 if (Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) {
861 if (tryToFoldAsFMAAKorMK())
862 return true;
863 }
864
865
866 if (OpToFold.isImm()) {
867 unsigned ImmOpc = 0;
868 if (Opc == AMDGPU::S_SETREG_B32)
869 ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
870 else if (Opc == AMDGPU::S_SETREG_B32_mode)
871 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
872 if (ImmOpc) {
873 MI->setDesc(TII->get(ImmOpc));
875 return true;
876 }
877 }
878
879
880
883 if (!CanCommute)
884 return false;
885
886 MachineOperand &Op = MI->getOperand(OpNo);
887 MachineOperand &CommutedOp = MI->getOperand(CommuteOpNo);
888
889
890
891
892
893 if (.isReg() || !CommutedOp.isReg())
894 return false;
895
896
897
898 if (Op.isReg() && CommutedOp.isReg() &&
899 (Op.getReg() == CommutedOp.getReg() &&
900 Op.getSubReg() == CommutedOp.getSubReg()))
901 return false;
902
904 return false;
905
906 int Op32 = -1;
907 if (!OpToFold.isOperandLegal(*TII, *MI, CommuteOpNo)) {
908 if ((Opc != AMDGPU::V_ADD_CO_U32_e64 && Opc != AMDGPU::V_SUB_CO_U32_e64 &&
909 Opc != AMDGPU::V_SUBREV_CO_U32_e64) ||
910 (!OpToFold.isImm() && !OpToFold.isFI() && !OpToFold.isGlobal())) {
912 return false;
913 }
914
915
916
917 MachineOperand &OtherOp = MI->getOperand(OpNo);
918 if (!OtherOp.isReg() ||
920 return false;
921
922 assert(MI->getOperand(1).isDef());
923
924
925 unsigned MaybeCommutedOpc = MI->getOpcode();
927 }
928
930 Op32);
931 return true;
932 }
933
934
935
936
937
938
939 if (Opc == AMDGPU::S_FMAC_F32 &&
940 (OpNo != 1 || ->getOperand(1).isIdenticalTo(MI->getOperand(2)))) {
941 if (tryToFoldAsFMAAKorMK())
942 return true;
943 }
944
945
946
947 if (OpToFold.isImm() &&
950 return false;
951
953 return true;
954}
955
956bool SIFoldOperandsImpl::isUseSafeToFold(const MachineInstr &MI,
957 const MachineOperand &UseMO) const {
958
959 return ->isSDWA(MI);
960}
961
967 SubDef && TII.isFoldableCopy(*SubDef);
968 SubDef = MRI.getVRegDef(Sub->getReg())) {
969 unsigned SrcIdx = TII.getFoldableCopySrcIdx(*SubDef);
971
972 if (SrcOp.isImm())
975 break;
977
978 if (SrcOp.getSubReg())
979 break;
980 }
981
982 return Sub;
983}
984
985const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
986 MachineInstr &RegSeq,
987 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs) const {
988
990
991 const TargetRegisterClass *RC = nullptr;
992
994 MachineOperand &SrcOp = RegSeq.getOperand(I);
996
997
998 const TargetRegisterClass *OpRC = getRegOpRC(*MRI, *TRI, SrcOp);
999 if (!RC)
1000 RC = OpRC;
1001 else if (->getCommonSubClass(RC, OpRC))
1002 return nullptr;
1003
1005
1006 Defs.emplace_back(&SrcOp, SubRegIdx);
1007 continue;
1008 }
1009
1011 if (DefSrc && (DefSrc->isReg() || DefSrc->isImm())) {
1012 Defs.emplace_back(DefSrc, SubRegIdx);
1013 continue;
1014 }
1015
1016 Defs.emplace_back(&SrcOp, SubRegIdx);
1017 }
1018
1019 return RC;
1020}
1021
1022
1023
1024
1025const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(
1026 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
1028 MachineInstr *Def = MRI->getVRegDef(UseReg);
1029 if (!Def || ->isRegSequence())
1030 return nullptr;
1031
1032 return getRegSeqInit(*Def, Defs);
1033}
1034
1035std::pair<int64_t, const TargetRegisterClass *>
1036SIFoldOperandsImpl::isRegSeqSplat(MachineInstr &RegSeq) const {
1038 const TargetRegisterClass *SrcRC = getRegSeqInit(RegSeq, Defs);
1039 if (!SrcRC)
1040 return {};
1041
1042 bool TryToMatchSplat64 = false;
1043
1044 int64_t Imm;
1045 for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
1046 const MachineOperand *Op = Defs[I].first;
1047 if (->isImm())
1048 return {};
1049
1050 int64_t SubImm = Op->getImm();
1051 if () {
1052 Imm = SubImm;
1053 continue;
1054 }
1055
1056 if (Imm != SubImm) {
1057 if (I == 1 && (E & 1) == 0) {
1058
1059
1060 TryToMatchSplat64 = true;
1061 break;
1062 }
1063
1064 return {};
1065 }
1066 }
1067
1068 if (!TryToMatchSplat64)
1069 return {Defs[0].first->getImm(), SrcRC};
1070
1071
1072
1073 int64_t SplatVal64;
1074 for (unsigned I = 0, E = Defs.size(); I != E; I += 2) {
1075 const MachineOperand *Op0 = Defs[I].first;
1076 const MachineOperand *Op1 = Defs[I + 1].first;
1077
1079 return {};
1080
1081 unsigned SubReg0 = Defs[I].second;
1082 unsigned SubReg1 = Defs[I + 1].second;
1083
1084
1085
1086 if (TRI->getChannelFromSubReg(SubReg0) + 1 !=
1087 TRI->getChannelFromSubReg(SubReg1))
1088 return {};
1089
1091 if (I == 0)
1092 SplatVal64 = MergedVal;
1093 else if (SplatVal64 != MergedVal)
1094 return {};
1095 }
1096
1097 const TargetRegisterClass *RC64 = TRI->getSubRegisterClass(
1099
1100 return {SplatVal64, RC64};
1101}
1102
1103bool SIFoldOperandsImpl::tryFoldRegSeqSplat(
1104 MachineInstr *UseMI, unsigned UseOpIdx, int64_t SplatVal,
1105 const TargetRegisterClass *SplatRC) const {
1107 if (UseOpIdx >= Desc.getNumOperands())
1108 return false;
1109
1110
1112 return false;
1113
1115 if (RCID == -1)
1116 return false;
1117
1118 const TargetRegisterClass *OpRC = TRI->getRegClass(RCID);
1119
1120
1121
1122
1123 if (SplatVal != 0 && SplatVal != -1) {
1124
1125
1126
1127 uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
1128 switch (OpTy) {
1133 OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0);
1134 break;
1138 OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0_sub1);
1139 break;
1140 default:
1141 return false;
1142 }
1143
1144 if (->getCommonSubClass(OpRC, SplatRC))
1145 return false;
1146 }
1147
1149 if (->isOperandLegal(*UseMI, UseOpIdx, &TmpOp))
1150 return false;
1151
1152 return true;
1153}
1154
1155bool SIFoldOperandsImpl::tryToFoldACImm(
1156 const FoldableDef &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx,
1157 SmallVectorImpl &FoldList) const {
1159 if (UseOpIdx >= Desc.getNumOperands())
1160 return false;
1161
1162
1164 return false;
1165
1166 if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) {
1169 return false;
1171 return true;
1172 }
1173
1174 return false;
1175}
1176
1177void SIFoldOperandsImpl::foldOperand(
1178 FoldableDef OpToFold, MachineInstr *UseMI, int UseOpIdx,
1179 SmallVectorImpl &FoldList,
1180 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
1181 const MachineOperand *UseOp = &UseMI->getOperand(UseOpIdx);
1182
1183 if (!isUseSafeToFold(*UseMI, *UseOp))
1184 return;
1185
1186
1187 if (UseOp->isReg() && OpToFold.isReg()) {
1189 return;
1190
1191 if (UseOp->getSubReg() != AMDGPU::NoSubRegister &&
1192 (UseOp->getSubReg() != AMDGPU::lo16 ||
1193 ->isSGPRReg(*MRI, OpToFold.getReg())))
1194 return;
1195 }
1196
1197
1198
1199
1203
1204 int64_t SplatVal;
1205 const TargetRegisterClass *SplatRC;
1206 std::tie(SplatVal, SplatRC) = isRegSeqSplat(*UseMI);
1207
1208
1211 for (unsigned I = 0; I != UsesToProcess.size(); ++I) {
1212 MachineOperand *RSUse = UsesToProcess[I];
1213 MachineInstr *RSUseMI = RSUse->getParent();
1214 unsigned OpNo = RSUseMI->getOperandNo(RSUse);
1215
1216 if (SplatRC) {
1217 if (RSUseMI->isCopy()) {
1221 continue;
1222 }
1223 if (tryFoldRegSeqSplat(RSUseMI, OpNo, SplatVal, SplatRC)) {
1224 FoldableDef SplatDef(SplatVal, SplatRC);
1226 continue;
1227 }
1228 }
1229
1230
1231 if (RSUse->getSubReg() != RegSeqDstSubReg)
1232 continue;
1233
1234
1235
1236 foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList,
1237 CopiesToReplace);
1238 }
1239
1240 return;
1241 }
1242
1243 if (tryToFoldACImm(OpToFold, UseMI, UseOpIdx, FoldList))
1244 return;
1245
1246 if (frameIndexMayFold(*UseMI, UseOpIdx, OpToFold)) {
1247
1248
1249
1251 if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
1253 return;
1254
1255
1256
1257 MachineOperand &SOff =
1258 *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
1259 if (!SOff.isImm() || SOff.getImm() != 0)
1260 return;
1261 }
1262
1264 if (TII->isFLATScratch(*UseMI) &&
1268 unsigned CPol =
1269 TII->getNamedOperand(*UseMI, AMDGPU::OpName::cpol)->getImm();
1272 return;
1273
1275 }
1276
1277
1278
1280
1281 return;
1282 }
1283
1284 bool FoldingImmLike =
1285 OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1286
1287 if (FoldingImmLike && UseMI->isCopy()) {
1292
1293 const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
1294
1295
1296
1297
1299 return;
1300
1301 const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
1302
1303
1304 for (unsigned MovOp :
1305 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,
1306 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_MOV_B16_t16_e64,
1307 AMDGPU::V_ACCVGPR_WRITE_B32_e64, AMDGPU::AV_MOV_B32_IMM_PSEUDO,
1308 AMDGPU::AV_MOV_B64_IMM_PSEUDO}) {
1309 const MCInstrDesc &MovDesc = TII->get(MovOp);
1310 const TargetRegisterClass *MovDstRC =
1312
1313
1314
1315
1317 continue;
1318
1319 const int SrcIdx = MovOp == AMDGPU::V_MOV_B16_t16_e64 ? 2 : 1;
1320
1322 if (RegClassID != -1) {
1323 const TargetRegisterClass *MovSrcRC = TRI->getRegClass(RegClassID);
1324
1325 if (UseSubReg)
1326 MovSrcRC = TRI->getMatchingSuperRegClass(SrcRC, MovSrcRC, UseSubReg);
1327
1328
1329
1330 if (MovOp == AMDGPU::AV_MOV_B32_IMM_PSEUDO &&
1331 (!OpToFold.isImm() ||
1332 ->isImmOperandLegal(MovDesc, SrcIdx,
1333 *OpToFold.getEffectiveImmVal())))
1334 break;
1335
1336 if (->constrainRegClass(SrcReg, MovSrcRC))
1337 break;
1338
1339
1340
1341 } else {
1342
1343
1344
1345
1346 if (!OpToFold.isImm() ||
1347 ->isImmOperandLegal(MovDesc, 1, *OpToFold.getEffectiveImmVal()))
1348 break;
1349 }
1350
1353 while (ImpOpI != ImpOpE) {
1355 ImpOpI++;
1357 }
1359
1360 if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
1362 MachineOperand NewSrcOp(SrcOp);
1363 MachineFunction *MF = UseMI->getMF();
1368 UseOpIdx = SrcIdx;
1370 }
1372 break;
1373 }
1374
1375
1377 return;
1378
1379 } else {
1380 if (UseMI->isCopy() && OpToFold.isReg() &&
1384 LLVM_DEBUG(dbgs() << "Folding " << OpToFold.OpToFold << "\n into "
1389 unsigned SubRegIdx = OpToFold.getSubReg();
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403 static_assert(AMDGPU::sub1_hi16 == 12, "Subregister layout has changed");
1406
1407
1408 if (SubRegIdx > AMDGPU::sub1) {
1409 LaneBitmask M = TRI->getSubRegIndexLaneMask(SubRegIdx);
1410 M |= M.getLane(M.getHighestLane() - 1);
1411 SmallVector<unsigned, 4> Indexes;
1412 TRI->getCoveringSubRegIndexes(TRI->getRegClassForReg(*MRI, UseReg), M,
1413 Indexes);
1414 assert(Indexes.size() == 1 && "Expected one 32-bit subreg to cover");
1415 SubRegIdx = Indexes[0];
1416
1417 } else if (TII->getOpSize(*UseMI, 1) == 4)
1418 SubRegIdx = 0;
1419 else
1420 SubRegIdx = AMDGPU::sub0;
1421 }
1425 OpToFold.OpToFold->setIsKill(false);
1426
1427
1429 if (foldCopyToAGPRRegSequence(UseMI))
1430 return;
1431 }
1432
1434 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
1435 (UseOpc == AMDGPU::V_READLANE_B32 &&
1436 (int)UseOpIdx ==
1437 AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
1438
1439
1440
1441
1442 if (FoldingImmLike) {
1445 *OpToFold.DefMI, *UseMI))
1446 return;
1447
1449
1450 if (OpToFold.isImm()) {
1452 *OpToFold.getEffectiveImmVal());
1453 } else if (OpToFold.isFI())
1455 else {
1456 assert(OpToFold.isGlobal());
1458 OpToFold.OpToFold->getOffset(),
1459 OpToFold.OpToFold->getTargetFlags());
1460 }
1462 return;
1463 }
1464
1465 if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
1468 *OpToFold.DefMI, *UseMI))
1469 return;
1470
1471
1472
1473
1474
1480 return;
1481 }
1482 }
1483
1484 const MCInstrDesc &UseDesc = UseMI->getDesc();
1485
1486
1487
1489 UseDesc.operands()[UseOpIdx].RegClass == -1)
1490 return;
1491 }
1492
1493
1494
1495
1496
1497 tryAddToFoldList(FoldList, UseMI, UseOpIdx, OpToFold);
1498}
1499
1502 switch (Opcode) {
1503 case AMDGPU::V_AND_B32_e64:
1504 case AMDGPU::V_AND_B32_e32:
1505 case AMDGPU::S_AND_B32:
1507 return true;
1508 case AMDGPU::V_OR_B32_e64:
1509 case AMDGPU::V_OR_B32_e32:
1510 case AMDGPU::S_OR_B32:
1512 return true;
1513 case AMDGPU::V_XOR_B32_e64:
1514 case AMDGPU::V_XOR_B32_e32:
1515 case AMDGPU::S_XOR_B32:
1517 return true;
1518 case AMDGPU::S_XNOR_B32:
1520 return true;
1521 case AMDGPU::S_NAND_B32:
1523 return true;
1524 case AMDGPU::S_NOR_B32:
1526 return true;
1527 case AMDGPU::S_ANDN2_B32:
1529 return true;
1530 case AMDGPU::S_ORN2_B32:
1532 return true;
1533 case AMDGPU::V_LSHL_B32_e64:
1534 case AMDGPU::V_LSHL_B32_e32:
1535 case AMDGPU::S_LSHL_B32:
1536
1537 Result = LHS << (RHS & 31);
1538 return true;
1539 case AMDGPU::V_LSHLREV_B32_e64:
1540 case AMDGPU::V_LSHLREV_B32_e32:
1541 Result = RHS << (LHS & 31);
1542 return true;
1543 case AMDGPU::V_LSHR_B32_e64:
1544 case AMDGPU::V_LSHR_B32_e32:
1545 case AMDGPU::S_LSHR_B32:
1546 Result = LHS >> (RHS & 31);
1547 return true;
1548 case AMDGPU::V_LSHRREV_B32_e64:
1549 case AMDGPU::V_LSHRREV_B32_e32:
1550 Result = RHS >> (LHS & 31);
1551 return true;
1552 case AMDGPU::V_ASHR_I32_e64:
1553 case AMDGPU::V_ASHR_I32_e32:
1554 case AMDGPU::S_ASHR_I32:
1555 Result = static_cast<int32_t>(LHS) >> (RHS & 31);
1556 return true;
1557 case AMDGPU::V_ASHRREV_I32_e64:
1558 case AMDGPU::V_ASHRREV_I32_e32:
1559 Result = static_cast<int32_t>(RHS) >> (LHS & 31);
1560 return true;
1561 default:
1562 return false;
1563 }
1564}
1565
1567 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1568}
1569
1570std::optional<int64_t>
1571SIFoldOperandsImpl::getImmOrMaterializedImm(MachineOperand &Op) const {
1572 if (Op.isImm())
1573 return Op.getImm();
1574
1575 if (.isReg() ||
.getReg().isVirtual())
1576 return std::nullopt;
1577
1578 const MachineInstr *Def = MRI->getVRegDef(Op.getReg());
1579 if (Def && Def->isMoveImmediate()) {
1580 const MachineOperand &ImmSrc = Def->getOperand(1);
1581 if (ImmSrc.isImm())
1582 return TII->extractSubregFromImm(ImmSrc.getImm(), Op.getSubReg());
1583 }
1584
1585 return std::nullopt;
1586}
1587
1588
1589
1590
1591bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
1592 if (->allImplicitDefsAreDead())
1593 return false;
1594
1595 unsigned Opc = MI->getOpcode();
1596
1597 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1598 if (Src0Idx == -1)
1599 return false;
1600
1601 MachineOperand *Src0 = &MI->getOperand(Src0Idx);
1602 std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(*Src0);
1603
1604 if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
1605 Opc == AMDGPU::S_NOT_B32) &&
1606 Src0Imm) {
1607 MI->getOperand(1).ChangeToImmediate(~*Src0Imm);
1608 TII->mutateAndCleanupImplicit(
1610 return true;
1611 }
1612
1613 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1614 if (Src1Idx == -1)
1615 return false;
1616
1617 MachineOperand *Src1 = &MI->getOperand(Src1Idx);
1618 std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1);
1619
1620 if (!Src0Imm && !Src1Imm)
1621 return false;
1622
1623
1624
1625
1626 if (Src0Imm && Src1Imm) {
1627 int32_t NewImm;
1629 return false;
1630
1631 bool IsSGPR = TRI->isSGPRReg(*MRI, MI->getOperand(0).getReg());
1632
1633
1634
1635 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
1636 MI->removeOperand(Src1Idx);
1638 return true;
1639 }
1640
1641 if (->isCommutable())
1642 return false;
1643
1644 if (Src0Imm && !Src1Imm) {
1648 }
1649
1650 int32_t Src1Val = static_cast<int32_t>(*Src1Imm);
1651 if (Opc == AMDGPU::V_OR_B32_e64 ||
1652 Opc == AMDGPU::V_OR_B32_e32 ||
1653 Opc == AMDGPU::S_OR_B32) {
1654 if (Src1Val == 0) {
1655
1656 MI->removeOperand(Src1Idx);
1657 TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
1658 } else if (Src1Val == -1) {
1659
1660 MI->removeOperand(Src1Idx);
1661 TII->mutateAndCleanupImplicit(
1663 } else
1664 return false;
1665
1666 return true;
1667 }
1668
1669 if (Opc == AMDGPU::V_AND_B32_e64 || Opc == AMDGPU::V_AND_B32_e32 ||
1670 Opc == AMDGPU::S_AND_B32) {
1671 if (Src1Val == 0) {
1672
1673 MI->removeOperand(Src0Idx);
1674 TII->mutateAndCleanupImplicit(
1676 } else if (Src1Val == -1) {
1677
1678 MI->removeOperand(Src1Idx);
1679 TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
1680 } else
1681 return false;
1682
1683 return true;
1684 }
1685
1686 if (Opc == AMDGPU::V_XOR_B32_e64 || Opc == AMDGPU::V_XOR_B32_e32 ||
1687 Opc == AMDGPU::S_XOR_B32) {
1688 if (Src1Val == 0) {
1689
1690 MI->removeOperand(Src1Idx);
1691 TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));
1692 return true;
1693 }
1694 }
1695
1696 return false;
1697}
1698
1699
1700bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
1701 unsigned Opc = MI.getOpcode();
1702 if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
1703 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
1704 return false;
1705
1706 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1707 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1709 std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1);
1710 if (!Src1Imm)
1711 return false;
1712
1713 std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(*Src0);
1714 if (!Src0Imm || *Src0Imm != *Src1Imm)
1715 return false;
1716 }
1717
1718 int Src1ModIdx =
1719 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
1720 int Src0ModIdx =
1721 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
1722 if ((Src1ModIdx != -1 && MI.getOperand(Src1ModIdx).getImm() != 0) ||
1723 (Src0ModIdx != -1 && MI.getOperand(Src0ModIdx).getImm() != 0))
1724 return false;
1725
1727 auto &NewDesc =
1729 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1730 if (Src2Idx != -1)
1731 MI.removeOperand(Src2Idx);
1732 MI.removeOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
1733 if (Src1ModIdx != -1)
1734 MI.removeOperand(Src1ModIdx);
1735 if (Src0ModIdx != -1)
1736 MI.removeOperand(Src0ModIdx);
1737 TII->mutateAndCleanupImplicit(MI, NewDesc);
1739 return true;
1740}
1741
1742bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const {
1743 if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
1744 MI.getOpcode() != AMDGPU::V_AND_B32_e32)
1745 return false;
1746
1747 std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(MI.getOperand(1));
1748 if (!Src0Imm || *Src0Imm != 0xffff || .getOperand(2).isReg())
1749 return false;
1750
1751 Register Src1 = MI.getOperand(2).getReg();
1752 MachineInstr *SrcDef = MRI->getVRegDef(Src1);
1754 return false;
1755
1756 Register Dst = MI.getOperand(0).getReg();
1757 MRI->replaceRegWith(Dst, Src1);
1758 if (.getOperand(2).isKill())
1759 MRI->clearKillFlags(Src1);
1760 MI.eraseFromParent();
1761 return true;
1762}
1763
1764bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
1765 const FoldableDef &OpToFold) const {
1766
1767
1768
1769 SmallVector<MachineInstr *, 4> CopiesToReplace;
1771 MachineOperand &Dst = MI.getOperand(0);
1773
1774 if (OpToFold.isImm()) {
1775 for (auto &UseMI :
1777
1778
1779
1780
1781
1782
1783
1784
1785 if (tryConstantFoldOp(&UseMI)) {
1788 }
1789 }
1790 }
1791
1794 for (auto *U : UsesToProcess) {
1795 MachineInstr *UseMI = U->getParent();
1796
1797 FoldableDef SubOpToFold = OpToFold.getWithSubReg(*TRI, U->getSubReg());
1799 CopiesToReplace);
1800 }
1801
1802 if (CopiesToReplace.empty() && FoldList.empty())
1804
1805 MachineFunction *MF = MI.getMF();
1806
1807 for (MachineInstr *Copy : CopiesToReplace)
1808 Copy->addImplicitDefUseOperands(*MF);
1809
1810 SetVector<MachineInstr *> ConstantFoldCandidates;
1811 for (FoldCandidate &Fold : FoldList) {
1812 assert(!Fold.isReg() || Fold.Def.OpToFold);
1813 if (Fold.isReg() && Fold.getReg().isVirtual()) {
1815 const MachineInstr *DefMI = Fold.Def.DefMI;
1818 continue;
1819 }
1821
1822 if (Fold.isReg()) {
1823 assert(Fold.Def.OpToFold && Fold.isReg());
1824
1825
1826
1827 MRI->clearKillFlags(Fold.getReg());
1828 }
1829 LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
1830 << static_cast<int>(Fold.UseOpNo) << " of "
1831 << *Fold.UseMI);
1832
1833 if (Fold.isImm())
1834 ConstantFoldCandidates.insert(Fold.UseMI);
1835
1836 } else if (Fold.Commuted) {
1837
1839 }
1840 }
1841
1842 for (MachineInstr *MI : ConstantFoldCandidates) {
1843 if (tryConstantFoldOp(MI)) {
1846 }
1847 }
1848 return true;
1849}
1850
1851
1852
1853bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const {
1854
1855
1856
1857
1858 const TargetRegisterClass *DefRC =
1860 if (->isAGPRClass(DefRC))
1861 return false;
1862
1864 MachineInstr *RegSeq = MRI->getVRegDef(UseReg);
1866 return false;
1867
1869 MachineBasicBlock &MBB = *CopyMI->getParent();
1870
1871 MachineInstrBuilder B(*MBB.getParent(), CopyMI);
1872 DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
1873
1874 const TargetRegisterClass *UseRC =
1876
1877
1879
1880 unsigned NumRegSeqOperands = RegSeq->getNumOperands();
1881 unsigned NumFoldable = 0;
1882
1883 for (unsigned I = 1; I != NumRegSeqOperands; I += 2) {
1884 MachineOperand &RegOp = RegSeq->getOperand(I);
1886
1888
1890 continue;
1891 }
1892
1896
1897 if (Lookup->isImm()) {
1898
1899 const TargetRegisterClass *DestSuperRC = TRI->getMatchingSuperRegClass(
1900 DefRC, &AMDGPU::AGPR_32RegClass, SubRegIdx);
1901 if (DestSuperRC &&
1903 ++NumFoldable;
1905 continue;
1906 }
1907 }
1908
1909 const TargetRegisterClass *InputRC =
1911 : MRI->getRegClass(RegOp.getReg());
1912
1913
1914
1915
1916
1917
1918
1919 const TargetRegisterClass *MatchRC =
1920 TRI->getMatchingSuperRegClass(DefRC, InputRC, SubRegIdx);
1921 if (!MatchRC) {
1922 ++NumFoldable;
1924 continue;
1925 }
1926
1928 }
1929
1930
1931 if (NumFoldable == 0)
1932 return false;
1933
1934 CopyMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
1937
1938 for (auto [Def, DestSubIdx] : NewDefs) {
1939 if (->isReg()) {
1940
1941
1942 Register Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
1943 BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp)
1944 .add(*Def);
1946 } else {
1947 TargetInstrInfo::RegSubRegPair Src = getRegSubRegPair(*Def);
1948 Def->setIsKill(false);
1949
1950 Register &VGPRCopy = VGPRCopies[Src];
1951 if (!VGPRCopy) {
1952 const TargetRegisterClass *VGPRUseSubRC =
1953 TRI->getSubRegisterClass(UseRC, DestSubIdx);
1954
1955
1956
1957
1958
1959
1960
1961
1962 const TargetRegisterClass *SubRC =
1963 TRI->getSubRegisterClass(MRI->getRegClass(Src.Reg), Src.SubReg);
1965
1966 VGPRCopy = MRI->createVirtualRegister(VGPRUseSubRC);
1969 } else {
1970
1971 B.add(*Def);
1972 }
1973 } else {
1974 B.addReg(VGPRCopy);
1975 }
1976 }
1977
1978 B.addImm(DestSubIdx);
1979 }
1980
1982 return true;
1983}
1984
1985bool SIFoldOperandsImpl::tryFoldFoldableCopy(
1986 MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const {
1987 Register DstReg = MI.getOperand(0).getReg();
1988
1989
1990 if (DstReg == AMDGPU::M0) {
1991 MachineOperand &NewM0Val = MI.getOperand(1);
1992 if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) {
1993 MI.eraseFromParent();
1994 return true;
1995 }
1996
1997
1999 ? nullptr
2000 : &NewM0Val;
2001 return false;
2002 }
2003
2004 MachineOperand *OpToFoldPtr;
2005 if (MI.getOpcode() == AMDGPU::V_MOV_B16_t16_e64) {
2006
2007 if (TII->hasAnyModifiersSet(MI))
2008 return false;
2009 OpToFoldPtr = &MI.getOperand(2);
2010 } else
2011 OpToFoldPtr = &MI.getOperand(1);
2012 MachineOperand &OpToFold = *OpToFoldPtr;
2013 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
2014
2015
2016 if (!FoldingImm && !OpToFold.isReg())
2017 return false;
2018
2019
2021 ->isConstantPhysReg(OpToFold.getReg()))
2022 return false;
2023
2024
2025
2026
2027
2028
2029
2031 return false;
2032
2033 const TargetRegisterClass *DstRC =
2034 MRI->getRegClass(MI.getOperand(0).getReg());
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050 if (MI.getOpcode() == AMDGPU::COPY && OpToFold.isReg() &&
2052 if (DstRC == &AMDGPU::SReg_32RegClass &&
2053 DstRC == MRI->getRegClass(OpToFold.getReg())) {
2056 }
2057 }
2058
2059
2060
2061 if (OpToFold.isReg() && MI.isCopy() && .getOperand(1).getSubReg()) {
2062 if (foldCopyToAGPRRegSequence(&MI))
2063 return true;
2064 }
2065
2066 FoldableDef Def(OpToFold, DstRC);
2067 bool Changed = foldInstOperand(MI, Def);
2068
2069
2070
2071
2072
2073
2074 auto *InstToErase = &MI;
2075 while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
2076 auto &SrcOp = InstToErase->getOperand(1);
2078 InstToErase->eraseFromParent();
2080 InstToErase = nullptr;
2082 break;
2083 InstToErase = MRI->getVRegDef(SrcReg);
2084 if (!InstToErase || ->isFoldableCopy(*InstToErase))
2085 break;
2086 }
2087
2088 if (InstToErase && InstToErase->isRegSequence() &&
2089 MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
2090 InstToErase->eraseFromParent();
2092 }
2093
2095 return true;
2096
2097
2098
2099
2100 return OpToFold.isReg() &&
2101 foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.getReg(), MI);
2102}
2103
2104
2105
2106const MachineOperand *
2107SIFoldOperandsImpl::isClamp(const MachineInstr &MI) const {
2108 unsigned Op = MI.getOpcode();
2109 switch (Op) {
2110 case AMDGPU::V_MAX_F32_e64:
2111 case AMDGPU::V_MAX_F16_e64:
2112 case AMDGPU::V_MAX_F16_t16_e64:
2113 case AMDGPU::V_MAX_F16_fake16_e64:
2114 case AMDGPU::V_MAX_F64_e64:
2115 case AMDGPU::V_MAX_NUM_F64_e64:
2116 case AMDGPU::V_PK_MAX_F16:
2117 case AMDGPU::V_MAX_BF16_PSEUDO_e64:
2118 case AMDGPU::V_PK_MAX_NUM_BF16: {
2119 if (MI.mayRaiseFPException())
2120 return nullptr;
2121
2122 if (->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
2123 return nullptr;
2124
2125
2126 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2127 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
2128 if (!Src0->isReg() || !Src1->isReg() ||
2131 Src0->getSubReg() != AMDGPU::NoSubRegister)
2132 return nullptr;
2133
2134
2135 if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
2136 return nullptr;
2137
2138 unsigned Src0Mods
2139 = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
2140 unsigned Src1Mods
2141 = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
2142
2143
2144
2145 unsigned UnsetMods =
2146 (Op == AMDGPU::V_PK_MAX_F16 || Op == AMDGPU::V_PK_MAX_NUM_BF16)
2148 : 0u;
2149 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
2150 return nullptr;
2151 return Src0;
2152 }
2153 default:
2154 return nullptr;
2155 }
2156}
2157
2158
2159bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) {
2160 const MachineOperand *ClampSrc = isClamp(MI);
2161 if (!ClampSrc || ->hasOneNonDBGUser(ClampSrc->getReg()))
2162 return false;
2163
2165 return false;
2166
2167
2169 MachineInstr *Def =
2170 MRI->getVRegDef(DefSrcReg.isVirtual() ? DefSrcReg : ClampSrc->getReg());
2171
2172
2173 if (TII->getClampMask(*Def) != TII->getClampMask(MI))
2174 return false;
2175
2176 if (Def->mayRaiseFPException())
2177 return false;
2178
2179 MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
2180 if (!DefClamp)
2181 return false;
2182
2183 LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def);
2184
2185
2186 DefClamp->setImm(1);
2187
2188 Register DefReg = Def->getOperand(0).getReg();
2189 Register MIDstReg = MI.getOperand(0).getReg();
2190 if (TRI->isSGPRReg(*MRI, DefReg)) {
2191
2192
2194 MIDstReg)
2196 } else {
2197 MRI->replaceRegWith(MIDstReg, DefReg);
2198 }
2199 MI.eraseFromParent();
2200
2201
2202
2203
2205 Def->eraseFromParent();
2206
2207 return true;
2208}
2209
2211 switch (Opc) {
2212 case AMDGPU::V_MUL_F64_e64:
2213 case AMDGPU::V_MUL_F64_pseudo_e64: {
2214 switch (Val) {
2215 case 0x3fe0000000000000:
2217 case 0x4000000000000000:
2219 case 0x4010000000000000:
2221 default:
2223 }
2224 }
2225 case AMDGPU::V_MUL_F32_e64: {
2226 switch (static_cast<uint32_t>(Val)) {
2227 case 0x3f000000:
2229 case 0x40000000:
2231 case 0x40800000:
2233 default:
2235 }
2236 }
2237 case AMDGPU::V_MUL_F16_e64:
2238 case AMDGPU::V_MUL_F16_t16_e64:
2239 case AMDGPU::V_MUL_F16_fake16_e64: {
2240 switch (static_cast<uint16_t>(Val)) {
2241 case 0x3800:
2243 case 0x4000:
2245 case 0x4400:
2247 default:
2249 }
2250 }
2251 default:
2253 }
2254}
2255
2256
2257
2258
2259std::pair<const MachineOperand *, int>
2260SIFoldOperandsImpl::isOMod(const MachineInstr &MI) const {
2261 unsigned Op = MI.getOpcode();
2262 switch (Op) {
2263 case AMDGPU::V_MUL_F64_e64:
2264 case AMDGPU::V_MUL_F64_pseudo_e64:
2265 case AMDGPU::V_MUL_F32_e64:
2266 case AMDGPU::V_MUL_F16_t16_e64:
2267 case AMDGPU::V_MUL_F16_fake16_e64:
2268 case AMDGPU::V_MUL_F16_e64: {
2269
2270 if ((Op == AMDGPU::V_MUL_F32_e64 &&
2272 ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F64_pseudo_e64 ||
2273 Op == AMDGPU::V_MUL_F16_e64 || Op == AMDGPU::V_MUL_F16_t16_e64 ||
2274 Op == AMDGPU::V_MUL_F16_fake16_e64) &&
2277 MI.mayRaiseFPException())
2279
2280 const MachineOperand *RegOp = nullptr;
2281 const MachineOperand *ImmOp = nullptr;
2282 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2283 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
2284 if (Src0->isImm()) {
2285 ImmOp = Src0;
2286 RegOp = Src1;
2287 } else if (Src1->isImm()) {
2288 ImmOp = Src1;
2289 RegOp = Src0;
2290 } else
2292
2295 TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
2296 TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
2297 TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
2298 TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
2300
2301 return std::pair(RegOp, OMod);
2302 }
2303 case AMDGPU::V_ADD_F64_e64:
2304 case AMDGPU::V_ADD_F64_pseudo_e64:
2305 case AMDGPU::V_ADD_F32_e64:
2306 case AMDGPU::V_ADD_F16_e64:
2307 case AMDGPU::V_ADD_F16_t16_e64:
2308 case AMDGPU::V_ADD_F16_fake16_e64: {
2309
2310 if ((Op == AMDGPU::V_ADD_F32_e64 &&
2312 ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F64_pseudo_e64 ||
2313 Op == AMDGPU::V_ADD_F16_e64 || Op == AMDGPU::V_ADD_F16_t16_e64 ||
2314 Op == AMDGPU::V_ADD_F16_fake16_e64) &&
2317
2318
2319 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2320 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
2321
2324 ->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
2325 ->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
2326 ->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
2327 ->hasModifiersSet(MI, AMDGPU::OpName::omod))
2329
2331 }
2332 default:
2334 }
2335}
2336
2337
2338bool SIFoldOperandsImpl::tryFoldOMod(MachineInstr &MI) {
2339 const MachineOperand *RegOp;
2340 int OMod;
2341 std::tie(RegOp, OMod) = isOMod(MI);
2343 RegOp->getSubReg() != AMDGPU::NoSubRegister ||
2344 ->hasOneNonDBGUser(RegOp->getReg()))
2345 return false;
2346
2347 MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
2348 MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
2350 return false;
2351
2352 if (Def->mayRaiseFPException())
2353 return false;
2354
2355
2356
2357 if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
2358 return false;
2359
2360 LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def);
2361
2362 DefOMod->setImm(OMod);
2363 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
2364
2365
2366 MRI->clearKillFlags(Def->getOperand(0).getReg());
2367 MI.eraseFromParent();
2368
2369
2370
2371
2373 Def->eraseFromParent();
2374
2375 return true;
2376}
2377
2378
2379
2380bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) {
2382 auto Reg = MI.getOperand(0).getReg();
2383
2385 ->hasOneNonDBGUse(Reg))
2386 return false;
2387
2389 if (!getRegSeqInit(Defs, Reg))
2390 return false;
2391
2392 for (auto &[Op, SubIdx] : Defs) {
2393 if (->isReg())
2394 return false;
2395 if (TRI->isAGPR(*MRI, Op->getReg()))
2396 continue;
2397
2398 const MachineInstr *SubDef = MRI->getVRegDef(Op->getReg());
2400 return false;
2402 return false;
2403 }
2404
2405 MachineOperand *Op = &*MRI->use_nodbg_begin(Reg);
2406 MachineInstr *UseMI = Op->getParent();
2409 if (->isVGPR(*MRI, Reg) ||
->hasOneNonDBGUse(Reg))
2410 return false;
2411 Op = &*MRI->use_nodbg_begin(Reg);
2413 }
2414
2415 if (Op->getSubReg())
2416 return false;
2417
2419 const MCInstrDesc &InstDesc = UseMI->getDesc();
2421 if (!OpRC || ->isVectorSuperClass(OpRC))
2422 return false;
2423
2424 const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg));
2425 auto Dst = MRI->createVirtualRegister(NewDstRC);
2426 auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
2427 TII->get(AMDGPU::REG_SEQUENCE), Dst);
2428
2429 for (auto &[Def, SubIdx] : Defs) {
2430 Def->setIsKill(false);
2431 if (TRI->isAGPR(*MRI, Def->getReg())) {
2432 RS.add(*Def);
2433 } else {
2434 MachineInstr *SubDef = MRI->getVRegDef(Def->getReg());
2437 }
2438 RS.addImm(SubIdx);
2439 }
2440
2441 Op->setReg(Dst);
2444 RS->eraseFromParent();
2445 return false;
2446 }
2447
2449
2450
2451
2452 if (MRI->use_nodbg_empty(MI.getOperand(0).getReg()))
2453 MI.eraseFromParent();
2454 return true;
2455}
2456
2457
2458
2461 Register &OutReg, unsigned &OutSubReg) {
2462 assert(Copy.isCopy());
2463
2467 return false;
2468
2469
2470
2471 if (TRI.isAGPR(MRI, CopySrcReg)) {
2472 OutReg = CopySrcReg;
2473 OutSubReg = CopySrc.getSubReg();
2474 return true;
2475 }
2476
2477
2478
2479
2480 const MachineInstr *CopySrcDef = MRI.getVRegDef(CopySrcReg);
2481 if (!CopySrcDef || !CopySrcDef->isCopy())
2482 return false;
2483
2486 if (!OtherCopySrcReg.isVirtual() ||
2488 OtherCopySrc.getSubReg() != AMDGPU::NoSubRegister ||
2489 .isAGPR(MRI, OtherCopySrcReg))
2490 return false;
2491
2492 OutReg = OtherCopySrcReg;
2493 OutSubReg = CopySrc.getSubReg();
2494 return true;
2495}
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526bool SIFoldOperandsImpl::tryFoldPhiAGPR(MachineInstr &PHI) {
2528
2529 Register PhiOut = PHI.getOperand(0).getReg();
2530 if (->isVGPR(*MRI, PhiOut))
2531 return false;
2532
2533
2534
2535 const TargetRegisterClass *ARC = nullptr;
2536 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
2537 MachineOperand &MO = PHI.getOperand(K);
2538 MachineInstr *Copy = MRI->getVRegDef(MO.getReg());
2539 if (!Copy || ->isCopy())
2540 continue;
2541
2543 unsigned AGPRRegMask = AMDGPU::NoSubRegister;
2545 continue;
2546
2547 const TargetRegisterClass *CopyInRC = MRI->getRegClass(AGPRSrc);
2548 if (const auto *SubRC = TRI->getSubRegisterClass(CopyInRC, AGPRRegMask))
2549 CopyInRC = SubRC;
2550
2552 return false;
2553 ARC = CopyInRC;
2554 }
2555
2556 if (!ARC)
2557 return false;
2558
2559 bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass);
2560
2561
2563 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
2564 MachineOperand &MO = PHI.getOperand(K);
2566
2568 MachineBasicBlock *InsertMBB = nullptr;
2569
2570
2571 unsigned CopyOpc = AMDGPU::COPY;
2572 if (MachineInstr *Def = MRI->getVRegDef(Reg)) {
2573
2574
2575
2576 if (Def->isCopy()) {
2578 unsigned AGPRSubReg = AMDGPU::NoSubRegister;
2582 continue;
2583 }
2584
2585
2586
2587
2588
2589
2590
2591 MachineOperand &CopyIn = Def->getOperand(1);
2594 CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
2595 }
2596
2597 InsertMBB = Def->getParent();
2599 } else {
2600 InsertMBB = PHI.getOperand(MO.getOperandNo() + 1).getMBB();
2602 }
2603
2604 Register NewReg = MRI->createVirtualRegister(ARC);
2605 MachineInstr *MI = BuildMI(*InsertMBB, InsertPt, PHI.getDebugLoc(),
2606 TII->get(CopyOpc), NewReg)
2609
2610 (void)MI;
2612 }
2613
2614
2615 Register NewReg = MRI->createVirtualRegister(ARC);
2616 PHI.getOperand(0).setReg(NewReg);
2617
2618
2619
2622 TII->get(AMDGPU::COPY), PhiOut)
2624
2626 return true;
2627}
2628
2629
2630bool SIFoldOperandsImpl::tryFoldLoad(MachineInstr &MI) {
2633 return false;
2634
2635 MachineOperand &Def = MI.getOperand(0);
2636 if (.isDef())
2637 return false;
2638
2640
2642 return false;
2643
2647
2648 if (Users.empty())
2649 return false;
2650
2651
2652 while (.empty()) {
2653 const MachineInstr *I = Users.pop_back_val();
2654 if (->isCopy() &&
->isRegSequence())
2655 return false;
2656 Register DstReg = I->getOperand(0).getReg();
2657
2659 return false;
2660 if (TRI->isAGPR(*MRI, DstReg))
2661 continue;
2663 for (const MachineInstr &U : MRI->use_nodbg_instructions(DstReg))
2664 Users.push_back(&U);
2665 }
2666
2667 const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
2668 MRI->setRegClass(DefReg, TRI->getEquivalentAGPRClass(RC));
2669 if (->isOperandLegal(MI, 0, &Def)) {
2670 MRI->setRegClass(DefReg, RC);
2671 return false;
2672 }
2673
2674 while (!MoveRegs.empty()) {
2676 MRI->setRegClass(Reg, TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)));
2677 }
2678
2680
2681 return true;
2682}
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716bool SIFoldOperandsImpl::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
2717
2718
2720 return false;
2721
2722
2723 DenseMap<std::pair<Register, unsigned>, std::vector<MachineOperand *>>
2724 RegToMO;
2725
2727 if (.isPHI())
2728 break;
2729
2730 if (->isAGPR(*MRI, MI.getOperand(0).getReg()))
2731 continue;
2732
2733 for (unsigned K = 1; K < MI.getNumOperands(); K += 2) {
2734 MachineOperand &PhiMO = MI.getOperand(K);
2736 continue;
2737 RegToMO[{PhiMO.getReg(), PhiMO.getSubReg()}].push_back(&PhiMO);
2738 }
2739 }
2740
2741
2742
2744 for (const auto &[Entry, MOs] : RegToMO) {
2745 if (MOs.size() == 1)
2746 continue;
2747
2749 MachineInstr *Def = MRI->getVRegDef(Reg);
2750 MachineBasicBlock *DefMBB = Def->getParent();
2751
2752
2753
2754 const TargetRegisterClass *ARC = getRegOpRC(*MRI, *TRI, *MOs.front());
2756 MRI->createVirtualRegister(TRI->getEquivalentVGPRClass(ARC));
2757 MachineInstr *VGPRCopy =
2758 BuildMI(*DefMBB, ++Def->getIterator(), Def->getDebugLoc(),
2759 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
2761
2762
2763 Register TempAGPR = MRI->createVirtualRegister(ARC);
2765 TII->get(AMDGPU::COPY), TempAGPR)
2767
2768 LLVM_DEBUG(dbgs() << "Caching AGPR into VGPR: " << *VGPRCopy);
2769 for (MachineOperand *MO : MOs) {
2770 MO->setReg(TempAGPR);
2771 MO->setSubReg(AMDGPU::NoSubRegister);
2772 LLVM_DEBUG(dbgs() << " Changed PHI Operand: " << *MO << "\n");
2773 }
2774
2776 }
2777
2779}
2780
2781bool SIFoldOperandsImpl::run(MachineFunction &MF) {
2782 this->MF = &MF;
2787 MFI = MF.getInfo();
2788
2789
2790
2791
2792
2795
2798 MachineOperand *CurrentKnownM0Val = nullptr;
2801
2802 if (tryFoldZeroHighBits(MI)) {
2804 continue;
2805 }
2806
2807 if (MI.isRegSequence() && tryFoldRegSequence(MI)) {
2809 continue;
2810 }
2811
2812 if (MI.isPHI() && tryFoldPhiAGPR(MI)) {
2814 continue;
2815 }
2816
2817 if (MI.mayLoad() && tryFoldLoad(MI)) {
2819 continue;
2820 }
2821
2822 if (TII->isFoldableCopy(MI)) {
2823 Changed |= tryFoldFoldableCopy(MI, CurrentKnownM0Val);
2824 continue;
2825 }
2826
2827
2828 if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
2829 CurrentKnownM0Val = nullptr;
2830
2831
2832
2834 !tryFoldOMod(MI))
2836 }
2837
2838 Changed |= tryOptimizeAGPRPhis(*MBB);
2839 }
2840
2842}
2843
2847
2848 bool Changed = SIFoldOperandsImpl().run(MF);
2851 }
2854 return PA;
2855}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
const TargetInstrInfo & TII
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat)
Updates the operand at Idx in instruction Inst with the result of instruction Mat.
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
iv Induction Variable Users
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
static unsigned macToMad(unsigned Opc)
Definition SIFoldOperands.cpp:316
static bool isAGPRCopy(const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI, const MachineInstr &Copy, Register &OutReg, unsigned &OutSubReg)
Checks whether Copy is a AGPR -> VGPR copy.
Definition SIFoldOperands.cpp:2459
static void appendFoldCandidate(SmallVectorImpl< FoldCandidate > &FoldList, FoldCandidate &&Entry)
Definition SIFoldOperands.cpp:750
static const TargetRegisterClass * getRegOpRC(const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const MachineOperand &MO)
Definition SIFoldOperands.cpp:305
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, uint32_t LHS, uint32_t RHS)
Definition SIFoldOperands.cpp:1500
static int getOModValue(unsigned Opc, int64_t Val)
Definition SIFoldOperands.cpp:2210
static unsigned getMovOpc(bool IsScalar)
Definition SIFoldOperands.cpp:1566
static MachineOperand * lookUpCopyChain(const SIInstrInfo &TII, const MachineRegisterInfo &MRI, Register SrcReg)
Definition SIFoldOperands.cpp:962
static bool checkImmOpForPKF32InstrReplicatesLower32BitsOfScalarOperand(const FoldableDef &OpToFold)
Definition SIFoldOperands.cpp:783
static bool isPKF32InstrReplicatesLower32BitsOfScalarOperand(const GCNSubtarget *ST, MachineInstr *MI, unsigned OpNo)
Definition SIFoldOperands.cpp:772
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
static int Lookup(ArrayRef< TableEntry > Table, unsigned Opcode)
bool hasNoSignedZerosFPMath() const
Represent the analysis usage information of a pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Represents analyses that only rely on functions' control flow.
FunctionPass class - This class is used to implement most global optimizations.
bool hasGFX90AInsts() const
const SIInstrInfo * getInstrInfo() const override
bool hasDOTOpSelHazard() const
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
ArrayRef< MCOperandInfo > operands() const
int getOperandConstraint(unsigned OpNum, MCOI::OperandConstraint Constraint) const
Returns the value of the specified operand constraint if it is present.
bool isVariadic() const
Return true if this instruction can have a variable number of operands.
const MCInstrDesc & get(unsigned Opcode) const
Return the machine instruction descriptor that corresponds to the specified instruction opcode.
This holds information about one operand of a machine instruction, indicating the register class for ...
uint8_t OperandType
Information about the type of the operand.
An RAII based helper class to modify MachineFunctionProperties when running pass.
LLVM_ABI iterator SkipPHIsLabelsAndDebug(iterator I, Register Reg=Register(), bool SkipPseudoOp=true)
Return the first instruction in MBB after I that is not a PHI, label or debug.
LLVM_ABI LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
LivenessQueryResult
Possible outcome of a register liveness query to computeRegisterLiveness()
@ LQR_Dead
Register is known to be fully dead.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
unsigned getNumOperands() const
Retuns the total number of operands.
LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
unsigned getOperandNo(const_mop_iterator I) const
Returns the number of the operand iterator I points to.
LLVM_ABI unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
mop_range implicit_operands()
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
bool isRegSequence() const
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
MachineOperand * mop_iterator
iterator/begin/end - Iterate over all operands of a machine instruction.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
LLVM_ABI unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
LLVM_ABI void substVirtReg(Register Reg, unsigned SubIdx, const TargetRegisterInfo &)
substVirtReg - Substitute the current register with the virtual subregister Reg:SubReg.
LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
LLVM_ABI void substPhysReg(MCRegister Reg, const TargetRegisterInfo &)
substPhysReg - Substitute the current register with the physical register Reg, taking any existing Su...
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_GlobalAddress
Address of a global value.
@ MO_FrameIndex
Abstract Stack Frame Index.
@ MO_Register
Register operand.
static MachineOperand CreateFI(int Idx)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Wrapper class representing virtual and physical registers.
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
Definition SIFoldOperands.cpp:2844
static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)
Return the extracted immediate value in a subregister use from a constant materialized in a super reg...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
SIModeRegisterDefaults getMode() const
bool insert(const value_type &X)
Insert a new element into the SetVector.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
StringRef - Represent a constant reference to a string, i.e.
virtual const TargetRegisterClass * getRegClass(const MCInstrDesc &MCID, unsigned OpNum) const
Given a machine instruction descriptor, returns the register class constraint for OpNum,...
int16_t getOpRegClassID(const MCOperandInfo &OpInfo) const
virtual bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const
Returns true iff the routine could find two commutable operands in the given machine instruction.
const TargetRegisterInfo & getRegisterInfo() const
MachineInstr * commuteInstruction(MachineInstr &MI, bool NewMI=false, unsigned OpIdx1=CommuteAnyOperandIndex, unsigned OpIdx2=CommuteAnyOperandIndex) const
This method commutes the operands of the given machine instruction MI.
virtual MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const
This method must be implemented by targets that set the M_CONVERTIBLE_TO_3_ADDR flag.
static const unsigned CommuteAnyOperandIndex
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)
Is this an AMDGPU specific source operand?
@ OPERAND_REG_INLINE_C_FP64
@ OPERAND_REG_INLINE_C_V2BF16
@ OPERAND_REG_IMM_V2INT16
@ OPERAND_REG_INLINE_C_INT64
@ OPERAND_REG_IMM_NOINLINE_V2FP16
@ OPERAND_REG_INLINE_C_V2FP16
@ OPERAND_REG_INLINE_AC_INT32
Operands with an AccVGPR register or inline constant.
@ OPERAND_REG_INLINE_AC_FP32
@ OPERAND_REG_INLINE_C_FP32
@ OPERAND_REG_INLINE_C_INT32
@ OPERAND_REG_INLINE_C_V2INT16
@ OPERAND_REG_INLINE_AC_FP64
bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)
LLVM_READONLY int getFlatScratchInstSSfromSV(uint16_t Opcode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Kill
The last use of a register.
NodeAddr< DefNode * > Def
This is an optimization pass for GlobalISel generic memory operations.
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
FunctionPass * createSIFoldOperandsLegacyPass()
Definition SIFoldOperands.cpp:447
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
@ Sub
Subtraction of integers.
DWARFExpression::Operation Op
char & SIFoldOperandsLegacyID
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
iterator_range< df_iterator< T > > depth_first(const T &G)
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
constexpr uint64_t Make_64(uint32_t High, uint32_t Low)
Make a 64-bit integer from a high / low pair of 32-bit integers.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.