LLVM: lib/Target/AMDGPU/SIFoldOperands.cpp Source File (original) (raw)

1

2

3

4

5

6

7

8

9

10

22

23#define DEBUG_TYPE "si-fold-operands"

24using namespace llvm;

25

26namespace {

27

28

29

30struct FoldableDef {

31 union {

34 int FrameIndexToFold;

35 };

36

37

39

40

42

43

44 unsigned DefSubReg = AMDGPU::NoSubRegister;

45

46

48

49 FoldableDef() = delete;

51 unsigned DefSubReg = AMDGPU::NoSubRegister)

52 : DefRC(DefRC), DefSubReg(DefSubReg), Kind(FoldOp.getType()) {

53

54 if (FoldOp.isImm()) {

55 ImmToFold = FoldOp.getImm();

56 } else if (FoldOp.isFI()) {

57 FrameIndexToFold = FoldOp.getIndex();

58 } else {

60 OpToFold = &FoldOp;

61 }

62

64 }

65

67 unsigned DefSubReg = AMDGPU::NoSubRegister)

68 : ImmToFold(FoldImm), DefRC(DefRC), DefSubReg(DefSubReg),

70

71

73 FoldableDef Copy(*this);

74 Copy.DefSubReg = TRI.composeSubRegIndices(DefSubReg, SubReg);

75 return Copy;

76 }

77

79

82 return OpToFold->getReg();

83 }

84

85 unsigned getSubReg() const {

87 return OpToFold->getSubReg();

88 }

89

91

92 bool isFI() const {

94 }

95

96 int getFI() const {

98 return FrameIndexToFold;

99 }

100

102

103

104

105

106 std::optional<int64_t> getEffectiveImmVal() const {

109 }

110

111

112

114 unsigned OpIdx) const {

115 switch (Kind) {

117 std::optional<int64_t> ImmToFold = getEffectiveImmVal();

118 if (!ImmToFold)

119 return false;

120

121

122

124 return TII.isOperandLegal(MI, OpIdx, &TmpOp);

125 }

127 if (DefSubReg != AMDGPU::NoSubRegister)

128 return false;

130 return TII.isOperandLegal(MI, OpIdx, &TmpOp);

131 }

132 default:

133

134

135 if (DefSubReg != AMDGPU::NoSubRegister)

136 return false;

137 return TII.isOperandLegal(MI, OpIdx, OpToFold);

138 }

139

141 }

142};

143

144struct FoldCandidate {

146 FoldableDef Def;

147 int ShrinkOpcode;

148 unsigned UseOpNo;

149 bool Commuted;

150

151 FoldCandidate(MachineInstr *MI, unsigned OpNo, FoldableDef Def,

152 bool Commuted = false, int ShrinkOp = -1)

153 : UseMI(MI), Def(Def), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),

154 Commuted(Commuted) {}

155

156 bool isFI() const { return Def.isFI(); }

157

158 int getFI() const {

160 return Def.FrameIndexToFold;

161 }

162

163 bool isImm() const { return Def.isImm(); }

164

165 bool isReg() const { return Def.isReg(); }

166

168

169 bool isGlobal() const { return Def.isGlobal(); }

170

171 bool needsShrink() const { return ShrinkOpcode != -1; }

172};

173

174class SIFoldOperandsImpl {

175public:

182

184 const FoldableDef &OpToFold) const;

185

186

187 unsigned convertToVALUOp(unsigned Opc, bool UseVOP3 = false) const {

188 switch (Opc) {

189 case AMDGPU::S_ADD_I32: {

190 if (ST->hasAddNoCarry())

191 return UseVOP3 ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_U32_e32;

192 return UseVOP3 ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;

193 }

194 case AMDGPU::S_OR_B32:

195 return UseVOP3 ? AMDGPU::V_OR_B32_e64 : AMDGPU::V_OR_B32_e32;

196 case AMDGPU::S_AND_B32:

197 return UseVOP3 ? AMDGPU::V_AND_B32_e64 : AMDGPU::V_AND_B32_e32;

198 case AMDGPU::S_MUL_I32:

199 return AMDGPU::V_MUL_LO_U32_e64;

200 default:

201 return AMDGPU::INSTRUCTION_LIST_END;

202 }

203 }

204

205 bool foldCopyToVGPROfScalarAddOfFrameIndex(Register DstReg, Register SrcReg,

207

209

210 bool canUseImmWithOpSel(const MachineInstr *MI, unsigned UseOpNo,

211 int64_t ImmVal) const;

212

213

214 bool tryFoldImmWithOpSel(MachineInstr *MI, unsigned UseOpNo,

215 int64_t ImmVal) const;

216

219 const FoldableDef &OpToFold) const;

222

225 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs) const;

226

228 getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,

230

231 std::pair<int64_t, const TargetRegisterClass *>

233

235 int64_t SplatVal,

237

238 bool tryToFoldACImm(const FoldableDef &OpToFold, MachineInstr *UseMI,

239 unsigned UseOpIdx,

241 void foldOperand(FoldableDef OpToFold, MachineInstr *UseMI, int UseOpIdx,

244

245 std::optional<int64_t> getImmOrMaterializedImm(MachineOperand &Op) const;

249 bool foldInstOperand(MachineInstr &MI, const FoldableDef &OpToFold) const;

250

251 bool foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const;

254

257

258 std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;

263

265

266public:

267 SIFoldOperandsImpl() = default;

268

270};

271

273public:

274 static char ID;

275

277

278 bool runOnMachineFunction(MachineFunction &MF) override {

280 return false;

281 return SIFoldOperandsImpl().run(MF);

282 }

283

284 StringRef getPassName() const override { return "SI Fold Operands"; }

285

286 void getAnalysisUsage(AnalysisUsage &AU) const override {

289 }

290

293 }

294};

295

296}

297

299 false)

300

301char SIFoldOperandsLegacy::ID = 0;

302

304

310 TRI.getSubRegisterClass(RC, MO.getSubReg()))

311 RC = SubRC;

312 return RC;

313}

314

315

317 switch (Opc) {

318 case AMDGPU::V_MAC_F32_e64:

319 return AMDGPU::V_MAD_F32_e64;

320 case AMDGPU::V_MAC_F16_e64:

321 return AMDGPU::V_MAD_F16_e64;

322 case AMDGPU::V_FMAC_F32_e64:

323 return AMDGPU::V_FMA_F32_e64;

324 case AMDGPU::V_FMAC_F16_e64:

325 return AMDGPU::V_FMA_F16_gfx9_e64;

326 case AMDGPU::V_FMAC_F16_t16_e64:

327 return AMDGPU::V_FMA_F16_gfx9_t16_e64;

328 case AMDGPU::V_FMAC_F16_fake16_e64:

329 return AMDGPU::V_FMA_F16_gfx9_fake16_e64;

330 case AMDGPU::V_FMAC_LEGACY_F32_e64:

331 return AMDGPU::V_FMA_LEGACY_F32_e64;

332 case AMDGPU::V_FMAC_F64_e64:

333 return AMDGPU::V_FMA_F64_e64;

334 }

335 return AMDGPU::INSTRUCTION_LIST_END;

336}

337

338

339

340bool SIFoldOperandsImpl::frameIndexMayFold(const MachineInstr &UseMI, int OpNo,

341 const FoldableDef &OpToFold) const {

342 if (!OpToFold.isFI())

343 return false;

344

345 const unsigned Opc = UseMI.getOpcode();

346 switch (Opc) {

347 case AMDGPU::S_ADD_I32:

348 case AMDGPU::S_ADD_U32:

349 case AMDGPU::V_ADD_U32_e32:

350 case AMDGPU::V_ADD_CO_U32_e32:

351

352

353

354 return UseMI.getOperand(OpNo == 1 ? 2 : 1).isImm() &&

355 MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg());

356 case AMDGPU::V_ADD_U32_e64:

357 case AMDGPU::V_ADD_CO_U32_e64:

358 return UseMI.getOperand(OpNo == 2 ? 3 : 2).isImm() &&

359 MRI->hasOneNonDBGUse(UseMI.getOperand(OpNo).getReg());

360 default:

361 break;

362 }

363

365 return OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);

366 if (TII->isFLATScratch(UseMI))

367 return false;

368

369 int SIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);

370 if (OpNo == SIdx)

371 return true;

372

373 int VIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);

374 return OpNo == VIdx && SIdx == -1;

375}

376

377

378

379

380bool SIFoldOperandsImpl::foldCopyToVGPROfScalarAddOfFrameIndex(

382 if (TRI->isVGPR(*MRI, DstReg) && TRI->isSGPRReg(*MRI, SrcReg) &&

383 MRI->hasOneNonDBGUse(SrcReg)) {

384 MachineInstr *Def = MRI->getVRegDef(SrcReg);

385 if (!Def || Def->getNumOperands() != 4)

386 return false;

387

388 MachineOperand *Src0 = &Def->getOperand(1);

389 MachineOperand *Src1 = &Def->getOperand(2);

390

391

392

393

394 if (!Src0->isFI() && !Src1->isFI())

395 return false;

396

397 if (Src0->isFI())

399

400 const bool UseVOP3 = !Src0->isImm() || TII->isInlineConstant(*Src0);

401 unsigned NewOp = convertToVALUOp(Def->getOpcode(), UseVOP3);

402 if (NewOp == AMDGPU::INSTRUCTION_LIST_END ||

403 Def->getOperand(3).isDead())

404 return false;

405

406 MachineBasicBlock *MBB = Def->getParent();

408 if (NewOp != AMDGPU::V_ADD_CO_U32_e32) {

409 MachineInstrBuilder Add =

411

412 if (Add->getDesc().getNumDefs() == 2) {

413 Register CarryOutReg = MRI->createVirtualRegister(TRI->getBoolRC());

415 MRI->setRegAllocationHint(CarryOutReg, 0, TRI->getVCC());

416 }

417

418 Add.add(*Src0).add(*Src1).setMIFlags(Def->getFlags());

420 Add.addImm(0);

421

422 Def->eraseFromParent();

423 MI.eraseFromParent();

424 return true;

425 }

426

427 assert(NewOp == AMDGPU::V_ADD_CO_U32_e32);

428

432

434 .add(*Src0)

435 .add(*Src1)

438 Def->eraseFromParent();

439 MI.eraseFromParent();

440 return true;

441 }

442 }

443

444 return false;

445}

446

448 return new SIFoldOperandsLegacy();

449}

450

451bool SIFoldOperandsImpl::canUseImmWithOpSel(const MachineInstr *MI,

452 unsigned UseOpNo,

453 int64_t ImmVal) const {

454 const uint64_t TSFlags = MI->getDesc().TSFlags;

455

459 return false;

460

462 int OpNo = MI->getOperandNo(&Old);

463

464 unsigned Opcode = MI->getOpcode();

465 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;

466 switch (OpType) {

467 default:

468 return false;

476

477

479 static_cast<uint16_t>(ImmVal) != static_cast<uint16_t>(ImmVal >> 16))

480 return false;

481 break;

482 }

483

484 return true;

485}

486

487bool SIFoldOperandsImpl::tryFoldImmWithOpSel(MachineInstr *MI, unsigned UseOpNo,

488 int64_t ImmVal) const {

489 MachineOperand &Old = MI->getOperand(UseOpNo);

490 unsigned Opcode = MI->getOpcode();

491 int OpNo = MI->getOperandNo(&Old);

492 uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;

493

494

495

496

499 return true;

500 }

501

502

503

504 AMDGPU::OpName ModName = AMDGPU::OpName::NUM_OPERAND_NAMES;

505 unsigned SrcIdx = ~0;

506 if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) {

507 ModName = AMDGPU::OpName::src0_modifiers;

508 SrcIdx = 0;

509 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) {

510 ModName = AMDGPU::OpName::src1_modifiers;

511 SrcIdx = 1;

512 } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) {

513 ModName = AMDGPU::OpName::src2_modifiers;

514 SrcIdx = 2;

515 }

516 assert(ModName != AMDGPU::OpName::NUM_OPERAND_NAMES);

517 int ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModName);

518 MachineOperand &Mod = MI->getOperand(ModIdx);

519 unsigned ModVal = Mod.getImm();

520

521 uint16_t ImmLo =

522 static_cast<uint16_t>(ImmVal >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0));

523 uint16_t ImmHi =

524 static_cast<uint16_t>(ImmVal >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0));

525 uint32_t Imm = (static_cast<uint32_t>(ImmHi) << 16) | ImmLo;

527

528

529

530 auto tryFoldToInline = [&](uint32_t Imm) -> bool {

534 return true;

535 }

536

537

538

539 uint16_t Lo = static_cast<uint16_t>(Imm);

540 uint16_t Hi = static_cast<uint16_t>(Imm >> 16);

541 if (Lo == Hi) {

543 Mod.setImm(NewModVal);

545 return true;

546 }

547

548 if (static_cast<int16_t>(Lo) < 0) {

549 int32_t SExt = static_cast<int16_t>(Lo);

551 Mod.setImm(NewModVal);

553 return true;

554 }

555 }

556

557

562 return true;

563 }

564 }

565 } else {

566 uint32_t Swapped = (static_cast<uint32_t>(Lo) << 16) | Hi;

570 return true;

571 }

572 }

573

574 return false;

575 };

576

577 if (tryFoldToInline(Imm))

578 return true;

579

580

581

582

583

584

585

586 bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16;

587 bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16;

588 if (SrcIdx == 1 && (IsUAdd || IsUSub)) {

589 unsigned ClampIdx =

590 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::clamp);

591 bool Clamp = MI->getOperand(ClampIdx).getImm() != 0;

592

593 if (!Clamp) {

594 uint16_t NegLo = -static_cast<uint16_t>(Imm);

595 uint16_t NegHi = -static_cast<uint16_t>(Imm >> 16);

596 uint32_t NegImm = (static_cast<uint32_t>(NegHi) << 16) | NegLo;

597

598 if (tryFoldToInline(NegImm)) {

599 unsigned NegOpcode =

600 IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16;

601 MI->setDesc(TII->get(NegOpcode));

602 return true;

603 }

604 }

605 }

606

607 return false;

608}

609

610bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {

611 MachineInstr *MI = Fold.UseMI;

612 MachineOperand &Old = MI->getOperand(Fold.UseOpNo);

614

615 std::optional<int64_t> ImmVal;

616 if (Fold.isImm())

617 ImmVal = Fold.Def.getEffectiveImmVal();

618

619 if (ImmVal && canUseImmWithOpSel(Fold.UseMI, Fold.UseOpNo, *ImmVal)) {

620 if (tryFoldImmWithOpSel(Fold.UseMI, Fold.UseOpNo, *ImmVal))

621 return true;

622

623

624

626 int OpNo = MI->getOperandNo(&Old);

627 if (TII->isOperandLegal(*MI, OpNo, &New))

628 return false;

630 return true;

631 }

632

633 if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {

634 MachineBasicBlock *MBB = MI->getParent();

637 LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n");

638 return false;

639 }

640

641 int Op32 = Fold.ShrinkOpcode;

642 MachineOperand &Dst0 = MI->getOperand(0);

643 MachineOperand &Dst1 = MI->getOperand(1);

645

646 bool HaveNonDbgCarryUse = MRI->use_nodbg_empty(Dst1.getReg());

647

648 const TargetRegisterClass *Dst0RC = MRI->getRegClass(Dst0.getReg());

649 Register NewReg0 = MRI->createVirtualRegister(Dst0RC);

650

651 MachineInstr *Inst32 = TII->buildShrunkInst(*MI, Op32);

652

653 if (HaveNonDbgCarryUse) {

657 }

658

659

660

661

662

663

664

665 Dst0.setReg(NewReg0);

666 for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)

667 MI->removeOperand(I);

668 MI->setDesc(TII->get(AMDGPU::IMPLICIT_DEF));

669

670 if (Fold.Commuted)

672 return true;

673 }

674

675 assert(!Fold.needsShrink() && "not handled");

676

677 if (ImmVal) {

680 if (NewMFMAOpc == -1)

681 return false;

682 MI->setDesc(TII->get(NewMFMAOpc));

683 MI->untieRegOperand(0);

684 const MCInstrDesc &MCID = MI->getDesc();

685 for (unsigned I = 0; I < MI->getNumDefs(); ++I)

687 MI->getOperand(I).setIsEarlyClobber(true);

688 }

689

690

692 int OpNo = MI->getOperandNo(&Old);

693 if (TII->isOperandLegal(*MI, OpNo, &New))

694 return false;

695

697 return true;

698 }

699

700 if (Fold.isGlobal()) {

701 Old.ChangeToGA(Fold.Def.OpToFold->getGlobal(),

702 Fold.Def.OpToFold->getOffset(),

703 Fold.Def.OpToFold->getTargetFlags());

704 return true;

705 }

706

707 if (Fold.isFI()) {

709 return true;

710 }

711

712 MachineOperand *New = Fold.Def.OpToFold;

713

714

715 if (const TargetRegisterClass *OpRC =

717 const TargetRegisterClass *NewRC =

718 TRI->getRegClassForReg(*MRI, New->getReg());

719

720 const TargetRegisterClass *ConstrainRC = OpRC;

721 if (New->getSubReg()) {

722 ConstrainRC =

723 TRI->getMatchingSuperRegClass(NewRC, OpRC, New->getSubReg());

724

725 if (!ConstrainRC)

726 return false;

727 }

728

729 if (New->getReg().isVirtual() &&

730 MRI->constrainRegClass(New->getReg(), ConstrainRC)) {

732 << TRI->getRegClassName(ConstrainRC) << '\n');

733 return false;

734 }

735 }

736

737

738

739 if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))

740 Old.setSubReg(AMDGPU::NoSubRegister);

741 if (New->getReg().isPhysical()) {

743 } else {

746 }

747 return true;

748}

749

751 FoldCandidate &&Entry) {

752

753 for (FoldCandidate &Fold : FoldList)

754 if (Fold.UseMI == Entry.UseMI && Fold.UseOpNo == Entry.UseOpNo)

755 return;

756 LLVM_DEBUG(dbgs() << "Append " << (Entry.Commuted ? "commuted" : "normal")

757 << " operand " << Entry.UseOpNo << "\n " << *Entry.UseMI);

759}

760

763 const FoldableDef &FoldOp,

764 bool Commuted = false, int ShrinkOp = -1) {

766 FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp));

767}

768

769

770

771

774 if (!ST->hasPKF32InstsReplicatingLower32BitsOfScalarInput())

775 return false;

776 const MCOperandInfo &OpDesc = MI->getDesc().operands()[OpNo];

778}

779

780

781

782

784 const FoldableDef &OpToFold) {

785 assert(OpToFold.isImm() && "Expected immediate operand");

786 uint64_t ImmVal = OpToFold.getEffectiveImmVal().value();

789 return Lo == Hi;

790}

791

792bool SIFoldOperandsImpl::tryAddToFoldList(

793 SmallVectorImpl &FoldList, MachineInstr *MI, unsigned OpNo,

794 const FoldableDef &OpToFold) const {

795 const unsigned Opc = MI->getOpcode();

796

797 auto tryToFoldAsFMAAKorMK = [&]() {

798 if (!OpToFold.isImm())

799 return false;

800

801 const bool TryAK = OpNo == 3;

802 const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32;

803 MI->setDesc(TII->get(NewOpc));

804

805

806 bool FoldAsFMAAKorMK =

807 tryAddToFoldList(FoldList, MI, TryAK ? 3 : 2, OpToFold);

808 if (FoldAsFMAAKorMK) {

809

810 MI->untieRegOperand(3);

811

812 if (OpNo == 1) {

813 MachineOperand &Op1 = MI->getOperand(1);

814 MachineOperand &Op2 = MI->getOperand(2);

816

817 if (Op2.isImm()) {

820 } else {

823 }

824 }

825 return true;

826 }

828 return false;

829 };

830

831 bool IsLegal = OpToFold.isOperandLegal(*TII, *MI, OpNo);

832 if (!IsLegal && OpToFold.isImm()) {

833 if (std::optional<int64_t> ImmVal = OpToFold.getEffectiveImmVal())

834 IsLegal = canUseImmWithOpSel(MI, OpNo, *ImmVal);

835 }

836

837 if (!IsLegal) {

838

840 if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {

841

842

843 MI->setDesc(TII->get(NewOpc));

846 if (AddOpSel)

848 bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold);

849 if (FoldAsMAD) {

850 MI->untieRegOperand(OpNo);

851 return true;

852 }

853 if (AddOpSel)

854 MI->removeOperand(MI->getNumExplicitOperands() - 1);

856 }

857

858

859

860 if (Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) {

861 if (tryToFoldAsFMAAKorMK())

862 return true;

863 }

864

865

866 if (OpToFold.isImm()) {

867 unsigned ImmOpc = 0;

868 if (Opc == AMDGPU::S_SETREG_B32)

869 ImmOpc = AMDGPU::S_SETREG_IMM32_B32;

870 else if (Opc == AMDGPU::S_SETREG_B32_mode)

871 ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;

872 if (ImmOpc) {

873 MI->setDesc(TII->get(ImmOpc));

875 return true;

876 }

877 }

878

879

880

883 if (!CanCommute)

884 return false;

885

886 MachineOperand &Op = MI->getOperand(OpNo);

887 MachineOperand &CommutedOp = MI->getOperand(CommuteOpNo);

888

889

890

891

892

893 if (Op.isReg() || !CommutedOp.isReg())

894 return false;

895

896

897

898 if (Op.isReg() && CommutedOp.isReg() &&

899 (Op.getReg() == CommutedOp.getReg() &&

900 Op.getSubReg() == CommutedOp.getSubReg()))

901 return false;

902

904 return false;

905

906 int Op32 = -1;

907 if (!OpToFold.isOperandLegal(*TII, *MI, CommuteOpNo)) {

908 if ((Opc != AMDGPU::V_ADD_CO_U32_e64 && Opc != AMDGPU::V_SUB_CO_U32_e64 &&

909 Opc != AMDGPU::V_SUBREV_CO_U32_e64) ||

910 (!OpToFold.isImm() && !OpToFold.isFI() && !OpToFold.isGlobal())) {

912 return false;

913 }

914

915

916

917 MachineOperand &OtherOp = MI->getOperand(OpNo);

918 if (!OtherOp.isReg() ||

920 return false;

921

922 assert(MI->getOperand(1).isDef());

923

924

925 unsigned MaybeCommutedOpc = MI->getOpcode();

927 }

928

930 Op32);

931 return true;

932 }

933

934

935

936

937

938

939 if (Opc == AMDGPU::S_FMAC_F32 &&

940 (OpNo != 1 || MI->getOperand(1).isIdenticalTo(MI->getOperand(2)))) {

941 if (tryToFoldAsFMAAKorMK())

942 return true;

943 }

944

945

946

947 if (OpToFold.isImm() &&

950 return false;

951

953 return true;

954}

955

956bool SIFoldOperandsImpl::isUseSafeToFold(const MachineInstr &MI,

957 const MachineOperand &UseMO) const {

958

959 return TII->isSDWA(MI);

960}

961

967 SubDef && TII.isFoldableCopy(*SubDef);

968 SubDef = MRI.getVRegDef(Sub->getReg())) {

969 unsigned SrcIdx = TII.getFoldableCopySrcIdx(*SubDef);

971

972 if (SrcOp.isImm())

975 break;

977

978 if (SrcOp.getSubReg())

979 break;

980 }

981

982 return Sub;

983}

984

985const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(

986 MachineInstr &RegSeq,

987 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs) const {

988

990

991 const TargetRegisterClass *RC = nullptr;

992

994 MachineOperand &SrcOp = RegSeq.getOperand(I);

996

997

998 const TargetRegisterClass *OpRC = getRegOpRC(*MRI, *TRI, SrcOp);

999 if (!RC)

1000 RC = OpRC;

1001 else if (TRI->getCommonSubClass(RC, OpRC))

1002 return nullptr;

1003

1005

1006 Defs.emplace_back(&SrcOp, SubRegIdx);

1007 continue;

1008 }

1009

1011 if (DefSrc && (DefSrc->isReg() || DefSrc->isImm())) {

1012 Defs.emplace_back(DefSrc, SubRegIdx);

1013 continue;

1014 }

1015

1016 Defs.emplace_back(&SrcOp, SubRegIdx);

1017 }

1018

1019 return RC;

1020}

1021

1022

1023

1024

1025const TargetRegisterClass *SIFoldOperandsImpl::getRegSeqInit(

1026 SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,

1028 MachineInstr *Def = MRI->getVRegDef(UseReg);

1029 if (!Def || Def->isRegSequence())

1030 return nullptr;

1031

1032 return getRegSeqInit(*Def, Defs);

1033}

1034

1035std::pair<int64_t, const TargetRegisterClass *>

1036SIFoldOperandsImpl::isRegSeqSplat(MachineInstr &RegSeq) const {

1038 const TargetRegisterClass *SrcRC = getRegSeqInit(RegSeq, Defs);

1039 if (!SrcRC)

1040 return {};

1041

1042 bool TryToMatchSplat64 = false;

1043

1044 int64_t Imm;

1045 for (unsigned I = 0, E = Defs.size(); I != E; ++I) {

1046 const MachineOperand *Op = Defs[I].first;

1047 if (Op->isImm())

1048 return {};

1049

1050 int64_t SubImm = Op->getImm();

1051 if (I) {

1052 Imm = SubImm;

1053 continue;

1054 }

1055

1056 if (Imm != SubImm) {

1057 if (I == 1 && (E & 1) == 0) {

1058

1059

1060 TryToMatchSplat64 = true;

1061 break;

1062 }

1063

1064 return {};

1065 }

1066 }

1067

1068 if (!TryToMatchSplat64)

1069 return {Defs[0].first->getImm(), SrcRC};

1070

1071

1072

1073 int64_t SplatVal64;

1074 for (unsigned I = 0, E = Defs.size(); I != E; I += 2) {

1075 const MachineOperand *Op0 = Defs[I].first;

1076 const MachineOperand *Op1 = Defs[I + 1].first;

1077

1079 return {};

1080

1081 unsigned SubReg0 = Defs[I].second;

1082 unsigned SubReg1 = Defs[I + 1].second;

1083

1084

1085

1086 if (TRI->getChannelFromSubReg(SubReg0) + 1 !=

1087 TRI->getChannelFromSubReg(SubReg1))

1088 return {};

1089

1091 if (I == 0)

1092 SplatVal64 = MergedVal;

1093 else if (SplatVal64 != MergedVal)

1094 return {};

1095 }

1096

1097 const TargetRegisterClass *RC64 = TRI->getSubRegisterClass(

1099

1100 return {SplatVal64, RC64};

1101}

1102

1103bool SIFoldOperandsImpl::tryFoldRegSeqSplat(

1104 MachineInstr *UseMI, unsigned UseOpIdx, int64_t SplatVal,

1105 const TargetRegisterClass *SplatRC) const {

1107 if (UseOpIdx >= Desc.getNumOperands())

1108 return false;

1109

1110

1112 return false;

1113

1115 if (RCID == -1)

1116 return false;

1117

1118 const TargetRegisterClass *OpRC = TRI->getRegClass(RCID);

1119

1120

1121

1122

1123 if (SplatVal != 0 && SplatVal != -1) {

1124

1125

1126

1127 uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;

1128 switch (OpTy) {

1133 OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0);

1134 break;

1138 OpRC = TRI->getSubRegisterClass(OpRC, AMDGPU::sub0_sub1);

1139 break;

1140 default:

1141 return false;

1142 }

1143

1144 if (TRI->getCommonSubClass(OpRC, SplatRC))

1145 return false;

1146 }

1147

1149 if (TII->isOperandLegal(*UseMI, UseOpIdx, &TmpOp))

1150 return false;

1151

1152 return true;

1153}

1154

1155bool SIFoldOperandsImpl::tryToFoldACImm(

1156 const FoldableDef &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx,

1157 SmallVectorImpl &FoldList) const {

1159 if (UseOpIdx >= Desc.getNumOperands())

1160 return false;

1161

1162

1164 return false;

1165

1166 if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) {

1169 return false;

1171 return true;

1172 }

1173

1174 return false;

1175}

1176

1177void SIFoldOperandsImpl::foldOperand(

1178 FoldableDef OpToFold, MachineInstr *UseMI, int UseOpIdx,

1179 SmallVectorImpl &FoldList,

1180 SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {

1181 const MachineOperand *UseOp = &UseMI->getOperand(UseOpIdx);

1182

1183 if (!isUseSafeToFold(*UseMI, *UseOp))

1184 return;

1185

1186

1187 if (UseOp->isReg() && OpToFold.isReg()) {

1189 return;

1190

1191 if (UseOp->getSubReg() != AMDGPU::NoSubRegister &&

1192 (UseOp->getSubReg() != AMDGPU::lo16 ||

1193 TRI->isSGPRReg(*MRI, OpToFold.getReg())))

1194 return;

1195 }

1196

1197

1198

1199

1203

1204 int64_t SplatVal;

1205 const TargetRegisterClass *SplatRC;

1206 std::tie(SplatVal, SplatRC) = isRegSeqSplat(*UseMI);

1207

1208

1211 for (unsigned I = 0; I != UsesToProcess.size(); ++I) {

1212 MachineOperand *RSUse = UsesToProcess[I];

1213 MachineInstr *RSUseMI = RSUse->getParent();

1214 unsigned OpNo = RSUseMI->getOperandNo(RSUse);

1215

1216 if (SplatRC) {

1217 if (RSUseMI->isCopy()) {

1221 continue;

1222 }

1223 if (tryFoldRegSeqSplat(RSUseMI, OpNo, SplatVal, SplatRC)) {

1224 FoldableDef SplatDef(SplatVal, SplatRC);

1226 continue;

1227 }

1228 }

1229

1230

1231 if (RSUse->getSubReg() != RegSeqDstSubReg)

1232 continue;

1233

1234

1235

1236 foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList,

1237 CopiesToReplace);

1238 }

1239

1240 return;

1241 }

1242

1243 if (tryToFoldACImm(OpToFold, UseMI, UseOpIdx, FoldList))

1244 return;

1245

1246 if (frameIndexMayFold(*UseMI, UseOpIdx, OpToFold)) {

1247

1248

1249

1251 if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=

1253 return;

1254

1255

1256

1257 MachineOperand &SOff =

1258 *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);

1259 if (!SOff.isImm() || SOff.getImm() != 0)

1260 return;

1261 }

1262

1264 if (TII->isFLATScratch(*UseMI) &&

1268 unsigned CPol =

1269 TII->getNamedOperand(*UseMI, AMDGPU::OpName::cpol)->getImm();

1272 return;

1273

1275 }

1276

1277

1278

1280

1281 return;

1282 }

1283

1284 bool FoldingImmLike =

1285 OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();

1286

1287 if (FoldingImmLike && UseMI->isCopy()) {

1292

1293 const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);

1294

1295

1296

1297

1299 return;

1300

1301 const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);

1302

1303

1304 for (unsigned MovOp :

1305 {AMDGPU::S_MOV_B32, AMDGPU::V_MOV_B32_e32, AMDGPU::S_MOV_B64,

1306 AMDGPU::V_MOV_B64_PSEUDO, AMDGPU::V_MOV_B16_t16_e64,

1307 AMDGPU::V_ACCVGPR_WRITE_B32_e64, AMDGPU::AV_MOV_B32_IMM_PSEUDO,

1308 AMDGPU::AV_MOV_B64_IMM_PSEUDO}) {

1309 const MCInstrDesc &MovDesc = TII->get(MovOp);

1310 const TargetRegisterClass *MovDstRC =

1312

1313

1314

1315

1317 continue;

1318

1319 const int SrcIdx = MovOp == AMDGPU::V_MOV_B16_t16_e64 ? 2 : 1;

1320

1322 if (RegClassID != -1) {

1323 const TargetRegisterClass *MovSrcRC = TRI->getRegClass(RegClassID);

1324

1325 if (UseSubReg)

1326 MovSrcRC = TRI->getMatchingSuperRegClass(SrcRC, MovSrcRC, UseSubReg);

1327

1328

1329

1330 if (MovOp == AMDGPU::AV_MOV_B32_IMM_PSEUDO &&

1331 (!OpToFold.isImm() ||

1332 TII->isImmOperandLegal(MovDesc, SrcIdx,

1333 *OpToFold.getEffectiveImmVal())))

1334 break;

1335

1336 if (MRI->constrainRegClass(SrcReg, MovSrcRC))

1337 break;

1338

1339

1340

1341 } else {

1342

1343

1344

1345

1346 if (!OpToFold.isImm() ||

1347 TII->isImmOperandLegal(MovDesc, 1, *OpToFold.getEffectiveImmVal()))

1348 break;

1349 }

1350

1353 while (ImpOpI != ImpOpE) {

1355 ImpOpI++;

1357 }

1359

1360 if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {

1362 MachineOperand NewSrcOp(SrcOp);

1363 MachineFunction *MF = UseMI->getMF();

1368 UseOpIdx = SrcIdx;

1370 }

1372 break;

1373 }

1374

1375

1377 return;

1378

1379 } else {

1380 if (UseMI->isCopy() && OpToFold.isReg() &&

1384 LLVM_DEBUG(dbgs() << "Folding " << OpToFold.OpToFold << "\n into "

1389 unsigned SubRegIdx = OpToFold.getSubReg();

1390

1391

1392

1393

1394

1395

1396

1397

1398

1399

1400

1401

1402

1403 static_assert(AMDGPU::sub1_hi16 == 12, "Subregister layout has changed");

1406

1407

1408 if (SubRegIdx > AMDGPU::sub1) {

1409 LaneBitmask M = TRI->getSubRegIndexLaneMask(SubRegIdx);

1410 M |= M.getLane(M.getHighestLane() - 1);

1411 SmallVector<unsigned, 4> Indexes;

1412 TRI->getCoveringSubRegIndexes(TRI->getRegClassForReg(*MRI, UseReg), M,

1413 Indexes);

1414 assert(Indexes.size() == 1 && "Expected one 32-bit subreg to cover");

1415 SubRegIdx = Indexes[0];

1416

1417 } else if (TII->getOpSize(*UseMI, 1) == 4)

1418 SubRegIdx = 0;

1419 else

1420 SubRegIdx = AMDGPU::sub0;

1421 }

1425 OpToFold.OpToFold->setIsKill(false);

1426

1427

1429 if (foldCopyToAGPRRegSequence(UseMI))

1430 return;

1431 }

1432

1434 if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||

1435 (UseOpc == AMDGPU::V_READLANE_B32 &&

1436 (int)UseOpIdx ==

1437 AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {

1438

1439

1440

1441

1442 if (FoldingImmLike) {

1445 *OpToFold.DefMI, *UseMI))

1446 return;

1447

1449

1450 if (OpToFold.isImm()) {

1452 *OpToFold.getEffectiveImmVal());

1453 } else if (OpToFold.isFI())

1455 else {

1456 assert(OpToFold.isGlobal());

1458 OpToFold.OpToFold->getOffset(),

1459 OpToFold.OpToFold->getTargetFlags());

1460 }

1462 return;

1463 }

1464

1465 if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {

1468 *OpToFold.DefMI, *UseMI))

1469 return;

1470

1471

1472

1473

1474

1480 return;

1481 }

1482 }

1483

1484 const MCInstrDesc &UseDesc = UseMI->getDesc();

1485

1486

1487

1489 UseDesc.operands()[UseOpIdx].RegClass == -1)

1490 return;

1491 }

1492

1493

1494

1495

1496

1497 tryAddToFoldList(FoldList, UseMI, UseOpIdx, OpToFold);

1498}

1499

1502 switch (Opcode) {

1503 case AMDGPU::V_AND_B32_e64:

1504 case AMDGPU::V_AND_B32_e32:

1505 case AMDGPU::S_AND_B32:

1507 return true;

1508 case AMDGPU::V_OR_B32_e64:

1509 case AMDGPU::V_OR_B32_e32:

1510 case AMDGPU::S_OR_B32:

1512 return true;

1513 case AMDGPU::V_XOR_B32_e64:

1514 case AMDGPU::V_XOR_B32_e32:

1515 case AMDGPU::S_XOR_B32:

1517 return true;

1518 case AMDGPU::S_XNOR_B32:

1519 Result = ~(LHS ^ RHS);

1520 return true;

1521 case AMDGPU::S_NAND_B32:

1522 Result = ~(LHS & RHS);

1523 return true;

1524 case AMDGPU::S_NOR_B32:

1525 Result = ~(LHS | RHS);

1526 return true;

1527 case AMDGPU::S_ANDN2_B32:

1529 return true;

1530 case AMDGPU::S_ORN2_B32:

1532 return true;

1533 case AMDGPU::V_LSHL_B32_e64:

1534 case AMDGPU::V_LSHL_B32_e32:

1535 case AMDGPU::S_LSHL_B32:

1536

1537 Result = LHS << (RHS & 31);

1538 return true;

1539 case AMDGPU::V_LSHLREV_B32_e64:

1540 case AMDGPU::V_LSHLREV_B32_e32:

1541 Result = RHS << (LHS & 31);

1542 return true;

1543 case AMDGPU::V_LSHR_B32_e64:

1544 case AMDGPU::V_LSHR_B32_e32:

1545 case AMDGPU::S_LSHR_B32:

1546 Result = LHS >> (RHS & 31);

1547 return true;

1548 case AMDGPU::V_LSHRREV_B32_e64:

1549 case AMDGPU::V_LSHRREV_B32_e32:

1550 Result = RHS >> (LHS & 31);

1551 return true;

1552 case AMDGPU::V_ASHR_I32_e64:

1553 case AMDGPU::V_ASHR_I32_e32:

1554 case AMDGPU::S_ASHR_I32:

1555 Result = static_cast<int32_t>(LHS) >> (RHS & 31);

1556 return true;

1557 case AMDGPU::V_ASHRREV_I32_e64:

1558 case AMDGPU::V_ASHRREV_I32_e32:

1559 Result = static_cast<int32_t>(RHS) >> (LHS & 31);

1560 return true;

1561 default:

1562 return false;

1563 }

1564}

1565

1567 return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;

1568}

1569

1570std::optional<int64_t>

1571SIFoldOperandsImpl::getImmOrMaterializedImm(MachineOperand &Op) const {

1572 if (Op.isImm())

1573 return Op.getImm();

1574

1575 if (Op.isReg() || Op.getReg().isVirtual())

1576 return std::nullopt;

1577

1578 const MachineInstr *Def = MRI->getVRegDef(Op.getReg());

1579 if (Def && Def->isMoveImmediate()) {

1580 const MachineOperand &ImmSrc = Def->getOperand(1);

1581 if (ImmSrc.isImm())

1582 return TII->extractSubregFromImm(ImmSrc.getImm(), Op.getSubReg());

1583 }

1584

1585 return std::nullopt;

1586}

1587

1588

1589

1590

1591bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {

1592 if (MI->allImplicitDefsAreDead())

1593 return false;

1594

1595 unsigned Opc = MI->getOpcode();

1596

1597 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);

1598 if (Src0Idx == -1)

1599 return false;

1600

1601 MachineOperand *Src0 = &MI->getOperand(Src0Idx);

1602 std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(*Src0);

1603

1604 if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||

1605 Opc == AMDGPU::S_NOT_B32) &&

1606 Src0Imm) {

1607 MI->getOperand(1).ChangeToImmediate(~*Src0Imm);

1608 TII->mutateAndCleanupImplicit(

1610 return true;

1611 }

1612

1613 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);

1614 if (Src1Idx == -1)

1615 return false;

1616

1617 MachineOperand *Src1 = &MI->getOperand(Src1Idx);

1618 std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1);

1619

1620 if (!Src0Imm && !Src1Imm)

1621 return false;

1622

1623

1624

1625

1626 if (Src0Imm && Src1Imm) {

1627 int32_t NewImm;

1629 return false;

1630

1631 bool IsSGPR = TRI->isSGPRReg(*MRI, MI->getOperand(0).getReg());

1632

1633

1634

1635 MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);

1636 MI->removeOperand(Src1Idx);

1638 return true;

1639 }

1640

1641 if (MI->isCommutable())

1642 return false;

1643

1644 if (Src0Imm && !Src1Imm) {

1648 }

1649

1650 int32_t Src1Val = static_cast<int32_t>(*Src1Imm);

1651 if (Opc == AMDGPU::V_OR_B32_e64 ||

1652 Opc == AMDGPU::V_OR_B32_e32 ||

1653 Opc == AMDGPU::S_OR_B32) {

1654 if (Src1Val == 0) {

1655

1656 MI->removeOperand(Src1Idx);

1657 TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));

1658 } else if (Src1Val == -1) {

1659

1660 MI->removeOperand(Src1Idx);

1661 TII->mutateAndCleanupImplicit(

1663 } else

1664 return false;

1665

1666 return true;

1667 }

1668

1669 if (Opc == AMDGPU::V_AND_B32_e64 || Opc == AMDGPU::V_AND_B32_e32 ||

1670 Opc == AMDGPU::S_AND_B32) {

1671 if (Src1Val == 0) {

1672

1673 MI->removeOperand(Src0Idx);

1674 TII->mutateAndCleanupImplicit(

1676 } else if (Src1Val == -1) {

1677

1678 MI->removeOperand(Src1Idx);

1679 TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));

1680 } else

1681 return false;

1682

1683 return true;

1684 }

1685

1686 if (Opc == AMDGPU::V_XOR_B32_e64 || Opc == AMDGPU::V_XOR_B32_e32 ||

1687 Opc == AMDGPU::S_XOR_B32) {

1688 if (Src1Val == 0) {

1689

1690 MI->removeOperand(Src1Idx);

1691 TII->mutateAndCleanupImplicit(*MI, TII->get(AMDGPU::COPY));

1692 return true;

1693 }

1694 }

1695

1696 return false;

1697}

1698

1699

1700bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {

1701 unsigned Opc = MI.getOpcode();

1702 if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&

1703 Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)

1704 return false;

1705

1706 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);

1707 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);

1709 std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1);

1710 if (!Src1Imm)

1711 return false;

1712

1713 std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(*Src0);

1714 if (!Src0Imm || *Src0Imm != *Src1Imm)

1715 return false;

1716 }

1717

1718 int Src1ModIdx =

1719 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);

1720 int Src0ModIdx =

1721 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);

1722 if ((Src1ModIdx != -1 && MI.getOperand(Src1ModIdx).getImm() != 0) ||

1723 (Src0ModIdx != -1 && MI.getOperand(Src0ModIdx).getImm() != 0))

1724 return false;

1725

1727 auto &NewDesc =

1729 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);

1730 if (Src2Idx != -1)

1731 MI.removeOperand(Src2Idx);

1732 MI.removeOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));

1733 if (Src1ModIdx != -1)

1734 MI.removeOperand(Src1ModIdx);

1735 if (Src0ModIdx != -1)

1736 MI.removeOperand(Src0ModIdx);

1737 TII->mutateAndCleanupImplicit(MI, NewDesc);

1739 return true;

1740}

1741

1742bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const {

1743 if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&

1744 MI.getOpcode() != AMDGPU::V_AND_B32_e32)

1745 return false;

1746

1747 std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(MI.getOperand(1));

1748 if (!Src0Imm || *Src0Imm != 0xffff || MI.getOperand(2).isReg())

1749 return false;

1750

1751 Register Src1 = MI.getOperand(2).getReg();

1752 MachineInstr *SrcDef = MRI->getVRegDef(Src1);

1754 return false;

1755

1756 Register Dst = MI.getOperand(0).getReg();

1757 MRI->replaceRegWith(Dst, Src1);

1758 if (MI.getOperand(2).isKill())

1759 MRI->clearKillFlags(Src1);

1760 MI.eraseFromParent();

1761 return true;

1762}

1763

1764bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,

1765 const FoldableDef &OpToFold) const {

1766

1767

1768

1769 SmallVector<MachineInstr *, 4> CopiesToReplace;

1771 MachineOperand &Dst = MI.getOperand(0);

1773

1774 if (OpToFold.isImm()) {

1775 for (auto &UseMI :

1777

1778

1779

1780

1781

1782

1783

1784

1785 if (tryConstantFoldOp(&UseMI)) {

1788 }

1789 }

1790 }

1791

1794 for (auto *U : UsesToProcess) {

1795 MachineInstr *UseMI = U->getParent();

1796

1797 FoldableDef SubOpToFold = OpToFold.getWithSubReg(*TRI, U->getSubReg());

1799 CopiesToReplace);

1800 }

1801

1802 if (CopiesToReplace.empty() && FoldList.empty())

1804

1805 MachineFunction *MF = MI.getMF();

1806

1807 for (MachineInstr *Copy : CopiesToReplace)

1808 Copy->addImplicitDefUseOperands(*MF);

1809

1810 SetVector<MachineInstr *> ConstantFoldCandidates;

1811 for (FoldCandidate &Fold : FoldList) {

1812 assert(!Fold.isReg() || Fold.Def.OpToFold);

1813 if (Fold.isReg() && Fold.getReg().isVirtual()) {

1815 const MachineInstr *DefMI = Fold.Def.DefMI;

1818 continue;

1819 }

1821

1822 if (Fold.isReg()) {

1823 assert(Fold.Def.OpToFold && Fold.isReg());

1824

1825

1826

1827 MRI->clearKillFlags(Fold.getReg());

1828 }

1829 LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "

1830 << static_cast<int>(Fold.UseOpNo) << " of "

1831 << *Fold.UseMI);

1832

1833 if (Fold.isImm())

1834 ConstantFoldCandidates.insert(Fold.UseMI);

1835

1836 } else if (Fold.Commuted) {

1837

1839 }

1840 }

1841

1842 for (MachineInstr *MI : ConstantFoldCandidates) {

1843 if (tryConstantFoldOp(MI)) {

1846 }

1847 }

1848 return true;

1849}

1850

1851

1852

1853bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const {

1854

1855

1856

1857

1858 const TargetRegisterClass *DefRC =

1860 if (TRI->isAGPRClass(DefRC))

1861 return false;

1862

1864 MachineInstr *RegSeq = MRI->getVRegDef(UseReg);

1866 return false;

1867

1869 MachineBasicBlock &MBB = *CopyMI->getParent();

1870

1871 MachineInstrBuilder B(*MBB.getParent(), CopyMI);

1872 DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;

1873

1874 const TargetRegisterClass *UseRC =

1876

1877

1879

1880 unsigned NumRegSeqOperands = RegSeq->getNumOperands();

1881 unsigned NumFoldable = 0;

1882

1883 for (unsigned I = 1; I != NumRegSeqOperands; I += 2) {

1884 MachineOperand &RegOp = RegSeq->getOperand(I);

1886

1888

1890 continue;

1891 }

1892

1896

1897 if (Lookup->isImm()) {

1898

1899 const TargetRegisterClass *DestSuperRC = TRI->getMatchingSuperRegClass(

1900 DefRC, &AMDGPU::AGPR_32RegClass, SubRegIdx);

1901 if (DestSuperRC &&

1903 ++NumFoldable;

1905 continue;

1906 }

1907 }

1908

1909 const TargetRegisterClass *InputRC =

1911 : MRI->getRegClass(RegOp.getReg());

1912

1913

1914

1915

1916

1917

1918

1919 const TargetRegisterClass *MatchRC =

1920 TRI->getMatchingSuperRegClass(DefRC, InputRC, SubRegIdx);

1921 if (!MatchRC) {

1922 ++NumFoldable;

1924 continue;

1925 }

1926

1928 }

1929

1930

1931 if (NumFoldable == 0)

1932 return false;

1933

1934 CopyMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));

1937

1938 for (auto [Def, DestSubIdx] : NewDefs) {

1939 if (Def->isReg()) {

1940

1941

1942 Register Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);

1943 BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp)

1944 .add(*Def);

1946 } else {

1947 TargetInstrInfo::RegSubRegPair Src = getRegSubRegPair(*Def);

1948 Def->setIsKill(false);

1949

1950 Register &VGPRCopy = VGPRCopies[Src];

1951 if (!VGPRCopy) {

1952 const TargetRegisterClass *VGPRUseSubRC =

1953 TRI->getSubRegisterClass(UseRC, DestSubIdx);

1954

1955

1956

1957

1958

1959

1960

1961

1962 const TargetRegisterClass *SubRC =

1963 TRI->getSubRegisterClass(MRI->getRegClass(Src.Reg), Src.SubReg);

1965

1966 VGPRCopy = MRI->createVirtualRegister(VGPRUseSubRC);

1969 } else {

1970

1971 B.add(*Def);

1972 }

1973 } else {

1974 B.addReg(VGPRCopy);

1975 }

1976 }

1977

1978 B.addImm(DestSubIdx);

1979 }

1980

1982 return true;

1983}

1984

1985bool SIFoldOperandsImpl::tryFoldFoldableCopy(

1986 MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const {

1987 Register DstReg = MI.getOperand(0).getReg();

1988

1989

1990 if (DstReg == AMDGPU::M0) {

1991 MachineOperand &NewM0Val = MI.getOperand(1);

1992 if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) {

1993 MI.eraseFromParent();

1994 return true;

1995 }

1996

1997

1999 ? nullptr

2000 : &NewM0Val;

2001 return false;

2002 }

2003

2004 MachineOperand *OpToFoldPtr;

2005 if (MI.getOpcode() == AMDGPU::V_MOV_B16_t16_e64) {

2006

2007 if (TII->hasAnyModifiersSet(MI))

2008 return false;

2009 OpToFoldPtr = &MI.getOperand(2);

2010 } else

2011 OpToFoldPtr = &MI.getOperand(1);

2012 MachineOperand &OpToFold = *OpToFoldPtr;

2013 bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();

2014

2015

2016 if (!FoldingImm && !OpToFold.isReg())

2017 return false;

2018

2019

2021 TRI->isConstantPhysReg(OpToFold.getReg()))

2022 return false;

2023

2024

2025

2026

2027

2028

2029

2031 return false;

2032

2033 const TargetRegisterClass *DstRC =

2034 MRI->getRegClass(MI.getOperand(0).getReg());

2035

2036

2037

2038

2039

2040

2041

2042

2043

2044

2045

2046

2047

2048

2049

2050 if (MI.getOpcode() == AMDGPU::COPY && OpToFold.isReg() &&

2052 if (DstRC == &AMDGPU::SReg_32RegClass &&

2053 DstRC == MRI->getRegClass(OpToFold.getReg())) {

2056 }

2057 }

2058

2059

2060

2061 if (OpToFold.isReg() && MI.isCopy() && MI.getOperand(1).getSubReg()) {

2062 if (foldCopyToAGPRRegSequence(&MI))

2063 return true;

2064 }

2065

2066 FoldableDef Def(OpToFold, DstRC);

2067 bool Changed = foldInstOperand(MI, Def);

2068

2069

2070

2071

2072

2073

2074 auto *InstToErase = &MI;

2075 while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {

2076 auto &SrcOp = InstToErase->getOperand(1);

2078 InstToErase->eraseFromParent();

2080 InstToErase = nullptr;

2082 break;

2083 InstToErase = MRI->getVRegDef(SrcReg);

2084 if (!InstToErase || TII->isFoldableCopy(*InstToErase))

2085 break;

2086 }

2087

2088 if (InstToErase && InstToErase->isRegSequence() &&

2089 MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {

2090 InstToErase->eraseFromParent();

2092 }

2093

2095 return true;

2096

2097

2098

2099

2100 return OpToFold.isReg() &&

2101 foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.getReg(), MI);

2102}

2103

2104

2105

2106const MachineOperand *

2107SIFoldOperandsImpl::isClamp(const MachineInstr &MI) const {

2108 unsigned Op = MI.getOpcode();

2109 switch (Op) {

2110 case AMDGPU::V_MAX_F32_e64:

2111 case AMDGPU::V_MAX_F16_e64:

2112 case AMDGPU::V_MAX_F16_t16_e64:

2113 case AMDGPU::V_MAX_F16_fake16_e64:

2114 case AMDGPU::V_MAX_F64_e64:

2115 case AMDGPU::V_MAX_NUM_F64_e64:

2116 case AMDGPU::V_PK_MAX_F16:

2117 case AMDGPU::V_MAX_BF16_PSEUDO_e64:

2118 case AMDGPU::V_PK_MAX_NUM_BF16: {

2119 if (MI.mayRaiseFPException())

2120 return nullptr;

2121

2122 if (TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())

2123 return nullptr;

2124

2125

2126 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);

2127 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);

2128 if (!Src0->isReg() || !Src1->isReg() ||

2131 Src0->getSubReg() != AMDGPU::NoSubRegister)

2132 return nullptr;

2133

2134

2135 if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))

2136 return nullptr;

2137

2138 unsigned Src0Mods

2139 = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();

2140 unsigned Src1Mods

2141 = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();

2142

2143

2144

2145 unsigned UnsetMods =

2146 (Op == AMDGPU::V_PK_MAX_F16 || Op == AMDGPU::V_PK_MAX_NUM_BF16)

2148 : 0u;

2149 if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)

2150 return nullptr;

2151 return Src0;

2152 }

2153 default:

2154 return nullptr;

2155 }

2156}

2157

2158

2159bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) {

2160 const MachineOperand *ClampSrc = isClamp(MI);

2161 if (!ClampSrc || MRI->hasOneNonDBGUser(ClampSrc->getReg()))

2162 return false;

2163

2165 return false;

2166

2167

2169 MachineInstr *Def =

2170 MRI->getVRegDef(DefSrcReg.isVirtual() ? DefSrcReg : ClampSrc->getReg());

2171

2172

2173 if (TII->getClampMask(*Def) != TII->getClampMask(MI))

2174 return false;

2175

2176 if (Def->mayRaiseFPException())

2177 return false;

2178

2179 MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);

2180 if (!DefClamp)

2181 return false;

2182

2183 LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def);

2184

2185

2186 DefClamp->setImm(1);

2187

2188 Register DefReg = Def->getOperand(0).getReg();

2189 Register MIDstReg = MI.getOperand(0).getReg();

2190 if (TRI->isSGPRReg(*MRI, DefReg)) {

2191

2192

2194 MIDstReg)

2196 } else {

2197 MRI->replaceRegWith(MIDstReg, DefReg);

2198 }

2199 MI.eraseFromParent();

2200

2201

2202

2203

2205 Def->eraseFromParent();

2206

2207 return true;

2208}

2209

2211 switch (Opc) {

2212 case AMDGPU::V_MUL_F64_e64:

2213 case AMDGPU::V_MUL_F64_pseudo_e64: {

2214 switch (Val) {

2215 case 0x3fe0000000000000:

2217 case 0x4000000000000000:

2219 case 0x4010000000000000:

2221 default:

2223 }

2224 }

2225 case AMDGPU::V_MUL_F32_e64: {

2226 switch (static_cast<uint32_t>(Val)) {

2227 case 0x3f000000:

2229 case 0x40000000:

2231 case 0x40800000:

2233 default:

2235 }

2236 }

2237 case AMDGPU::V_MUL_F16_e64:

2238 case AMDGPU::V_MUL_F16_t16_e64:

2239 case AMDGPU::V_MUL_F16_fake16_e64: {

2240 switch (static_cast<uint16_t>(Val)) {

2241 case 0x3800:

2243 case 0x4000:

2245 case 0x4400:

2247 default:

2249 }

2250 }

2251 default:

2253 }

2254}

2255

2256

2257

2258

2259std::pair<const MachineOperand *, int>

2260SIFoldOperandsImpl::isOMod(const MachineInstr &MI) const {

2261 unsigned Op = MI.getOpcode();

2262 switch (Op) {

2263 case AMDGPU::V_MUL_F64_e64:

2264 case AMDGPU::V_MUL_F64_pseudo_e64:

2265 case AMDGPU::V_MUL_F32_e64:

2266 case AMDGPU::V_MUL_F16_t16_e64:

2267 case AMDGPU::V_MUL_F16_fake16_e64:

2268 case AMDGPU::V_MUL_F16_e64: {

2269

2270 if ((Op == AMDGPU::V_MUL_F32_e64 &&

2272 ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F64_pseudo_e64 ||

2273 Op == AMDGPU::V_MUL_F16_e64 || Op == AMDGPU::V_MUL_F16_t16_e64 ||

2274 Op == AMDGPU::V_MUL_F16_fake16_e64) &&

2277 MI.mayRaiseFPException())

2279

2280 const MachineOperand *RegOp = nullptr;

2281 const MachineOperand *ImmOp = nullptr;

2282 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);

2283 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);

2284 if (Src0->isImm()) {

2285 ImmOp = Src0;

2286 RegOp = Src1;

2287 } else if (Src1->isImm()) {

2288 ImmOp = Src1;

2289 RegOp = Src0;

2290 } else

2292

2295 TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||

2296 TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||

2297 TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||

2298 TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))

2300

2301 return std::pair(RegOp, OMod);

2302 }

2303 case AMDGPU::V_ADD_F64_e64:

2304 case AMDGPU::V_ADD_F64_pseudo_e64:

2305 case AMDGPU::V_ADD_F32_e64:

2306 case AMDGPU::V_ADD_F16_e64:

2307 case AMDGPU::V_ADD_F16_t16_e64:

2308 case AMDGPU::V_ADD_F16_fake16_e64: {

2309

2310 if ((Op == AMDGPU::V_ADD_F32_e64 &&

2312 ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F64_pseudo_e64 ||

2313 Op == AMDGPU::V_ADD_F16_e64 || Op == AMDGPU::V_ADD_F16_t16_e64 ||

2314 Op == AMDGPU::V_ADD_F16_fake16_e64) &&

2317

2318

2319 const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);

2320 const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);

2321

2324 TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&

2325 TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&

2326 TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&

2327 TII->hasModifiersSet(MI, AMDGPU::OpName::omod))

2329

2331 }

2332 default:

2334 }

2335}

2336

2337

2338bool SIFoldOperandsImpl::tryFoldOMod(MachineInstr &MI) {

2339 const MachineOperand *RegOp;

2340 int OMod;

2341 std::tie(RegOp, OMod) = isOMod(MI);

2343 RegOp->getSubReg() != AMDGPU::NoSubRegister ||

2344 MRI->hasOneNonDBGUser(RegOp->getReg()))

2345 return false;

2346

2347 MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());

2348 MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);

2350 return false;

2351

2352 if (Def->mayRaiseFPException())

2353 return false;

2354

2355

2356

2357 if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))

2358 return false;

2359

2360 LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def);

2361

2362 DefOMod->setImm(OMod);

2363 MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());

2364

2365

2366 MRI->clearKillFlags(Def->getOperand(0).getReg());

2367 MI.eraseFromParent();

2368

2369

2370

2371

2373 Def->eraseFromParent();

2374

2375 return true;

2376}

2377

2378

2379

2380bool SIFoldOperandsImpl::tryFoldRegSequence(MachineInstr &MI) {

2382 auto Reg = MI.getOperand(0).getReg();

2383

2385 MRI->hasOneNonDBGUse(Reg))

2386 return false;

2387

2389 if (!getRegSeqInit(Defs, Reg))

2390 return false;

2391

2392 for (auto &[Op, SubIdx] : Defs) {

2393 if (Op->isReg())

2394 return false;

2395 if (TRI->isAGPR(*MRI, Op->getReg()))

2396 continue;

2397

2398 const MachineInstr *SubDef = MRI->getVRegDef(Op->getReg());

2400 return false;

2402 return false;

2403 }

2404

2405 MachineOperand *Op = &*MRI->use_nodbg_begin(Reg);

2406 MachineInstr *UseMI = Op->getParent();

2409 if (TRI->isVGPR(*MRI, Reg) || MRI->hasOneNonDBGUse(Reg))

2410 return false;

2411 Op = &*MRI->use_nodbg_begin(Reg);

2412 UseMI = Op->getParent();

2413 }

2414

2415 if (Op->getSubReg())

2416 return false;

2417

2419 const MCInstrDesc &InstDesc = UseMI->getDesc();

2421 if (!OpRC || TRI->isVectorSuperClass(OpRC))

2422 return false;

2423

2424 const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg));

2425 auto Dst = MRI->createVirtualRegister(NewDstRC);

2426 auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),

2427 TII->get(AMDGPU::REG_SEQUENCE), Dst);

2428

2429 for (auto &[Def, SubIdx] : Defs) {

2430 Def->setIsKill(false);

2431 if (TRI->isAGPR(*MRI, Def->getReg())) {

2432 RS.add(*Def);

2433 } else {

2434 MachineInstr *SubDef = MRI->getVRegDef(Def->getReg());

2437 }

2438 RS.addImm(SubIdx);

2439 }

2440

2441 Op->setReg(Dst);

2443 Op->setReg(Reg);

2444 RS->eraseFromParent();

2445 return false;

2446 }

2447

2449

2450

2451

2452 if (MRI->use_nodbg_empty(MI.getOperand(0).getReg()))

2453 MI.eraseFromParent();

2454 return true;

2455}

2456

2457

2458

2461 Register &OutReg, unsigned &OutSubReg) {

2462 assert(Copy.isCopy());

2463

2467 return false;

2468

2469

2470

2471 if (TRI.isAGPR(MRI, CopySrcReg)) {

2472 OutReg = CopySrcReg;

2473 OutSubReg = CopySrc.getSubReg();

2474 return true;

2475 }

2476

2477

2478

2479

2480 const MachineInstr *CopySrcDef = MRI.getVRegDef(CopySrcReg);

2481 if (!CopySrcDef || !CopySrcDef->isCopy())

2482 return false;

2483

2486 if (!OtherCopySrcReg.isVirtual() ||

2488 OtherCopySrc.getSubReg() != AMDGPU::NoSubRegister ||

2489 TRI.isAGPR(MRI, OtherCopySrcReg))

2490 return false;

2491

2492 OutReg = OtherCopySrcReg;

2493 OutSubReg = CopySrc.getSubReg();

2494 return true;

2495}

2496

2497

2498

2499

2500

2501

2502

2503

2504

2505

2506

2507

2508

2509

2510

2511

2512

2513

2514

2515

2516

2517

2518

2519

2520

2521

2522

2523

2524

2525

2526bool SIFoldOperandsImpl::tryFoldPhiAGPR(MachineInstr &PHI) {

2528

2529 Register PhiOut = PHI.getOperand(0).getReg();

2530 if (TRI->isVGPR(*MRI, PhiOut))

2531 return false;

2532

2533

2534

2535 const TargetRegisterClass *ARC = nullptr;

2536 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {

2537 MachineOperand &MO = PHI.getOperand(K);

2538 MachineInstr *Copy = MRI->getVRegDef(MO.getReg());

2539 if (!Copy || Copy->isCopy())

2540 continue;

2541

2543 unsigned AGPRRegMask = AMDGPU::NoSubRegister;

2545 continue;

2546

2547 const TargetRegisterClass *CopyInRC = MRI->getRegClass(AGPRSrc);

2548 if (const auto *SubRC = TRI->getSubRegisterClass(CopyInRC, AGPRRegMask))

2549 CopyInRC = SubRC;

2550

2552 return false;

2553 ARC = CopyInRC;

2554 }

2555

2556 if (!ARC)

2557 return false;

2558

2559 bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass);

2560

2561

2563 for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {

2564 MachineOperand &MO = PHI.getOperand(K);

2566

2568 MachineBasicBlock *InsertMBB = nullptr;

2569

2570

2571 unsigned CopyOpc = AMDGPU::COPY;

2572 if (MachineInstr *Def = MRI->getVRegDef(Reg)) {

2573

2574

2575

2576 if (Def->isCopy()) {

2578 unsigned AGPRSubReg = AMDGPU::NoSubRegister;

2582 continue;

2583 }

2584

2585

2586

2587

2588

2589

2590

2591 MachineOperand &CopyIn = Def->getOperand(1);

2594 CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;

2595 }

2596

2597 InsertMBB = Def->getParent();

2599 } else {

2600 InsertMBB = PHI.getOperand(MO.getOperandNo() + 1).getMBB();

2602 }

2603

2604 Register NewReg = MRI->createVirtualRegister(ARC);

2605 MachineInstr *MI = BuildMI(*InsertMBB, InsertPt, PHI.getDebugLoc(),

2606 TII->get(CopyOpc), NewReg)

2609

2610 (void)MI;

2612 }

2613

2614

2615 Register NewReg = MRI->createVirtualRegister(ARC);

2616 PHI.getOperand(0).setReg(NewReg);

2617

2618

2619

2622 TII->get(AMDGPU::COPY), PhiOut)

2624

2626 return true;

2627}

2628

2629

2630bool SIFoldOperandsImpl::tryFoldLoad(MachineInstr &MI) {

2633 return false;

2634

2635 MachineOperand &Def = MI.getOperand(0);

2636 if (Def.isDef())

2637 return false;

2638

2640

2642 return false;

2643

2647

2648 if (Users.empty())

2649 return false;

2650

2651

2652 while (Users.empty()) {

2653 const MachineInstr *I = Users.pop_back_val();

2654 if (I->isCopy() && I->isRegSequence())

2655 return false;

2656 Register DstReg = I->getOperand(0).getReg();

2657

2659 return false;

2660 if (TRI->isAGPR(*MRI, DstReg))

2661 continue;

2663 for (const MachineInstr &U : MRI->use_nodbg_instructions(DstReg))

2664 Users.push_back(&U);

2665 }

2666

2667 const TargetRegisterClass *RC = MRI->getRegClass(DefReg);

2668 MRI->setRegClass(DefReg, TRI->getEquivalentAGPRClass(RC));

2669 if (TII->isOperandLegal(MI, 0, &Def)) {

2670 MRI->setRegClass(DefReg, RC);

2671 return false;

2672 }

2673

2674 while (!MoveRegs.empty()) {

2676 MRI->setRegClass(Reg, TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)));

2677 }

2678

2680

2681 return true;

2682}

2683

2684

2685

2686

2687

2688

2689

2690

2691

2692

2693

2694

2695

2696

2697

2698

2699

2700

2701

2702

2703

2704

2705

2706

2707

2708

2709

2710

2711

2712

2713

2714

2715

2716bool SIFoldOperandsImpl::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {

2717

2718

2720 return false;

2721

2722

2723 DenseMap<std::pair<Register, unsigned>, std::vector<MachineOperand *>>

2724 RegToMO;

2725

2726 for (auto &MI : MBB) {

2727 if (MI.isPHI())

2728 break;

2729

2730 if (TRI->isAGPR(*MRI, MI.getOperand(0).getReg()))

2731 continue;

2732

2733 for (unsigned K = 1; K < MI.getNumOperands(); K += 2) {

2734 MachineOperand &PhiMO = MI.getOperand(K);

2736 continue;

2737 RegToMO[{PhiMO.getReg(), PhiMO.getSubReg()}].push_back(&PhiMO);

2738 }

2739 }

2740

2741

2742

2744 for (const auto &[Entry, MOs] : RegToMO) {

2745 if (MOs.size() == 1)

2746 continue;

2747

2749 MachineInstr *Def = MRI->getVRegDef(Reg);

2750 MachineBasicBlock *DefMBB = Def->getParent();

2751

2752

2753

2754 const TargetRegisterClass *ARC = getRegOpRC(*MRI, *TRI, *MOs.front());

2756 MRI->createVirtualRegister(TRI->getEquivalentVGPRClass(ARC));

2757 MachineInstr *VGPRCopy =

2758 BuildMI(*DefMBB, ++Def->getIterator(), Def->getDebugLoc(),

2759 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)

2761

2762

2763 Register TempAGPR = MRI->createVirtualRegister(ARC);

2765 TII->get(AMDGPU::COPY), TempAGPR)

2767

2768 LLVM_DEBUG(dbgs() << "Caching AGPR into VGPR: " << *VGPRCopy);

2769 for (MachineOperand *MO : MOs) {

2770 MO->setReg(TempAGPR);

2771 MO->setSubReg(AMDGPU::NoSubRegister);

2772 LLVM_DEBUG(dbgs() << " Changed PHI Operand: " << *MO << "\n");

2773 }

2774

2776 }

2777

2779}

2780

2781bool SIFoldOperandsImpl::run(MachineFunction &MF) {

2782 this->MF = &MF;

2787 MFI = MF.getInfo();

2788

2789

2790

2791

2792

2795

2798 MachineOperand *CurrentKnownM0Val = nullptr;

2801

2802 if (tryFoldZeroHighBits(MI)) {

2804 continue;

2805 }

2806

2807 if (MI.isRegSequence() && tryFoldRegSequence(MI)) {

2809 continue;

2810 }

2811

2812 if (MI.isPHI() && tryFoldPhiAGPR(MI)) {

2814 continue;

2815 }

2816

2817 if (MI.mayLoad() && tryFoldLoad(MI)) {

2819 continue;

2820 }

2821

2822 if (TII->isFoldableCopy(MI)) {

2823 Changed |= tryFoldFoldableCopy(MI, CurrentKnownM0Val);

2824 continue;

2825 }

2826

2827

2828 if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))

2829 CurrentKnownM0Val = nullptr;

2830

2831

2832

2834 !tryFoldOMod(MI))

2836 }

2837

2838 Changed |= tryOptimizeAGPRPhis(*MBB);

2839 }

2840

2842}

2843

2847

2848 bool Changed = SIFoldOperandsImpl().run(MF);

2851 }

2854 return PA;

2855}

unsigned const MachineRegisterInfo * MRI

MachineInstrBuilder & UseMI

MachineInstrBuilder MachineInstrBuilder & DefMI

assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

const TargetInstrInfo & TII

Provides AMDGPU specific target descriptions.

MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL

static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat)

Updates the operand at Idx in instruction Inst with the result of instruction Mat.

This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.

AMD GCN specific subclass of TargetSubtarget.

static Register UseReg(const MachineOperand &MO)

iv Induction Variable Users

Register const TargetRegisterInfo * TRI

Promote Memory to Register

static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)

static bool isReg(const MCInst &MI, unsigned OpNo)

MachineInstr unsigned OpIdx

if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod

#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)

static unsigned macToMad(unsigned Opc)

Definition SIFoldOperands.cpp:316

static bool isAGPRCopy(const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI, const MachineInstr &Copy, Register &OutReg, unsigned &OutSubReg)

Checks whether Copy is a AGPR -> VGPR copy.

Definition SIFoldOperands.cpp:2459

static void appendFoldCandidate(SmallVectorImpl< FoldCandidate > &FoldList, FoldCandidate &&Entry)

Definition SIFoldOperands.cpp:750

static const TargetRegisterClass * getRegOpRC(const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, const MachineOperand &MO)

Definition SIFoldOperands.cpp:305

static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, uint32_t LHS, uint32_t RHS)

Definition SIFoldOperands.cpp:1500

static int getOModValue(unsigned Opc, int64_t Val)

Definition SIFoldOperands.cpp:2210

static unsigned getMovOpc(bool IsScalar)

Definition SIFoldOperands.cpp:1566

static MachineOperand * lookUpCopyChain(const SIInstrInfo &TII, const MachineRegisterInfo &MRI, Register SrcReg)

Definition SIFoldOperands.cpp:962

static bool checkImmOpForPKF32InstrReplicatesLower32BitsOfScalarOperand(const FoldableDef &OpToFold)

Definition SIFoldOperands.cpp:783

static bool isPKF32InstrReplicatesLower32BitsOfScalarOperand(const GCNSubtarget *ST, MachineInstr *MI, unsigned OpNo)

Definition SIFoldOperands.cpp:772

Interface definition for SIInstrInfo.

Interface definition for SIRegisterInfo.

static int Lookup(ArrayRef< TableEntry > Table, unsigned Opcode)

bool hasNoSignedZerosFPMath() const

Represent the analysis usage information of a pass.

LLVM_ABI void setPreservesCFG()

This function should be called by the pass, iff they do not:

Represents analyses that only rely on functions' control flow.

FunctionPass class - This class is used to implement most global optimizations.

bool hasGFX90AInsts() const

const SIInstrInfo * getInstrInfo() const override

bool hasDOTOpSelHazard() const

bool zeroesHigh16BitsOfDest(unsigned Opcode) const

Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...

ArrayRef< MCOperandInfo > operands() const

int getOperandConstraint(unsigned OpNum, MCOI::OperandConstraint Constraint) const

Returns the value of the specified operand constraint if it is present.

bool isVariadic() const

Return true if this instruction can have a variable number of operands.

const MCInstrDesc & get(unsigned Opcode) const

Return the machine instruction descriptor that corresponds to the specified instruction opcode.

This holds information about one operand of a machine instruction, indicating the register class for ...

uint8_t OperandType

Information about the type of the operand.

An RAII based helper class to modify MachineFunctionProperties when running pass.

LLVM_ABI iterator SkipPHIsLabelsAndDebug(iterator I, Register Reg=Register(), bool SkipPseudoOp=true)

Return the first instruction in MBB after I that is not a PHI, label or debug.

LLVM_ABI LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const

Return whether (physical) register Reg has been defined and not killed as of just before Before.

LLVM_ABI iterator getFirstTerminator()

Returns an iterator to the first terminator instruction of this basic block.

LLVM_ABI iterator getFirstNonPHI()

Returns a pointer to the first instruction in this block that is not a PHINode instruction.

const MachineFunction * getParent() const

Return the MachineFunction containing this basic block.

MachineInstrBundleIterator< MachineInstr > iterator

LivenessQueryResult

Possible outcome of a register liveness query to computeRegisterLiveness()

@ LQR_Dead

Register is known to be fully dead.

MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...

void getAnalysisUsage(AnalysisUsage &AU) const override

getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.

Properties which a MachineFunction may have at a given point in time.

const TargetSubtargetInfo & getSubtarget() const

getSubtarget - Return the subtarget for which this machine code is being compiled.

MachineRegisterInfo & getRegInfo()

getRegInfo - Return information about the registers currently in use.

Function & getFunction()

Return the LLVM function that this machine code represents.

Ty * getInfo()

getInfo - Keep track of various per-function pieces of information for backends that would like to do...

Register getReg(unsigned Idx) const

Get the register for the operand index.

const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const

const MachineInstrBuilder & add(const MachineOperand &MO) const

const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const

Add a new virtual register operand.

const MachineInstrBuilder & setMIFlags(unsigned Flags) const

Representation of each machine instruction.

unsigned getOpcode() const

Returns the opcode of this MachineInstr.

const MachineBasicBlock * getParent() const

bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const

Return true if the MachineInstr reads the specified register.

unsigned getNumOperands() const

Retuns the total number of operands.

LLVM_ABI void addOperand(MachineFunction &MF, const MachineOperand &Op)

Add the specified operand to the instruction.

unsigned getOperandNo(const_mop_iterator I) const

Returns the number of the operand iterator I points to.

LLVM_ABI unsigned getNumExplicitOperands() const

Returns the number of non-implicit operands.

mop_range implicit_operands()

const MCInstrDesc & getDesc() const

Returns the target instruction descriptor of this MachineInstr.

bool isRegSequence() const

LLVM_ABI void setDesc(const MCInstrDesc &TID)

Replace the instruction descriptor (thus opcode) of the current instruction with a new one.

LLVM_ABI const MachineFunction * getMF() const

Return the function that contains the basic block that this instruction belongs to.

MachineOperand * mop_iterator

iterator/begin/end - Iterate over all operands of a machine instruction.

const DebugLoc & getDebugLoc() const

Returns the debug location id of this MachineInstr.

LLVM_ABI void removeOperand(unsigned OpNo)

Erase an operand from an instruction, leaving it with one fewer operand than it started with.

const MachineOperand & getOperand(unsigned i) const

MachineOperand class - Representation of each machine instruction operand.

void setSubReg(unsigned subReg)

unsigned getSubReg() const

LLVM_ABI unsigned getOperandNo() const

Returns the index of this operand in the instruction that it belongs to.

LLVM_ABI void substVirtReg(Register Reg, unsigned SubIdx, const TargetRegisterInfo &)

substVirtReg - Substitute the current register with the virtual subregister Reg:SubReg.

LLVM_ABI void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)

Replace this operand with a frame index.

void setImm(int64_t immVal)

bool isReg() const

isReg - Tests if this is a MO_Register operand.

LLVM_ABI void setReg(Register Reg)

Change the register this operand corresponds to.

bool isImm() const

isImm - Tests if this is a MO_Immediate operand.

LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)

ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.

LLVM_ABI void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)

ChangeToGA - Replace this operand with a new global address operand.

void setIsKill(bool Val=true)

LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)

ChangeToRegister - Replace this operand with a new register operand of the specified value.

MachineInstr * getParent()

getParent - Return the instruction that this operand belongs to.

LLVM_ABI void substPhysReg(MCRegister Reg, const TargetRegisterInfo &)

substPhysReg - Substitute the current register with the physical register Reg, taking any existing Su...

static MachineOperand CreateImm(int64_t Val)

bool isGlobal() const

isGlobal - Tests if this is a MO_GlobalAddress operand.

MachineOperandType getType() const

getType - Returns the MachineOperandType for this operand.

void setIsUndef(bool Val=true)

Register getReg() const

getReg - Returns the register number.

bool isFI() const

isFI - Tests if this is a MO_FrameIndex operand.

LLVM_ABI bool isIdenticalTo(const MachineOperand &Other) const

Returns true if this operand is identical to the specified operand except for liveness related flags ...

@ MO_Immediate

Immediate operand.

@ MO_GlobalAddress

Address of a global value.

@ MO_FrameIndex

Abstract Stack Frame Index.

@ MO_Register

Register operand.

static MachineOperand CreateFI(int Idx)

MachineRegisterInfo - Keep track of information for virtual and physical registers,...

A set of analyses that are preserved following a run of a transformation pass.

static PreservedAnalyses all()

Construct a special preserved set that preserves all passes.

Wrapper class representing virtual and physical registers.

constexpr bool isVirtual() const

Return true if the specified register number is in the virtual register namespace.

constexpr bool isPhysical() const

Return true if the specified register number is in the physical register namespace.

PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)

Definition SIFoldOperands.cpp:2844

static std::optional< int64_t > extractSubregFromImm(int64_t ImmVal, unsigned SubRegIndex)

Return the extracted immediate value in a subregister use from a constant materialized in a super reg...

This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...

Register getScratchRSrcReg() const

Returns the physical register reserved for use as the resource descriptor for scratch accesses.

SIModeRegisterDefaults getMode() const

bool insert(const value_type &X)

Insert a new element into the SetVector.

This class consists of common code factored out of the SmallVector class to reduce code duplication b...

reference emplace_back(ArgTypes &&... Args)

void push_back(const T &Elt)

StringRef - Represent a constant reference to a string, i.e.

virtual const TargetRegisterClass * getRegClass(const MCInstrDesc &MCID, unsigned OpNum) const

Given a machine instruction descriptor, returns the register class constraint for OpNum,...

int16_t getOpRegClassID(const MCOperandInfo &OpInfo) const

virtual bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const

Returns true iff the routine could find two commutable operands in the given machine instruction.

const TargetRegisterInfo & getRegisterInfo() const

MachineInstr * commuteInstruction(MachineInstr &MI, bool NewMI=false, unsigned OpIdx1=CommuteAnyOperandIndex, unsigned OpIdx2=CommuteAnyOperandIndex) const

This method commutes the operands of the given machine instruction MI.

virtual MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const

This method must be implemented by targets that set the M_CONVERTIBLE_TO_3_ADDR flag.

static const unsigned CommuteAnyOperandIndex

bool contains(Register Reg) const

Return true if the specified register is included in this register class.

bool hasSubClassEq(const TargetRegisterClass *RC) const

Returns true if RC is a sub-class of or equal to this class.

bool hasSuperClassEq(const TargetRegisterClass *RC) const

Returns true if RC is a super-class of or equal to this class.

TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...

self_iterator getIterator()

#define llvm_unreachable(msg)

Marks that the current location is not supposed to be reachable.

bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType)

LLVM_READONLY int getVOPe32(uint16_t Opcode)

LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)

LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)

constexpr bool isSISrcOperand(const MCOperandInfo &OpInfo)

Is this an AMDGPU specific source operand?

@ OPERAND_REG_INLINE_C_FP64

@ OPERAND_REG_INLINE_C_V2BF16

@ OPERAND_REG_IMM_V2INT16

@ OPERAND_REG_INLINE_C_INT64

@ OPERAND_REG_IMM_NOINLINE_V2FP16

@ OPERAND_REG_INLINE_C_V2FP16

@ OPERAND_REG_INLINE_AC_INT32

Operands with an AccVGPR register or inline constant.

@ OPERAND_REG_INLINE_AC_FP32

@ OPERAND_REG_INLINE_C_FP32

@ OPERAND_REG_INLINE_C_INT32

@ OPERAND_REG_INLINE_C_V2INT16

@ OPERAND_REG_INLINE_AC_FP64

bool supportsScaleOffset(const MCInstrInfo &MII, unsigned Opcode)

LLVM_READONLY int getFlatScratchInstSSfromSV(uint16_t Opcode)

unsigned ID

LLVM IR allows to use arbitrary numbers as calling convention identifiers.

@ Kill

The last use of a register.

NodeAddr< DefNode * > Def

This is an optimization pass for GlobalISel generic memory operations.

TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)

Create RegSubRegPair from a register MachineOperand.

MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)

Builder interface. Specify how to create the initial instruction itself.

bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)

Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.

void append_range(Container &C, Range &&R)

Wrapper function to append range R to container C.

iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)

Make a range that does early increment to allow mutation of the underlying range without disrupting i...

AnalysisManager< MachineFunction > MachineFunctionAnalysisManager

LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()

Returns the minimum set of Analyses that all machine function passes must preserve.

FunctionPass * createSIFoldOperandsLegacyPass()

Definition SIFoldOperands.cpp:447

constexpr uint32_t Hi_32(uint64_t Value)

Return the high 32 bits of a 64 bit value.

LLVM_ABI raw_ostream & dbgs()

dbgs() - This returns a reference to a raw_ostream for debugging messages.

class LLVM_GSL_OWNER SmallVector

Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...

constexpr uint32_t Lo_32(uint64_t Value)

Return the low 32 bits of a 64 bit value.

@ Sub

Subtraction of integers.

DWARFExpression::Operation Op

char & SIFoldOperandsLegacyID

iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)

iterator_range< df_iterator< T > > depth_first(const T &G)

LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)

Prints virtual and physical registers with or without a TRI instance.

constexpr uint64_t Make_64(uint32_t High, uint32_t Low)

Make a 64-bit integer from a high / low pair of 32-bit integers.

void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)

Implement std::swap in terms of BitVector swap.

@ PreserveSign

The sign of a flushed-to-zero number is preserved in the sign of 0.

DenormalModeKind Output

Denormal flushing mode for floating point instruction results in the default floating point environme...

DenormalMode FP64FP16Denormals

If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...

bool IEEE

Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...

DenormalMode FP32Denormals

If this is set, neither input or output denormals are flushed for most f32 instructions.