LLVM: lib/Target/AMDGPU/SIFixSGPRCopies.cpp Source File (original) (raw)

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

75

76using namespace llvm;

77

78#define DEBUG_TYPE "si-fix-sgpr-copies"

79

81 "amdgpu-enable-merge-m0",

82 cl::desc("Merge and hoist M0 initializations"),

84

85namespace {

86

87class V2SCopyInfo {

88public:

89

91

93

94

95 unsigned NumSVCopies = 0;

96

97 unsigned Score = 0;

98

99

100 unsigned NumReadfirstlanes = 0;

101

102 bool NeedToBeConvertedToVALU = false;

103

104 unsigned ID;

105

106

107

108 unsigned SiblingPenalty = 0;

110 V2SCopyInfo() : Copy(nullptr), ID(0){};

111 V2SCopyInfo(unsigned Id, MachineInstr *C, unsigned Width)

112 : Copy(C), NumReadfirstlanes(Width / 32), ID(Id){};

113#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

114 void dump() {

115 dbgs() << ID << " : " << *Copy << "\n\tS:" << SChain.size()

116 << "\n\tSV:" << NumSVCopies << "\n\tSP: " << SiblingPenalty

117 << "\nScore: " << Score << "\n";

118 }

119#endif

120};

121

122class SIFixSGPRCopies {

123 MachineDominatorTree *MDT;

124 SmallVector<MachineInstr*, 4> SCCCopies;

125 SmallVector<MachineInstr*, 4> RegSequences;

126 SmallVector<MachineInstr*, 4> PHINodes;

127 SmallVector<MachineInstr*, 4> S2VCopies;

128 unsigned NextVGPRToSGPRCopyID = 0;

129 MapVector<unsigned, V2SCopyInfo> V2SCopies;

130 DenseMap<MachineInstr *, SetVector> SiblingPenalty;

131 DenseSet<MachineInstr *> PHISources;

132

133public:

134 MachineRegisterInfo *MRI;

135 const SIRegisterInfo *TRI;

136 const SIInstrInfo *TII;

137

138 SIFixSGPRCopies(MachineDominatorTree *MDT) : MDT(MDT) {}

139

140 bool run(MachineFunction &MF);

141 void fixSCCCopies(MachineFunction &MF);

142 void prepareRegSequenceAndPHIs(MachineFunction &MF);

143 unsigned getNextVGPRToSGPRCopyId() { return ++NextVGPRToSGPRCopyID; }

144 bool needToBeConvertedToVALU(V2SCopyInfo *I);

145 void analyzeVGPRToSGPRCopy(MachineInstr *MI);

146 void lowerVGPR2SGPRCopies(MachineFunction &MF);

147

148

149

150

152

153 void processPHINode(MachineInstr &MI);

154

155

156

157

158 bool tryMoveVGPRConstToSGPR(MachineOperand &MO, Register NewDst,

159 MachineBasicBlock *BlockToInsertTo,

162};

163

165public:

166 static char ID;

167

168 SIFixSGPRCopiesLegacy() : MachineFunctionPass(ID) {}

169

170 bool runOnMachineFunction(MachineFunction &MF) override {

171 MachineDominatorTree *MDT =

172 &getAnalysis().getDomTree();

173 SIFixSGPRCopies Impl(MDT);

174 return Impl.run(MF);

175 }

176

177 StringRef getPassName() const override { return "SI Fix SGPR copies"; }

178

179 void getAnalysisUsage(AnalysisUsage &AU) const override {

180 AU.addRequired();

181 AU.addPreserved();

184 }

185};

186

187}

188

190 false, false)

194

195char SIFixSGPRCopiesLegacy::ID = 0;

196

198

200 return new SIFixSGPRCopiesLegacy();

201}

202

203static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>

207 Register DstReg = Copy.getOperand(0).getReg();

208 Register SrcReg = Copy.getOperand(1).getReg();

209

211 ? MRI.getRegClass(SrcReg)

212 : TRI.getPhysRegBaseClass(SrcReg);

213

214

215

216

218 ? MRI.getRegClass(DstReg)

219 : TRI.getPhysRegBaseClass(DstReg);

220

221 return std::pair(SrcRC, DstRC);

222}

223

227 return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(DstRC) &&

228 TRI.hasVectorRegisters(SrcRC);

229}

230

234 return DstRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(SrcRC) &&

235 TRI.hasVectorRegisters(DstRC);

236}

237

242 auto &Src = MI.getOperand(1);

243 Register DstReg = MI.getOperand(0).getReg();

244 Register SrcReg = Src.getReg();

246 return false;

247

248 for (const auto &MO : MRI.reg_nodbg_operands(DstReg)) {

249 const auto *UseMI = MO.getParent();

251 continue;

252 if (MO.isDef() || UseMI->getParent() != MI.getParent() ||

253 UseMI->getOpcode() <= TargetOpcode::GENERIC_OP_END)

254 return false;

255

256 unsigned OpIdx = MO.getOperandNo();

257 if (OpIdx >= UseMI->getDesc().getNumOperands() ||

259 return false;

260 }

261

262 MRI.setRegClass(DstReg, TRI->getEquivalentSGPRClass(MRI.getRegClass(DstReg)));

263 return true;

264}

265

266

267

268

269

270

271

272

273

274

275

276

277

278

284

285 Register DstReg = MI.getOperand(0).getReg();

286 if (TRI->isSGPRClass(MRI.getRegClass(DstReg)))

287 return false;

288

289 if (MRI.hasOneUse(DstReg))

290 return false;

291

293 if (!CopyUse.isCopy())

294 return false;

295

296

298 return false;

299

302

304 return false;

305

307 return true;

308

309

311 if (SubReg != AMDGPU::NoSubRegister)

312 return false;

313

314 MRI.setRegClass(DstReg, DstRC);

315

316

317

318

319

320

321

322

323

325 bool IsAGPR = TRI->isAGPRClass(DstRC);

326

327 for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {

329 TRI->getRegClassForOperandReg(MRI, MI.getOperand(I));

330 assert(TRI->isSGPRClass(SrcRC) &&

331 "Expected SGPR REG_SEQUENCE to only have SGPR inputs");

333

334 Register TmpReg = MRI.createVirtualRegister(NewSrcRC);

335

336 BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY),

337 TmpReg)

338 .add(MI.getOperand(I));

339

340 if (IsAGPR) {

342 Register TmpAReg = MRI.createVirtualRegister(NewSrcRC);

343 unsigned Opc = NewSrcRC == &AMDGPU::AGPR_32RegClass ?

344 AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::COPY;

346 TmpAReg)

348 TmpReg = TmpAReg;

349 }

350

351 MI.getOperand(I).setReg(TmpReg);

352 }

353

355 return true;

356}

357

361 unsigned &SMovOp,

362 int64_t &Imm) {

363 if (Copy->getOpcode() != AMDGPU::COPY)

364 return false;

365

366 if (!MoveImm->isMoveImmediate())

367 return false;

368

370 TII->getNamedOperand(*MoveImm, AMDGPU::OpName::src0);

371 if (!ImmOp->isImm())

372 return false;

373

374

375 if (Copy->getOperand(1).getSubReg())

376 return false;

377

378 switch (MoveImm->getOpcode()) {

379 default:

380 return false;

381 case AMDGPU::V_MOV_B32_e32:

382 case AMDGPU::AV_MOV_B32_IMM_PSEUDO:

383 SMovOp = AMDGPU::S_MOV_B32;

384 break;

385 case AMDGPU::V_MOV_B64_PSEUDO:

386 SMovOp = AMDGPU::S_MOV_B64_IMM_PSEUDO;

387 break;

388 }

389 Imm = ImmOp->getImm();

390 return true;

391}

392

393template

397 if (MBB == CutOff)

398 return false;

399

402

403 while (!Worklist.empty()) {

405

407 continue;

408 if (MBB == CutOff)

409 continue;

411 return true;

412

413 Worklist.append(MBB->pred_begin(), MBB->pred_end());

414 }

415

416 return false;

417}

418

419

420

421

427 return true;

428

431

432

433

434

437}

438

439

443 while (I != MBB->end() && TII->isBasicBlockPrologue(*I))

444 ++I;

445

446 return I;

447}

448

449

450

451

452

458

459 using InitListMap = std::map<unsigned, std::list<MachineInstr *>>;

460 InitListMap Inits;

461

463

465

467

468 for (auto &MI : MRI.def_instructions(Reg)) {

470 for (auto &MO : MI.operands()) {

471 if ((MO.isReg() && ((MO.isDef() && MO.getReg() != Reg) || !MO.isDef())) ||

472 (!MO.isImm() && !MO.isReg()) || (MO.isImm() && Imm)) {

473 Imm = nullptr;

474 break;

475 }

476 if (MO.isImm())

477 Imm = &MO;

478 }

479 if (Imm)

480 Inits[Imm->getImm()].push_front(&MI);

481 else

483 }

484

485 for (auto &Init : Inits) {

486 auto &Defs = Init.second;

487

488 for (auto I1 = Defs.begin(), E = Defs.end(); I1 != E; ) {

490

491 for (auto I2 = std::next(I1); I2 != E; ) {

493

494

497

499

500 auto interferes = [&MDT, From, To](MachineInstr* &Clobber) -> bool {

503 bool MayClobberFrom = isReachable(Clobber, &*From, MBBTo, MDT);

504 bool MayClobberTo = isReachable(Clobber, &*To, MBBTo, MDT);

505 if (!MayClobberFrom && !MayClobberTo)

506 return false;

507 if ((MayClobberFrom && !MayClobberTo) ||

508 (!MayClobberFrom && MayClobberTo))

509 return true;

510

511

512

513

514 return !((MBBFrom == MBBTo &&

515 MDT.dominates(Clobber, &*From) &&

518 };

519

520 return (llvm::any_of(Clobbers, interferes)) ||

521 (llvm::any_of(Inits, [&](InitListMap::value_type &C) {

522 return C.first != Init.first &&

524 }));

525 };

526

528 if (!interferes(MI2, MI1)) {

530 << "Erasing from "

532 MergedInstrs.insert(MI2);

534 ++I2;

535 continue;

536 }

537 } else if (MDT.dominates(MI2, MI1)) {

538 if (!interferes(MI1, MI2)) {

540 << "Erasing from "

542 MergedInstrs.insert(MI1);

544 ++I1;

545 break;

546 }

547 } else {

550 if (MBB) {

551 ++I2;

552 continue;

553 }

554

556 if (!interferes(MI1, I) && !interferes(MI2, I)) {

558 << "Erasing from "

560 << "and moving from "

563 I->getParent()->splice(I, MI2->getParent(), MI2);

564 MergedInstrs.insert(MI1);

566 ++I1;

567 break;

568 }

569 }

570 ++I2;

571 }

572 ++I1;

573 }

574 }

575

576

577 for (auto &Init : Inits) {

578 auto &Defs = Init.second;

579 auto I = Defs.begin();

580 while (I != Defs.end()) {

581 if (MergedInstrs.count(*I)) {

582 (*I)->eraseFromParent();

583 I = Defs.erase(I);

584 } else

585 ++I;

586 }

587 }

588

589

590 for (auto &Init : Inits) {

591 auto &Defs = Init.second;

592 for (auto *MI : Defs) {

593 auto *MBB = MI->getParent();

596

597

598 if (TII->isBasicBlockPrologue(*B))

599 B++;

600

601 auto R = std::next(MI->getReverseIterator());

602 const unsigned Threshold = 50;

603

604 for (unsigned I = 0; R != B && I < Threshold; ++R, ++I)

605 if (R->readsRegister(Reg, TRI) || R->definesRegister(Reg, TRI) ||

606 TII->isSchedulingBoundary(*R, MBB, *MBB->getParent()))

607 break;

608

609

610 if (&*--R != MI)

612 }

613 }

614

616 MRI.clearKillFlags(Reg);

617

619}

620

622

624 return false;

625

626 const GCNSubtarget &ST = MF.getSubtarget();

628 TRI = ST.getRegisterInfo();

629 TII = ST.getInstrInfo();

630

631

632 SmallVector<MachineInstr *, 8> Relegalize;

633

634 for (MachineBasicBlock &MBB : MF) {

636 ++I) {

637 MachineInstr &MI = *I;

638

639 switch (MI.getOpcode()) {

640 default:

641

642

643 if (TII->isWMMA(MI) &&

646 continue;

647 case AMDGPU::COPY: {

648 const TargetRegisterClass *SrcRC, *DstRC;

650

652

653

654

656 continue;

657

658

659

661 }

663 continue;

664 if (lowerSpecialCase(MI, I))

665 continue;

666

667 analyzeVGPRToSGPRCopy(&MI);

668

669 break;

670 }

671 case AMDGPU::WQM:

672 case AMDGPU::STRICT_WQM:

673 case AMDGPU::SOFT_WQM:

674 case AMDGPU::STRICT_WWM:

675 case AMDGPU::INSERT_SUBREG:

676 case AMDGPU::PHI:

677 case AMDGPU::REG_SEQUENCE: {

678 if (TRI->isSGPRClass(TII->getOpRegClass(MI, 0))) {

679 for (MachineOperand &MO : MI.operands()) {

680 if (!MO.isReg() || !MO.getReg().isVirtual())

681 continue;

682 const TargetRegisterClass *SrcRC = MRI->getRegClass(MO.getReg());

683 if (SrcRC == &AMDGPU::VReg_1RegClass)

684 continue;

685

686 if (TRI->hasVectorRegisters(SrcRC)) {

687 const TargetRegisterClass *DestRC =

688 TRI->getEquivalentSGPRClass(SrcRC);

689 Register NewDst = MRI->createVirtualRegister(DestRC);

690 MachineBasicBlock *BlockToInsertCopy =

691 MI.isPHI() ? MI.getOperand(MO.getOperandNo() + 1).getMBB()

695

697 if (!tryMoveVGPRConstToSGPR(MO, NewDst, BlockToInsertCopy,

698 PointToInsertCopy, DL)) {

699 MachineInstr *NewCopy =

700 BuildMI(*BlockToInsertCopy, PointToInsertCopy, DL,

701 TII->get(AMDGPU::COPY), NewDst)

702 .addReg(MO.getReg());

703 MO.setReg(NewDst);

704 analyzeVGPRToSGPRCopy(NewCopy);

705 PHISources.insert(NewCopy);

706 }

707 }

708 }

709 }

710

711 if (MI.isPHI())

713 else if (MI.isRegSequence())

715

716 break;

717 }

718 case AMDGPU::V_WRITELANE_B32: {

719

720

721 if (ST.getConstantBusLimit(MI.getOpcode()) != 1)

722 break;

723

724

725

726

727

728

729

730 int Src0Idx =

731 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);

732 int Src1Idx =

733 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1);

734 MachineOperand &Src0 = MI.getOperand(Src0Idx);

735 MachineOperand &Src1 = MI.getOperand(Src1Idx);

736

737

739 Src0.getReg() != AMDGPU::M0) &&

741 Src1.getReg() != AMDGPU::M0)) {

742

743

744

745

746

748 for (MachineOperand *MO : {&Src0, &Src1}) {

749 if (MO->getReg().isVirtual()) {

750 MachineInstr *DefMI = MRI->getVRegDef(MO->getReg());

753 if (Def.isReg() &&

754 MO->getReg() == Def.getReg() &&

755 MO->getSubReg() == Def.getSubReg()) {

757 if (Copied.isImm() &&

758 TII->isInlineConstant(APInt(64, Copied.getImm(), true))) {

759 MO->ChangeToImmediate(Copied.getImm());

761 break;

762 }

763 }

764 }

765 }

766 }

767

768 if (!Resolved) {

769

770

772 TII->get(AMDGPU::COPY), AMDGPU::M0)

773 .add(Src1);

775 }

776 }

777 break;

778 }

779 }

780 }

781 }

782

783 lowerVGPR2SGPRCopies(MF);

784

785 fixSCCCopies(MF);

786 for (auto *MI : S2VCopies) {

787

788 if (MI->isCopy()) {

789 const TargetRegisterClass *SrcRC, *DstRC;

793 }

794 }

795 for (auto *MI : RegSequences) {

796

797 if (MI->isRegSequence())

799 }

800 for (auto *MI : PHINodes) {

801 processPHINode(*MI);

802 }

803 while (!Relegalize.empty())

805

806 if (MF.getTarget().getOptLevel() > CodeGenOptLevel::None && EnableM0Merge)

808

809 SiblingPenalty.clear();

810 V2SCopies.clear();

811 SCCCopies.clear();

812 RegSequences.clear();

813 PHINodes.clear();

814 S2VCopies.clear();

815 PHISources.clear();

816

817 return true;

818}

819

820void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {

821 bool AllAGPRUses = true;

822 SetVector<const MachineInstr *> worklist;

823 SmallPtrSet<const MachineInstr *, 4> Visited;

824 SetVector<MachineInstr *> PHIOperands;

827

828 bool HasUses = false;

829 while (!worklist.empty()) {

832 for (const auto &Use : MRI->use_operands(Reg)) {

833 HasUses = true;

834 const MachineInstr *UseMI = Use.getParent();

841

842 continue;

843 }

844 }

845 }

846

847 Register PHIRes = MI.getOperand(0).getReg();

848 const TargetRegisterClass *RC0 = MRI->getRegClass(PHIRes);

849 if (HasUses && AllAGPRUses && TRI->isAGPRClass(RC0)) {

851 MRI->setRegClass(PHIRes, TRI->getEquivalentAGPRClass(RC0));

852 for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {

853 MachineInstr *DefMI = MRI->getVRegDef(MI.getOperand(I).getReg());

856 }

857 }

858

859 if (TRI->hasVectorRegisters(MRI->getRegClass(PHIRes)) ||

860 RC0 == &AMDGPU::VReg_1RegClass) {

862 TII->legalizeOperands(MI, MDT);

863 }

864

865

866 while (!PHIOperands.empty()) {

868 }

869}

870

871bool SIFixSGPRCopies::tryMoveVGPRConstToSGPR(

872 MachineOperand &MaybeVGPRConstMO, Register DstReg,

873 MachineBasicBlock *BlockToInsertTo,

875

876 MachineInstr *DefMI = MRI->getVRegDef(MaybeVGPRConstMO.getReg());

878 return false;

879

880 MachineOperand *SrcConst = TII->getNamedOperand(*DefMI, AMDGPU::OpName::src0);

881 if (SrcConst->isReg())

882 return false;

883

884 const TargetRegisterClass *SrcRC =

885 MRI->getRegClass(MaybeVGPRConstMO.getReg());

886 unsigned MoveSize = TRI->getRegSizeInBits(*SrcRC);

887 unsigned MoveOp = MoveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;

888 BuildMI(*BlockToInsertTo, PointToInsertTo, DL, TII->get(MoveOp), DstReg)

889 .add(*SrcConst);

890 if (MRI->hasOneUse(MaybeVGPRConstMO.getReg()))

892 MaybeVGPRConstMO.setReg(DstReg);

893 return true;

894}

895

896bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI,

898 Register DstReg = MI.getOperand(0).getReg();

899 Register SrcReg = MI.getOperand(1).getReg();

901

902

903

904

905 const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);

906 if (DstReg == AMDGPU::M0 && TRI->hasVectorRegisters(SrcRC)) {

908 MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);

909

910 const MCInstrDesc &ReadFirstLaneDesc =

911 TII->get(AMDGPU::V_READFIRSTLANE_B32);

912 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), ReadFirstLaneDesc, TmpReg)

913 .add(MI.getOperand(1));

914

915 unsigned SubReg = MI.getOperand(1).getSubReg();

916 MI.getOperand(1).setReg(TmpReg);

917 MI.getOperand(1).setSubReg(AMDGPU::NoSubRegister);

918

919 const TargetRegisterClass *OpRC = TII->getRegClass(ReadFirstLaneDesc, 1);

920 const TargetRegisterClass *ConstrainRC =

921 SubReg == AMDGPU::NoSubRegister

922 ? OpRC

923 : TRI->getMatchingSuperRegClass(SrcRC, OpRC, SubReg);

924

925 if (MRI->constrainRegClass(SrcReg, ConstrainRC))

927 } else if (tryMoveVGPRConstToSGPR(MI.getOperand(1), DstReg, MI.getParent(),

928 MI, MI.getDebugLoc())) {

929 I = std::next(I);

930 MI.eraseFromParent();

931 }

932 return true;

933 }

935 SIInstrWorklist worklist;

937 TII->moveToVALU(worklist, MDT);

938 return true;

939 }

940

941 unsigned SMovOp;

942 int64_t Imm;

943

944

946 MI.getOperand(1).ChangeToImmediate(Imm);

947 MI.addImplicitDefUseOperands(*MI.getMF());

948 MI.setDesc(TII->get(SMovOp));

949 return true;

950 }

951 return false;

952}

953

954void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) {

956 return;

957 Register DstReg = MI->getOperand(0).getReg();

958 const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg);

959

960 V2SCopyInfo Info(getNextVGPRToSGPRCopyId(), MI,

961 TRI->getRegSizeInBits(*DstRC));

962 SmallVector<MachineInstr *, 8> AnalysisWorklist;

963

964

965 DenseSet<MachineInstr *> Visited;

967 while (!AnalysisWorklist.empty()) {

968

969 MachineInstr *Inst = AnalysisWorklist.pop_back_val();

970

971 if (!Visited.insert(Inst).second)

972 continue;

973

974

975

978 Info.NumSVCopies++;

979 continue;

980 }

981 if (Inst->isCopy()) {

982 const TargetRegisterClass *SrcRC, *DstRC;

986 Info.NumSVCopies++;

987 continue;

988 }

989 }

990

991 SiblingPenalty[Inst].insert(Info.ID);

992

993 SmallVector<MachineInstr *, 4> Users;

994 if ((TII->isSALU(*Inst) && Inst->isCompare()) ||

998 while (++I != E &&

999 I->findRegisterDefOperand(AMDGPU::SCC, nullptr)) {

1000 if (I->readsRegister(AMDGPU::SCC, nullptr))

1001 Users.push_back(&*I);

1002 }

1006 for (auto &U : MRI->use_instructions(Reg))

1007 Users.push_back(&U);

1008 }

1009 }

1010 for (auto *U : Users) {

1011 if (TII->isSALU(*U))

1012 Info.SChain.insert(U);

1014 }

1015 }

1017}

1018

1019

1020

1021bool SIFixSGPRCopies::needToBeConvertedToVALU(V2SCopyInfo *Info) {

1022 if (Info->SChain.empty()) {

1023 Info->Score = 0;

1024 return true;

1025 }

1027 Info->SChain, [&](MachineInstr *A, MachineInstr *B) -> bool {

1028 return SiblingPenalty[A].size() < SiblingPenalty[B].size();

1029 })];

1030 Info->Siblings.remove_if([&](unsigned ID) { return ID == Info->ID; });

1031

1032

1033

1034

1035

1036 SmallSet<std::pair<Register, unsigned>, 4> SrcRegs;

1037 for (auto J : Info->Siblings) {

1038 auto *InfoIt = V2SCopies.find(J);

1039 if (InfoIt != V2SCopies.end()) {

1040 MachineInstr *SiblingCopy = InfoIt->second.Copy;

1042

1043 continue;

1044

1047 }

1048 }

1049 Info->SiblingPenalty = SrcRegs.size();

1050

1051 unsigned Penalty =

1052 Info->NumSVCopies + Info->SiblingPenalty + Info->NumReadfirstlanes;

1053 unsigned Profit = Info->SChain.size();

1054 Info->Score = Penalty > Profit ? 0 : Profit - Penalty;

1055 Info->NeedToBeConvertedToVALU = Info->Score < 3;

1056 return Info->NeedToBeConvertedToVALU;

1057}

1058

1059void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {

1060

1061 SmallVector<unsigned, 8> LoweringWorklist;

1062 for (auto &C : V2SCopies) {

1063 if (needToBeConvertedToVALU(&C.second))

1064 LoweringWorklist.push_back(C.second.ID);

1065 }

1066

1067

1068

1069 SIInstrWorklist Copies;

1070

1071 while (!LoweringWorklist.empty()) {

1072 unsigned CurID = LoweringWorklist.pop_back_val();

1073 auto *CurInfoIt = V2SCopies.find(CurID);

1074 if (CurInfoIt != V2SCopies.end()) {

1075 V2SCopyInfo C = CurInfoIt->second;

1077 for (auto S : C.Siblings) {

1078 auto *SibInfoIt = V2SCopies.find(S);

1079 if (SibInfoIt != V2SCopies.end()) {

1080 V2SCopyInfo &SI = SibInfoIt->second;

1082 if (SI.NeedToBeConvertedToVALU) {

1083 SI.SChain.set_subtract(C.SChain);

1084 if (needToBeConvertedToVALU(&SI))

1086 }

1087 SI.Siblings.remove_if([&](unsigned ID) { return ID == C.ID; });

1088 }

1089 }

1091 << " is being turned to VALU\n");

1092

1093

1094 V2SCopies.erase(C.ID);

1096 }

1097 }

1098

1101

1102

1103 for (auto C : V2SCopies) {

1104 MachineInstr *MI = C.second.Copy;

1105 MachineBasicBlock *MBB = MI->getParent();

1106

1107

1109 << " is being turned to v_readfirstlane_b32"

1110 << " Score: " << C.second.Score << "\n");

1111 Register DstReg = MI->getOperand(0).getReg();

1112 MRI->constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass);

1113

1114 Register SrcReg = MI->getOperand(1).getReg();

1115 unsigned SubReg = MI->getOperand(1).getSubReg();

1116 const TargetRegisterClass *SrcRC =

1117 TRI->getRegClassForOperandReg(*MRI, MI->getOperand(1));

1118 size_t SrcSize = TRI->getRegSizeInBits(*SrcRC);

1119 if (SrcSize == 16) {

1121 "We do not expect to see 16-bit copies from VGPR to SGPR unless "

1122 "we have 16-bit VGPRs");

1123 assert(MRI->getRegClass(DstReg) == &AMDGPU::SReg_32RegClass ||

1124 MRI->getRegClass(DstReg) == &AMDGPU::SReg_32_XM0RegClass);

1125

1126 MRI->setRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass);

1127 Register VReg32 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);

1129 Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_16RegClass);

1133 .addImm(AMDGPU::lo16)

1135 .addImm(AMDGPU::hi16);

1138 } else if (SrcSize == 32) {

1139 const MCInstrDesc &ReadFirstLaneDesc =

1140 TII->get(AMDGPU::V_READFIRSTLANE_B32);

1141 const TargetRegisterClass *OpRC = TII->getRegClass(ReadFirstLaneDesc, 1);

1142 BuildMI(*MBB, MI, MI->getDebugLoc(), ReadFirstLaneDesc, DstReg)

1144

1145 const TargetRegisterClass *ConstrainRC =

1146 SubReg == AMDGPU::NoSubRegister

1147 ? OpRC

1148 : TRI->getMatchingSuperRegClass(MRI->getRegClass(SrcReg), OpRC,

1150

1151 if (MRI->constrainRegClass(SrcReg, ConstrainRC))

1153 } else {

1155 TII->get(AMDGPU::REG_SEQUENCE), DstReg);

1156 int N = TRI->getRegSizeInBits(*SrcRC) / 32;

1157 for (int i = 0; i < N; i++) {

1158 Register PartialSrc = TII->buildExtractSubReg(

1159 Result, *MRI, MI->getOperand(1), SrcRC,

1160 TRI->getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass);

1162 MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);

1164 TII->get(AMDGPU::V_READFIRSTLANE_B32), PartialDst)

1165 .addReg(PartialSrc);

1166 Result.addReg(PartialDst).addImm(TRI->getSubRegFromChannel(i));

1167 }

1168 }

1169 MI->eraseFromParent();

1170 }

1171}

1172

1173void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) {

1174 const AMDGPU::LaneMaskConstants &LMC =

1176 for (MachineBasicBlock &MBB : MF) {

1178 ++I) {

1179 MachineInstr &MI = *I;

1180

1181 if (MI.isCopy())

1182 continue;

1183 Register SrcReg = MI.getOperand(1).getReg();

1184 Register DstReg = MI.getOperand(0).getReg();

1185 if (SrcReg == AMDGPU::SCC) {

1187 MRI->createVirtualRegister(TRI->getWaveMaskRegClass());

1192 I = BuildMI(*MI.getParent(), std::next(I), I->getDebugLoc(),

1193 TII->get(AMDGPU::COPY), DstReg)

1195 MI.eraseFromParent();

1196 continue;

1197 }

1198 if (DstReg == AMDGPU::SCC) {

1199 Register Tmp = MRI->createVirtualRegister(TRI->getBoolRC());

1205 MI.eraseFromParent();

1206 }

1207 }

1208 }

1209}

1210

1211PreservedAnalyses

1215 SIFixSGPRCopies Impl(&MDT);

1216 bool Changed = Impl.run(MF);

1219

1220

1222 return PA;

1223}

unsigned const MachineRegisterInfo * MRI

MachineInstrBuilder & UseMI

MachineInstrBuilder MachineInstrBuilder & DefMI

assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

const TargetInstrInfo & TII

Provides AMDGPU specific target descriptions.

MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL

static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")

static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

Analysis containing CSE Info

AMD GCN specific subclass of TargetSubtarget.

iv Induction Variable Users

Register const TargetRegisterInfo * TRI

Promote Memory to Register

MachineInstr unsigned OpIdx

#define INITIALIZE_PASS_DEPENDENCY(depName)

#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)

#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)

static std::pair< const TargetRegisterClass *, const TargetRegisterClass * > getCopyRegClasses(const MachineInstr &Copy, const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI)

Definition SIFixSGPRCopies.cpp:204

static cl::opt< bool > EnableM0Merge("amdgpu-enable-merge-m0", cl::desc("Merge and hoist M0 initializations"), cl::init(true))

static bool hoistAndMergeSGPRInits(unsigned Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo *TRI, MachineDominatorTree &MDT, const TargetInstrInfo *TII)

Definition SIFixSGPRCopies.cpp:453

static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, const SIRegisterInfo *TRI, const SIInstrInfo *TII, MachineRegisterInfo &MRI)

Definition SIFixSGPRCopies.cpp:279

bool searchPredecessors(const MachineBasicBlock *MBB, const MachineBasicBlock *CutOff, UnaryPredicate Predicate)

Definition SIFixSGPRCopies.cpp:394

static bool isReachable(const MachineInstr *From, const MachineInstr *To, const MachineBasicBlock *CutOff, MachineDominatorTree &MDT)

Definition SIFixSGPRCopies.cpp:422

static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC, const TargetRegisterClass *DstRC, const SIRegisterInfo &TRI)

Definition SIFixSGPRCopies.cpp:224

static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI, const SIRegisterInfo *TRI, const SIInstrInfo *TII)

Definition SIFixSGPRCopies.cpp:238

static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC, const TargetRegisterClass *DstRC, const SIRegisterInfo &TRI)

Definition SIFixSGPRCopies.cpp:231

static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy, const MachineInstr *MoveImm, const SIInstrInfo *TII, unsigned &SMovOp, int64_t &Imm)

Definition SIFixSGPRCopies.cpp:358

static MachineBasicBlock::iterator getFirstNonPrologue(MachineBasicBlock *MBB, const TargetInstrInfo *TII)

Definition SIFixSGPRCopies.cpp:441

const unsigned CSelectOpc

static const LaneMaskConstants & get(const GCNSubtarget &ST)

PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)

Get the result of an analysis pass for a given IR unit.

AnalysisUsage & addRequired()

AnalysisUsage & addPreserved()

Add the specified Pass class to the set of analyses preserved by this pass.

LLVM_ABI void setPreservesCFG()

This function should be called by the pass, iff they do not:

Implements a dense probed hash-table based set.

NodeT * findNearestCommonDominator(NodeT *A, NodeT *B) const

Find nearest common dominator basic block for basic block A and B.

bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const

properlyDominates - Returns true iff A dominates B and A != B.

FunctionPass class - This class is used to implement most global optimizations.

const MCInstrDesc & get(unsigned Opcode) const

Return the machine instruction descriptor that corresponds to the specified instruction opcode.

MachineInstrBundleIterator< MachineInstr, true > reverse_iterator

const MachineFunction * getParent() const

Return the MachineFunction containing this basic block.

LLVM_ABI instr_iterator getFirstInstrTerminator()

Same getFirstTerminator but it ignores bundles and return an instr_iterator instead.

MachineInstrBundleIterator< MachineInstr > iterator

Analysis pass which computes a MachineDominatorTree.

Analysis pass which computes a MachineDominatorTree.

DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...

bool dominates(const MachineInstr *A, const MachineInstr *B) const

MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...

void getAnalysisUsage(AnalysisUsage &AU) const override

getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.

const TargetSubtargetInfo & getSubtarget() const

getSubtarget - Return the subtarget for which this machine code is being compiled.

MachineRegisterInfo & getRegInfo()

getRegInfo - Return information about the registers currently in use.

const MachineFunctionProperties & getProperties() const

Get the function properties.

const MachineInstrBuilder & addImm(int64_t Val) const

Add a new immediate operand.

const MachineInstrBuilder & add(const MachineOperand &MO) const

const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const

Add a new virtual register operand.

Representation of each machine instruction.

bool isImplicitDef() const

const MachineBasicBlock * getParent() const

bool isCompare(QueryType Type=IgnoreBundle) const

Return true if this instruction is a comparison.

bool isRegSequence() const

LLVM_ABI unsigned getNumExplicitDefs() const

Returns the number of non-implicit definitions.

bool isMoveImmediate(QueryType Type=IgnoreBundle) const

Return true if this instruction is a move immediate (including conditional moves) instruction.

LLVM_ABI void eraseFromParent()

Unlink 'this' from the containing basic block and delete it.

const MachineOperand & getOperand(unsigned i) const

MachineOperand class - Representation of each machine instruction operand.

unsigned getSubReg() const

bool isReg() const

isReg - Tests if this is a MO_Register operand.

LLVM_ABI void setReg(Register Reg)

Change the register this operand corresponds to.

bool isImm() const

isImm - Tests if this is a MO_Immediate operand.

LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)

ChangeToRegister - Replace this operand with a new register operand of the specified value.

Register getReg() const

getReg - Returns the register number.

MachineRegisterInfo - Keep track of information for virtual and physical registers,...

static PreservedAnalyses all()

Construct a special preserved set that preserves all passes.

Wrapper class representing virtual and physical registers.

constexpr bool isVirtual() const

Return true if the specified register number is in the virtual register namespace.

constexpr bool isPhysical() const

Return true if the specified register number is in the physical register namespace.

PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)

Definition SIFixSGPRCopies.cpp:1212

A vector that has set insertion semantics.

bool empty() const

Determine if the SetVector is empty or not.

bool insert(const value_type &X)

Insert a new element into the SetVector.

value_type pop_back_val()

size_type count(ConstPtrType Ptr) const

count - Return 1 if the specified pointer is in the set, 0 otherwise.

std::pair< iterator, bool > insert(PtrType Ptr)

Inserts Ptr if and only if there is no element in the container equal to Ptr.

SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.

std::pair< const_iterator, bool > insert(const T &V)

insert - Insert an element into the set if it isn't already there.

void append(ItTy in_start, ItTy in_end)

Add the specified range to the end of the SmallVector.

void push_back(const T &Elt)

This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.

TargetInstrInfo - Interface to description of machine instruction set.

virtual const TargetRegisterClass * getRegClass(const MCInstrDesc &MCID, unsigned OpNum) const

Given a machine instruction descriptor, returns the register class constraint for OpNum,...

TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...

std::pair< iterator, bool > insert(const ValueT &V)

bool contains(const_arg_type_t< ValueT > V) const

Check if the set contains the given element.

self_iterator getIterator()

#define llvm_unreachable(msg)

Marks that the current location is not supposed to be reachable.

LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)

unsigned ID

LLVM IR allows to use arbitrary numbers as calling convention identifiers.

@ C

The default llvm calling convention, compatible with C.

@ Kill

The last use of a register.

@ Undef

Value of the register doesn't matter.

initializer< Ty > init(const Ty &Val)

PointerTypeMap run(const Module &M)

Compute the PointerTypeMap for the module M.

@ Resolved

Queried, materialization begun.

NodeAddr< DefNode * > Def

NodeAddr< InstrNode * > Instr

NodeAddr< UseNode * > Use

This is an optimization pass for GlobalISel generic memory operations.

void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)

MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)

Builder interface. Specify how to create the initial instruction itself.

AnalysisManager< MachineFunction > MachineFunctionAnalysisManager

LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()

Returns the minimum set of Analyses that all machine function passes must preserve.

bool any_of(R &&range, UnaryPredicate P)

Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.

LLVM_ABI raw_ostream & dbgs()

dbgs() - This returns a reference to a raw_ostream for debugging messages.

unsigned getDefRegState(bool B)

auto max_element(R &&Range)

Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...

char & SIFixSGPRCopiesLegacyID

Definition SIFixSGPRCopies.cpp:197

LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)

Prints a machine basic block reference.

FunctionPass * createSIFixSGPRCopiesLegacyPass()

Definition SIFixSGPRCopies.cpp:199

void insert(MachineInstr *MI)