LLVM: lib/Target/AMDGPU/SILoadStoreOptimizer.cpp Source File (original) (raw)

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

68

69using namespace llvm;

70

71#define DEBUG_TYPE "si-load-store-opt"

72

73namespace {

74enum InstClassEnum {

75 UNKNOWN,

76 DS_READ,

77 DS_WRITE,

78 S_BUFFER_LOAD_IMM,

79 S_BUFFER_LOAD_SGPR_IMM,

80 S_LOAD_IMM,

81 BUFFER_LOAD,

82 BUFFER_STORE,

83 MIMG,

84 TBUFFER_LOAD,

85 TBUFFER_STORE,

86 GLOBAL_LOAD_SADDR,

87 GLOBAL_STORE_SADDR,

88 FLAT_LOAD,

89 FLAT_STORE,

90 FLAT_LOAD_SADDR,

91 FLAT_STORE_SADDR,

92 GLOBAL_LOAD,

93 GLOBAL_STORE

94

95};

96

97struct AddressRegs {

98 unsigned char NumVAddrs = 0;

99 bool SBase = false;

100 bool SRsrc = false;

101 bool SOffset = false;

102 bool SAddr = false;

103 bool VAddr = false;

104 bool Addr = false;

105 bool SSamp = false;

106};

107

108

109const unsigned MaxAddressRegs = 12 + 1 + 1;

110

111class SILoadStoreOptimizer {

112 struct CombineInfo {

114 unsigned EltSize;

116 unsigned Width;

117 unsigned Format;

118 unsigned BaseOff;

119 unsigned DMask;

120 InstClassEnum InstClass;

121 unsigned CPol = 0;

123 bool UseST64;

124 int AddrIdx[MaxAddressRegs];

126 unsigned NumAddresses;

127 unsigned Order;

128

129 bool hasSameBaseAddress(const CombineInfo &CI) {

130 if (NumAddresses != CI.NumAddresses)

131 return false;

132

134 for (unsigned i = 0; i < NumAddresses; i++) {

135 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);

136

137 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {

138 if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||

139 AddrReg[i]->getImm() != AddrRegNext.getImm()) {

140 return false;

141 }

142 continue;

143 }

144

145

146

147 if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||

149 return false;

150 }

151 }

152 return true;

153 }

154

156 for (unsigned i = 0; i < NumAddresses; ++i) {

158

159 if (AddrOp->isImm())

160 continue;

161

162

163

164

165 if (!AddrOp->isReg())

166 return false;

167

168

169

171 AddrOp->getReg() != AMDGPU::SGPR_NULL)

172 return false;

173

174

175

176 if (MRI.hasOneNonDBGUse(AddrOp->getReg()))

177 return false;

178 }

179 return true;

180 }

181

183

184

186 return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;

187 }

188 };

189

190 struct BaseRegisters {

193

194 unsigned LoSubReg = 0;

195 unsigned HiSubReg = 0;

196 };

197

198 struct MemAddress {

199 BaseRegisters Base;

201 };

202

204

205private:

212 bool OptimizeAgain;

213

217 static bool dmasksCanBeCombined(const CombineInfo &CI,

219 const CombineInfo &Paired);

220 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,

221 CombineInfo &Paired, bool Modify = false);

222 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,

223 const CombineInfo &Paired);

224 unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);

225 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,

226 const CombineInfo &Paired);

228 getTargetRegisterClass(const CombineInfo &CI,

229 const CombineInfo &Paired) const;

231

232 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);

233

234 void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,

238 Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,

241

242 unsigned read2Opcode(unsigned EltSize) const;

243 unsigned read2ST64Opcode(unsigned EltSize) const;

245 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,

247

248 unsigned write2Opcode(unsigned EltSize) const;

249 unsigned write2ST64Opcode(unsigned EltSize) const;

250 unsigned getWrite2Opcode(const CombineInfo &CI) const;

251

253 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,

256 mergeImagePair(CombineInfo &CI, CombineInfo &Paired,

259 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,

262 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,

265 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,

268 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,

271 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,

274 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,

277 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,

279

281 int32_t NewOffset) const;

284 std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;

285 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;

286

287

288

289 bool promoteConstantOffsetToImm(MachineInstr &CI,

290 MemInfoMap &Visited,

293 std::list<std::list > &MergeableInsts) const;

294

298 std::list<std::list> &MergeableInsts) const;

299

300 static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,

301 const CombineInfo &Paired);

302

303 static InstClassEnum getCommonInstClass(const CombineInfo &CI,

304 const CombineInfo &Paired);

305

306 bool optimizeInstsWithSameBaseAddr(std::list &MergeList,

307 bool &OptimizeListAgain);

308 bool optimizeBlock(std::list<std::list > &MergeableInsts);

309

310public:

313};

314

316public:

317 static char ID;

318

320

322

323 StringRef getPassName() const override { return "SI Load Store Optimizer"; }

324

325 void getAnalysisUsage(AnalysisUsage &AU) const override {

328

330 }

331

334 }

335};

336

338 const unsigned Opc = MI.getOpcode();

339

340 if (TII.isMUBUF(Opc)) {

341

343 }

344 if (TII.isImage(MI)) {

346 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();

348 }

349 if (TII.isMTBUF(Opc)) {

351 }

352

353 switch (Opc) {

354 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:

355 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:

356 case AMDGPU::S_LOAD_DWORD_IMM:

357 case AMDGPU::GLOBAL_LOAD_DWORD:

358 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:

359 case AMDGPU::GLOBAL_STORE_DWORD:

360 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:

361 case AMDGPU::FLAT_LOAD_DWORD:

362 case AMDGPU::FLAT_STORE_DWORD:

363 case AMDGPU::FLAT_LOAD_DWORD_SADDR:

364 case AMDGPU::FLAT_STORE_DWORD_SADDR:

365 return 1;

366 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:

367 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:

368 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:

369 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:

370 case AMDGPU::S_LOAD_DWORDX2_IMM:

371 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:

372 case AMDGPU::GLOBAL_LOAD_DWORDX2:

373 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:

374 case AMDGPU::GLOBAL_STORE_DWORDX2:

375 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:

376 case AMDGPU::FLAT_LOAD_DWORDX2:

377 case AMDGPU::FLAT_STORE_DWORDX2:

378 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:

379 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:

380 return 2;

381 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:

382 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:

383 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:

384 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:

385 case AMDGPU::S_LOAD_DWORDX3_IMM:

386 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:

387 case AMDGPU::GLOBAL_LOAD_DWORDX3:

388 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:

389 case AMDGPU::GLOBAL_STORE_DWORDX3:

390 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:

391 case AMDGPU::FLAT_LOAD_DWORDX3:

392 case AMDGPU::FLAT_STORE_DWORDX3:

393 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:

394 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:

395 return 3;

396 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:

397 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:

398 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:

399 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:

400 case AMDGPU::S_LOAD_DWORDX4_IMM:

401 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:

402 case AMDGPU::GLOBAL_LOAD_DWORDX4:

403 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:

404 case AMDGPU::GLOBAL_STORE_DWORDX4:

405 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:

406 case AMDGPU::FLAT_LOAD_DWORDX4:

407 case AMDGPU::FLAT_STORE_DWORDX4:

408 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:

409 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:

410 return 4;

411 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:

412 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:

413 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:

414 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:

415 case AMDGPU::S_LOAD_DWORDX8_IMM:

416 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:

417 return 8;

418 case AMDGPU::DS_READ_B32:

419 case AMDGPU::DS_READ_B32_gfx9:

420 case AMDGPU::DS_WRITE_B32:

421 case AMDGPU::DS_WRITE_B32_gfx9:

422 return 1;

423 case AMDGPU::DS_READ_B64:

424 case AMDGPU::DS_READ_B64_gfx9:

425 case AMDGPU::DS_WRITE_B64:

426 case AMDGPU::DS_WRITE_B64_gfx9:

427 return 2;

428 default:

429 return 0;

430 }

431}

432

433

434static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {

435 switch (Opc) {

436 default:

437 if (TII.isMUBUF(Opc)) {

439 default:

440 return UNKNOWN;

441 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:

442 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:

443 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:

444 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:

445 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:

446 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:

447 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:

448 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:

449 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:

450 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:

451 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:

452 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:

453 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:

454 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:

455 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:

456 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:

457 return BUFFER_LOAD;

458 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:

459 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:

460 case AMDGPU::BUFFER_STORE_DWORD_IDXEN:

461 case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:

462 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:

463 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:

464 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:

465 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:

466 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:

467 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:

468 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:

469 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:

470 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:

471 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:

472 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:

473 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:

474 return BUFFER_STORE;

475 }

476 }

477 if (TII.isImage(Opc)) {

478

481 return UNKNOWN;

482

484 return UNKNOWN;

485

486 if (TII.get(Opc).mayStore() || TII.get(Opc).mayLoad() ||

488 return UNKNOWN;

489 return MIMG;

490 }

491 if (TII.isMTBUF(Opc)) {

493 default:

494 return UNKNOWN;

495 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:

496 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:

497 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:

498 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:

499 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:

500 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:

501 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:

502 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:

503 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:

504 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:

505 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:

506 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:

507 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:

508 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:

509 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:

510 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:

511 return TBUFFER_LOAD;

512 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:

513 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:

514 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:

515 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:

516 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:

517 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:

518 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:

519 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:

520 return TBUFFER_STORE;

521 }

522 }

523 return UNKNOWN;

524 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:

525 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:

526 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:

527 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:

528 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:

529 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:

530 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:

531 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:

532 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:

533 return S_BUFFER_LOAD_IMM;

534 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:

535 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:

536 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:

537 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:

538 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:

539 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:

540 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:

541 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:

542 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:

543 return S_BUFFER_LOAD_SGPR_IMM;

544 case AMDGPU::S_LOAD_DWORD_IMM:

545 case AMDGPU::S_LOAD_DWORDX2_IMM:

546 case AMDGPU::S_LOAD_DWORDX3_IMM:

547 case AMDGPU::S_LOAD_DWORDX4_IMM:

548 case AMDGPU::S_LOAD_DWORDX8_IMM:

549 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:

550 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:

551 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:

552 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:

553 return S_LOAD_IMM;

554 case AMDGPU::DS_READ_B32:

555 case AMDGPU::DS_READ_B32_gfx9:

556 case AMDGPU::DS_READ_B64:

557 case AMDGPU::DS_READ_B64_gfx9:

558 return DS_READ;

559 case AMDGPU::DS_WRITE_B32:

560 case AMDGPU::DS_WRITE_B32_gfx9:

561 case AMDGPU::DS_WRITE_B64:

562 case AMDGPU::DS_WRITE_B64_gfx9:

563 return DS_WRITE;

564 case AMDGPU::GLOBAL_LOAD_DWORD:

565 case AMDGPU::GLOBAL_LOAD_DWORDX2:

566 case AMDGPU::GLOBAL_LOAD_DWORDX3:

567 case AMDGPU::GLOBAL_LOAD_DWORDX4:

568 case AMDGPU::FLAT_LOAD_DWORD:

569 case AMDGPU::FLAT_LOAD_DWORDX2:

570 case AMDGPU::FLAT_LOAD_DWORDX3:

571 case AMDGPU::FLAT_LOAD_DWORDX4:

572 return FLAT_LOAD;

573 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:

574 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:

575 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:

576 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:

577 return GLOBAL_LOAD_SADDR;

578 case AMDGPU::GLOBAL_STORE_DWORD:

579 case AMDGPU::GLOBAL_STORE_DWORDX2:

580 case AMDGPU::GLOBAL_STORE_DWORDX3:

581 case AMDGPU::GLOBAL_STORE_DWORDX4:

582 case AMDGPU::FLAT_STORE_DWORD:

583 case AMDGPU::FLAT_STORE_DWORDX2:

584 case AMDGPU::FLAT_STORE_DWORDX3:

585 case AMDGPU::FLAT_STORE_DWORDX4:

586 return FLAT_STORE;

587 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:

588 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:

589 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:

590 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:

591 return GLOBAL_STORE_SADDR;

592 case AMDGPU::FLAT_LOAD_DWORD_SADDR:

593 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:

594 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:

595 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:

596 return FLAT_LOAD_SADDR;

597 case AMDGPU::FLAT_STORE_DWORD_SADDR:

598 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:

599 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:

600 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:

601 return FLAT_STORE_SADDR;

602 }

603}

604

605

606

607

608static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {

609 switch (Opc) {

610 default:

611 if (TII.isMUBUF(Opc))

613 if (TII.isImage(Opc)) {

616 return Info->BaseOpcode;

617 }

618 if (TII.isMTBUF(Opc))

620 return -1;

621 case AMDGPU::DS_READ_B32:

622 case AMDGPU::DS_READ_B32_gfx9:

623 case AMDGPU::DS_READ_B64:

624 case AMDGPU::DS_READ_B64_gfx9:

625 case AMDGPU::DS_WRITE_B32:

626 case AMDGPU::DS_WRITE_B32_gfx9:

627 case AMDGPU::DS_WRITE_B64:

628 case AMDGPU::DS_WRITE_B64_gfx9:

629 return Opc;

630 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:

631 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:

632 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:

633 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:

634 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:

635 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:

636 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:

637 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:

638 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:

639 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;

640 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:

641 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:

642 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:

643 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:

644 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:

645 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:

646 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:

647 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:

648 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:

649 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;

650 case AMDGPU::S_LOAD_DWORD_IMM:

651 case AMDGPU::S_LOAD_DWORDX2_IMM:

652 case AMDGPU::S_LOAD_DWORDX3_IMM:

653 case AMDGPU::S_LOAD_DWORDX4_IMM:

654 case AMDGPU::S_LOAD_DWORDX8_IMM:

655 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:

656 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:

657 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:

658 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:

659 return AMDGPU::S_LOAD_DWORD_IMM;

660 case AMDGPU::GLOBAL_LOAD_DWORD:

661 case AMDGPU::GLOBAL_LOAD_DWORDX2:

662 case AMDGPU::GLOBAL_LOAD_DWORDX3:

663 case AMDGPU::GLOBAL_LOAD_DWORDX4:

664 case AMDGPU::FLAT_LOAD_DWORD:

665 case AMDGPU::FLAT_LOAD_DWORDX2:

666 case AMDGPU::FLAT_LOAD_DWORDX3:

667 case AMDGPU::FLAT_LOAD_DWORDX4:

668 return AMDGPU::FLAT_LOAD_DWORD;

669 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:

670 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:

671 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:

672 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:

673 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;

674 case AMDGPU::GLOBAL_STORE_DWORD:

675 case AMDGPU::GLOBAL_STORE_DWORDX2:

676 case AMDGPU::GLOBAL_STORE_DWORDX3:

677 case AMDGPU::GLOBAL_STORE_DWORDX4:

678 case AMDGPU::FLAT_STORE_DWORD:

679 case AMDGPU::FLAT_STORE_DWORDX2:

680 case AMDGPU::FLAT_STORE_DWORDX3:

681 case AMDGPU::FLAT_STORE_DWORDX4:

682 return AMDGPU::FLAT_STORE_DWORD;

683 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:

684 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:

685 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:

686 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:

687 return AMDGPU::GLOBAL_STORE_DWORD_SADDR;

688 case AMDGPU::FLAT_LOAD_DWORD_SADDR:

689 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:

690 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:

691 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:

692 return AMDGPU::FLAT_LOAD_DWORD_SADDR;

693 case AMDGPU::FLAT_STORE_DWORD_SADDR:

694 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:

695 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:

696 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:

697 return AMDGPU::FLAT_STORE_DWORD_SADDR;

698 }

699}

700

701

702

703

704

705

706

707InstClassEnum

708SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,

709 const CombineInfo &Paired) {

710 assert(CI.InstClass == Paired.InstClass);

711

712 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&

714 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;

715

716 return CI.InstClass;

717}

718

719static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {

720 AddressRegs Result;

721

722 if (TII.isMUBUF(Opc)) {

724 Result.VAddr = true;

726 Result.SRsrc = true;

728 Result.SOffset = true;

729

730 return Result;

731 }

732

733 if (TII.isImage(Opc)) {

734 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);

735 if (VAddr0Idx >= 0) {

736 AMDGPU::OpName RsrcName =

737 TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;

738 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName);

739 Result.NumVAddrs = RsrcIdx - VAddr0Idx;

740 } else {

741 Result.VAddr = true;

742 }

743 Result.SRsrc = true;

746 Result.SSamp = true;

747

748 return Result;

749 }

750 if (TII.isMTBUF(Opc)) {

752 Result.VAddr = true;

754 Result.SRsrc = true;

756 Result.SOffset = true;

757

758 return Result;

759 }

760

761 switch (Opc) {

762 default:

763 return Result;

764 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:

765 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:

766 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:

767 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:

768 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:

769 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:

770 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:

771 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:

772 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:

773 Result.SOffset = true;

774 [[fallthrough]];

775 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:

776 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:

777 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:

778 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:

779 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:

780 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:

781 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:

782 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:

783 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:

784 case AMDGPU::S_LOAD_DWORD_IMM:

785 case AMDGPU::S_LOAD_DWORDX2_IMM:

786 case AMDGPU::S_LOAD_DWORDX3_IMM:

787 case AMDGPU::S_LOAD_DWORDX4_IMM:

788 case AMDGPU::S_LOAD_DWORDX8_IMM:

789 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:

790 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:

791 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:

792 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:

793 Result.SBase = true;

794 return Result;

795 case AMDGPU::DS_READ_B32:

796 case AMDGPU::DS_READ_B64:

797 case AMDGPU::DS_READ_B32_gfx9:

798 case AMDGPU::DS_READ_B64_gfx9:

799 case AMDGPU::DS_WRITE_B32:

800 case AMDGPU::DS_WRITE_B64:

801 case AMDGPU::DS_WRITE_B32_gfx9:

802 case AMDGPU::DS_WRITE_B64_gfx9:

803 Result.Addr = true;

804 return Result;

805 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:

806 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:

807 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:

808 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:

809 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:

810 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:

811 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:

812 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:

813 case AMDGPU::FLAT_LOAD_DWORD_SADDR:

814 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:

815 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:

816 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:

817 case AMDGPU::FLAT_STORE_DWORD_SADDR:

818 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:

819 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:

820 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:

821 Result.SAddr = true;

822 [[fallthrough]];

823 case AMDGPU::GLOBAL_LOAD_DWORD:

824 case AMDGPU::GLOBAL_LOAD_DWORDX2:

825 case AMDGPU::GLOBAL_LOAD_DWORDX3:

826 case AMDGPU::GLOBAL_LOAD_DWORDX4:

827 case AMDGPU::GLOBAL_STORE_DWORD:

828 case AMDGPU::GLOBAL_STORE_DWORDX2:

829 case AMDGPU::GLOBAL_STORE_DWORDX3:

830 case AMDGPU::GLOBAL_STORE_DWORDX4:

831 case AMDGPU::FLAT_LOAD_DWORD:

832 case AMDGPU::FLAT_LOAD_DWORDX2:

833 case AMDGPU::FLAT_LOAD_DWORDX3:

834 case AMDGPU::FLAT_LOAD_DWORDX4:

835 case AMDGPU::FLAT_STORE_DWORD:

836 case AMDGPU::FLAT_STORE_DWORDX2:

837 case AMDGPU::FLAT_STORE_DWORDX3:

838 case AMDGPU::FLAT_STORE_DWORDX4:

839 Result.VAddr = true;

840 return Result;

841 }

842}

843

845 const SILoadStoreOptimizer &LSO) {

847 unsigned Opc = MI->getOpcode();

848 InstClass = getInstClass(Opc, *LSO.TII);

849

850 if (InstClass == UNKNOWN)

851 return;

852

853 DataRC = LSO.getDataRegClass(*MI);

854

855 switch (InstClass) {

856 case DS_READ:

857 EltSize =

858 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8

859 : 4;

860 break;

861 case DS_WRITE:

862 EltSize =

863 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8

864 : 4;

865 break;

866 case S_BUFFER_LOAD_IMM:

867 case S_BUFFER_LOAD_SGPR_IMM:

868 case S_LOAD_IMM:

870 break;

871 default:

872 EltSize = 4;

873 break;

874 }

875

876 if (InstClass == MIMG) {

878

880 } else {

881 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);

882 Offset = I->getOperand(OffsetIdx).getImm();

883 }

884

885 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) {

889 EltSize = Info->BitsPerComp / 8;

890 }

891

892 Width = getOpcodeWidth(*I, *LSO.TII);

893

894 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {

896 } else if (InstClass != MIMG) {

898 }

899

900 AddressRegs Regs = getRegs(Opc, *LSO.TII);

901 bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I);

902

903 NumAddresses = 0;

904 for (unsigned J = 0; J < Regs.NumVAddrs; J++)

905 AddrIdx[NumAddresses++] =

906 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;

907 if (Regs.Addr)

908 AddrIdx[NumAddresses++] =

909 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);

910 if (Regs.SBase)

911 AddrIdx[NumAddresses++] =

912 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);

913 if (Regs.SRsrc)

914 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(

915 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);

916 if (Regs.SOffset)

917 AddrIdx[NumAddresses++] =

918 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);

919 if (Regs.SAddr)

920 AddrIdx[NumAddresses++] =

921 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);

922 if (Regs.VAddr)

923 AddrIdx[NumAddresses++] =

924 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);

925 if (Regs.SSamp)

926 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(

927 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);

928 assert(NumAddresses <= MaxAddressRegs);

929

930 for (unsigned J = 0; J < NumAddresses; J++)

931 AddrReg[J] = &I->getOperand(AddrIdx[J]);

932}

933

934}

935

937 "SI Load Store Optimizer", false, false)

941

942char SILoadStoreOptimizerLegacy::ID = 0;

943

945

947 return new SILoadStoreOptimizerLegacy();

948}

949

953 for (const auto &Op : MI.operands()) {

954 if (Op.isReg())

955 continue;

956 if (Op.isDef())

958 if (Op.readsReg())

960 }

961}

962

963bool SILoadStoreOptimizer::canSwapInstructions(

964 const DenseSet &ARegDefs, const DenseSet &ARegUses,

965 const MachineInstr &A, const MachineInstr &B) const {

966 if (A.mayLoadOrStore() && B.mayLoadOrStore() &&

967 (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))

968 return false;

969 for (const auto &BOp : B.operands()) {

970 if (!BOp.isReg())

971 continue;

972 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))

973 return false;

974 if (BOp.isDef() && ARegUses.contains(BOp.getReg()))

975 return false;

976 }

977 return true;

978}

979

980

981

982MachineMemOperand *

983SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,

984 const CombineInfo &Paired) {

985 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();

986 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();

987

989

990

991

992 if (Paired < CI)

994

996

999

1000 MachineFunction *MF = CI.I->getMF();

1002}

1003

1004bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,

1005 const SIInstrInfo &TII,

1006 const CombineInfo &Paired) {

1007 assert(CI.InstClass == MIMG);

1008

1009

1010 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);

1011 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);

1012

1013 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))

1014 return false;

1015

1016

1017 AMDGPU::OpName OperandsToMatch[] = {

1018 AMDGPU::OpName::cpol, AMDGPU::OpName::d16, AMDGPU::OpName::unorm,

1019 AMDGPU::OpName::da, AMDGPU::OpName::r128, AMDGPU::OpName::a16};

1020

1021 for (AMDGPU::OpName op : OperandsToMatch) {

1022 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);

1023 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)

1024 return false;

1025 if (Idx != -1 &&

1026 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())

1027 return false;

1028 }

1029

1030

1031 unsigned MaxMask = std::max(CI.DMask, Paired.DMask);

1032 unsigned MinMask = std::min(CI.DMask, Paired.DMask);

1033

1034 if (!MaxMask)

1035 return false;

1036

1038 if ((1u << AllowedBitsForMin) <= MinMask)

1039 return false;

1040

1041 return true;

1042}

1043

1045 unsigned ComponentCount,

1047 if (ComponentCount > 4)

1048 return 0;

1049

1052 if (!OldFormatInfo)

1053 return 0;

1054

1057 ComponentCount,

1059

1060 if (!NewFormatInfo)

1061 return 0;

1062

1065

1066 return NewFormatInfo->Format;

1067}

1068

1069

1070

1071

1072

1073

1074

1078

1079bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,

1080 const GCNSubtarget &STI,

1081 CombineInfo &Paired,

1082 bool Modify) {

1083 assert(CI.InstClass != MIMG);

1084

1085

1086

1087 if (CI.Offset == Paired.Offset)

1088 return false;

1089

1090

1091 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))

1092 return false;

1093

1094 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {

1095

1096 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =

1098 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =

1100

1103 return false;

1104

1105

1106

1107

1108

1109

1110 unsigned NumCombinedComponents = CI.Width + Paired.Width;

1111 if (NumCombinedComponents == 3 && CI.EltSize <= 2)

1112 NumCombinedComponents = 4;

1113

1115 0)

1116 return false;

1117

1118

1119

1120 unsigned ElemIndex0 = CI.Offset / CI.EltSize;

1121 unsigned ElemIndex1 = Paired.Offset / Paired.EltSize;

1122 if (ElemIndex0 + CI.Width != ElemIndex1 &&

1123 ElemIndex1 + Paired.Width != ElemIndex0)

1124 return false;

1125

1126

1127

1128

1129 unsigned MergedBytes = CI.EltSize * NumCombinedComponents;

1130 unsigned RequiredAlign = std::min(MergedBytes, 4u);

1131 unsigned MinOff = std::min(CI.Offset, Paired.Offset);

1132 if (MinOff % RequiredAlign != 0)

1133 return false;

1134

1135 return true;

1136 }

1137

1138 uint32_t EltOffset0 = CI.Offset / CI.EltSize;

1139 uint32_t EltOffset1 = Paired.Offset / CI.EltSize;

1140 CI.UseST64 = false;

1141 CI.BaseOff = 0;

1142

1143

1144 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {

1145 if (EltOffset0 + CI.Width != EltOffset1 &&

1146 EltOffset1 + Paired.Width != EltOffset0)

1147 return false;

1148

1149

1151 return false;

1152 if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||

1153 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {

1154

1155

1156

1157

1158

1159 if (CI.Width != Paired.Width &&

1160 (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))

1161 return false;

1162 }

1163 return true;

1164 }

1165

1166

1167

1168 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&

1170 if (Modify) {

1171 CI.Offset = EltOffset0 / 64;

1172 Paired.Offset = EltOffset1 / 64;

1173 CI.UseST64 = true;

1174 }

1175 return true;

1176 }

1177

1178

1180 if (Modify) {

1181 CI.Offset = EltOffset0;

1182 Paired.Offset = EltOffset1;

1183 }

1184 return true;

1185 }

1186

1187

1188 uint32_t Min = std::min(EltOffset0, EltOffset1);

1189 uint32_t Max = std::max(EltOffset0, EltOffset1);

1190

1192 if (((Max - Min) & ~Mask) == 0) {

1193 if (Modify) {

1194

1195

1196

1198

1199

1201 CI.BaseOff = BaseOff * CI.EltSize;

1202 CI.Offset = (EltOffset0 - BaseOff) / 64;

1203 Paired.Offset = (EltOffset1 - BaseOff) / 64;

1204 CI.UseST64 = true;

1205 }

1206 return true;

1207 }

1208

1210 if (Modify) {

1211

1212

1213

1215 CI.BaseOff = BaseOff * CI.EltSize;

1216 CI.Offset = EltOffset0 - BaseOff;

1217 Paired.Offset = EltOffset1 - BaseOff;

1218 }

1219 return true;

1220 }

1221

1222 return false;

1223}

1224

1225bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,

1226 const CombineInfo &CI,

1227 const CombineInfo &Paired) {

1228 const unsigned Width = (CI.Width + Paired.Width);

1229 switch (CI.InstClass) {

1230 default:

1232 case S_BUFFER_LOAD_IMM:

1233 case S_BUFFER_LOAD_SGPR_IMM:

1234 case S_LOAD_IMM:

1235 switch (Width) {

1236 default:

1237 return false;

1238 case 2:

1239 case 4:

1240 case 8:

1241 return true;

1242 case 3:

1244 }

1245 }

1246}

1247

1248const TargetRegisterClass *

1249SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {

1250 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {

1251 return TRI->getRegClassForReg(*MRI, Dst->getReg());

1252 }

1253 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {

1254 return TRI->getRegClassForReg(*MRI, Src->getReg());

1255 }

1256 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {

1257 return TRI->getRegClassForReg(*MRI, Src->getReg());

1258 }

1259 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {

1260 return TRI->getRegClassForReg(*MRI, Dst->getReg());

1261 }

1262 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {

1263 return TRI->getRegClassForReg(*MRI, Src->getReg());

1264 }

1265 return nullptr;

1266}

1267

1268

1269

1270SILoadStoreOptimizer::CombineInfo *

1271SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,

1272 CombineInfo &Paired) {

1273

1274

1275 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)

1276 return nullptr;

1277 assert(CI.InstClass == Paired.InstClass);

1278

1279 if (getInstSubclass(CI.I->getOpcode(), *TII) !=

1280 getInstSubclass(Paired.I->getOpcode(), *TII))

1281 return nullptr;

1282

1283

1284

1285 if (CI.InstClass == MIMG) {

1286 if (!dmasksCanBeCombined(CI, *TII, Paired))

1287 return nullptr;

1288 } else {

1289 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))

1290 return nullptr;

1291 }

1292

1293 DenseSet RegDefs;

1294 DenseSet RegUses;

1295 CombineInfo *Where;

1296 if (CI.I->mayLoad()) {

1297

1300 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))

1301 return nullptr;

1302 }

1303 Where = &CI;

1304 } else {

1305

1308 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))

1309 return nullptr;

1310 }

1311 Where = &Paired;

1312 }

1313

1314

1315

1316

1317

1318 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)

1319 offsetsCanBeCombined(CI, *STM, Paired, true);

1320

1321 if (CI.InstClass == DS_WRITE) {

1322

1323

1324

1325

1326

1327

1328

1329 const MachineOperand *Data0 =

1330 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);

1331 const MachineOperand *Data1 =

1332 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);

1333

1334 const MCInstrDesc &Write2Opc = TII->get(getWrite2Opcode(CI));

1335 int Data0Idx = AMDGPU::getNamedOperandIdx(Write2Opc.getOpcode(),

1336 AMDGPU::OpName::data0);

1337 int Data1Idx = AMDGPU::getNamedOperandIdx(Write2Opc.getOpcode(),

1338 AMDGPU::OpName::data1);

1339

1340 const TargetRegisterClass *DataRC0 = TII->getRegClass(Write2Opc, Data0Idx);

1341

1342 const TargetRegisterClass *DataRC1 = TII->getRegClass(Write2Opc, Data1Idx);

1343

1345 DataRC0 = TRI->getMatchingSuperRegClass(MRI->getRegClass(Data0->getReg()),

1347 }

1348

1350 DataRC1 = TRI->getMatchingSuperRegClass(MRI->getRegClass(Data1->getReg()),

1352 }

1353

1354 if (MRI->constrainRegClass(Data0->getReg(), DataRC0) ||

1355 MRI->constrainRegClass(Data1->getReg(), DataRC1))

1356 return nullptr;

1357

1358

1359

1360 }

1361

1362 return Where;

1363}

1364

1365

1366

1367void SILoadStoreOptimizer::copyToDestRegs(

1368 CombineInfo &CI, CombineInfo &Paired,

1370 AMDGPU::OpName OpName, Register DestReg) const {

1371 MachineBasicBlock *MBB = CI.I->getParent();

1372

1373 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);

1374

1375

1376 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);

1377 auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);

1378 auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);

1379

1380

1381

1382

1383 Dest0->setIsEarlyClobber(false);

1384 Dest1->setIsEarlyClobber(false);

1385

1387 .add(*Dest0)

1388 .addReg(DestReg, 0, SubRegIdx0);

1390 .add(*Dest1)

1392}

1393

1394

1395

1397SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,

1400 AMDGPU::OpName OpName) const {

1401 MachineBasicBlock *MBB = CI.I->getParent();

1402

1403 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);

1404

1405

1406 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);

1407 Register SrcReg = MRI->createVirtualRegister(SuperRC);

1408

1409 const auto *Src0 = TII->getNamedOperand(*CI.I, OpName);

1410 const auto *Src1 = TII->getNamedOperand(*Paired.I, OpName);

1411

1412 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)

1413 .add(*Src0)

1415 .add(*Src1)

1416 .addImm(SubRegIdx1);

1417

1418 return SrcReg;

1419}

1420

1421unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {

1423 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;

1424 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;

1425}

1426

1427unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {

1429 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;

1430

1431 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9

1432 : AMDGPU::DS_READ2ST64_B64_gfx9;

1433}

1434

1436SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,

1438 MachineBasicBlock *MBB = CI.I->getParent();

1439

1440

1441

1442 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);

1443

1444 unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset);

1445 unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset);

1446 unsigned Opc =

1447 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);

1448

1450 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");

1451

1452 const MCInstrDesc &Read2Desc = TII->get(Opc);

1453

1454 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);

1455 Register DestReg = MRI->createVirtualRegister(SuperRC);

1456

1459

1461 unsigned BaseSubReg = AddrReg->getSubReg();

1462 unsigned BaseRegFlags = 0;

1463 if (CI.BaseOff) {

1464 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);

1465 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)

1466 .addImm(CI.BaseOff);

1467

1468 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);

1470

1471 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)

1472 .addReg(ImmReg)

1473 .addReg(AddrReg->getReg(), 0, BaseSubReg)

1474 .addImm(0);

1475 BaseSubReg = 0;

1476 }

1477

1478 MachineInstrBuilder Read2 =

1479 BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)

1480 .addReg(BaseReg, BaseRegFlags, BaseSubReg)

1481 .addImm(NewOffset0)

1482 .addImm(NewOffset1)

1485

1486 copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdst, DestReg);

1487

1488 CI.I->eraseFromParent();

1489 Paired.I->eraseFromParent();

1490

1491 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');

1492 return Read2;

1493}

1494

1495unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {

1497 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;

1498 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9

1499 : AMDGPU::DS_WRITE2_B64_gfx9;

1500}

1501

1502unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {

1504 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32

1505 : AMDGPU::DS_WRITE2ST64_B64;

1506

1507 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9

1508 : AMDGPU::DS_WRITE2ST64_B64_gfx9;

1509}

1510

1511unsigned SILoadStoreOptimizer::getWrite2Opcode(const CombineInfo &CI) const {

1512 return CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);

1513}

1514

1516 CombineInfo &CI, CombineInfo &Paired,

1518 MachineBasicBlock *MBB = CI.I->getParent();

1519

1520

1521

1522 const MachineOperand *AddrReg =

1523 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);

1524 const MachineOperand *Data0 =

1525 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);

1526 const MachineOperand *Data1 =

1527 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);

1528

1529 unsigned NewOffset0 = CI.Offset;

1530 unsigned NewOffset1 = Paired.Offset;

1531 unsigned Opc = getWrite2Opcode(CI);

1532

1533 if (NewOffset0 > NewOffset1) {

1534

1535 std::swap(NewOffset0, NewOffset1);

1537 }

1538

1540 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");

1541

1542 const MCInstrDesc &Write2Desc = TII->get(Opc);

1545

1547 unsigned BaseSubReg = AddrReg->getSubReg();

1548 unsigned BaseRegFlags = 0;

1549 if (CI.BaseOff) {

1550 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);

1551 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)

1552 .addImm(CI.BaseOff);

1553

1554 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);

1556

1557 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)

1558 .addReg(ImmReg)

1559 .addReg(AddrReg->getReg(), 0, BaseSubReg)

1560 .addImm(0);

1561 BaseSubReg = 0;

1562 }

1563

1564 MachineInstrBuilder Write2 =

1566 .addReg(BaseReg, BaseRegFlags, BaseSubReg)

1567 .add(*Data0)

1568 .add(*Data1)

1569 .addImm(NewOffset0)

1570 .addImm(NewOffset1)

1573

1574 CI.I->eraseFromParent();

1575 Paired.I->eraseFromParent();

1576

1577 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');

1578 return Write2;

1579}

1580

1582SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,

1584 MachineBasicBlock *MBB = CI.I->getParent();

1587

1588 const unsigned Opcode = getNewOpcode(CI, Paired);

1589

1590 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);

1591

1592 Register DestReg = MRI->createVirtualRegister(SuperRC);

1593 unsigned MergedDMask = CI.DMask | Paired.DMask;

1594 unsigned DMaskIdx =

1595 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);

1596

1597 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);

1598 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {

1599 if (I == DMaskIdx)

1600 MIB.addImm(MergedDMask);

1601 else

1602 MIB.add((*CI.I).getOperand(I));

1603 }

1604

1605

1606

1607

1608 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());

1609

1610 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));

1611

1612 copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg);

1613

1614 CI.I->eraseFromParent();

1615 Paired.I->eraseFromParent();

1616 return New;

1617}

1618

1620 CombineInfo &CI, CombineInfo &Paired,

1622 MachineBasicBlock *MBB = CI.I->getParent();

1625

1626 const unsigned Opcode = getNewOpcode(CI, Paired);

1627

1628 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);

1629

1630 Register DestReg = MRI->createVirtualRegister(SuperRC);

1631 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);

1632

1633

1634

1635

1636 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());

1637

1638 MachineInstrBuilder New =

1640 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));

1641 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)

1642 New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));

1643 New.addImm(MergedOffset);

1644 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));

1645

1646 copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::sdst, DestReg);

1647

1648 CI.I->eraseFromParent();

1649 Paired.I->eraseFromParent();

1650 return New;

1651}

1652

1654 CombineInfo &CI, CombineInfo &Paired,

1656 MachineBasicBlock *MBB = CI.I->getParent();

1657

1660

1661 const unsigned Opcode = getNewOpcode(CI, Paired);

1662

1663 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);

1664

1665

1666 Register DestReg = MRI->createVirtualRegister(SuperRC);

1667 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);

1668

1669 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);

1670

1671 AddressRegs Regs = getRegs(Opcode, *TII);

1672

1673 if (Regs.VAddr)

1674 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));

1675

1676

1677

1678

1679 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());

1680

1681 MachineInstr *New =

1682 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))

1683 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))

1684 .addImm(MergedOffset)

1685 .addImm(CI.CPol)

1686 .addImm(0)

1687 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));

1688

1689 copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg);

1690

1691 CI.I->eraseFromParent();

1692 Paired.I->eraseFromParent();

1693 return New;

1694}

1695

1697 CombineInfo &CI, CombineInfo &Paired,

1699 MachineBasicBlock *MBB = CI.I->getParent();

1700

1703

1704 const unsigned Opcode = getNewOpcode(CI, Paired);

1705

1706 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);

1707

1708

1709 Register DestReg = MRI->createVirtualRegister(SuperRC);

1710 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);

1711

1712 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);

1713

1714 AddressRegs Regs = getRegs(Opcode, *TII);

1715

1716 if (Regs.VAddr)

1717 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));

1718

1719

1720

1721

1722 unsigned NumCombinedComponents = CI.Width + Paired.Width;

1723 if (NumCombinedComponents == 3 && CI.EltSize <= 2)

1724 NumCombinedComponents = 4;

1725 unsigned JoinedFormat =

1727

1728

1729

1730

1731 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());

1732

1733 MachineInstr *New =

1734 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))

1735 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))

1736 .addImm(MergedOffset)

1737 .addImm(JoinedFormat)

1738 .addImm(CI.CPol)

1739 .addImm(0)

1740 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));

1741

1742 copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg);

1743

1744 CI.I->eraseFromParent();

1745 Paired.I->eraseFromParent();

1746 return New;

1747}

1748

1750 CombineInfo &CI, CombineInfo &Paired,

1752 MachineBasicBlock *MBB = CI.I->getParent();

1755

1756 const unsigned Opcode = getNewOpcode(CI, Paired);

1757

1759 copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata);

1760

1763

1764 AddressRegs Regs = getRegs(Opcode, *TII);

1765

1766 if (Regs.VAddr)

1767 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));

1768

1769

1770

1771

1772 unsigned NumCombinedComponents = CI.Width + Paired.Width;

1773 if (NumCombinedComponents == 3 && CI.EltSize <= 2)

1774 NumCombinedComponents = 4;

1775 unsigned JoinedFormat =

1777

1778

1779

1780

1781 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());

1782

1783 MachineInstr *New =

1784 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))

1785 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))

1786 .addImm(std::min(CI.Offset, Paired.Offset))

1787 .addImm(JoinedFormat)

1788 .addImm(CI.CPol)

1789 .addImm(0)

1790 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));

1791

1792 CI.I->eraseFromParent();

1793 Paired.I->eraseFromParent();

1794 return New;

1795}

1796

1798 CombineInfo &CI, CombineInfo &Paired,

1800 MachineBasicBlock *MBB = CI.I->getParent();

1801

1804

1805 const unsigned Opcode = getNewOpcode(CI, Paired);

1806

1807 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);

1808 Register DestReg = MRI->createVirtualRegister(SuperRC);

1809

1810 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);

1811

1812 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))

1813 MIB.add(*SAddr);

1814

1815 MachineInstr *New =

1816 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))

1817 .addImm(std::min(CI.Offset, Paired.Offset))

1818 .addImm(CI.CPol)

1819 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));

1820

1821 copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdst, DestReg);

1822

1823 CI.I->eraseFromParent();

1824 Paired.I->eraseFromParent();

1825 return New;

1826}

1827

1829 CombineInfo &CI, CombineInfo &Paired,

1831 MachineBasicBlock *MBB = CI.I->getParent();

1832

1835

1836 const unsigned Opcode = getNewOpcode(CI, Paired);

1837

1839 copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata);

1840

1842 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))

1844

1845 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))

1846 MIB.add(*SAddr);

1847

1848 MachineInstr *New =

1849 MIB.addImm(std::min(CI.Offset, Paired.Offset))

1850 .addImm(CI.CPol)

1851 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));

1852

1853 CI.I->eraseFromParent();

1854 Paired.I->eraseFromParent();

1855 return New;

1856}

1857

1860 unsigned Width) {

1861

1863 (MMOs.size() != 1 || MMOs[0]->getAlign().value() < Width * 4);

1864}

1865

1866unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,

1867 const CombineInfo &Paired) {

1868 const unsigned Width = CI.Width + Paired.Width;

1869

1870 switch (getCommonInstClass(CI, Paired)) {

1871 default:

1872 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);

1873

1875 Width);

1876 case TBUFFER_LOAD:

1877 case TBUFFER_STORE:

1879 Width);

1880

1883 case S_BUFFER_LOAD_IMM: {

1884

1885

1886 bool NeedsConstrainedOpc =

1888 switch (Width) {

1889 default:

1890 return 0;

1891 case 2:

1892 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec

1893 : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;

1894 case 3:

1895 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec

1896 : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;

1897 case 4:

1898 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec

1899 : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;

1900 case 8:

1901 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec

1902 : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;

1903 }

1904 }

1905 case S_BUFFER_LOAD_SGPR_IMM: {

1906

1907

1908 bool NeedsConstrainedOpc =

1910 switch (Width) {

1911 default:

1912 return 0;

1913 case 2:

1914 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec

1915 : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;

1916 case 3:

1917 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec

1918 : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;

1919 case 4:

1920 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec

1921 : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;

1922 case 8:

1923 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec

1924 : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;

1925 }

1926 }

1927 case S_LOAD_IMM: {

1928

1929

1930 bool NeedsConstrainedOpc =

1932 switch (Width) {

1933 default:

1934 return 0;

1935 case 2:

1936 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec

1937 : AMDGPU::S_LOAD_DWORDX2_IMM;

1938 case 3:

1939 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec

1940 : AMDGPU::S_LOAD_DWORDX3_IMM;

1941 case 4:

1942 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec

1943 : AMDGPU::S_LOAD_DWORDX4_IMM;

1944 case 8:

1945 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec

1946 : AMDGPU::S_LOAD_DWORDX8_IMM;

1947 }

1948 }

1949 case GLOBAL_LOAD:

1950 switch (Width) {

1951 default:

1952 return 0;

1953 case 2:

1954 return AMDGPU::GLOBAL_LOAD_DWORDX2;

1955 case 3:

1956 return AMDGPU::GLOBAL_LOAD_DWORDX3;

1957 case 4:

1958 return AMDGPU::GLOBAL_LOAD_DWORDX4;

1959 }

1960 case GLOBAL_LOAD_SADDR:

1961 switch (Width) {

1962 default:

1963 return 0;

1964 case 2:

1965 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;

1966 case 3:

1967 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;

1968 case 4:

1969 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;

1970 }

1971 case GLOBAL_STORE:

1972 switch (Width) {

1973 default:

1974 return 0;

1975 case 2:

1976 return AMDGPU::GLOBAL_STORE_DWORDX2;

1977 case 3:

1978 return AMDGPU::GLOBAL_STORE_DWORDX3;

1979 case 4:

1980 return AMDGPU::GLOBAL_STORE_DWORDX4;

1981 }

1982 case GLOBAL_STORE_SADDR:

1983 switch (Width) {

1984 default:

1985 return 0;

1986 case 2:

1987 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;

1988 case 3:

1989 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;

1990 case 4:

1991 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;

1992 }

1993 case FLAT_LOAD:

1994 switch (Width) {

1995 default:

1996 return 0;

1997 case 2:

1998 return AMDGPU::FLAT_LOAD_DWORDX2;

1999 case 3:

2000 return AMDGPU::FLAT_LOAD_DWORDX3;

2001 case 4:

2002 return AMDGPU::FLAT_LOAD_DWORDX4;

2003 }

2004 case FLAT_STORE:

2005 switch (Width) {

2006 default:

2007 return 0;

2008 case 2:

2009 return AMDGPU::FLAT_STORE_DWORDX2;

2010 case 3:

2011 return AMDGPU::FLAT_STORE_DWORDX3;

2012 case 4:

2013 return AMDGPU::FLAT_STORE_DWORDX4;

2014 }

2015 case FLAT_LOAD_SADDR:

2016 switch (Width) {

2017 default:

2018 return 0;

2019 case 2:

2020 return AMDGPU::FLAT_LOAD_DWORDX2_SADDR;

2021 case 3:

2022 return AMDGPU::FLAT_LOAD_DWORDX3_SADDR;

2023 case 4:

2024 return AMDGPU::FLAT_LOAD_DWORDX4_SADDR;

2025 }

2026 case FLAT_STORE_SADDR:

2027 switch (Width) {

2028 default:

2029 return 0;

2030 case 2:

2031 return AMDGPU::FLAT_STORE_DWORDX2_SADDR;

2032 case 3:

2033 return AMDGPU::FLAT_STORE_DWORDX3_SADDR;

2034 case 4:

2035 return AMDGPU::FLAT_STORE_DWORDX4_SADDR;

2036 }

2039 "No overlaps");

2041 }

2042}

2043

2044std::pair<unsigned, unsigned>

2045SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,

2046 const CombineInfo &Paired) {

2047 assert((CI.InstClass != MIMG ||

2048 ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==

2049 CI.Width + Paired.Width)) &&

2050 "No overlaps");

2051

2052 unsigned Idx0;

2053 unsigned Idx1;

2054

2055 static const unsigned Idxs[5][4] = {

2056 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},

2057 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},

2058 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},

2059 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},

2060 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},

2061 };

2062

2063 assert(CI.Width >= 1 && CI.Width <= 4);

2064 assert(Paired.Width >= 1 && Paired.Width <= 4);

2065

2066 if (Paired < CI) {

2067 Idx1 = Idxs[0][Paired.Width - 1];

2068 Idx0 = Idxs[Paired.Width][CI.Width - 1];

2069 } else {

2070 Idx0 = Idxs[0][CI.Width - 1];

2071 Idx1 = Idxs[CI.Width][Paired.Width - 1];

2072 }

2073

2074 return {Idx0, Idx1};

2075}

2076

2077const TargetRegisterClass *

2078SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,

2079 const CombineInfo &Paired) const {

2080 if (CI.InstClass == S_BUFFER_LOAD_IMM ||

2081 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {

2082 switch (CI.Width + Paired.Width) {

2083 default:

2084 return nullptr;

2085 case 2:

2086 return &AMDGPU::SReg_64_XEXECRegClass;

2087 case 3:

2088 return &AMDGPU::SGPR_96RegClass;

2089 case 4:

2090 return &AMDGPU::SGPR_128RegClass;

2091 case 8:

2092 return &AMDGPU::SGPR_256RegClass;

2093 case 16:

2094 return &AMDGPU::SGPR_512RegClass;

2095 }

2096 }

2097

2098

2099

2100 unsigned BitWidth = 32 * (CI.Width + Paired.Width);

2101 return TRI->isAGPRClass(getDataRegClass(*CI.I))

2102 ? TRI->getAGPRClassForBitWidth(BitWidth)

2103 : TRI->getVGPRClassForBitWidth(BitWidth);

2104}

2105

2107 CombineInfo &CI, CombineInfo &Paired,

2109 MachineBasicBlock *MBB = CI.I->getParent();

2112

2113 const unsigned Opcode = getNewOpcode(CI, Paired);

2114

2116 copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata);

2117

2120

2121 AddressRegs Regs = getRegs(Opcode, *TII);

2122

2123 if (Regs.VAddr)

2124 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));

2125

2126

2127

2128

2129

2130 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());

2131

2132 MachineInstr *New =

2133 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))

2134 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))

2135 .addImm(std::min(CI.Offset, Paired.Offset))

2136 .addImm(CI.CPol)

2137 .addImm(0)

2138 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));

2139

2140 CI.I->eraseFromParent();

2141 Paired.I->eraseFromParent();

2142 return New;

2143}

2144

2145MachineOperand

2146SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {

2147 APInt V(32, Val, true);

2148 if (TII->isInlineConstant(V))

2150

2151 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);

2152 MachineInstr *Mov =

2153 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),

2154 TII->get(AMDGPU::S_MOV_B32), Reg)

2156 (void)Mov;

2159}

2160

2161

2162Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,

2163 const MemAddress &Addr) const {

2164 MachineBasicBlock *MBB = MI.getParent();

2167

2168 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||

2169 Addr.Base.LoSubReg) &&

2170 "Expected 32-bit Base-Register-Low!!");

2171

2172 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||

2173 Addr.Base.HiSubReg) &&

2174 "Expected 32-bit Base-Register-Hi!!");

2175

2176 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");

2177 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);

2178 MachineOperand OffsetHi =

2179 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);

2180

2181 const auto *CarryRC = TRI->getWaveMaskRegClass();

2182 Register CarryReg = MRI->createVirtualRegister(CarryRC);

2183 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);

2184

2185 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);

2186 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);

2187 MachineInstr *LoHalf =

2190 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)

2191 .add(OffsetLo)

2192 .addImm(0);

2193 (void)LoHalf;

2195

2196 MachineInstr *HiHalf =

2199 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)

2200 .add(OffsetHi)

2202 .addImm(0);

2203 (void)HiHalf;

2205

2206 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());

2207 MachineInstr *FullBase =

2210 .addImm(AMDGPU::sub0)

2212 .addImm(AMDGPU::sub1);

2213 (void)FullBase;

2215

2216 return FullDestReg;

2217}

2218

2219

2220void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,

2222 int32_t NewOffset) const {

2223 auto *Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);

2224 Base->setReg(NewBase);

2225 Base->setIsKill(false);

2226 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);

2227}

2228

2229std::optional<int32_t>

2230SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {

2231 if (Op.isImm())

2232 return Op.getImm();

2233

2234 if (Op.isReg())

2235 return std::nullopt;

2236

2237 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());

2238 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||

2239 Def->getOperand(1).isImm())

2240 return std::nullopt;

2241

2242 return Def->getOperand(1).getImm();

2243}

2244

2245

2246

2247

2248

2249

2250

2251

2252

2253

2254

2255void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,

2256 MemAddress &Addr) const {

2257 if (Base.isReg())

2258 return;

2259

2260 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());

2261 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE

2262 || Def->getNumOperands() != 5)

2263 return;

2264

2265 MachineOperand BaseLo = Def->getOperand(1);

2266 MachineOperand BaseHi = Def->getOperand(3);

2267 if (!BaseLo.isReg() || !BaseHi.isReg())

2268 return;

2269

2270 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());

2271 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());

2272

2273 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||

2274 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)

2275 return;

2276

2277 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);

2278 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);

2279

2280 auto Offset0P = extractConstOffset(*Src0);

2281 if (Offset0P)

2282 BaseLo = *Src1;

2283 else {

2284 if (!(Offset0P = extractConstOffset(*Src1)))

2285 return;

2286 BaseLo = *Src0;

2287 }

2288

2289 if (!BaseLo.isReg())

2290 return;

2291

2292 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);

2293 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);

2294

2295 if (Src0->isImm())

2297

2298 if (!Src1->isImm() || Src0->isImm())

2299 return;

2300

2301 uint64_t Offset1 = Src1->getImm();

2302 BaseHi = *Src0;

2303

2304 if (!BaseHi.isReg())

2305 return;

2306

2307 Addr.Base.LoReg = BaseLo.getReg();

2308 Addr.Base.HiReg = BaseHi.getReg();

2309 Addr.Base.LoSubReg = BaseLo.getSubReg();

2310 Addr.Base.HiSubReg = BaseHi.getSubReg();

2311 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);

2312}

2313

2314bool SILoadStoreOptimizer::promoteConstantOffsetToImm(

2315 MachineInstr &MI,

2316 MemInfoMap &Visited,

2317 SmallPtrSet<MachineInstr *, 4> &AnchorList) const {

2318

2320 return false;

2321

2322

2324 return false;

2325

2328

2330 return false;

2331

2333

2334 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {

2335 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);

2336 return false;

2337 }

2338

2339

2340 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);

2341 auto [It, Inserted] = Visited.try_emplace(&MI);

2342 MemAddress MAddr;

2343 if (Inserted) {

2344 processBaseWithConstOffset(Base, MAddr);

2345 It->second = MAddr;

2346 } else

2347 MAddr = It->second;

2348

2349 if (MAddr.Offset == 0) {

2350 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"

2351 " constant offsets that can be promoted.\n";);

2352 return false;

2353 }

2354

2357 << "} Offset: " << MAddr.Offset << "\n\n";);

2358

2359

2360

2361

2362

2363

2364

2365

2366

2367

2368

2369

2370

2371

2372

2373

2374

2375

2376

2377

2378

2379

2380

2381

2382

2383

2384 MachineInstr *AnchorInst = nullptr;

2385 MemAddress AnchorAddr;

2386 uint32_t MaxDist = std::numeric_limits<uint32_t>::min();

2388

2389 MachineBasicBlock *MBB = MI.getParent();

2394

2396 MachineInstr &MINext = *MBBI;

2397

2398

2399 if (MINext.getOpcode() != MI.getOpcode() ||

2400 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())

2401 continue;

2402

2403 const MachineOperand &BaseNext =

2404 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);

2405 MemAddress MAddrNext;

2406 auto [It, Inserted] = Visited.try_emplace(&MINext);

2407 if (Inserted) {

2408 processBaseWithConstOffset(BaseNext, MAddrNext);

2409 It->second = MAddrNext;

2410 } else

2411 MAddrNext = It->second;

2412

2413 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||

2414 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||

2415 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||

2416 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)

2417 continue;

2418

2419 InstsWCommonBase.emplace_back(&MINext, MAddrNext.Offset);

2420

2421 int64_t Dist = MAddr.Offset - MAddrNext.Offset;

2422 TargetLoweringBase::AddrMode AM;

2426 (uint32_t)std::abs(Dist) > MaxDist) {

2427 MaxDist = std::abs(Dist);

2428

2429 AnchorAddr = MAddrNext;

2430 AnchorInst = &MINext;

2431 }

2432 }

2433

2434 if (AnchorInst) {

2435 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";

2436 AnchorInst->dump());

2438 << AnchorAddr.Offset << "\n\n");

2439

2440

2442

2443 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);

2445

2446 for (auto [OtherMI, OtherOffset] : InstsWCommonBase) {

2447 TargetLoweringBase::AddrMode AM;

2449 AM.BaseOffs = OtherOffset - AnchorAddr.Offset;

2450

2452 LLVM_DEBUG(dbgs() << " Promote Offset(" << OtherOffset; dbgs() << ")";

2453 OtherMI->dump());

2454 updateBaseAndOffset(*OtherMI, Base, OtherOffset - AnchorAddr.Offset);

2455 LLVM_DEBUG(dbgs() << " After promotion: "; OtherMI->dump());

2456 }

2457 }

2459 return true;

2460 }

2461

2462 return false;

2463}

2464

2465void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,

2466 std::list<std::list > &MergeableInsts) const {

2467 for (std::list &AddrList : MergeableInsts) {

2468 if (AddrList.front().InstClass == CI.InstClass &&

2469 AddrList.front().hasSameBaseAddress(CI)) {

2470 AddrList.emplace_back(CI);

2471 return;

2472 }

2473 }

2474

2475

2476 MergeableInsts.emplace_back(1, CI);

2477}

2478

2479std::pair<MachineBasicBlock::iterator, bool>

2480SILoadStoreOptimizer::collectMergeableInsts(

2482 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,

2483 std::list<std::list> &MergeableInsts) const {

2485

2486

2487 unsigned Order = 0;

2489 for (; BlockI != End; ++BlockI) {

2490 MachineInstr &MI = *BlockI;

2491

2492

2493

2494 if (promoteConstantOffsetToImm(MI, Visited, AnchorList))

2496

2497

2498

2499 if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {

2501

2502

2503 ++BlockI;

2504 break;

2505 }

2506

2507 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);

2508 if (InstClass == UNKNOWN)

2509 continue;

2510

2511

2512 int Swizzled =

2513 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);

2514 if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())

2515 continue;

2516

2517 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) {

2518 const MachineOperand *Fmt =

2519 TII->getNamedOperand(MI, AMDGPU::OpName::format);

2521 LLVM_DEBUG(dbgs() << "Skip tbuffer with unknown format: " << MI);

2522 continue;

2523 }

2524 }

2525

2526 CombineInfo CI;

2527 CI.setMI(MI, *this);

2528 CI.Order = Order++;

2529

2530 if (!CI.hasMergeableAddress(*MRI))

2531 continue;

2532

2534

2536 }

2537

2538

2539

2540

2541

2542

2543

2544

2545 for (std::list<std::list>::iterator I = MergeableInsts.begin(),

2546 E = MergeableInsts.end(); I != E;) {

2547

2548 std::list &MergeList = *I;

2549 if (MergeList.size() <= 1) {

2550

2551

2552

2553 I = MergeableInsts.erase(I);

2554 continue;

2555 }

2556

2557

2558

2559

2560 MergeList.sort(

2561 [] (const CombineInfo &A, const CombineInfo &B) {

2562 return A.Offset < B.Offset;

2563 });

2564 ++I;

2565 }

2566

2568}

2569

2570

2571

2572

2573bool SILoadStoreOptimizer::optimizeBlock(

2574 std::list<std::list > &MergeableInsts) {

2576

2577 for (std::list<std::list>::iterator I = MergeableInsts.begin(),

2578 E = MergeableInsts.end(); I != E;) {

2579 std::list &MergeList = *I;

2580

2581 bool OptimizeListAgain = false;

2582 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {

2583

2584

2585

2586 I = MergeableInsts.erase(I);

2587 continue;

2588 }

2589

2591

2592

2593

2594 if (!OptimizeListAgain) {

2595 I = MergeableInsts.erase(I);

2596 continue;

2597 }

2598 OptimizeAgain = true;

2599 }

2601}

2602

2603bool

2604SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(

2605 std::list &MergeList,

2606 bool &OptimizeListAgain) {

2607 if (MergeList.empty())

2608 return false;

2609

2611

2612 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();

2613 Next = std::next(I)) {

2614

2616 auto Second = Next;

2617

2618 if ((*First).Order > (*Second).Order)

2620 CombineInfo &CI = *First;

2621 CombineInfo &Paired = *Second;

2622

2623 CombineInfo *Where = checkAndPrepareMerge(CI, Paired);

2624 if (!Where) {

2625 ++I;

2626 continue;

2627 }

2628

2630

2631 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I);

2632

2634 switch (CI.InstClass) {

2635 default:

2637 break;

2638 case DS_READ:

2639 NewMI = mergeRead2Pair(CI, Paired, Where->I);

2640 break;

2641 case DS_WRITE:

2642 NewMI = mergeWrite2Pair(CI, Paired, Where->I);

2643 break;

2644 case S_BUFFER_LOAD_IMM:

2645 case S_BUFFER_LOAD_SGPR_IMM:

2646 case S_LOAD_IMM:

2647 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);

2648 OptimizeListAgain |= CI.Width + Paired.Width < 8;

2649 break;

2650 case BUFFER_LOAD:

2651 NewMI = mergeBufferLoadPair(CI, Paired, Where->I);

2652 OptimizeListAgain |= CI.Width + Paired.Width < 4;

2653 break;

2654 case BUFFER_STORE:

2655 NewMI = mergeBufferStorePair(CI, Paired, Where->I);

2656 OptimizeListAgain |= CI.Width + Paired.Width < 4;

2657 break;

2659 NewMI = mergeImagePair(CI, Paired, Where->I);

2660 OptimizeListAgain |= CI.Width + Paired.Width < 4;

2661 break;

2662 case TBUFFER_LOAD:

2663 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);

2664 OptimizeListAgain |= CI.Width + Paired.Width < 4;

2665 break;

2666 case TBUFFER_STORE:

2667 NewMI = mergeTBufferStorePair(CI, Paired, Where->I);

2668 OptimizeListAgain |= CI.Width + Paired.Width < 4;

2669 break;

2670 case FLAT_LOAD:

2671 case FLAT_LOAD_SADDR:

2672 case GLOBAL_LOAD:

2673 case GLOBAL_LOAD_SADDR:

2674 NewMI = mergeFlatLoadPair(CI, Paired, Where->I);

2675 OptimizeListAgain |= CI.Width + Paired.Width < 4;

2676 break;

2677 case FLAT_STORE:

2678 case FLAT_STORE_SADDR:

2679 case GLOBAL_STORE:

2680 case GLOBAL_STORE_SADDR:

2681 NewMI = mergeFlatStorePair(CI, Paired, Where->I);

2682 OptimizeListAgain |= CI.Width + Paired.Width < 4;

2683 break;

2684 }

2685 CI.setMI(NewMI, *this);

2686 CI.Order = Where->Order;

2687 if (I == Second)

2689

2690 MergeList.erase(Second);

2691 }

2692

2694}

2695

2696bool SILoadStoreOptimizerLegacy::runOnMachineFunction(MachineFunction &MF) {

2698 return false;

2699 return SILoadStoreOptimizer(

2700 &getAnalysis().getAAResults())

2701 .run(MF);

2702}

2703

2704bool SILoadStoreOptimizer::run(MachineFunction &MF) {

2705 this->MF = &MF;

2708 return false;

2709

2712

2714

2715 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");

2716

2718

2719

2720

2721 SmallPtrSet<MachineInstr *, 4> AnchorList;

2722 MemInfoMap Visited;

2723

2724 for (MachineBasicBlock &MBB : MF) {

2727 I = SectionEnd) {

2728 bool CollectModified;

2729 std::list<std::list> MergeableInsts;

2730

2731

2732

2733 std::tie(SectionEnd, CollectModified) =

2735

2736 Modified |= CollectModified;

2737

2738 do {

2739 OptimizeAgain = false;

2741 } while (OptimizeAgain);

2742 }

2743

2744 Visited.clear();

2746 }

2747

2749}

2750

2751PreservedAnalyses

2755

2758

2760 .getManager();

2762

2763 bool Changed = SILoadStoreOptimizer(&AA).run(MF);

2766

2769 return PA;

2770}

unsigned const MachineRegisterInfo * MRI

assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

const TargetInstrInfo & TII

INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, "AMDGPU Image Intrinsic Optimizer", false, false) char AMDGPUImageIntrinsicOptimizer void addInstToMergeableList(IntrinsicInst *II, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr)

BasicBlock::iterator collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts)

Provides AMDGPU specific target descriptions.

MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL

MachineBasicBlock MachineBasicBlock::iterator MBBI

static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")

static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

Analysis containing CSE Info

AMD GCN specific subclass of TargetSubtarget.

static MaybeAlign getAlign(Value *Ptr)

Register const TargetRegisterInfo * TRI

Promote Memory to Register

static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)

FunctionAnalysisManager FAM

#define INITIALIZE_PASS_DEPENDENCY(depName)

#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)

#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)

static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi)

Definition SILoadStoreOptimizer.cpp:1075

static bool needsConstrainedOpcode(const GCNSubtarget &STM, ArrayRef< MachineMemOperand * > MMOs, unsigned Width)

Definition SILoadStoreOptimizer.cpp:1858

static void addDefsUsesToList(const MachineInstr &MI, DenseSet< Register > &RegDefs, DenseSet< Register > &RegUses)

Definition SILoadStoreOptimizer.cpp:950

static unsigned getBufferFormatWithCompCount(unsigned OldFormat, unsigned ComponentCount, const GCNSubtarget &STI)

Definition SILoadStoreOptimizer.cpp:1044

static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, bool HasBranchDivergence, DomTreeUpdater *DTU)

A manager for alias analyses.

A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.

PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)

Get the result of an analysis pass for a given IR unit.

Represent the analysis usage information of a pass.

AnalysisUsage & addRequired()

LLVM_ABI void setPreservesCFG()

This function should be called by the pass, iff they do not:

ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...

size_t size() const

size - Get the array size.

Represents analyses that only rely on functions' control flow.

static LLVM_ABI DebugLoc getMergedLocation(DebugLoc LocA, DebugLoc LocB)

When two instructions are combined into a single instruction we also need to combine the original loc...

Implements a dense probed hash-table based set.

FunctionPass class - This class is used to implement most global optimizations.

bool hasOptNone() const

Do not optimize this function (-O0).

bool loadStoreOptEnabled() const

bool hasFlatInstOffsets() const

const SIInstrInfo * getInstrInfo() const override

bool hasDwordx3LoadStores() const

const SITargetLowering * getTargetLowering() const override

bool ldsRequiresM0Init() const

Return if most LDS instructions have an m0 use that require m0 to be initialized.

bool hasScalarDwordx3Loads() const

bool isXNACKEnabled() const

TypeSize getValue() const

unsigned getOpcode() const

Return the opcode number for this descriptor.

const MCInstrDesc & get(unsigned Opcode) const

Return the machine instruction descriptor that corresponds to the specified instruction opcode.

An RAII based helper class to modify MachineFunctionProperties when running pass.

MachineInstrBundleIterator< MachineInstr > iterator

MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...

void getAnalysisUsage(AnalysisUsage &AU) const override

getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.

Properties which a MachineFunction may have at a given point in time.

const TargetSubtargetInfo & getSubtarget() const

getSubtarget - Return the subtarget for which this machine code is being compiled.

MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)

getMachineMemOperand - Allocate a new MachineMemOperand.

MachineRegisterInfo & getRegInfo()

getRegInfo - Return information about the registers currently in use.

Function & getFunction()

Return the LLVM function that this machine code represents.

const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr * > OtherMIs) const

const MachineInstrBuilder & addImm(int64_t Val) const

Add a new immediate operand.

const MachineInstrBuilder & add(const MachineOperand &MO) const

const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const

Add a new virtual register operand.

Representation of each machine instruction.

unsigned getOpcode() const

Returns the opcode of this MachineInstr.

LLVM_ABI void dump() const

A description of a memory reference used in the backend.

LocationSize getSize() const

Return the size in bytes of the memory reference.

unsigned getAddrSpace() const

const MachinePointerInfo & getPointerInfo() const

MachineOperand class - Representation of each machine instruction operand.

unsigned getSubReg() const

bool isReg() const

isReg - Tests if this is a MO_Register operand.

bool isImm() const

isImm - Tests if this is a MO_Immediate operand.

static MachineOperand CreateImm(int64_t Val)

Register getReg() const

getReg - Returns the register number.

static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)

MachineRegisterInfo - Keep track of information for virtual and physical registers,...

A set of analyses that are preserved following a run of a transformation pass.

static PreservedAnalyses all()

Construct a special preserved set that preserves all passes.

PreservedAnalyses & preserveSet()

Mark an analysis set as preserved.

Wrapper class representing virtual and physical registers.

constexpr bool isPhysical() const

Return true if the specified register number is in the physical register namespace.

static bool isFLATScratch(const MachineInstr &MI)

static bool isVIMAGE(const MachineInstr &MI)

static bool isFLATGlobal(const MachineInstr &MI)

static bool isVSAMPLE(const MachineInstr &MI)

static bool isFLAT(const MachineInstr &MI)

LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const

Returns the operand named Op.

PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)

Definition SILoadStoreOptimizer.cpp:2752

bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const

SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.

reference emplace_back(ArgTypes &&... Args)

StringRef - Represent a constant reference to a string, i.e.

virtual const TargetRegisterClass * getRegClass(const MCInstrDesc &MCID, unsigned OpNum) const

Given a machine instruction descriptor, returns the register class constraint for OpNum,...

const TargetRegisterInfo & getRegisterInfo() const

std::pair< iterator, bool > insert(const ValueT &V)

bool contains(const_arg_type_t< ValueT > V) const

Check if the set contains the given element.

#define llvm_unreachable(msg)

Marks that the current location is not supposed to be reachable.

Abstract Attribute helper functions.

@ FLAT_ADDRESS

Address space for flat memory.

@ GLOBAL_ADDRESS

Address space for global memory (RAT0, VTX0).

LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)

uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)

Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.

bool getMTBUFHasSrsrc(unsigned Opc)

int getMTBUFElements(unsigned Opc)

bool getMTBUFHasSoffset(unsigned Opc)

int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)

int getMUBUFBaseOpcode(unsigned Opc)

LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)

int getMTBUFBaseOpcode(unsigned Opc)

bool getMUBUFHasVAddr(unsigned Opc)

int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements)

bool getMUBUFHasSoffset(unsigned Opc)

const MIMGBaseOpcodeInfo * getMIMGBaseOpcode(unsigned Opc)

LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)

int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)

bool getMTBUFHasVAddr(unsigned Opc)

int getMUBUFElements(unsigned Opc)

const GcnBufferFormatInfo * getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, const MCSubtargetInfo &STI)

bool getMUBUFHasSrsrc(unsigned Opc)

constexpr std::underlying_type_t< E > Mask()

Get a bitmask with 1s in all places up to the high-order bit of E's largest value.

unsigned ID

LLVM IR allows to use arbitrary numbers as calling convention identifiers.

@ Define

Register definition.

@ Kill

The last use of a register.

NodeAddr< DefNode * > Def

BaseReg

Stack frame base register. Bit 0 of FREInfo.Info.

This is an optimization pass for GlobalISel generic memory operations.

bool operator<(int64_t V1, const APSInt &V2)

MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)

Builder interface. Specify how to create the initial instruction itself.

constexpr T maskLeadingOnes(unsigned N)

Create a bitmask with the N left-most bits set to 1, and all other bits set to 0.

FunctionPass * createSILoadStoreOptimizerLegacyPass()

Definition SILoadStoreOptimizer.cpp:946

AnalysisManager< MachineFunction > MachineFunctionAnalysisManager

char & SILoadStoreOptimizerLegacyID

Definition SILoadStoreOptimizer.cpp:944

constexpr int popcount(T Value) noexcept

Count the number of set bits in a value.

int countr_zero(T Val)

Count number of 0's from the least significant bit to the most stopping at the first 1.

LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()

Returns the minimum set of Analyses that all machine function passes must preserve.

int countl_zero(T Val)

Count number of 0's from the most significant bit to the least stopping at the first 1.

LLVM_ABI raw_ostream & dbgs()

dbgs() - This returns a reference to a raw_ostream for debugging messages.

constexpr bool isUInt(uint64_t x)

Checks if an unsigned integer fits into the given bit width.

class LLVM_GSL_OWNER SmallVector

Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...

@ First

Helpers to iterate all locations in the MemoryEffectsBase class.

FunctionAddr VTableAddr Next

DWARFExpression::Operation Op

std::vector< std::pair< LineLocation, FunctionId > > AnchorList

constexpr unsigned BitWidth

constexpr T maskTrailingOnes(unsigned N)

Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.

AAResults AliasAnalysis

Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.

LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)

Prints virtual and physical registers with or without a TRI instance.

void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)

Implement std::swap in terms of BitVector swap.