LLVM: lib/Target/AArch64/AArch64SIMDInstrOpt.cpp Source File (original) (raw)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
53#include
54#include <unordered_map>
55
56using namespace llvm;
57
58#define DEBUG_TYPE "aarch64-simdinstr-opt"
59
61 "Number of SIMD instructions modified");
62
63#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME \
64 "AArch64 SIMD instructions optimization pass"
65
66namespace {
67
69 static char ID;
70
74
75
76
77
78 std::map<std::pair<unsigned, std::string>, bool> SIMDInstrTable;
79
80
81 std::unordered_map<std::string, bool> InterlEarlyExit;
82
83 typedef enum {
84 VectorElem,
85 Interleave
86 } Subpass;
87
88
89 struct InstReplInfo {
90 unsigned OrigOpc;
91 std::vector ReplOpc;
93 };
94
95#define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC) \
96 {OpcOrg, {OpcR0, OpcR1, OpcR2}, RC}
97#define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, \
98 OpcR7, OpcR8, OpcR9, RC) \
99 {OpcOrg, \
100 {OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9}, RC}
101
102
103 std::vector IRT = {
104
105 RuleST2(AArch64::ST2Twov2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
106 AArch64::STPQi, AArch64::FPR128RegClass),
107 RuleST2(AArch64::ST2Twov4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
108 AArch64::STPQi, AArch64::FPR128RegClass),
109 RuleST2(AArch64::ST2Twov2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
110 AArch64::STPDi, AArch64::FPR64RegClass),
111 RuleST2(AArch64::ST2Twov8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
112 AArch64::STPQi, AArch64::FPR128RegClass),
113 RuleST2(AArch64::ST2Twov4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
114 AArch64::STPDi, AArch64::FPR64RegClass),
115 RuleST2(AArch64::ST2Twov16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
116 AArch64::STPQi, AArch64::FPR128RegClass),
117 RuleST2(AArch64::ST2Twov8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
118 AArch64::STPDi, AArch64::FPR64RegClass),
119
120 RuleST4(AArch64::ST4Fourv2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
121 AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, AArch64::ZIP1v2i64,
122 AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
123 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
124 RuleST4(AArch64::ST4Fourv4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
125 AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, AArch64::ZIP1v4i32,
126 AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
127 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
128 RuleST4(AArch64::ST4Fourv2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
129 AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, AArch64::ZIP1v2i32,
130 AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
131 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
132 RuleST4(AArch64::ST4Fourv8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
133 AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, AArch64::ZIP1v8i16,
134 AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
135 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
136 RuleST4(AArch64::ST4Fourv4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
137 AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, AArch64::ZIP1v4i16,
138 AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
139 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
140 RuleST4(AArch64::ST4Fourv16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
141 AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, AArch64::ZIP1v16i8,
142 AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
143 AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
144 RuleST4(AArch64::ST4Fourv8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
145 AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, AArch64::ZIP1v8i8,
146 AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
147 AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass)
148 };
149
150
151
152 static const unsigned MaxNumRepl = 10;
153
154 AArch64SIMDInstrOpt() : MachineFunctionPass(ID) {}
155
156
157
158
159
160 bool shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc,
161 SmallVectorImpl<const MCInstrDesc*> &ReplInstrMCID);
162
163
164
165
166
167 bool shouldExitEarly(MachineFunction *MF, Subpass SP);
168
169
170
171
172
173 bool reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg,
174 unsigned LaneNumber, unsigned *DestReg) const;
175
176
177
178
179
180 bool optimizeVectElement(MachineInstr &MI);
181
182
183
184
185
186
187 bool processSeqRegInst(MachineInstr *DefiningMI, unsigned* StReg,
188 unsigned* StRegKill, unsigned NumArg) const;
189
190
191
192
193 bool optimizeLdStInterleave(MachineInstr &MI);
194
195
196
197 unsigned determineSrcReg(MachineInstr &MI) const;
198
199 bool runOnMachineFunction(MachineFunction &Fn) override;
200
201 StringRef getPassName() const override {
203 }
204};
205
206char AArch64SIMDInstrOpt::ID = 0;
207
208}
209
212
213
214
215
216
217bool AArch64SIMDInstrOpt::
220
221
222 std::string Subtarget = std::string(SchedModel.getSubtargetInfo()->getCPU());
223 auto InstID = std::make_pair(InstDesc->getOpcode(), Subtarget);
224 auto It = SIMDInstrTable.find(InstID);
225 if (It != SIMDInstrTable.end())
226 return It->second;
227
228 unsigned SCIdx = InstDesc->getSchedClass();
230 SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
231
232
233
236 {
237 SIMDInstrTable[InstID] = false;
238 return false;
239 }
240 for (const auto *IDesc : InstDescRepl)
241 {
242 SCDescRepl = SchedModel.getMCSchedModel()->getSchedClassDesc(
243 IDesc->getSchedClass());
245 {
246 SIMDInstrTable[InstID] = false;
247 return false;
248 }
249 }
250
251
252 unsigned ReplCost = 0;
253 for (const auto *IDesc :InstDescRepl)
254 ReplCost += SchedModel.computeInstrLatency(IDesc->getOpcode());
255
256 if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) > ReplCost)
257 {
258 SIMDInstrTable[InstID] = true;
259 return true;
260 }
261 else
262 {
263 SIMDInstrTable[InstID] = false;
264 return false;
265 }
266}
267
268
269
270
271
272
273
274bool AArch64SIMDInstrOpt::shouldExitEarly(MachineFunction *MF, Subpass SP) {
275 const MCInstrDesc* OriginalMCID;
277
278 switch (SP) {
279
280
281
282 case VectorElem:
283 OriginalMCID = &TII->get(AArch64::FMLAv4i32_indexed);
284 ReplInstrMCID.push_back(&TII->get(AArch64::DUPv4i32lane));
286 if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID))
287 return false;
288 break;
289
290
291 case Interleave:
292 std::string Subtarget =
294 auto It = InterlEarlyExit.find(Subtarget);
295 if (It != InterlEarlyExit.end())
296 return It->second;
297
298 for (auto &I : IRT) {
299 OriginalMCID = &TII->get(I.OrigOpc);
300 for (auto &Repl : I.ReplOpc)
302 if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) {
303 InterlEarlyExit[Subtarget] = false;
304 return false;
305 }
306 ReplInstrMCID.clear();
307 }
308 InterlEarlyExit[Subtarget] = true;
309 break;
310 }
311
312 return true;
313}
314
315
316
317
318
319bool AArch64SIMDInstrOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode,
320 unsigned SrcReg, unsigned LaneNumber,
321 unsigned *DestReg) const {
323 MII != MIE;) {
324 MII--;
325 MachineInstr *CurrentMI = &*MII;
326
327 if (CurrentMI->getOpcode() == DupOpcode &&
332 return true;
333 }
334 }
335
336 return false;
337}
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353bool AArch64SIMDInstrOpt::optimizeVectElement(MachineInstr &MI) {
354 const MCInstrDesc *MulMCID, *DupMCID;
355 const TargetRegisterClass *RC = &AArch64::FPR128RegClass;
356
357 switch (MI.getOpcode()) {
358 default:
359 return false;
360
361
362 case AArch64::FMLAv4i32_indexed:
363 DupMCID = &TII->get(AArch64::DUPv4i32lane);
364 MulMCID = &TII->get(AArch64::FMLAv4f32);
365 break;
366 case AArch64::FMLSv4i32_indexed:
367 DupMCID = &TII->get(AArch64::DUPv4i32lane);
368 MulMCID = &TII->get(AArch64::FMLSv4f32);
369 break;
370 case AArch64::FMULXv4i32_indexed:
371 DupMCID = &TII->get(AArch64::DUPv4i32lane);
372 MulMCID = &TII->get(AArch64::FMULXv4f32);
373 break;
374 case AArch64::FMULv4i32_indexed:
375 DupMCID = &TII->get(AArch64::DUPv4i32lane);
376 MulMCID = &TII->get(AArch64::FMULv4f32);
377 break;
378
379
380 case AArch64::FMLAv2i64_indexed:
381 DupMCID = &TII->get(AArch64::DUPv2i64lane);
382 MulMCID = &TII->get(AArch64::FMLAv2f64);
383 break;
384 case AArch64::FMLSv2i64_indexed:
385 DupMCID = &TII->get(AArch64::DUPv2i64lane);
386 MulMCID = &TII->get(AArch64::FMLSv2f64);
387 break;
388 case AArch64::FMULXv2i64_indexed:
389 DupMCID = &TII->get(AArch64::DUPv2i64lane);
390 MulMCID = &TII->get(AArch64::FMULXv2f64);
391 break;
392 case AArch64::FMULv2i64_indexed:
393 DupMCID = &TII->get(AArch64::DUPv2i64lane);
394 MulMCID = &TII->get(AArch64::FMULv2f64);
395 break;
396
397
398 case AArch64::FMLAv2i32_indexed:
399 RC = &AArch64::FPR64RegClass;
400 DupMCID = &TII->get(AArch64::DUPv2i32lane);
401 MulMCID = &TII->get(AArch64::FMLAv2f32);
402 break;
403 case AArch64::FMLSv2i32_indexed:
404 RC = &AArch64::FPR64RegClass;
405 DupMCID = &TII->get(AArch64::DUPv2i32lane);
406 MulMCID = &TII->get(AArch64::FMLSv2f32);
407 break;
408 case AArch64::FMULXv2i32_indexed:
409 RC = &AArch64::FPR64RegClass;
410 DupMCID = &TII->get(AArch64::DUPv2i32lane);
411 MulMCID = &TII->get(AArch64::FMULXv2f32);
412 break;
413 case AArch64::FMULv2i32_indexed:
414 RC = &AArch64::FPR64RegClass;
415 DupMCID = &TII->get(AArch64::DUPv2i32lane);
416 MulMCID = &TII->get(AArch64::FMULv2f32);
417 break;
418 }
419
421 ReplInstrMCID.push_back(DupMCID);
422 ReplInstrMCID.push_back(MulMCID);
423 if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()),
424 ReplInstrMCID))
425 return false;
426
428 MachineBasicBlock &MBB = *MI.getParent();
430
431
432 Register MulDest = MI.getOperand(0).getReg();
433 Register SrcReg0 = MI.getOperand(1).getReg();
434 unsigned Src0IsKill = getKillRegState(MI.getOperand(1).isKill());
435 Register SrcReg1 = MI.getOperand(2).getReg();
436 unsigned Src1IsKill = getKillRegState(MI.getOperand(2).isKill());
437 unsigned DupDest;
438
439
440 if (MI.getNumOperands() == 5) {
441 Register SrcReg2 = MI.getOperand(3).getReg();
442 unsigned Src2IsKill = getKillRegState(MI.getOperand(3).isKill());
443 unsigned LaneNumber = MI.getOperand(4).getImm();
444
445
446
447 if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg2, LaneNumber, &DupDest)) {
448 DupDest = MRI.createVirtualRegister(RC);
450 .addReg(SrcReg2, Src2IsKill)
452 }
454 .addReg(SrcReg0, Src0IsKill)
455 .addReg(SrcReg1, Src1IsKill)
456 .addReg(DupDest, Src2IsKill);
457 } else if (MI.getNumOperands() == 4) {
458 unsigned LaneNumber = MI.getOperand(3).getImm();
459 if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)) {
460 DupDest = MRI.createVirtualRegister(RC);
462 .addReg(SrcReg1, Src1IsKill)
464 }
466 .addReg(SrcReg0, Src0IsKill)
467 .addReg(DupDest, Src1IsKill);
468 } else {
469 return false;
470 }
471
472 ++NumModifiedInstr;
473 return true;
474}
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505bool AArch64SIMDInstrOpt::optimizeLdStInterleave(MachineInstr &MI) {
506
507 unsigned SeqReg, AddrReg;
508 unsigned StReg[4], StRegKill[4];
509 MachineInstr *DefiningMI;
511 MachineBasicBlock &MBB = *MI.getParent();
514
515
516
517 bool Match = false;
518 for (auto &I : IRT) {
519 if (MI.getOpcode() == I.OrigOpc) {
520 SeqReg = MI.getOperand(0).getReg();
521 AddrReg = MI.getOperand(1).getReg();
522 DefiningMI = MRI->getUniqueVRegDef(SeqReg);
523 unsigned NumReg = determineSrcReg(MI);
524 if (!processSeqRegInst(DefiningMI, StReg, StRegKill, NumReg))
525 return false;
526
527 for (auto &Repl : I.ReplOpc) {
529
530 if (Repl != AArch64::STPQi && Repl != AArch64::STPDi)
531 ZipDest.push_back(MRI->createVirtualRegister(&I.RC));
532 }
533 Match = true;
534 break;
535 }
536 }
537
538 if (!Match)
539 return false;
540
541
542
543 if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()),
544 ReplInstrMCID))
545 return false;
546
547
548
549
550
551
552
553 switch (MI.getOpcode()) {
554 default:
555 return false;
556
557 case AArch64::ST2Twov16b:
558 case AArch64::ST2Twov8b:
559 case AArch64::ST2Twov8h:
560 case AArch64::ST2Twov4h:
561 case AArch64::ST2Twov4s:
562 case AArch64::ST2Twov2s:
563 case AArch64::ST2Twov2d:
564
569 .addReg(StReg[0], StRegKill[0])
570 .addReg(StReg[1], StRegKill[1]);
571
577 break;
578
579 case AArch64::ST4Fourv16b:
580 case AArch64::ST4Fourv8b:
581 case AArch64::ST4Fourv8h:
582 case AArch64::ST4Fourv4h:
583 case AArch64::ST4Fourv4s:
584 case AArch64::ST4Fourv2s:
585 case AArch64::ST4Fourv2d:
586
591 .addReg(StReg[0], StRegKill[0])
592 .addReg(StReg[2], StRegKill[2]);
597 .addReg(StReg[1], StRegKill[1])
598 .addReg(StReg[3], StRegKill[3]);
611
622 break;
623 }
624
625 ++NumModifiedInstr;
626 return true;
627}
628
629
630
631
632
633
634bool AArch64SIMDInstrOpt::processSeqRegInst(MachineInstr *DefiningMI,
635 unsigned* StReg, unsigned* StRegKill, unsigned NumArg) const {
636 assert(DefiningMI != nullptr);
637 if (DefiningMI->getOpcode() != AArch64::REG_SEQUENCE)
638 return false;
639
640 for (unsigned i=0; i<NumArg; i++) {
643
644
647 default:
648 return false;
649
650 case AArch64::dsub0:
651 case AArch64::dsub1:
652 case AArch64::dsub2:
653 case AArch64::dsub3:
654 case AArch64::qsub0:
655 case AArch64::qsub1:
656 case AArch64::qsub2:
657 case AArch64::qsub3:
658 break;
659 }
660 }
661 else
662 return false;
663 }
664 return true;
665}
666
667
668
669unsigned AArch64SIMDInstrOpt::determineSrcReg(MachineInstr &MI) const {
670 switch (MI.getOpcode()) {
671 default:
673
674 case AArch64::ST2Twov16b:
675 case AArch64::ST2Twov8b:
676 case AArch64::ST2Twov8h:
677 case AArch64::ST2Twov4h:
678 case AArch64::ST2Twov4s:
679 case AArch64::ST2Twov2s:
680 case AArch64::ST2Twov2d:
681 return 2;
682
683 case AArch64::ST4Fourv16b:
684 case AArch64::ST4Fourv8b:
685 case AArch64::ST4Fourv8h:
686 case AArch64::ST4Fourv4h:
687 case AArch64::ST4Fourv4s:
688 case AArch64::ST4Fourv2s:
689 case AArch64::ST4Fourv2d:
690 return 4;
691 }
692}
693
694bool AArch64SIMDInstrOpt::runOnMachineFunction(MachineFunction &MF) {
696 return false;
697
699 const AArch64Subtarget &ST = MF.getSubtarget();
701 SchedModel.init(&ST);
703 return false;
704
706 for (auto OptimizationKind : {VectorElem, Interleave}) {
707 if (!shouldExitEarly(&MF, OptimizationKind)) {
708 SmallVector<MachineInstr *, 8> RemoveMIs;
709 for (MachineBasicBlock &MBB : MF) {
710 for (MachineInstr &MI : MBB) {
711 bool InstRewrite;
712 if (OptimizationKind == VectorElem)
713 InstRewrite = optimizeVectElement(MI) ;
714 else
715 InstRewrite = optimizeLdStInterleave(MI);
716 if (InstRewrite) {
717
718
721 }
722 }
723 }
724 for (MachineInstr *MI : RemoveMIs)
725 MI->eraseFromParent();
726 }
727 }
728
730}
731
732
733
735 return new AArch64SIMDInstrOpt();
736}
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
const TargetInstrInfo & TII
#define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9, RC)
Definition AArch64SIMDInstrOpt.cpp:97
#define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC)
Definition AArch64SIMDInstrOpt.cpp:95
#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME
Definition AArch64SIMDInstrOpt.cpp:63
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Promote Memory to Register
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
FunctionPass class - This class is used to implement most global optimizations.
Describe properties that are true of each instruction in the target description file.
unsigned getOpcode() const
Return the opcode number for this descriptor.
const MCInstrDesc & get(unsigned Opcode) const
Return the machine instruction descriptor that corresponds to the specified instruction opcode.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
unsigned getNumOperands() const
Retuns the total number of operands.
const MachineOperand & getOperand(unsigned i) const
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
Provide an instruction scheduling machine model to CodeGen passes.
LLVM_ABI bool hasInstrSchedModel() const
Return true if this machine model includes an instruction-level scheduling model.
LLVM_ABI void init(const TargetSubtargetInfo *TSInfo, bool EnableSModel=true, bool EnableSItins=true)
Initialize the machine model for instruction scheduling.
const TargetSubtargetInfo * getSubtargetInfo() const
TargetSubtargetInfo getter.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
This is an optimization pass for GlobalISel generic memory operations.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
FunctionPass * createAArch64SIMDInstrOptPass()
Returns an instance of the high cost ASIMD instruction replacement optimization pass.
Definition AArch64SIMDInstrOpt.cpp:734
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
unsigned getKillRegState(bool B)
Summarize the scheduling resources required for an instruction of a particular scheduling class.