[AArch64] Expand bcmp() for small block lengths · llvm/llvm-project@a005c1a (original) (raw)
4 files changed
lines changed
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -629,6 +629,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, | ||
629 | 629 | |
630 | 630 | MaxStoresPerMemmoveOptSize = MaxStoresPerMemmove = 4; |
631 | 631 | |
632 | + MaxLoadsPerMemcmpOptSize = 4; | |
633 | + MaxLoadsPerMemcmp = Subtarget->requiresStrictAlign() | |
634 | + ? MaxLoadsPerMemcmpOptSize : 8; | |
635 | + | |
632 | 636 | setStackPointerRegisterToSaveRestore(AArch64::SP); |
633 | 637 | |
634 | 638 | setSchedulingPreference(Sched::Hybrid); |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -618,6 +618,19 @@ int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, | ||
618 | 618 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); |
619 | 619 | } |
620 | 620 | |
621 | +AArch64TTIImpl::TTI::MemCmpExpansionOptions | |
622 | +AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { | |
623 | + TTI::MemCmpExpansionOptions Options; | |
624 | + Options.AllowOverlappingLoads = !ST->requiresStrictAlign(); | |
625 | + Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); | |
626 | + Options.NumLoadsPerBlock = Options.MaxNumLoads; | |
627 | +// TODO: Though vector loads usually perform well on AArch64, in some targets | |
628 | +// they may wake up the FP unit, which raises the power consumption. Perhaps | |
629 | +// they could be used with no holds barred (-O3). | |
630 | + Options.LoadSizes = {8, 4, 2, 1}; | |
631 | +return Options; | |
632 | +} | |
633 | + | |
621 | 634 | int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, |
622 | 635 | unsigned Alignment, unsigned AddressSpace, |
623 | 636 | const Instruction *I) { |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -130,6 +130,9 @@ class AArch64TTIImpl : public BasicTTIImplBase { | ||
130 | 130 | int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, |
131 | 131 | const Instruction *I = nullptr); |
132 | 132 | |
133 | + TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, | |
134 | +bool IsZeroCmp) const; | |
135 | + | |
133 | 136 | int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, |
134 | 137 | unsigned AddressSpace, const Instruction *I = nullptr); |
135 | 138 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
1 | +; RUN: llc -O2 < %s -mtriple=aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECKN | |
2 | +; RUN: llc -O2 < %s -mtriple=aarch64-linux-gnu -mattr=strict-align | FileCheck %s --check-prefixes=CHECK,CHECKS | |
3 | + | |
4 | +declare i32 @bcmp(i8*, i8*, i64) nounwind readonly | |
5 | +declare i32 @memcmp(i8*, i8*, i64) nounwind readonly | |
6 | + | |
7 | +define i1 @bcmp_b2(i8* %s1, i8* %s2) { | |
8 | +entry: | |
9 | +%bcmp = call i32 @bcmp(i8* %s1, i8* %s2, i64 15) | |
10 | +%ret = icmp eq i32 %bcmp, 0 | |
11 | +ret i1 %ret | |
12 | + | |
13 | +; CHECK-LABEL: bcmp_b2: | |
14 | +; CHECK-NOT: bl bcmp | |
15 | +; CHECKN: ldr x | |
16 | +; CHECKN-NEXT: ldr x | |
17 | +; CHECKN-NEXT: ldur x | |
18 | +; CHECKN-NEXT: ldur x | |
19 | +; CHECKS: ldr x | |
20 | +; CHECKS-NEXT: ldr x | |
21 | +; CHECKS-NEXT: ldr w | |
22 | +; CHECKS-NEXT: ldr w | |
23 | +; CHECKS-NEXT: ldrh w | |
24 | +; CHECKS-NEXT: ldrh w | |
25 | +; CHECKS-NEXT: ldrb w | |
26 | +; CHECKS-NEXT: ldrb w | |
27 | +} | |
28 | + | |
29 | +define i1 @bcmp_bs(i8* %s1, i8* %s2) optsize { | |
30 | +entry: | |
31 | +%memcmp = call i32 @memcmp(i8* %s1, i8* %s2, i64 31) | |
32 | +%ret = icmp eq i32 %memcmp, 0 | |
33 | +ret i1 %ret | |
34 | + | |
35 | +; CHECK-LABEL: bcmp_bs: | |
36 | +; CHECKN-NOT: bl memcmp | |
37 | +; CHECKN: ldp x | |
38 | +; CHECKN-NEXT: ldp x | |
39 | +; CHECKN-NEXT: ldr x | |
40 | +; CHECKN-NEXT: ldr x | |
41 | +; CHECKN-NEXT: ldur x | |
42 | +; CHECKN-NEXT: ldur x | |
43 | +; CHECKS: bl memcmp | |
44 | +} |