[AArch64] Expand bcmp() for small block lengths · llvm/llvm-project@a005c1a (original) (raw)

4 files changed

lines changed

Original file line number Diff line number Diff line change
@@ -629,6 +629,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
629 629
630 630 MaxStoresPerMemmoveOptSize = MaxStoresPerMemmove = 4;
631 631
632 + MaxLoadsPerMemcmpOptSize = 4;
633 + MaxLoadsPerMemcmp = Subtarget->requiresStrictAlign()
634 + ? MaxLoadsPerMemcmpOptSize : 8;
635 +
632 636 setStackPointerRegisterToSaveRestore(AArch64::SP);
633 637
634 638 setSchedulingPreference(Sched::Hybrid);
Original file line number Diff line number Diff line change
@@ -618,6 +618,19 @@ int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
618 618 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
619 619 }
620 620
621 +AArch64TTIImpl::TTI::MemCmpExpansionOptions
622 +AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
623 + TTI::MemCmpExpansionOptions Options;
624 + Options.AllowOverlappingLoads = !ST->requiresStrictAlign();
625 + Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
626 + Options.NumLoadsPerBlock = Options.MaxNumLoads;
627 +// TODO: Though vector loads usually perform well on AArch64, in some targets
628 +// they may wake up the FP unit, which raises the power consumption. Perhaps
629 +// they could be used with no holds barred (-O3).
630 + Options.LoadSizes = {8, 4, 2, 1};
631 +return Options;
632 +}
633 +
621 634 int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
622 635 unsigned Alignment, unsigned AddressSpace,
623 636 const Instruction *I) {
Original file line number Diff line number Diff line change
@@ -130,6 +130,9 @@ class AArch64TTIImpl : public BasicTTIImplBase {
130 130 int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
131 131 const Instruction *I = nullptr);
132 132
133 + TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
134 +bool IsZeroCmp) const;
135 +
133 136 int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
134 137 unsigned AddressSpace, const Instruction *I = nullptr);
135 138
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
1 +; RUN: llc -O2 < %s -mtriple=aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECKN
2 +; RUN: llc -O2 < %s -mtriple=aarch64-linux-gnu -mattr=strict-align | FileCheck %s --check-prefixes=CHECK,CHECKS
3 +
4 +declare i32 @bcmp(i8*, i8*, i64) nounwind readonly
5 +declare i32 @memcmp(i8*, i8*, i64) nounwind readonly
6 +
7 +define i1 @bcmp_b2(i8* %s1, i8* %s2) {
8 +entry:
9 +%bcmp = call i32 @bcmp(i8* %s1, i8* %s2, i64 15)
10 +%ret = icmp eq i32 %bcmp, 0
11 +ret i1 %ret
12 +
13 +; CHECK-LABEL: bcmp_b2:
14 +; CHECK-NOT: bl bcmp
15 +; CHECKN: ldr x
16 +; CHECKN-NEXT: ldr x
17 +; CHECKN-NEXT: ldur x
18 +; CHECKN-NEXT: ldur x
19 +; CHECKS: ldr x
20 +; CHECKS-NEXT: ldr x
21 +; CHECKS-NEXT: ldr w
22 +; CHECKS-NEXT: ldr w
23 +; CHECKS-NEXT: ldrh w
24 +; CHECKS-NEXT: ldrh w
25 +; CHECKS-NEXT: ldrb w
26 +; CHECKS-NEXT: ldrb w
27 +}
28 +
29 +define i1 @bcmp_bs(i8* %s1, i8* %s2) optsize {
30 +entry:
31 +%memcmp = call i32 @memcmp(i8* %s1, i8* %s2, i64 31)
32 +%ret = icmp eq i32 %memcmp, 0
33 +ret i1 %ret
34 +
35 +; CHECK-LABEL: bcmp_bs:
36 +; CHECKN-NOT: bl memcmp
37 +; CHECKN: ldp x
38 +; CHECKN-NEXT: ldp x
39 +; CHECKN-NEXT: ldr x
40 +; CHECKN-NEXT: ldr x
41 +; CHECKN-NEXT: ldur x
42 +; CHECKN-NEXT: ldur x
43 +; CHECKS: bl memcmp
44 +}