Improve Math(F).FusedMultiplyAdd codegen by EgorBo · Pull Request #27060 · dotnet/coreclr (original) (raw)
Fixes https://github.com/dotnet/coreclr/issues/25829 (currently Math(F).FusedMultiplyAdd
always emits vfmadd213ss\d
and xor
s if there are negations)
Test cases:
static float Test1(float a, float b, float c) => MathF.FusedMultiplyAdd( a, b, c); static float Test2(float a, float b, float c) => MathF.FusedMultiplyAdd( a, -b, c); static float Test3(float a, float b, float c) => MathF.FusedMultiplyAdd(-a, b, c); static float Test4(float a, float b, float c) => MathF.FusedMultiplyAdd(-a, -b, c); static float Test5(float a, float b, float c) => MathF.FusedMultiplyAdd( a, b, -c); static float Test6(float a, float b, float c) => MathF.FusedMultiplyAdd( a, -b, -c); static float Test7(float a, float b, float c) => MathF.FusedMultiplyAdd(-a, b, -c); static float Test8(float a, float b, float c) => MathF.FusedMultiplyAdd(-a, -b, -c);
Was:
; Method FmaFTests:Test1(float,float,float):float
G_M46841_IG01:
vzeroupper
G_M46841_IG02:
vfmadd213ss xmm0, xmm1, xmm2
G_M46841_IG03:
ret
; Total bytes of code: 9
; Method FmaFTests:Test2(float,float,float):float
G_M46842_IG01:
vzeroupper
G_M46842_IG02:
vmovss xmm3, dword ptr [reloc @RWD00]
vxorps xmm1, xmm3
vfmadd213ss xmm0, xmm1, xmm2
G_M46842_IG03:
ret
RWD00 dd 80000000h
; Total bytes of code: 21
; Method FmaFTests:Test3(float,float,float):float
G_M46843_IG01:
vzeroupper
G_M46843_IG02:
vmovss xmm3, dword ptr [reloc @RWD00]
vxorps xmm0, xmm3
vfmadd213ss xmm0, xmm1, xmm2
G_M46843_IG03:
ret
RWD00 dd 80000000h
; Total bytes of code: 21
; Method FmaFTests:Test4(float,float,float):float
G_M46844_IG01:
vzeroupper
G_M46844_IG02:
vmovss xmm3, dword ptr [reloc @RWD00]
vxorps xmm0, xmm3
vmovss xmm3, dword ptr [reloc @RWD00]
vxorps xmm1, xmm3
vfmadd213ss xmm0, xmm1, xmm2
G_M46844_IG03:
ret
RWD00 dd 80000000h
; Total bytes of code: 33
; Method FmaFTests:Test5(float,float,float):float
G_M46845_IG01:
vzeroupper
G_M46845_IG02:
vmovss xmm3, dword ptr [reloc @RWD00]
vxorps xmm2, xmm3
vfmadd213ss xmm0, xmm1, xmm2
G_M46845_IG03:
ret
RWD00 dd 80000000h
; Total bytes of code: 21
; Method FmaFTests:Test6(float,float,float):float
G_M46846_IG01:
vzeroupper
G_M46846_IG02:
vmovss xmm3, dword ptr [reloc @RWD00]
vxorps xmm1, xmm3
vmovss xmm3, dword ptr [reloc @RWD00]
vxorps xmm2, xmm3
vfmadd213ss xmm0, xmm1, xmm2
G_M46846_IG03:
ret
RWD00 dd 80000000h
; Total bytes of code: 33
; Method FmaFTests:Test7(float,float,float):float
G_M46847_IG01:
vzeroupper
G_M46847_IG02:
vmovss xmm3, dword ptr [reloc @RWD00]
vxorps xmm0, xmm3
vmovss xmm3, dword ptr [reloc @RWD00]
vxorps xmm2, xmm3
vfmadd213ss xmm0, xmm1, xmm2
G_M46847_IG03:
ret
RWD00 dd 80000000h
; Total bytes of code: 33
; Method FmaFTests:Test8(float,float,float):float
G_M46832_IG01:
vzeroupper
G_M46832_IG02:
vmovss xmm3, dword ptr [reloc @RWD00]
vxorps xmm0, xmm3
vmovss xmm3, dword ptr [reloc @RWD00]
vxorps xmm1, xmm3
vmovss xmm3, dword ptr [reloc @RWD00]
vxorps xmm2, xmm3
vfmadd213ss xmm0, xmm1, xmm2
G_M46832_IG03:
ret
RWD00 dd 80000000h
; Total bytes of code: 45
Now:
; Method FmaFTests:Test1(float,float,float):float
G_M12796_IG01:
vzeroupper
G_M12796_IG02:
vfmadd213ss xmm0, xmm1, xmm2
G_M12796_IG03:
ret
; Total bytes of code: 9
; Method FmaFTests:Test2(float,float,float):float
G_M12799_IG01:
vzeroupper
G_M12799_IG02:
vfnmadd213ss xmm0, xmm1, xmm2
G_M12799_IG03:
ret
; Total bytes of code: 9
; Method FmaFTests:Test3(float,float,float):float
G_M12798_IG01:
vzeroupper
G_M12798_IG02:
vfnmadd213ss xmm0, xmm1, xmm2
G_M12798_IG03:
ret
; Total bytes of code: 9
; Method FmaFTests:Test4(float,float,float):float
G_M12793_IG01:
vzeroupper
G_M12793_IG02:
vfmadd213ss xmm0, xmm1, xmm2
G_M12793_IG03:
ret
; Total bytes of code: 9
; Method FmaFTests:Test5(float,float,float):float
G_M12792_IG01:
vzeroupper
G_M12792_IG02:
vfmsub213ss xmm0, xmm1, xmm2
G_M12792_IG03:
ret
; Total bytes of code: 9
; Method FmaFTests:Test6(float,float,float):float
G_M12795_IG01:
vzeroupper
G_M12795_IG02:
vfnmsub213ss xmm0, xmm1, xmm2
G_M12795_IG03:
ret
; Total bytes of code: 9
; Method FmaFTests:Test7(float,float,float):float
G_M12794_IG01:
vzeroupper
G_M12794_IG02:
vfnmsub213ss xmm0, xmm1, xmm2
G_M12794_IG03:
ret
; Total bytes of code: 9
; Method FmaFTests:Test8(float,float,float):float
G_M12789_IG01:
vzeroupper
G_M12789_IG02:
vfmsub213ss xmm0, xmm1, xmm2
G_M12789_IG03:
ret
; Total bytes of code: 9
Diff.
/cc @tannergooding