Improve Math(F).FusedMultiplyAdd codegen by EgorBo · Pull Request #27060 · dotnet/coreclr (original) (raw)

Fixes https://github.com/dotnet/coreclr/issues/25829 (currently Math(F).FusedMultiplyAdd always emits vfmadd213ss\d and xors if there are negations)

Test cases:

static float Test1(float a, float b, float c) => MathF.FusedMultiplyAdd( a, b, c); static float Test2(float a, float b, float c) => MathF.FusedMultiplyAdd( a, -b, c); static float Test3(float a, float b, float c) => MathF.FusedMultiplyAdd(-a, b, c); static float Test4(float a, float b, float c) => MathF.FusedMultiplyAdd(-a, -b, c); static float Test5(float a, float b, float c) => MathF.FusedMultiplyAdd( a, b, -c); static float Test6(float a, float b, float c) => MathF.FusedMultiplyAdd( a, -b, -c); static float Test7(float a, float b, float c) => MathF.FusedMultiplyAdd(-a, b, -c); static float Test8(float a, float b, float c) => MathF.FusedMultiplyAdd(-a, -b, -c);

Was:

; Method FmaFTests:Test1(float,float,float):float G_M46841_IG01: vzeroupper G_M46841_IG02: vfmadd213ss xmm0, xmm1, xmm2 G_M46841_IG03: ret
; Total bytes of code: 9

; Method FmaFTests:Test2(float,float,float):float G_M46842_IG01: vzeroupper G_M46842_IG02: vmovss xmm3, dword ptr [reloc @RWD00] vxorps xmm1, xmm3 vfmadd213ss xmm0, xmm1, xmm2 G_M46842_IG03: ret
RWD00 dd 80000000h ; Total bytes of code: 21

; Method FmaFTests:Test3(float,float,float):float G_M46843_IG01: vzeroupper G_M46843_IG02: vmovss xmm3, dword ptr [reloc @RWD00] vxorps xmm0, xmm3 vfmadd213ss xmm0, xmm1, xmm2 G_M46843_IG03: ret
RWD00 dd 80000000h ; Total bytes of code: 21

; Method FmaFTests:Test4(float,float,float):float G_M46844_IG01: vzeroupper G_M46844_IG02: vmovss xmm3, dword ptr [reloc @RWD00] vxorps xmm0, xmm3 vmovss xmm3, dword ptr [reloc @RWD00] vxorps xmm1, xmm3 vfmadd213ss xmm0, xmm1, xmm2 G_M46844_IG03: ret
RWD00 dd 80000000h ; Total bytes of code: 33

; Method FmaFTests:Test5(float,float,float):float G_M46845_IG01: vzeroupper G_M46845_IG02: vmovss xmm3, dword ptr [reloc @RWD00] vxorps xmm2, xmm3 vfmadd213ss xmm0, xmm1, xmm2 G_M46845_IG03: ret
RWD00 dd 80000000h ; Total bytes of code: 21

; Method FmaFTests:Test6(float,float,float):float G_M46846_IG01: vzeroupper G_M46846_IG02: vmovss xmm3, dword ptr [reloc @RWD00] vxorps xmm1, xmm3 vmovss xmm3, dword ptr [reloc @RWD00] vxorps xmm2, xmm3 vfmadd213ss xmm0, xmm1, xmm2 G_M46846_IG03: ret
RWD00 dd 80000000h ; Total bytes of code: 33

; Method FmaFTests:Test7(float,float,float):float G_M46847_IG01: vzeroupper G_M46847_IG02: vmovss xmm3, dword ptr [reloc @RWD00] vxorps xmm0, xmm3 vmovss xmm3, dword ptr [reloc @RWD00] vxorps xmm2, xmm3 vfmadd213ss xmm0, xmm1, xmm2 G_M46847_IG03: ret
RWD00 dd 80000000h ; Total bytes of code: 33

; Method FmaFTests:Test8(float,float,float):float G_M46832_IG01: vzeroupper G_M46832_IG02: vmovss xmm3, dword ptr [reloc @RWD00] vxorps xmm0, xmm3 vmovss xmm3, dword ptr [reloc @RWD00] vxorps xmm1, xmm3 vmovss xmm3, dword ptr [reloc @RWD00] vxorps xmm2, xmm3 vfmadd213ss xmm0, xmm1, xmm2 G_M46832_IG03: ret
RWD00 dd 80000000h ; Total bytes of code: 45

Now:

; Method FmaFTests:Test1(float,float,float):float G_M12796_IG01: vzeroupper G_M12796_IG02: vfmadd213ss xmm0, xmm1, xmm2 G_M12796_IG03: ret
; Total bytes of code: 9

; Method FmaFTests:Test2(float,float,float):float G_M12799_IG01: vzeroupper G_M12799_IG02: vfnmadd213ss xmm0, xmm1, xmm2 G_M12799_IG03: ret
; Total bytes of code: 9

; Method FmaFTests:Test3(float,float,float):float G_M12798_IG01: vzeroupper G_M12798_IG02: vfnmadd213ss xmm0, xmm1, xmm2 G_M12798_IG03: ret
; Total bytes of code: 9

; Method FmaFTests:Test4(float,float,float):float G_M12793_IG01: vzeroupper G_M12793_IG02: vfmadd213ss xmm0, xmm1, xmm2 G_M12793_IG03: ret
; Total bytes of code: 9

; Method FmaFTests:Test5(float,float,float):float G_M12792_IG01: vzeroupper G_M12792_IG02: vfmsub213ss xmm0, xmm1, xmm2 G_M12792_IG03: ret
; Total bytes of code: 9

; Method FmaFTests:Test6(float,float,float):float G_M12795_IG01: vzeroupper G_M12795_IG02: vfnmsub213ss xmm0, xmm1, xmm2 G_M12795_IG03: ret
; Total bytes of code: 9

; Method FmaFTests:Test7(float,float,float):float G_M12794_IG01: vzeroupper G_M12794_IG02: vfnmsub213ss xmm0, xmm1, xmm2 G_M12794_IG03: ret
; Total bytes of code: 9

; Method FmaFTests:Test8(float,float,float):float G_M12789_IG01: vzeroupper G_M12789_IG02: vfmsub213ss xmm0, xmm1, xmm2 G_M12789_IG03: ret
; Total bytes of code: 9

Diff.
/cc @tannergooding