f128
symbols on powerpc64 give inaccurate results · Issue #125109 · rust-lang/rust (original) (raw)
For some reason, the __addkf3
symbol linked by rustc says that 0x00000000000000000000000000000000 + 0x00000000000000000000000000000001 = 0x00000000000000000000000000000000
, when the answer should be 0x00000000000000000000000000000001
. Other symbols likely do the wrong thing here too.
Example:
$ powerpc64-linux-gnu-gcc add_test.c -o add_test.c.ppc64 $ rustc add_test.rs -o add_test.rs.ppc64 --target powerpc64-unknown-linux-gnu -Clinker=powerpc64-linux-gnu-gcc
$ qemu-ppc64 -L /usr/powerpc64-linux-gnu/ add_test.c.ppc64 $ qemu-ppc64 -L /usr/powerpc64-linux-gnu/ add_test.rs.ppc64
$ qemu-ppc64 -L /usr/powerpc64-linux-gnu/ add_test.c.ppc64 0000000000000000000000000000000000 0.000000 0000000000000000000000000000000001 0.000000 0000000000000000000000000000000001 0.000000
$ qemu-ppc64 -L /usr/powerpc64-linux-gnu/ add_test.rs.ppc64 [add_test.rs:12:5] a = 0x00000000000000000000000000000000 [add_test.rs:12:5] b = 0x00000000000000000000000000000001 [add_test.rs:14:5] c = 0x00000000000000000000000000000000
Our Rust code emits __addkf3
which calls the hardware f128 addition routine xsaddup
. The part I cannot explain is that the C version also emits __addkf3
which also calls xsaddup
, but somehow produces a correct result. I checked with GDB to make sure they are both hitting xsaddup
, so can't really explain the discrepancy.
Original notes at rust-lang/compiler-builtins#606 (comment), discussion on Zulip at https://rust-lang.zulipchat.com/#narrow/stream/122651-general/topic/f128.20system.20libraries.20noncompliant.20platforms/near/438486364.
Full code copied from rust-lang/compiler-builtins#606 (comment):
Rust code
#![feature(f128)]
#[no_mangle] #[inline(never)] fn add_entry(a: f128, b: f128) -> f128 { a + b }
fn main() { let a = f128::from_bits(0x0); let b = f128::from_bits(0x1); dbg!(a, b); let c = add_entry(a, b); dbg!(c); }
C code
#include <stdio.h> #include <stdlib.h> #include <inttypes.h>
#define _Float128 __float128
typedef struct { #if __BYTE_ORDER == __LITTLE_ENDIAN uint64_t lower, upper; #elif __BYTE_ORDER == __BIG_ENDIAN uint64_t upper, lower; #else #error missing endian check #endif } attribute((aligned(_Alignof(_Float128)))) u128;
_Float128 __addkf3(_Float128, _Float128);
void f128_print(_Float128 val) { u128 ival = *((u128 *)(&val)); printf("%#018" PRIx64 "%016" PRIx64 " %lf\n", ival.upper, ival.lower, (double)val); }
_Float128 new_f128(uint64_t upper, uint64_t lower) { u128 val = { .lower = lower, .upper = upper }; return *((_Float128 *)(&val)); }
_Float128 add_entry(_Float128 a, _Float128 b) { #ifdef USE_ADDKF3 return __addkf3(a, b); #else return a + b; #endif }
int main() { _Float128 a = new_f128(0x0000000000000000, 0x0000000000000000); _Float128 b = new_f128(0x0000000000000000, 0x0000000000000001); f128_print(a); f128_print(b); _Float128 c = add_entry(a, b); f128_print(c);
return 0;
}
Assembly generated from Rust (incorrect result)
0000000000010d00 <.add_entry>: 10d00: 7c 08 02 a6 mflr r0 10d04: f8 21 ff 91 stdu r1,-112(r1) 10d08: f8 01 00 80 std r0,128(r1) 10d0c: 4b ff c6 95 bl d3a0 <00000143.plt_call.__addkf3> 10d10: e8 41 00 28 ld r2,40(r1) 10d14: 38 21 00 70 addi r1,r1,112 10d18: e8 01 00 10 ld r0,16(r1) 10d1c: 7c 08 03 a6 mtlr r0 10d20: 4e 80 00 20 blr
000000000000d3a0 <00000143.plt_call.__addkf3>: d3a0: f8 41 00 28 std r2,40(r1) d3a4: 3d 62 ff ff addis r11,r2,-1 d3a8: e9 8b 7f 58 ld r12,32600(r11) d3ac: 7d 89 03 a6 mtctr r12 d3b0: e8 4b 7f 60 ld r2,32608(r11) d3b4: 4e 80 04 20 bctr
0000000000056790 <.__addkf3_resolve>: 56790: 81 2d 8f 9c lwz r9,-28772(r13) 56794: 75 29 00 40 andis. r9,r9,64 56798: 41 82 00 18 beq 567b0 <.__addkf3_resolve+0x20> 5679c: e8 62 80 10 ld r3,-32752(r2) 567a0: 4e 80 00 20 blr 567a4: 60 00 00 00 nop 567a8: 60 00 00 00 nop 567ac: 60 42 00 00 ori r2,r2,0 567b0: e8 62 80 18 ld r3,-32744(r2) 567b4: 4e 80 00 20 blr ... 567c4: 60 00 00 00 nop 567c8: 60 00 00 00 nop 567cc: 60 42 00 00 ori r2,r2,0
000000000005e6e0 <.__addkf3_hw>: 5e6e0: fc 42 18 08 xsaddqp v2,v2,v3 5e6e4: 4e 80 00 20 blr ... 5e6f4: 60 00 00 00 nop 5e6f8: 60 00 00 00 nop 5e6fc: 60 00 00 00 nop
0000000000057050 <.__addkf3_sw>: 57050: fb 41 ff d0 std r26,-48(r1) 57054: fb 61 ff d8 std r27,-40(r1) 57058: fb 81 ff e0 std r28,-32(r1) 5705c: fb a1 ff e8 std r29,-24(r1) 57060: fb c1 ff f0 std r30,-16(r1) 57064: fb e1 ff f8 std r31,-8(r1) 57068: f8 21 ff 41 stdu r1,-192(r1) 5706c: 39 21 00 70 addi r9,r1,112 57070: 39 41 00 70 addi r10,r1,112 // ... full sw implementation
Assembly generated from C (correct result)
0000000010000af8 <.add_entry>: 10000af8: 7c 08 02 a6 mflr r0 10000afc: f8 01 00 10 std r0,16(r1) 10000b00: fb e1 ff f8 std r31,-8(r1) 10000b04: f8 21 ff 81 stdu r1,-128(r1) 10000b08: 7c 3f 0b 78 mr r31,r1 10000b0c: 39 20 00 30 li r9,48 10000b10: 39 5f 00 80 addi r10,r31,128 10000b14: 7c 4a 4f 99 stxvd2x vs34,r10,r9 10000b18: 39 20 00 40 li r9,64 10000b1c: 39 5f 00 80 addi r10,r31,128 10000b20: 7c 6a 4f 99 stxvd2x vs35,r10,r9 10000b24: 39 20 00 40 li r9,64 10000b28: 39 5f 00 80 addi r10,r31,128 10000b2c: 7c 6a 4e 99 lxvd2x vs35,r10,r9 10000b30: 39 20 00 30 li r9,48 10000b34: 39 5f 00 80 addi r10,r31,128 10000b38: 7c 4a 4e 99 lxvd2x vs34,r10,r9 10000b3c: 4b ff fb a5 bl 100006e0 <00000019.plt_call.__addkf3> 10000b40: e8 41 00 28 ld r2,40(r1) 10000b44: f0 02 14 96 xxmr vs0,vs34 10000b48: f0 40 04 91 xxmr vs34,vs0 10000b4c: 38 3f 00 80 addi r1,r31,128 10000b50: e8 01 00 10 ld r0,16(r1) 10000b54: 7c 08 03 a6 mtlr r0 10000b58: eb e1 ff f8 ld r31,-8(r1) 10000b5c: 4e 80 00 20 blr 10000b60: 00 00 00 00 .long 0x0 10000b64: 00 00 00 01 .long 0x1 10000b68: 80 01 00 01 lwz r0,1(r1)
0000000010000c40 <.__addkf3_resolve>: 10000c40: 81 2d 8f 9c lwz r9,-28772(r13) 10000c44: 75 29 00 40 andis. r9,r9,64 10000c48: 41 82 00 18 beq 10000c60 <.__addkf3_resolve+0x20> 10000c4c: e8 62 80 10 ld r3,-32752(r2) 10000c50: 4e 80 00 20 blr 10000c54: 60 00 00 00 nop 10000c58: 60 00 00 00 nop 10000c5c: 60 42 00 00 ori r2,r2,0 10000c60: e8 62 80 18 ld r3,-32744(r2) 10000c64: 4e 80 00 20 blr ... 10000c74: 60 00 00 00 nop 10000c78: 60 00 00 00 nop 10000c7c: 60 42 00 00 ori r2,r2,0
0000000010008b90 <.__addkf3_hw>: 10008b90: fc 42 18 08 xsaddqp v2,v2,v3 10008b94: 4e 80 00 20 blr ... 10008ba4: 60 00 00 00 nop 10008ba8: 60 00 00 00 nop 10008bac: 60 00 00 00 nop
0000000010001500 <.__addkf3_sw>: 10001500: fb 41 ff d0 std r26,-48(r1) 10001504: fb 61 ff d8 std r27,-40(r1) 10001508: fb 81 ff e0 std r28,-32(r1) 1000150c: fb a1 ff e8 std r29,-24(r1) 10001510: fb c1 ff f0 std r30,-16(r1) 10001514: fb e1 ff f8 std r31,-8(r1) 10001518: f8 21 ff 41 stdu r1,-192(r1) 1000151c: 39 21 00 70 addi r9,r1,112 10001520: 39 41 00 70 addi r10,r1,112 // ...