16 bit multiplication always produces zero result (original) (raw)

Another one from the swift team. I'm surprised no one has found this yet.

@llvm.umul.with.overflow.i16(i16 , i16 ) seems to be lowering to invalid assembly, specifically it ends up moving the two 16 bit values into the top two bytes of 32 bit numbers then using __mulsi3 to multiply them together and put the result into a 32 bit number, where it takes the top two bytes. This will always produce zero.

In my opinion if it's using __mulsi3, it should be putting two 16 bit numbers into the bottom two bytes of each input and taking the bottom two bytes of the output, then any non zero value in the top two bytes after multiplication should be interpreted as an overflow and the flag set accordingly.

Here's the llvm ir in a test case as a patch to llvm...

diff --git a/test/CodeGen/AVR/umul.with.overflow.i16-bug.ll b/test/CodeGen/AVR/umul.with.overflow.i16-bug.ll
new file mode 100644
index 00000000000..12c4030f943
--- /dev/null
+++ b/test/CodeGen/AVR/umul.with.overflow.i16-bug.ll
@@ -0,0 +1,39 @@
+; RUN: llc -O1 < %s -march=avr | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9"
+
+%Vs6UInt16 = type <{ i16 }>
+%Sb = type <{ i1 }>
+
+define hidden void @_TF4main13setServoAngleFT5angleVs6UInt16_T_(i16) #0 {
+entry:
+  %adjustedAngle = alloca %Vs6UInt16, align 2
+  %1 = bitcast %Vs6UInt16* %adjustedAngle to i8*
+  %adjustedAngle._value = getelementptr inbounds %Vs6UInt16, %Vs6UInt16* %adjustedAngle, i32 0, i32 0
+  store i16 %0, i16* %adjustedAngle._value, align 2
+
+;print(unsignedInt: adjustedAngle &* UInt16(11))
+; breaks here
+  %adjustedAngle._value2 = getelementptr inbounds %Vs6UInt16, %Vs6UInt16* %adjustedAngle, i32 0, i32 0
+  %2 = load i16, i16* %adjustedAngle._value2, align 2
+
+  %3 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 %2, i16 11)
+  %4 = extractvalue { i16, i1 } %3, 0
+  %5 = extractvalue { i16, i1 } %3, 1
+
+  ; above code looks fine, how is it lowered?
+  %6 = call i1 @_TIF3AVR5printFT11unsignedIntVs6UInt1610addNewlineSb_T_A0_()
+  call void @_TF3AVR5printFT11unsignedIntVs6UInt1610addNewlineSb_T_(i16 %4, i1 %6)
+
+  ret void
+}
+
+declare void @_TF3AVR5printFT11unsignedIntVs6UInt1610addNewlineSb_T_(i16, i1) #0
+declare i1 @_TIF3AVR5printFT11unsignedIntVs6UInt1610addNewlineSb_T_A0_() #0
+
+; Function Attrs: nounwind readnone speculatable
+declare { i16, i1 } @llvm.umul.with.overflow.i16(i16, i16) #2
+
+attributes #0 = { "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "target-cpu"="core2" "target-features"="+ssse3,+cx16,+fxsr,+mmx,+x87,+sse,+sse2,+sse3" }
+attributes #2 = { nounwind readnone speculatable }

(Note that I haven't yet put in the FileCheck directives because I haven't decided/worked out what the compiler should be doing here yet!)

FYI the original source, with debug statements in was something like:

func setServoAngle(angle: UInt16) {
  var adjustedAngle: UInt16 = angle
  print(unsignedInt: adjustedAngle &* UInt16(11))
}

This produces the following assembly...

    .text
    .file	"<stdin>"
    .hidden	_TF4main13setServoAngleFT5angleVs6UInt16_T_ ; -- Begin function _TF4main13setServoAngleFT5angleVs6UInt16_T_
    .globl	_TF4main13setServoAngleFT5angleVs6UInt16_T_
    .p2align	1
    .type	_TF4main13setServoAngleFT5angleVs6UInt16_T_,@function
_TF4main13setServoAngleFT5angleVs6UInt16_T_: ; @_TF4main13setServoAngleFT5angleVs6UInt16_T_
; %bb.0:                                ; %entry
    push	r28
    push	r29
    push	r16
    push	r17
    in	r28, 61
    in	r29, 62
    sbiw	r28, 2
    in	r0, 63
    cli
    out	62, r29
    out	63, r0
    out	61, r28
    std	Y+1, r24
    std	Y+2, r25
    ldi	r20, 11
    ldi	r21, 0
    ldi	r18, 0
    ldi	r19, 0
    mov	r22, r18
    mov	r23, r19
    call	__mulsi3
    mov	r16, r24
    mov	r17, r25
    call	_TIF3AVR5printFT11unsignedIntVs6UInt1610addNewlineSb_T_A0_
    mov	r22, r24
    mov	r24, r16
    mov	r25, r17
    call	_TF3AVR5printFT11unsignedIntVs6UInt1610addNewlineSb_T_
    adiw	r28, 2
    in	r0, 63
    cli
    out	62, r29
    out	63, r0
    out	61, r28
    pop	r17
    pop	r16
    pop	r29
    pop	r28
    ret
.Lfunc_end0:
    .size	_TF4main13setServoAngleFT5angleVs6UInt16_T_, .Lfunc_end0-_TF4main13setServoAngleFT5angleVs6UInt16_T_
                                        ; -- End function

From what I know, __mulsi3 takes the 32 bit value in r18-r21, multiplies it by the 32 bit value in r22-r25 and stores the result in the 32 bit value r22-r25. (I'm not sure how it detects overflow.)

It should be multiplying the input value, e.g. 90 by 11 then printing out the result. Instead it always prints 0.

As I read this assembly, it's moving both input values into the top two bytes of 32 bit numbers, which looks broken. Either it should move them to the bottom two bytes or use a different function.

I'd love to investigate this but I'll need to get some pointers from people where this is all happening. I couldn't even find mulsi3 by grepping through llvm source code. No idea how this is made!