Use ScalarPair for tagged enums by nox · Pull Request #49420 · rust-lang/rust (original) (raw)

nox mentioned this pull request

Mar 30, 2018

nox mentioned this pull request

Apr 14, 2018

bors added S-waiting-on-bors

Status: Waiting on bors to run and complete tests. Bors will change the label on completion.

and removed S-waiting-on-review

Status: Awaiting review from the assignee but also interested parties.

labels

Apr 26, 2018

bors added S-waiting-on-review

Status: Awaiting review from the assignee but also interested parties.

and removed S-waiting-on-bors

Status: Waiting on bors to run and complete tests. Bors will change the label on completion.

labels

Apr 26, 2018

bors added S-waiting-on-bors

Status: Waiting on bors to run and complete tests. Bors will change the label on completion.

and removed S-waiting-on-review

Status: Awaiting review from the assignee but also interested parties.

labels

Apr 26, 2018

bors added a commit that referenced this pull request

Apr 27, 2018

Use ScalarPair for tagged enums

nox deleted the enum-scalarpair branch

April 28, 2018 10:53

This was referenced

Jul 3, 2018

kennytm added a commit to kennytm/rust that referenced this pull request

Jul 21, 2018

mem::swap the obvious way for types smaller than the SIMD optimization's block size

LLVM isn't able to remove the alloca for the unaligned block in the post-SIMD tail in some cases, so doing this helps SRoA work in cases where it currently doesn't. Found in the replace_with RFC discussion.

Examples of the improvements:

swapping `[u16; 3]` takes 1/3 fewer instructions and no stackalloc

type Demo = [u16; 3];
pub fn swap_demo(x: &mut Demo, y: &mut Demo) {
    std::mem::swap(x, y);
}

nightly:

_ZN4blah9swap_demo17ha1732a9b71393a7eE:
.seh_proc _ZN4blah9swap_demo17ha1732a9b71393a7eE
    sub	rsp, 32
    .seh_stackalloc 32
    .seh_endprologue
    movzx	eax, word ptr [rcx + 4]
    mov	word ptr [rsp + 4], ax
    mov	eax, dword ptr [rcx]
    mov	dword ptr [rsp], eax
    movzx	eax, word ptr [rdx + 4]
    mov	word ptr [rcx + 4], ax
    mov	eax, dword ptr [rdx]
    mov	dword ptr [rcx], eax
    movzx	eax, word ptr [rsp + 4]
    mov	word ptr [rdx + 4], ax
    mov	eax, dword ptr [rsp]
    mov	dword ptr [rdx], eax
    add	rsp, 32
    ret
    .seh_handlerdata
    .section	.text,"xr",one_only,_ZN4blah9swap_demo17ha1732a9b71393a7eE
    .seh_endproc

this PR:

_ZN4blah9swap_demo17ha1732a9b71393a7eE:
    mov	r8d, dword ptr [rcx]
    movzx	r9d, word ptr [rcx + 4]
    movzx	eax, word ptr [rdx + 4]
    mov	word ptr [rcx + 4], ax
    mov	eax, dword ptr [rdx]
    mov	dword ptr [rcx], eax
    mov	word ptr [rdx + 4], r9w
    mov	dword ptr [rdx], r8d
    ret

`replace_with` optimizes down much better

Inspired by rust-lang/rfcs#2490,

fn replace_with<T, F>(x: &mut Option<T>, f: F)
    where F: FnOnce(Option<T>) -> Option<T>
{
    *x = f(x.take());
}

pub fn inc_opt(mut x: &mut Option<i32>) {
    replace_with(&mut x, |i| i.map(|j| j + 1));
}

Rust 1.26.0:

_ZN4blah7inc_opt17heb0acb64c51777cfE:
    mov	rax, qword ptr [rcx]
    movabs	r8, 4294967296
    add	r8, rax
    shl	rax, 32
    movabs	rdx, -4294967296
    and	rdx, r8
    xor	r8d, r8d
    test	rax, rax
    cmove	rdx, rax
    setne	r8b
    or	rdx, r8
    mov	qword ptr [rcx], rdx
    ret

Nightly (better thanks to ScalarPair, maybe?):

_ZN4blah7inc_opt17h66df690be0b5899dE:
    mov	r8, qword ptr [rcx]
    mov	rdx, r8
    shr	rdx, 32
    xor	eax, eax
    test	r8d, r8d
    setne	al
    add	edx, 1
    mov	dword ptr [rcx], eax
    mov	dword ptr [rcx + 4], edx
    ret

This PR:

_ZN4blah7inc_opt17h1426dc215ecbdb19E:
    xor	eax, eax
    cmp	dword ptr [rcx], 0
    setne	al
    mov	dword ptr [rcx], eax
    add	dword ptr [rcx + 4], 1
    ret

Where that add is beautiful -- using an addressing mode to not even need to explicitly go through a register -- and the remaining imperfection is well-known (rust-lang#49420 (comment)).

kennytm added a commit to kennytm/rust that referenced this pull request

Jul 22, 2018

mem::swap the obvious way for types smaller than the SIMD optimization's block size

Examples of the improvements:

swapping `[u16; 3]` takes 1/3 fewer instructions and no stackalloc

type Demo = [u16; 3];
pub fn swap_demo(x: &mut Demo, y: &mut Demo) {
    std::mem::swap(x, y);
}

nightly:

_ZN4blah9swap_demo17ha1732a9b71393a7eE:
.seh_proc _ZN4blah9swap_demo17ha1732a9b71393a7eE
    sub	rsp, 32
    .seh_stackalloc 32
    .seh_endprologue
    movzx	eax, word ptr [rcx + 4]
    mov	word ptr [rsp + 4], ax
    mov	eax, dword ptr [rcx]
    mov	dword ptr [rsp], eax
    movzx	eax, word ptr [rdx + 4]
    mov	word ptr [rcx + 4], ax
    mov	eax, dword ptr [rdx]
    mov	dword ptr [rcx], eax
    movzx	eax, word ptr [rsp + 4]
    mov	word ptr [rdx + 4], ax
    mov	eax, dword ptr [rsp]
    mov	dword ptr [rdx], eax
    add	rsp, 32
    ret
    .seh_handlerdata
    .section	.text,"xr",one_only,_ZN4blah9swap_demo17ha1732a9b71393a7eE
    .seh_endproc

this PR:

_ZN4blah9swap_demo17ha1732a9b71393a7eE:
    mov	r8d, dword ptr [rcx]
    movzx	r9d, word ptr [rcx + 4]
    movzx	eax, word ptr [rdx + 4]
    mov	word ptr [rcx + 4], ax
    mov	eax, dword ptr [rdx]
    mov	dword ptr [rcx], eax
    mov	word ptr [rdx + 4], r9w
    mov	dword ptr [rdx], r8d
    ret

`replace_with` optimizes down much better

Inspired by rust-lang/rfcs#2490,

fn replace_with<T, F>(x: &mut Option<T>, f: F)
    where F: FnOnce(Option<T>) -> Option<T>
{
    *x = f(x.take());
}

pub fn inc_opt(mut x: &mut Option<i32>) {
    replace_with(&mut x, |i| i.map(|j| j + 1));
}

Rust 1.26.0:

_ZN4blah7inc_opt17heb0acb64c51777cfE:
    mov	rax, qword ptr [rcx]
    movabs	r8, 4294967296
    add	r8, rax
    shl	rax, 32
    movabs	rdx, -4294967296
    and	rdx, r8
    xor	r8d, r8d
    test	rax, rax
    cmove	rdx, rax
    setne	r8b
    or	rdx, r8
    mov	qword ptr [rcx], rdx
    ret

Nightly (better thanks to ScalarPair, maybe?):

_ZN4blah7inc_opt17h66df690be0b5899dE:
    mov	r8, qword ptr [rcx]
    mov	rdx, r8
    shr	rdx, 32
    xor	eax, eax
    test	r8d, r8d
    setne	al
    add	edx, 1
    mov	dword ptr [rcx], eax
    mov	dword ptr [rcx + 4], edx
    ret

This PR:

_ZN4blah7inc_opt17h1426dc215ecbdb19E:
    xor	eax, eax
    cmp	dword ptr [rcx], 0
    setne	al
    mov	dword ptr [rcx], eax
    add	dword ptr [rcx + 4], 1
    ret

Where that add is beautiful -- using an addressing mode to not even need to explicitly go through a register -- and the remaining imperfection is well-known (rust-lang#49420 (comment)).

This was referenced

Apr 16, 2019

lqd mentioned this pull request

Jul 17, 2019