Use ScalarPair for tagged enums by nox · Pull Request #49420 · rust-lang/rust (original) (raw)
nox mentioned this pull request
nox mentioned this pull request
bors added S-waiting-on-bors
Status: Waiting on bors to run and complete tests. Bors will change the label on completion.
and removed S-waiting-on-review
Status: Awaiting review from the assignee but also interested parties.
labels
bors added S-waiting-on-review
Status: Awaiting review from the assignee but also interested parties.
and removed S-waiting-on-bors
Status: Waiting on bors to run and complete tests. Bors will change the label on completion.
labels
bors added S-waiting-on-bors
Status: Waiting on bors to run and complete tests. Bors will change the label on completion.
and removed S-waiting-on-review
Status: Awaiting review from the assignee but also interested parties.
labels
bors added a commit that referenced this pull request
Use ScalarPair for tagged enums
nox deleted the enum-scalarpair branch
This was referenced
Jul 3, 2018
kennytm added a commit to kennytm/rust that referenced this pull request
mem::swap the obvious way for types smaller than the SIMD optimization's block size
LLVM isn't able to remove the alloca for the unaligned block in the post-SIMD tail in some cases, so doing this helps SRoA work in cases where it currently doesn't. Found in the replace_with
RFC discussion.
Examples of the improvements:
swapping `[u16; 3]` takes 1/3 fewer instructions and no stackalloc
type Demo = [u16; 3];
pub fn swap_demo(x: &mut Demo, y: &mut Demo) {
std::mem::swap(x, y);
}
nightly:
_ZN4blah9swap_demo17ha1732a9b71393a7eE:
.seh_proc _ZN4blah9swap_demo17ha1732a9b71393a7eE
sub rsp, 32
.seh_stackalloc 32
.seh_endprologue
movzx eax, word ptr [rcx + 4]
mov word ptr [rsp + 4], ax
mov eax, dword ptr [rcx]
mov dword ptr [rsp], eax
movzx eax, word ptr [rdx + 4]
mov word ptr [rcx + 4], ax
mov eax, dword ptr [rdx]
mov dword ptr [rcx], eax
movzx eax, word ptr [rsp + 4]
mov word ptr [rdx + 4], ax
mov eax, dword ptr [rsp]
mov dword ptr [rdx], eax
add rsp, 32
ret
.seh_handlerdata
.section .text,"xr",one_only,_ZN4blah9swap_demo17ha1732a9b71393a7eE
.seh_endproc
this PR:
_ZN4blah9swap_demo17ha1732a9b71393a7eE:
mov r8d, dword ptr [rcx]
movzx r9d, word ptr [rcx + 4]
movzx eax, word ptr [rdx + 4]
mov word ptr [rcx + 4], ax
mov eax, dword ptr [rdx]
mov dword ptr [rcx], eax
mov word ptr [rdx + 4], r9w
mov dword ptr [rdx], r8d
ret
`replace_with` optimizes down much better
Inspired by rust-lang/rfcs#2490,
fn replace_with<T, F>(x: &mut Option<T>, f: F)
where F: FnOnce(Option<T>) -> Option<T>
{
*x = f(x.take());
}
pub fn inc_opt(mut x: &mut Option<i32>) {
replace_with(&mut x, |i| i.map(|j| j + 1));
}
Rust 1.26.0:
_ZN4blah7inc_opt17heb0acb64c51777cfE:
mov rax, qword ptr [rcx]
movabs r8, 4294967296
add r8, rax
shl rax, 32
movabs rdx, -4294967296
and rdx, r8
xor r8d, r8d
test rax, rax
cmove rdx, rax
setne r8b
or rdx, r8
mov qword ptr [rcx], rdx
ret
Nightly (better thanks to ScalarPair, maybe?):
_ZN4blah7inc_opt17h66df690be0b5899dE:
mov r8, qword ptr [rcx]
mov rdx, r8
shr rdx, 32
xor eax, eax
test r8d, r8d
setne al
add edx, 1
mov dword ptr [rcx], eax
mov dword ptr [rcx + 4], edx
ret
This PR:
_ZN4blah7inc_opt17h1426dc215ecbdb19E:
xor eax, eax
cmp dword ptr [rcx], 0
setne al
mov dword ptr [rcx], eax
add dword ptr [rcx + 4], 1
ret
Where that add is beautiful -- using an addressing mode to not even need to explicitly go through a register -- and the remaining imperfection is well-known (rust-lang#49420 (comment)).
kennytm added a commit to kennytm/rust that referenced this pull request
mem::swap the obvious way for types smaller than the SIMD optimization's block size
LLVM isn't able to remove the alloca for the unaligned block in the post-SIMD tail in some cases, so doing this helps SRoA work in cases where it currently doesn't. Found in the replace_with
RFC discussion.
Examples of the improvements:
swapping `[u16; 3]` takes 1/3 fewer instructions and no stackalloc
type Demo = [u16; 3];
pub fn swap_demo(x: &mut Demo, y: &mut Demo) {
std::mem::swap(x, y);
}
nightly:
_ZN4blah9swap_demo17ha1732a9b71393a7eE:
.seh_proc _ZN4blah9swap_demo17ha1732a9b71393a7eE
sub rsp, 32
.seh_stackalloc 32
.seh_endprologue
movzx eax, word ptr [rcx + 4]
mov word ptr [rsp + 4], ax
mov eax, dword ptr [rcx]
mov dword ptr [rsp], eax
movzx eax, word ptr [rdx + 4]
mov word ptr [rcx + 4], ax
mov eax, dword ptr [rdx]
mov dword ptr [rcx], eax
movzx eax, word ptr [rsp + 4]
mov word ptr [rdx + 4], ax
mov eax, dword ptr [rsp]
mov dword ptr [rdx], eax
add rsp, 32
ret
.seh_handlerdata
.section .text,"xr",one_only,_ZN4blah9swap_demo17ha1732a9b71393a7eE
.seh_endproc
this PR:
_ZN4blah9swap_demo17ha1732a9b71393a7eE:
mov r8d, dword ptr [rcx]
movzx r9d, word ptr [rcx + 4]
movzx eax, word ptr [rdx + 4]
mov word ptr [rcx + 4], ax
mov eax, dword ptr [rdx]
mov dword ptr [rcx], eax
mov word ptr [rdx + 4], r9w
mov dword ptr [rdx], r8d
ret
`replace_with` optimizes down much better
Inspired by rust-lang/rfcs#2490,
fn replace_with<T, F>(x: &mut Option<T>, f: F)
where F: FnOnce(Option<T>) -> Option<T>
{
*x = f(x.take());
}
pub fn inc_opt(mut x: &mut Option<i32>) {
replace_with(&mut x, |i| i.map(|j| j + 1));
}
Rust 1.26.0:
_ZN4blah7inc_opt17heb0acb64c51777cfE:
mov rax, qword ptr [rcx]
movabs r8, 4294967296
add r8, rax
shl rax, 32
movabs rdx, -4294967296
and rdx, r8
xor r8d, r8d
test rax, rax
cmove rdx, rax
setne r8b
or rdx, r8
mov qword ptr [rcx], rdx
ret
Nightly (better thanks to ScalarPair, maybe?):
_ZN4blah7inc_opt17h66df690be0b5899dE:
mov r8, qword ptr [rcx]
mov rdx, r8
shr rdx, 32
xor eax, eax
test r8d, r8d
setne al
add edx, 1
mov dword ptr [rcx], eax
mov dword ptr [rcx + 4], edx
ret
This PR:
_ZN4blah7inc_opt17h1426dc215ecbdb19E:
xor eax, eax
cmp dword ptr [rcx], 0
setne al
mov dword ptr [rcx], eax
add dword ptr [rcx + 4], 1
ret
Where that add is beautiful -- using an addressing mode to not even need to explicitly go through a register -- and the remaining imperfection is well-known (rust-lang#49420 (comment)).
This was referenced
Apr 16, 2019
lqd mentioned this pull request