Optimize write with as_const_str for shorter code by nyurik · Pull Request #122059 · rust-lang/rust (original) (raw)

Optimize write with as_const_str for shorter code

Following up on rust-lang#121001

Apparently this code generates significant code block for each call to write() with non-simple formatting string - approx 100 lines of assembly code, possibly due to dyn (?). See generated assembly code here:

Details

This is the inlining of write!(buffer, "Iteration {value} was written")

core::fmt::Write::write_fmt:
        // /home/nyurik/dev/rust/rust/library/core/src/fmt/mod.rs : 194
        fn write_fmt(&mut self, args: Arguments<'_>) -> Result {
    push r15
    push r14
    push r13
    push r12
    push rbx
    mov rdx, rsi
        // /home/nyurik/dev/rust/rust/library/core/src/fmt/mod.rs : 427
        match (self.pieces, self.args) {
    mov rcx, qword ptr [rsi + 8]
    mov rax, qword ptr [rsi + 24]
        // /home/nyurik/dev/rust/rust/library/core/src/fmt/mod.rs : 428
        ([], []) => Some(""),
    cmp rcx, 1
    je .LBB0_8
    test rcx, rcx
    jne .LBB0_9
    test rax, rax
    jne .LBB0_9
        // /home/nyurik/dev/rust/rust/library/alloc/src/vec/mod.rs : 911
        self.buf.reserve(self.len, additional);
    lea r12, [rdi + 16]
    lea rsi, [rip + .L__unnamed_2]
    xor ebx, ebx
.LBB0_6:
    mov r14, qword ptr [r12]
    jmp .LBB0_7
.LBB0_8:
        // /home/nyurik/dev/rust/rust/library/core/src/fmt/mod.rs : 429
        ([s], []) => Some(s),
    test rax, rax
    je .LBB0_4
.LBB0_9:
        // /home/nyurik/dev/rust/rust/library/core/src/fmt/mod.rs : 1108
        if let Some(s) = args.as_str() { output.write_str(s) } else { write_internal(output, args) }
    lea rsi, [rip + .L__unnamed_1]
    pop rbx
    pop r12
    pop r13
    pop r14
    pop r15
    jmp qword ptr [rip + core::fmt::write_internal@GOTPCREL]
.LBB0_4:
    mov rax, qword ptr [rdx]
        // /home/nyurik/dev/rust/rust/library/core/src/fmt/mod.rs : 429
        ([s], []) => Some(s),
    mov rsi, qword ptr [rax]
    mov rbx, qword ptr [rax + 8]
        // /home/nyurik/dev/rust/rust/library/alloc/src/raw_vec.rs : 248
        if T::IS_ZST { usize::MAX } else { self.cap.0 }
    mov rax, qword ptr [rdi]
        // /home/nyurik/dev/rust/rust/library/alloc/src/vec/mod.rs : 911
        self.buf.reserve(self.len, additional);
    mov r14, qword ptr [rdi + 16]
        // /home/nyurik/dev/rust/rust/library/core/src/num/mod.rs : 1281
        uint_impl! {
    sub rax, r14
        // /home/nyurik/dev/rust/rust/library/alloc/src/raw_vec.rs : 392
        additional > self.capacity().wrapping_sub(len)
    cmp rax, rbx
        // /home/nyurik/dev/rust/rust/library/alloc/src/raw_vec.rs : 309
        if self.needs_to_grow(len, additional) {
    jb .LBB0_5
.LBB0_7:
    mov rax, qword ptr [rdi + 8]
        // /home/nyurik/dev/rust/rust/library/core/src/ptr/mut_ptr.rs : 1046
        unsafe { intrinsics::offset(self, count) }
    add rax, r14
    mov r15, rdi
        // /home/nyurik/dev/rust/rust/library/core/src/intrinsics.rs : 2922
        copy_nonoverlapping(src, dst, count)
    mov rdi, rax
    mov rdx, rbx
    call qword ptr [rip + memcpy@GOTPCREL]
        // /home/nyurik/dev/rust/rust/library/alloc/src/vec/mod.rs : 2040
        self.len += count;
    add r14, rbx
    mov qword ptr [r15 + 16], r14
        // /home/nyurik/dev/rust/rust/library/core/src/fmt/mod.rs : 216
        }
    xor eax, eax
    pop rbx
    pop r12
    pop r13
    pop r14
    pop r15
    ret
.LBB0_5:
        // /home/nyurik/dev/rust/rust/library/alloc/src/vec/mod.rs : 911
        self.buf.reserve(self.len, additional);
    lea r12, [rdi + 16]
    mov r15, rdi
    mov r13, rsi
        // /home/nyurik/dev/rust/rust/library/alloc/src/raw_vec.rs : 310
        do_reserve_and_handle(self, len, additional);
    mov rsi, r14
    mov rdx, rbx
    call alloc::raw_vec::RawVec<T,A>::reserve::do_reserve_and_handle
    mov rsi, r13
    mov rdi, r15
    jmp .LBB0_6

#[inline]
pub fn write(output: &mut dyn Write, args: Arguments<'_>) -> Result {
    if let Some(s) = args.as_str() { output.write_str(s) } else { write_internal(output, args) }
}

So, this brings back the older experiment - where I used if core::intrinsics::is_val_statically_known(s.is_some()) { s } else { None } helper function, and called it in multiple places that used write. This is not as optimal because now every user of write must do this logic, but at least it results in significantly smaller assembly code for the formatting case, and results in identical code as now for the "simple" (no formatting) case. See assembly comparison of what is now with what this change brings (focus only on fmt/intel-lib.txt and str/intel-lib.txt files).

               if let Some(s) = args.as_const_str() {
                    self.write_str(s)
                } else {
                    write(self, args)
                }