-
Notifications
You must be signed in to change notification settings - Fork 51
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Smol option #75
Smol option #75
Conversation
A small assembly dive: b788a8e<compact_str::CompactStr as core::convert::From<&str>>::from:
push r15
push r14
push rsi
push rdi
push rbx
sub rsp, 64
mov rsi, rcx
test r8, r8
je .LBB30_1
mov rdi, r8
mov r14, rdx
cmp r8, 25
jae .LBB30_3
xorps xmm0, xmm0
movaps xmmword, ptr, [rsp, +, 32], xmm0
mov qword, ptr, [rsp, +, 47], 0
mov eax, edi
or al, -64
mov byte, ptr, [rsp, +, 55], al
lea rcx, [rsp, +, 32]
mov rdx, r14
mov r8, rdi
call memcpy
mov rbx, qword, ptr, [rsp, +, 32]
mov rdi, qword, ptr, [rsp, +, 40]
mov r15, qword, ptr, [rsp, +, 48]
jmp .LBB30_6
.LBB30_1:
movabs r15, -4611686018427387904
xor edi, edi
xor ebx, ebx
jmp .LBB30_6
.LBB30_3:
mov edx, 1
mov rcx, rdi
call __rust_alloc
test rax, rax
je .LBB30_7
mov rbx, rax
movabs r15, -72057594037927936
or r15, rdi
mov rcx, rax
mov rdx, r14
mov r8, rdi
call memcpy
.LBB30_6:
mov qword, ptr, [rsi], rbx
mov qword, ptr, [rsi, +, 8], rdi
mov qword, ptr, [rsi, +, 16], r15
mov rax, rsi
add rsp, 64
pop rbx
pop rdi
pop rsi
pop r14
pop r15
ret
.LBB30_7:
mov edx, 1
mov rcx, rdi
call alloc::alloc::handle_alloc_error
ud2 c2159a7<compact_str::CompactStr as core::convert::From<&str>>::from:
push r15
push r14
push rsi
push rdi
push rbx
sub rsp, 64
mov rsi, rcx
test r8, r8
je .LBB30_1
mov rdi, r8
mov r14, rdx
cmp r8, 25
jae .LBB30_3
xorps xmm0, xmm0
movaps xmmword, ptr, [rsp, +, 32], xmm0
mov qword, ptr, [rsp, +, 47], 0
mov eax, edi
or al, -64
mov byte, ptr, [rsp, +, 55], al
lea rcx, [rsp, +, 32]
mov rdx, r14
mov r8, rdi
call memcpy
mov r15, qword, ptr, [rsp, +, 32]
mov rdi, qword, ptr, [rsp, +, 40]
mov rbx, qword, ptr, [rsp, +, 48]
jmp .LBB30_6
.LBB30_1:
movabs rbx, -4611686018427387904
xor edi, edi
xor r15d, r15d
jmp .LBB30_6
.LBB30_3:
mov edx, 1
mov rcx, rdi
call __rust_alloc
test rax, rax
je .LBB30_7
mov r15, rax
movabs rax, 72057594037927935
and rax, rdi
movabs rbx, -144115188075855872
or rbx, rax
mov rcx, r15
mov rdx, r14
mov r8, rdi
call memcpy
.LBB30_6:
mov qword, ptr, [rsi], r15
mov qword, ptr, [rsi, +, 8], rdi
mov qword, ptr, [rsi, +, 16], rbx
mov rax, rsi
add rsp, 64
pop rbx
pop rdi
pop rsi
pop r14
pop r15
ret
.LBB30_7:
mov edx, 1
mov rcx, rdi
call alloc::alloc::handle_alloc_error
ud2 diff<compact_str::CompactStr as core::convert::From<&str>>::from:
push r15
push r14
push rsi
push rdi
push rbx
sub rsp, 64
mov rsi, rcx
test r8, r8
je .LBB30_1
mov rdi, r8
mov r14, rdx
cmp r8, 25
jae .LBB30_3
xorps xmm0, xmm0
movaps xmmword, ptr, [rsp, +, 32], xmm0
mov qword, ptr, [rsp, +, 47], 0
mov eax, edi
or al, -64
mov byte, ptr, [rsp, +, 55], al
lea rcx, [rsp, +, 32]
mov rdx, r14
mov r8, rdi
call memcpy call memcpy
-mov rbx, qword, ptr, [rsp, +, 32]
+mov r15, qword, ptr, [rsp, +, 32]
mov rdi, qword, ptr, [rsp, +, 40]
-mov r15, qword, ptr, [rsp, +, 48]
+mov rbx, qword, ptr, [rsp, +, 48]
jmp .LBB30_6
.LBB30_1:
-movabs r15, -4611686018427387904
+movabs rbx, -4611686018427387904
xor edi, edi
-xor ebx, ebx
+xor r15d, r15d
jmp .LBB30_6
; just some register shuffling; bx and r15 are swapped
.LBB30_3:
mov edx, 1
mov rcx, rdi
call __rust_alloc
test rax, rax
je .LBB30_7
-mov rbx, rax
+mov r15, rax
-movabs r15, -72057594037927936
+movabs rax, 72057594037927935
+and rax, rdi
+movabs rbx, -144115188075855872
-or r15, rdi
+or rbx, rax
-mov rcx, rax
+mov rcx, r15
mov rdx, r14
mov r8, rdi
call memcpy
; this is the only real difference; how is it so notable!?
; note: bx and r15 are swapped between the two
.LBB30_6:
-mov qword, ptr, [rsi], rbx
+mov qword, ptr, [rsi], r15
mov qword, ptr, [rsi, +, 8], rdi
-mov qword, ptr, [rsi, +, 16], r15
+mov qword, ptr, [rsi, +, 16], rbx
mov rax, rsi
add rsp, 64
pop rbx
pop rdi
pop rsi
pop r14
pop r15
ret
.LBB30_7:
mov edx, 1
mov rcx, rdi
call alloc::alloc::handle_alloc_error
ud2 I am now more confused. The meaningful difference is in the allocating path, and the huge difference is being seen in the non-allocating path? |
Continuing from #19, you're totally right about Looking at the assembly diff I'm also confused. Total shot in the dark, maybe |
Marking a note for myself: try out https://github.com/bheisler/iai to measure inst-count |
At this point, I'm suspecting that the perf volatility is due to the effects of machine code layout or similar, and isn't something directly caused by code changes. An inst-count benchmark would be able to clarify if this is the case (But setting up Iai is difficult when I run Windows...) However, this latest push was able to decrease the measured impact from ~500% to ~40%, so it's less bad at least? Also, it avoids ptr<->int transmutes, which are very sketchy at best, and outright problematic at worst. |
I got WSL2 running and Iai in WSL2, so we now have inst-count results!
So the perf difference is "real" |
For comparison:
I think this is reasonable overhead, so it's just up to you to make a decision on whether this is good enough and how to handle msrv. (I believe you can backfill const transmute with union punning?) |
I ran the iai tests again on the latest nightly (0677edc86 2022-03-31), so here's the best(?) measurements I can offer: 47975c0
d5ee4d6
The largest inst-count percentage change is This is well in the "compiler optimizations are unreliable at an inst level" realm. I think this is a reasonable cost, now that the larger regression is fixed, and basically comes down to "how good is your optimizer exactly". |
This makes the MSRV 1.46 for const mem::transmute (I tried using unions but IIRC that worked in EDIT: ah right, it's 1.56 for use in |
Rebased for latest changes. Perf -> 971a4dc
cargo bench (81c43b0 )
cargo bench (971a4dc)
Conclusion: these benchmarks are noisy. Trying to pull any conclusions from them as-is is very difficult.
|
For comparison's sake, I did a small mechanical change to diff --git a/compact_str/src/repr/mod.rs b/compact_str/src/repr/mod.rs
index 69ebab2..53dc4b8 100644
--- a/compact_str/src/repr/mod.rs
+++ b/compact_str/src/repr/mod.rs
@@ -1,8 +1,11 @@
use std::borrow::Cow;
-use std::fmt;
use std::iter::Extend;
use std::mem::ManuallyDrop;
use std::str::Utf8Error;
+use std::{
+ fmt,
+ mem,
+};
#[cfg(feature = "bytes")]
mod bytes;
@@ -41,13 +44,9 @@ pub(crate) struct Repr(
// First in the representation is a pointer, which is a pointer here to
// ensure that it carries provenance properly;
*const (),
- // second is a usize worth of data;
- usize,
- // and third comes another usize of data, but we break it into parts...
- #[cfg(target_pointer_width = "64")] u32,
- u16,
- u8,
- // ...such that we can mark the last byte as nonmax.
+ // second is padding to size...
+ [u8; PADDING_SIZE - mem::size_of::<*const ()>()],
+ // ...and the last byte as nonmax.
NonMaxU8,
); And got the following performance delta to without the change: criterion output
|
@CAD97, thanks for continuously staying on top of this! I plan to take a closer look at everything this weekend, and probs merge something to make progress here |
Completed in #105 |
#22 but up to date with git main. (Missing some extra tests from #22, though.)
Attempts to avoid the perf cliff seen by #22 by dealing almost exclusively in "
ReprNiche
", and only usingReprUnion
as a view intoReprNiche
.( 👏 on figuring out how to use UTF-8 to get a full 24 bytes inline, btw! )