Auto merge of #83357 - saethlin:vec-reserve-inlining, r=dtolnay

bors · bors · commit 32d3276561e8 · 2021-03-30T03:41:14.000Z
Reduce the impact of Vec::reserve calls that do not cause any allocation I think a lot of callers expect `Vec::reserve` to be nearly free when no resizing is required, but unfortunately that isn't the case. LLVM makes remarkably poor inlining choices (along the path from `Vec::reserve` to `RawVec::grow_amortized`), so depending on the surrounding context you either get a huge blob of `RawVec`'s resizing logic inlined into some seemingly-unrelated function, or not enough inlining happens and/or the actual check in `needs_to_grow` ends up behind a function call. My goal is to make the codegen for `Vec::reserve` match the mental that callers seem to have: It's reliably just a `sub cmp ja` if there is already sufficient capacity. This patch has the following impact on the serde_json benchmarks: https://github.com/serde-rs/json-benchmark/tree/ca3efde8a5b75ff59271539b67452911860248c7 run with `cargo +stage1 run --release -- -n 1024` Before: ``` DOM STRUCT ======= serde_json ======= parse|stringify ===== parse|stringify ==== data/canada.json 340 MB/s 490 MB/s 630 MB/s 370 MB/s data/citm_catalog.json 460 MB/s 540 MB/s 1010 MB/s 550 MB/s data/twitter.json 330 MB/s 840 MB/s 640 MB/s 630 MB/s ======= json-rust ======== parse|stringify ===== parse|stringify ==== data/canada.json 580 MB/s 990 MB/s data/citm_catalog.json 720 MB/s 660 MB/s data/twitter.json 570 MB/s 960 MB/s ``` After: ``` DOM STRUCT ======= serde_json ======= parse|stringify ===== parse|stringify ==== data/canada.json 330 MB/s 510 MB/s 610 MB/s 380 MB/s data/citm_catalog.json 450 MB/s 640 MB/s 970 MB/s 830 MB/s data/twitter.json 330 MB/s 880 MB/s 670 MB/s 960 MB/s ======= json-rust ======== parse|stringify ===== parse|stringify ==== data/canada.json 560 MB/s 1130 MB/s data/citm_catalog.json 710 MB/s 880 MB/s data/twitter.json 530 MB/s 1230 MB/s ``` That's approximately a one-third increase in throughput on two of the benchmarks, and no effect on one (The benchmark suite has sufficient jitter that I could pick a run where there are no regressions, so I'm not convinced they're meaningful here). This also produces perf increases on the order of 3-5% in a few other microbenchmarks that I'm tracking. It might be useful to see if this has a cascading effect on inlining choices in some large codebases. Compiling this simple program demonstrates the change in codegen that causes the perf impact: ```rust fn main() { reserve(&mut Vec::new()); } #[inline(never)] fn reserve(v: &mut Vec<u8>) { v.reserve(1234); } ``` Before: ```rust 00000000000069b0 <scratch::reserve>: 69b0: 53 push %rbx 69b1: 48 83 ec 30 sub $0x30,%rsp 69b5: 48 8b 47 08 mov 0x8(%rdi),%rax 69b9: 48 8b 4f 10 mov 0x10(%rdi),%rcx 69bd: 48 89 c2 mov %rax,%rdx 69c0: 48 29 ca sub %rcx,%rdx 69c3: 48 81 fa d1 04 00 00 cmp $0x4d1,%rdx 69ca: 77 73 ja 6a3f <scratch::reserve+0x8f> 69cc: 48 81 c1 d2 04 00 00 add $0x4d2,%rcx 69d3: 72 75 jb 6a4a <scratch::reserve+0x9a> 69d5: 48 89 fb mov %rdi,%rbx 69d8: 48 8d 14 00 lea (%rax,%rax,1),%rdx 69dc: 48 39 ca cmp %rcx,%rdx 69df: 48 0f 47 ca cmova %rdx,%rcx 69e3: 48 83 f9 08 cmp $0x8,%rcx 69e7: be 08 00 00 00 mov $0x8,%esi 69ec: 48 0f 47 f1 cmova %rcx,%rsi 69f0: 48 85 c0 test %rax,%rax 69f3: 74 17 je 6a0c <scratch::reserve+0x5c> 69f5: 48 8b 0b mov (%rbx),%rcx 69f8: 48 89 0c 24 mov %rcx,(%rsp) 69fc: 48 89 44 24 08 mov %rax,0x8(%rsp) 6a01: 48 c7 44 24 10 01 00 movq $0x1,0x10(%rsp) 6a08: 00 00 6a0a: eb 08 jmp 6a14 <scratch::reserve+0x64> 6a0c: 48 c7 04 24 00 00 00 movq $0x0,(%rsp) 6a13: 00 6a14: 48 8d 7c 24 18 lea 0x18(%rsp),%rdi 6a19: 48 89 e1 mov %rsp,%rcx 6a1c: ba 01 00 00 00 mov $0x1,%edx 6a21: e8 9a fe ff ff call 68c0 <alloc::raw_vec::finish_grow> 6a26: 48 8b 7c 24 20 mov 0x20(%rsp),%rdi 6a2b: 48 8b 74 24 28 mov 0x28(%rsp),%rsi 6a30: 48 83 7c 24 18 01 cmpq $0x1,0x18(%rsp) 6a36: 74 0d je 6a45 <scratch::reserve+0x95> 6a38: 48 89 3b mov %rdi,(%rbx) 6a3b: 48 89 73 08 mov %rsi,0x8(%rbx) 6a3f: 48 83 c4 30 add $0x30,%rsp 6a43: 5b pop %rbx 6a44: c3 ret 6a45: 48 85 f6 test %rsi,%rsi 6a48: 75 08 jne 6a52 <scratch::reserve+0xa2> 6a4a: ff 15 38 c4 03 00 call *0x3c438(%rip) # 42e88 <_GLOBAL_OFFSET_TABLE_+0x490> 6a50: 0f 0b ud2 6a52: ff 15 f0 c4 03 00 call *0x3c4f0(%rip) # 42f48 <_GLOBAL_OFFSET_TABLE_+0x550> 6a58: 0f 0b ud2 6a5a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1) ``` After: ```asm 0000000000006910 <scratch::reserve>: 6910: 48 8b 47 08 mov 0x8(%rdi),%rax 6914: 48 8b 77 10 mov 0x10(%rdi),%rsi 6918: 48 29 f0 sub %rsi,%rax 691b: 48 3d d1 04 00 00 cmp $0x4d1,%rax 6921: 77 05 ja 6928 <scratch::reserve+0x18> 6923: e9 e8 fe ff ff jmp 6810 <alloc::raw_vec::RawVec<T,A>::reserve::do_reserve_and_handle> 6928: c3 ret 6929: 0f 1f 80 00 00 00 00 nopl 0x0(%rax) ```
diff --git a/library/alloc/src/raw_vec.rs b/library/alloc/src/raw_vec.rs
@@ -315,8 +315,24 @@ impl<T, A: Allocator> RawVec<T, A> {
     /// #   vector.push_all(&[1, 3, 5, 7, 9]);
     /// # }
     /// ```
+    #[inline]
     pub fn reserve(&mut self, len: usize, additional: usize) {
-        handle_reserve(self.try_reserve(len, additional));
+        // Callers expect this function to be very cheap when there is already sufficient capacity.
+        // Therefore, we move all the resizing and error-handling logic from grow_amortized and
+        // handle_reserve behind a call, while making sure that the this function is likely to be
+        // inlined as just a comparison and a call if the comparison fails.
+        #[cold]
+        fn do_reserve_and_handle<T, A: Allocator>(
+            slf: &mut RawVec<T, A>,
+            len: usize,
+            additional: usize,
+        ) {
+            handle_reserve(slf.grow_amortized(len, additional));
+        }
+
+        if self.needs_to_grow(len, additional) {
+            do_reserve_and_handle(self, len, additional);
+        }
     }
 
     /// The same as `reserve`, but returns on errors instead of panicking or aborting.