Skip to content

Commit 785027a

Browse files
committed
optimize the codegen of Vec::clone
these changes optimize `Vec<u8, 1024>::clone` down to these operations 1. reserve the stack space (1028 bytes on 32-bit ARM) and leave it uninitialized 2. zero the `len` field 3. memcpy `len` bytes of data from the parent analyzed source code ``` rust use heapless::Vec; fn clone(vec: &Vec<u8, 1024>) { let mut vec = vec.clone(); black_box(&mut vec); } fn black_box<T>(val: &mut T) { unsafe { asm!("// {0}", in(reg) val) } } ``` machine code with `lto = fat`, `codegen-units = 1` and `opt-level = 'z'` ('z' instead of 3 to avoid loop unrolling and keep the machine code readable) ``` armasm 00020100 <clone>: 20100: b5d0 push {r4, r6, r7, lr} 20102: af02 add r7, sp, #8 20104: f5ad 6d81 sub.w sp, sp, #1032 ; 0x408 20108: 2300 movs r3, #0 2010a: c802 ldmia r0!, {r1} 2010c: 9301 str r3, [sp, #4] 2010e: aa01 add r2, sp, #4 20110: /--/-X b141 cbz r1, 20124 <clone+0x24> 20112: | | 4413 add r3, r2 20114: | | f810 4b01 ldrb.w r4, [r0], #1 20118: | | 3901 subs r1, #1 2011a: | | 711c strb r4, [r3, #4] 2011c: | | 9b01 ldr r3, [sp, #4] 2011e: | | 3301 adds r3, #1 20120: | | 9301 str r3, [sp, #4] 20122: | \-- e7f5 b.n 20110 <clone+0x10> 20124: \----> a801 add r0, sp, #4 20126: f50d 6d81 add.w sp, sp, #1032 ; 0x408 2012a: bdd0 pop {r4, r6, r7, pc} ``` note that it's not optimizing step (3) to an actual `memcpy` because we lack the 'trait specialization' code that libstd uses --- before `clone` was optimized to 1. reserve and zero (`memclr`) 1028 (!?) bytes of stack space 2. (unnecessarily) runtime check if `len` is equal or less than 1024 (capacity) -- this included a panicking branch 3. memcpy `len` bytes of data from the parent
1 parent aacc359 commit 785027a

File tree

1 file changed

+15
-4
lines changed

1 file changed

+15
-4
lines changed

src/vec.rs

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,18 @@ use hash32;
3434
/// assert_eq!(*vec, [7, 1, 2, 3]);
3535
/// ```
3636
pub struct Vec<T, const N: usize> {
37-
buffer: [MaybeUninit<T>; N],
37+
// NOTE order is important for optimizations. the `len` first layout lets the compiler optimize
38+
// `new` to: reserve stack space and zero the first word. With the fields in the reverse order
39+
// the compiler optimizes `new` to `memclr`-ing the *entire* stack space, including the `buffer`
40+
// field which should be left uninitialized. Optimizations were last checked with Rust 1.60
3841
len: usize,
42+
43+
buffer: [MaybeUninit<T>; N],
3944
}
4045

4146
impl<T, const N: usize> Vec<T, N> {
42-
const INIT: MaybeUninit<T> = MaybeUninit::uninit();
47+
const ELEM: MaybeUninit<T> = MaybeUninit::uninit();
48+
const INIT: [MaybeUninit<T>; N] = [Self::ELEM; N]; // important for optimization of `new`
4349

4450
/// Constructs a new, empty vector with a fixed capacity of `N`
4551
///
@@ -60,8 +66,8 @@ impl<T, const N: usize> Vec<T, N> {
6066
crate::sealed::greater_than_eq_0::<N>();
6167

6268
Self {
63-
buffer: [Self::INIT; N],
6469
len: 0,
70+
buffer: Self::INIT,
6571
}
6672
}
6773

@@ -92,7 +98,12 @@ impl<T, const N: usize> Vec<T, N> {
9298
T: Clone,
9399
{
94100
let mut new = Self::new();
95-
new.extend_from_slice(self.as_slice()).unwrap();
101+
// avoid `extend_from_slice` as that introduces a runtime check / panicking branch
102+
for elem in self {
103+
unsafe {
104+
new.push_unchecked(elem.clone());
105+
}
106+
}
96107
new
97108
}
98109

0 commit comments

Comments
 (0)