std::trie: use unsafe code to give a 3x speed up to the iterator.

huonw · huonw · commit 0148055a5695 · 2014-01-18T10:46:11.000+11:00
This stores the stack of iterators inline (we have a maximum depth with
`uint` keys), and then uses direct pointer offsetting to manipulate it,
in a blazing fast way:

Before:

    bench_iter_large          ... bench:     43187 ns/iter (+/- 3082)
    bench_iter_small          ... bench:       618 ns/iter (+/- 288)

After:

    bench_iter_large          ... bench:     13497 ns/iter (+/- 1575)
    bench_iter_small          ... bench:       220 ns/iter (+/- 91)
diff --git a/src/libstd/trie.rs b/src/libstd/trie.rs
@@ -11,14 +11,17 @@
 //! Ordered containers with integer keys, implemented as radix tries (`TrieSet` and `TrieMap` types)
 
 use prelude::*;
+use mem;
 use uint;
 use util::replace;
+use unstable::intrinsics::init;
 use vec;
 
 // FIXME: #5244: need to manually update the TrieNode constructor
 static SHIFT: uint = 4;
 static SIZE: uint = 1 << SHIFT;
 static MASK: uint = SIZE - 1;
+static NUM_CHUNKS: uint = uint::bits / SHIFT;
 
 enum Child<T> {
     Internal(~TrieNode<T>),
@@ -113,21 +116,25 @@ impl<T> TrieMap<T> {
 
     /// Get an iterator over the key-value pairs in the map
     pub fn iter<'a>(&'a self) -> TrieMapIterator<'a, T> {
-        TrieMapIterator {
-            stack: ~[self.root.children.iter()],
-            remaining_min: self.length,
-            remaining_max: self.length
-        }
+        let mut iter = unsafe {TrieMapIterator::new()};
+        iter.stack[0] = self.root.children.iter();
+        iter.length = 1;
+        iter.remaining_min = self.length;
+        iter.remaining_max = self.length;
+
+        iter
     }
 
     /// Get an iterator over the key-value pairs in the map, with the
     /// ability to mutate the values.
     pub fn mut_iter<'a>(&'a mut self) -> TrieMapMutIterator<'a, T> {
-        TrieMapMutIterator {
-            stack: ~[self.root.children.mut_iter()],
-            remaining_min: self.length,
-            remaining_max: self.length
-        }
+        let mut iter = unsafe {TrieMapMutIterator::new()};
+        iter.stack[0] = self.root.children.mut_iter();
+        iter.length = 1;
+        iter.remaining_min = self.length;
+        iter.remaining_max = self.length;
+
+        iter
     }
 }
 
@@ -176,16 +183,16 @@ macro_rules! bound {
 
             let key = $key;
 
-            let mut idx = 0;
-            let mut it = $iterator_name {
-                stack: ~[],
-                remaining_min: 0,
-                remaining_max: this.length
-            };
+            let mut it = unsafe {$iterator_name::new()};
+            // everything else is zero'd, as we want.
+            it.remaining_max = this.length;
+
             // this addr is necessary for the `Internal` pattern.
             addr!(loop {
                     let children = unsafe {addr!(& $($mut_)* (*node).children)};
-                    let child_id = chunk(key, idx);
+                    // it.length is the current depth in the iterator and the
+                    // current depth through the `uint` key we've traversed.
+                    let child_id = chunk(key, it.length);
                     let (slice_idx, ret) = match children[child_id] {
                         Internal(ref $($mut_)* n) => {
                             node = addr!(& $($mut_)* **n as * $($mut_)* TrieNode<T>);
@@ -202,9 +209,10 @@ macro_rules! bound {
                             (child_id + 1, true)
                         }
                     };
-                    it.stack.push(children.$slice_from(slice_idx).$iter());
+                    // push to the stack.
+                    it.stack[it.length] = children.$slice_from(slice_idx).$iter();
+                    it.length += 1;
                     if ret { return it }
-                    idx += 1;
                 })
         }
     }
@@ -467,15 +475,17 @@ fn remove<T>(count: &mut uint, child: &mut Child<T>, key: uint,
 
 /// Forward iterator over a map
 pub struct TrieMapIterator<'a, T> {
-    priv stack: ~[vec::VecIterator<'a, Child<T>>],
+    priv stack: [vec::VecIterator<'a, Child<T>>, .. NUM_CHUNKS],
+    priv length: uint,
     priv remaining_min: uint,
     priv remaining_max: uint
 }
 
 /// Forward iterator over the key-value pairs of a map, with the
 /// values being mutable.
 pub struct TrieMapMutIterator<'a, T> {
-    priv stack: ~[vec::VecMutIterator<'a, Child<T>>],
+    priv stack: [vec::VecMutIterator<'a, Child<T>>, .. NUM_CHUNKS],
+    priv length: uint,
     priv remaining_min: uint,
     priv remaining_max: uint
 }
@@ -487,27 +497,96 @@ macro_rules! iterator_impl {
     ($name:ident,
      iter = $iter:ident,
      mutability = $($mut_:tt)*) => {
+        impl<'a, T> $name<'a, T> {
+            // Create new zero'd iterator. We have a thin gilding of safety by
+            // using init rather than uninit, so that the worst that can happen
+            // from failing to initialise correctly after calling these is a
+            // segfault.
+            #[cfg(target_word_size="32")]
+            unsafe fn new() -> $name<'a, T> {
+                $name {
+                    remaining_min: 0,
+                    remaining_max: 0,
+                    length: 0,
+                    // ick :( ... at least the compiler will tell us if we screwed up.
+                    stack: [init(), init(), init(), init(), init(), init(), init(), init()]
+                }
+            }
+
+            #[cfg(target_word_size="64")]
+            unsafe fn new() -> $name<'a, T> {
+                $name {
+                    remaining_min: 0,
+                    remaining_max: 0,
+                    length: 0,
+                    stack: [init(), init(), init(), init(), init(), init(), init(), init(),
+                            init(), init(), init(), init(), init(), init(), init(), init()]
+                }
+            }
+        }
+
         item!(impl<'a, T> Iterator<(uint, &'a $($mut_)* T)> for $name<'a, T> {
+                // you might wonder why we're not even trying to act within the
+                // rules, and are just manipulating raw pointers like there's no
+                // such thing as invalid pointers and memory unsafety. The
+                // reason is performance, without doing this we can get the
+                // bench_iter_large microbenchmark down to about 30000 ns/iter
+                // (using .unsafe_ref to index self.stack directly, 38000
+                // ns/iter with [] checked indexing), but this smashes that down
+                // to 13500 ns/iter.
+                //
+                // Fortunately, it's still safe...
+                //
+                // We have an invariant that every Internal node
+                // corresponds to one push to self.stack, and one pop,
+                // nested appropriately. self.stack has enough storage
+                // to store the maximum depth of Internal nodes in the
+                // trie (8 on 32-bit platforms, 16 on 64-bit).
                 fn next(&mut self) -> Option<(uint, &'a $($mut_)* T)> {
-                    while !self.stack.is_empty() {
-                        match self.stack[self.stack.len() - 1].next() {
-                            None => {
-                                self.stack.pop();
-                            }
-                            Some(child) => {
-                                addr!(match *child {
-                                        Internal(ref $($mut_)* node) => {
-                                            self.stack.push(node.children.$iter());
-                                        }
-                                        External(key, ref $($mut_)* value) => {
-                                            self.remaining_max -= 1;
-                                            if self.remaining_min > 0 {
-                                                self.remaining_min -= 1;
+                    let start_ptr = self.stack.as_mut_ptr();
+
+                    unsafe {
+                        // write_ptr is the next place to write to the stack.
+                        // invariant: start_ptr <= write_ptr < end of the
+                        // vector.
+                        let mut write_ptr = start_ptr.offset(self.length as int);
+                        while write_ptr != start_ptr {
+                            // indexing back one is safe, since write_ptr >
+                            // start_ptr now.
+                            match (*write_ptr.offset(-1)).next() {
+                                // exhausted this iterator (i.e. finished this
+                                // Internal node), so pop from the stack.
+                                //
+                                // don't bother clearing the memory, because the
+                                // next time we use it we'll've written to it
+                                // first.
+                                None => write_ptr = write_ptr.offset(-1),
+                                Some(child) => {
+                                    addr!(match *child {
+                                            Internal(ref $($mut_)* node) => {
+                                                // going down a level, so push
+                                                // to the stack (this is the
+                                                // write referenced above)
+                                                *write_ptr = node.children.$iter();
+                                                write_ptr = write_ptr.offset(1);
+                                            }
+                                            External(key, ref $($mut_)* value) => {
+                                                self.remaining_max -= 1;
+                                                if self.remaining_min > 0 {
+                                                    self.remaining_min -= 1;
+                                                }
+                                                // store the new length of the
+                                                // stack, based on our current
+                                                // position.
+                                                self.length = (write_ptr as uint
+                                                               - start_ptr as uint) /
+                                                    mem::size_of_val(&*write_ptr);
+
+                                                return Some((key, value));
                                             }
-                                            return Some((key, value));
-                                        }
-                                        Nothing => {}
-                                    })
+                                            Nothing => {}
+                                        })
+                                }
                             }
                         }
                     }