auto merge of #13539 : Aatch/rust/vector-copy-faster, r=thestinger

bors · bors · commit f39ba69aaa0b · 2014-04-16T03:36:27.000-07:00
LLVM wasn't recognising the loops as memcpy loops and was therefore failing to optimise them properly. While improving LLVM is the "proper" way to fix this, I think that these cases are important enough to warrant a little low-level optimisation. Fixes #13472 r? @thestinger --- Benchmark Results: ``` --- Before --- test clone_owned ... bench: 6126104 ns/iter (+/- 285962) = 170 MB/s test clone_owned_to_owned ... bench: 6125054 ns/iter (+/- 271197) = 170 MB/s test clone_str ... bench: 80586 ns/iter (+/- 11489) = 13011 MB/s test clone_vec ... bench: 3903220 ns/iter (+/- 658556) = 268 MB/s test test_memcpy ... bench: 69401 ns/iter (+/- 2168) = 15108 MB/s --- After --- test clone_owned ... bench: 70839 ns/iter (+/- 4931) = 14801 MB/s test clone_owned_to_owned ... bench: 70286 ns/iter (+/- 4836) = 14918 MB/s test clone_str ... bench: 78519 ns/iter (+/- 5511) = 13353 MB/s test clone_vec ... bench: 71415 ns/iter (+/- 1999) = 14682 MB/s test test_memcpy ... bench: 70980 ns/iter (+/- 2126) = 14772 MB/s ```
diff --git a/src/libstd/slice.rs b/src/libstd/slice.rs
@@ -760,9 +760,25 @@ impl<'a, T: Clone> CloneableVector<T> for &'a [T] {
     /// Returns a copy of `v`.
     #[inline]
     fn to_owned(&self) -> ~[T] {
-        let mut result = with_capacity(self.len());
-        for e in self.iter() {
-            result.push((*e).clone());
+        let len = self.len();
+        let mut result = with_capacity(len);
+        // Unsafe code so this can be optimised to a memcpy (or something
+        // similarly fast) when T is Copy. LLVM is easily confused, so any
+        // extra operations during the loop can prevent this optimisation
+        unsafe {
+            let mut i = 0;
+            let p = result.as_mut_ptr();
+            // Use try_finally here otherwise the write to length
+            // inside the loop stops LLVM from optimising this.
+            try_finally(
+                &mut i, (),
+                |i, ()| while *i < len {
+                    mem::move_val_init(
+                        &mut(*p.offset(*i as int)),
+                        self.unsafe_ref(*i).clone());
+                    *i += 1;
+                },
+                |i| result.set_len(*i));
         }
         result
     }
@@ -2584,7 +2600,8 @@ pub mod bytes {
 impl<A: Clone> Clone for ~[A] {
     #[inline]
     fn clone(&self) -> ~[A] {
-        self.iter().map(|item| item.clone()).collect()
+        // Use the fast to_owned on &[A] for cloning
+        self.as_slice().to_owned()
     }
 
     fn clone_from(&mut self, source: &~[A]) {
diff --git a/src/libstd/vec.rs b/src/libstd/vec.rs
@@ -311,7 +311,23 @@ impl<T: Clone> Vec<T> {
 
 impl<T:Clone> Clone for Vec<T> {
     fn clone(&self) -> Vec<T> {
-        self.iter().map(|x| x.clone()).collect()
+        let len = self.len;
+        let mut vector = Vec::with_capacity(len);
+        // Unsafe code so this can be optimised to a memcpy (or something
+        // similarly fast) when T is Copy. LLVM is easily confused, so any
+        // extra operations during the loop can prevent this optimisation
+        {
+            let this_slice = self.as_slice();
+            while vector.len < len {
+                unsafe {
+                    mem::move_val_init(
+                        vector.as_mut_slice().unsafe_mut_ref(vector.len),
+                        this_slice.unsafe_ref(vector.len).clone());
+                }
+                vector.len += 1;
+            }
+        }
+        vector
     }
 
     fn clone_from(&mut self, other: &Vec<T>) {