@@ -621,100 +621,41 @@ impl<T> [T] {
621
621
#[ stable( feature = "rust1" , since = "1.0.0" ) ]
622
622
#[ inline]
623
623
pub fn reverse ( & mut self ) {
624
- let mut i: usize = 0 ;
625
- let ln = self . len ( ) ;
626
-
627
- // For very small types, all the individual reads in the normal
628
- // path perform poorly. We can do better, given efficient unaligned
629
- // load/store, by loading a larger chunk and reversing a register.
630
-
631
- // Ideally LLVM would do this for us, as it knows better than we do
632
- // whether unaligned reads are efficient (since that changes between
633
- // different ARM versions, for example) and what the best chunk size
634
- // would be. Unfortunately, as of LLVM 4.0 (2017-05) it only unrolls
635
- // the loop, so we need to do this ourselves. (Hypothesis: reverse
636
- // is troublesome because the sides can be aligned differently --
637
- // will be, when the length is odd -- so there's no way of emitting
638
- // pre- and postludes to use fully-aligned SIMD in the middle.)
639
-
640
- let fast_unaligned = cfg ! ( any( target_arch = "x86" , target_arch = "x86_64" ) ) ;
641
-
642
- if fast_unaligned && mem:: size_of :: < T > ( ) == 1 {
643
- // Use the llvm.bswap intrinsic to reverse u8s in a usize
644
- let chunk = mem:: size_of :: < usize > ( ) ;
645
- while i + chunk - 1 < ln / 2 {
646
- // SAFETY: There are several things to check here:
647
- //
648
- // - Note that `chunk` is either 4 or 8 due to the cfg check
649
- // above. So `chunk - 1` is positive.
650
- // - Indexing with index `i` is fine as the loop check guarantees
651
- // `i + chunk - 1 < ln / 2`
652
- // <=> `i < ln / 2 - (chunk - 1) < ln / 2 < ln`.
653
- // - Indexing with index `ln - i - chunk = ln - (i + chunk)` is fine:
654
- // - `i + chunk > 0` is trivially true.
655
- // - The loop check guarantees:
656
- // `i + chunk - 1 < ln / 2`
657
- // <=> `i + chunk ≤ ln / 2 ≤ ln`, thus subtraction does not underflow.
658
- // - The `read_unaligned` and `write_unaligned` calls are fine:
659
- // - `pa` points to index `i` where `i < ln / 2 - (chunk - 1)`
660
- // (see above) and `pb` points to index `ln - i - chunk`, so
661
- // both are at least `chunk`
662
- // many bytes away from the end of `self`.
663
- // - Any initialized memory is valid `usize`.
664
- unsafe {
665
- let ptr = self . as_mut_ptr ( ) ;
666
- let pa = ptr. add ( i) ;
667
- let pb = ptr. add ( ln - i - chunk) ;
668
- let va = ptr:: read_unaligned ( pa as * mut usize ) ;
669
- let vb = ptr:: read_unaligned ( pb as * mut usize ) ;
670
- ptr:: write_unaligned ( pa as * mut usize , vb. swap_bytes ( ) ) ;
671
- ptr:: write_unaligned ( pb as * mut usize , va. swap_bytes ( ) ) ;
672
- }
673
- i += chunk;
674
- }
675
- }
624
+ let half_len = self . len ( ) / 2 ;
625
+ let Range { start, end } = self . as_mut_ptr_range ( ) ;
626
+
627
+ // These slices will skip the middle item for an odd length,
628
+ // since that one doesn't need to move.
629
+ let ( front_half, back_half) =
630
+ // SAFETY: Both are subparts of the original slice, so the memory
631
+ // range is valid, and they don't overlap because they're each only
632
+ // half (or less) of the original slice.
633
+ unsafe {
634
+ (
635
+ slice:: from_raw_parts_mut ( start, half_len) ,
636
+ slice:: from_raw_parts_mut ( end. sub ( half_len) , half_len) ,
637
+ )
638
+ } ;
676
639
677
- if fast_unaligned && mem:: size_of :: < T > ( ) == 2 {
678
- // Use rotate-by-16 to reverse u16s in a u32
679
- let chunk = mem:: size_of :: < u32 > ( ) / 2 ;
680
- while i + chunk - 1 < ln / 2 {
681
- // SAFETY: An unaligned u32 can be read from `i` if `i + 1 < ln`
682
- // (and obviously `i < ln`), because each element is 2 bytes and
683
- // we're reading 4.
684
- //
685
- // `i + chunk - 1 < ln / 2` # while condition
686
- // `i + 2 - 1 < ln / 2`
687
- // `i + 1 < ln / 2`
688
- //
689
- // Since it's less than the length divided by 2, then it must be
690
- // in bounds.
691
- //
692
- // This also means that the condition `0 < i + chunk <= ln` is
693
- // always respected, ensuring the `pb` pointer can be used
694
- // safely.
695
- unsafe {
696
- let ptr = self . as_mut_ptr ( ) ;
697
- let pa = ptr. add ( i) ;
698
- let pb = ptr. add ( ln - i - chunk) ;
699
- let va = ptr:: read_unaligned ( pa as * mut u32 ) ;
700
- let vb = ptr:: read_unaligned ( pb as * mut u32 ) ;
701
- ptr:: write_unaligned ( pa as * mut u32 , vb. rotate_left ( 16 ) ) ;
702
- ptr:: write_unaligned ( pb as * mut u32 , va. rotate_left ( 16 ) ) ;
703
- }
704
- i += chunk;
705
- }
706
- }
640
+ // Introducing a function boundary here means that the two halves
641
+ // get `noalias` markers, allowing better optimization as LLVM
642
+ // knows that they're disjoint, unlike in the original slice.
643
+ revswap ( front_half, back_half, half_len) ;
707
644
708
- while i < ln / 2 {
709
- // SAFETY: `i` is inferior to half the length of the slice so
710
- // accessing `i` and `ln - i - 1` is safe (`i` starts at 0 and
711
- // will not go further than `ln / 2 - 1`).
712
- // The resulting pointers `pa` and `pb` are therefore valid and
713
- // aligned, and can be read from and written to.
714
- unsafe {
715
- self . swap_unchecked ( i, ln - i - 1 ) ;
645
+ #[ inline]
646
+ fn revswap < T > ( a : & mut [ T ] , b : & mut [ T ] , n : usize ) {
647
+ debug_assert_eq ! ( a. len( ) , n) ;
648
+ debug_assert_eq ! ( b. len( ) , n) ;
649
+
650
+ // Because this function is first compiled in isolation,
651
+ // this check tells LLVM that the indexing below is
652
+ // in-bounds. Then after inlining -- once the actual
653
+ // lengths of the slices are known -- it's removed.
654
+ let ( a, b) = ( & mut a[ ..n] , & mut b[ ..n] ) ;
655
+
656
+ for i in 0 ..n {
657
+ mem:: swap ( & mut a[ i] , & mut b[ n - 1 - i] ) ;
716
658
}
717
- i += 1 ;
718
659
}
719
660
}
720
661
0 commit comments