@@ -383,15 +383,23 @@ impl str {
383
383
without modifying the original"]
384
384
#[ stable( feature = "unicode_case_mapping" , since = "1.2.0" ) ]
385
385
pub fn to_lowercase ( & self ) -> String {
386
- let mut s = String :: with_capacity ( self . len ( ) ) ;
387
- for ( i, c) in self [ ..] . char_indices ( ) {
386
+ let out = convert_while_ascii ( self . as_bytes ( ) , u8:: to_ascii_lowercase) ;
387
+
388
+ // Safety: we know this is a valid char boundary since
389
+ // out.len() is only progressed if ascii bytes are found
390
+ let rest = unsafe { self . get_unchecked ( out. len ( ) ..) } ;
391
+
392
+ // Safety: We have written only valid ASCII to our vec
393
+ let mut s = unsafe { String :: from_utf8_unchecked ( out) } ;
394
+
395
+ for ( i, c) in rest[ ..] . char_indices ( ) {
388
396
if c == 'Σ' {
389
397
// Σ maps to σ, except at the end of a word where it maps to ς.
390
398
// This is the only conditional (contextual) but language-independent mapping
391
399
// in `SpecialCasing.txt`,
392
400
// so hard-code it rather than have a generic "condition" mechanism.
393
401
// See https://github.com/rust-lang/rust/issues/26035
394
- map_uppercase_sigma ( self , i, & mut s)
402
+ map_uppercase_sigma ( rest , i, & mut s)
395
403
} else {
396
404
match conversions:: to_lower ( c) {
397
405
[ a, '\0' , _] => s. push ( a) ,
@@ -466,8 +474,16 @@ impl str {
466
474
without modifying the original"]
467
475
#[ stable( feature = "unicode_case_mapping" , since = "1.2.0" ) ]
468
476
pub fn to_uppercase ( & self ) -> String {
469
- let mut s = String :: with_capacity ( self . len ( ) ) ;
470
- for c in self [ ..] . chars ( ) {
477
+ let out = convert_while_ascii ( self . as_bytes ( ) , u8:: to_ascii_uppercase) ;
478
+
479
+ // Safety: we know this is a valid char boundary since
480
+ // out.len() is only progressed if ascii bytes are found
481
+ let rest = unsafe { self . get_unchecked ( out. len ( ) ..) } ;
482
+
483
+ // Safety: We have written only valid ASCII to our vec
484
+ let mut s = unsafe { String :: from_utf8_unchecked ( out) } ;
485
+
486
+ for c in rest. chars ( ) {
471
487
match conversions:: to_upper ( c) {
472
488
[ a, '\0' , _] => s. push ( a) ,
473
489
[ a, b, '\0' ] => {
@@ -619,3 +635,51 @@ impl str {
619
635
pub unsafe fn from_boxed_utf8_unchecked ( v : Box < [ u8 ] > ) -> Box < str > {
620
636
unsafe { Box :: from_raw ( Box :: into_raw ( v) as * mut str ) }
621
637
}
638
+
639
+ /// Converts the bytes while the bytes are still ascii.
640
+ /// For better average performance, this is happens in chunks of `2*size_of::<usize>()`.
641
+ /// Returns a vec with the converted bytes.
642
+ #[ inline]
643
+ #[ cfg( not( test) ) ]
644
+ #[ cfg( not( no_global_oom_handling) ) ]
645
+ fn convert_while_ascii ( b : & [ u8 ] , convert : fn ( & u8 ) -> u8 ) -> Vec < u8 > {
646
+ let mut out = Vec :: with_capacity ( b. len ( ) ) ;
647
+
648
+ const USIZE_SIZE : usize = mem:: size_of :: < usize > ( ) ;
649
+ const MAGIC_UNROLL : usize = 2 ;
650
+ const N : usize = USIZE_SIZE * MAGIC_UNROLL ;
651
+ const NONASCII_MASK : usize = usize:: from_ne_bytes ( [ 0x80 ; USIZE_SIZE ] ) ;
652
+
653
+ let mut i = 0 ;
654
+ unsafe {
655
+ while i + N <= b. len ( ) {
656
+ // Safety: we have checks the sizes `b` and `out` to know that our
657
+ let in_chunk = b. get_unchecked ( i..i + N ) ;
658
+ let out_chunk = out. spare_capacity_mut ( ) . get_unchecked_mut ( i..i + N ) ;
659
+
660
+ let mut bits = 0 ;
661
+ for j in 0 ..MAGIC_UNROLL {
662
+ // read the bytes 1 usize at a time (unaligned since we haven't checked the alignment)
663
+ // safety: in_chunk is valid bytes in the range
664
+ bits |= in_chunk. as_ptr ( ) . cast :: < usize > ( ) . add ( j) . read_unaligned ( ) ;
665
+ }
666
+ // if our chunks aren't ascii, then return only the prior bytes as init
667
+ if bits & NONASCII_MASK != 0 {
668
+ break ;
669
+ }
670
+
671
+ // perform the case conversions on N bytes (gets heavily autovec'd)
672
+ for j in 0 ..N {
673
+ // safety: in_chunk and out_chunk is valid bytes in the range
674
+ let out = out_chunk. get_unchecked_mut ( j) ;
675
+ out. write ( convert ( in_chunk. get_unchecked ( j) ) ) ;
676
+ }
677
+
678
+ // mark these bytes as initialised
679
+ i += N ;
680
+ }
681
+ out. set_len ( i) ;
682
+ }
683
+
684
+ out
685
+ }
0 commit comments