Skip to content

Commit 1851f08

Browse files
committed
Auto merge of #97046 - conradludgate:faster-ascii-case-conv-path, r=thomcc
improve case conversion happy path Someone shared the source code for [Go's string case conversion](https://github.com/golang/go/blob/19156a54741d4f353c9e8e0860197ca95a6ee6ca/src/strings/strings.go#L558-L616). It features a hot path for ascii-only strings (although I assume for reasons specific to go, they've opted for a read safe hot loop). I've borrowed these ideas and also kept our existing code to provide a fast path + seamless utf-8 correct path fallback. (Naive) Benchmarks can be found here https://github.com/conradludgate/case-conv For the cases where non-ascii is found near the start, the performance of this algorithm does fall back to original speeds and has not had any measurable speed loss
2 parents 1ab9893 + d0f9930 commit 1851f08

File tree

2 files changed

+83
-5
lines changed

2 files changed

+83
-5
lines changed

library/alloc/src/str.rs

+69-5
Original file line numberDiff line numberDiff line change
@@ -383,15 +383,23 @@ impl str {
383383
without modifying the original"]
384384
#[stable(feature = "unicode_case_mapping", since = "1.2.0")]
385385
pub fn to_lowercase(&self) -> String {
386-
let mut s = String::with_capacity(self.len());
387-
for (i, c) in self[..].char_indices() {
386+
let out = convert_while_ascii(self.as_bytes(), u8::to_ascii_lowercase);
387+
388+
// Safety: we know this is a valid char boundary since
389+
// out.len() is only progressed if ascii bytes are found
390+
let rest = unsafe { self.get_unchecked(out.len()..) };
391+
392+
// Safety: We have written only valid ASCII to our vec
393+
let mut s = unsafe { String::from_utf8_unchecked(out) };
394+
395+
for (i, c) in rest[..].char_indices() {
388396
if c == 'Σ' {
389397
// Σ maps to σ, except at the end of a word where it maps to ς.
390398
// This is the only conditional (contextual) but language-independent mapping
391399
// in `SpecialCasing.txt`,
392400
// so hard-code it rather than have a generic "condition" mechanism.
393401
// See https://github.com/rust-lang/rust/issues/26035
394-
map_uppercase_sigma(self, i, &mut s)
402+
map_uppercase_sigma(rest, i, &mut s)
395403
} else {
396404
match conversions::to_lower(c) {
397405
[a, '\0', _] => s.push(a),
@@ -466,8 +474,16 @@ impl str {
466474
without modifying the original"]
467475
#[stable(feature = "unicode_case_mapping", since = "1.2.0")]
468476
pub fn to_uppercase(&self) -> String {
469-
let mut s = String::with_capacity(self.len());
470-
for c in self[..].chars() {
477+
let out = convert_while_ascii(self.as_bytes(), u8::to_ascii_uppercase);
478+
479+
// Safety: we know this is a valid char boundary since
480+
// out.len() is only progressed if ascii bytes are found
481+
let rest = unsafe { self.get_unchecked(out.len()..) };
482+
483+
// Safety: We have written only valid ASCII to our vec
484+
let mut s = unsafe { String::from_utf8_unchecked(out) };
485+
486+
for c in rest.chars() {
471487
match conversions::to_upper(c) {
472488
[a, '\0', _] => s.push(a),
473489
[a, b, '\0'] => {
@@ -619,3 +635,51 @@ impl str {
619635
pub unsafe fn from_boxed_utf8_unchecked(v: Box<[u8]>) -> Box<str> {
620636
unsafe { Box::from_raw(Box::into_raw(v) as *mut str) }
621637
}
638+
639+
/// Converts the bytes while the bytes are still ascii.
640+
/// For better average performance, this is happens in chunks of `2*size_of::<usize>()`.
641+
/// Returns a vec with the converted bytes.
642+
#[inline]
643+
#[cfg(not(test))]
644+
#[cfg(not(no_global_oom_handling))]
645+
fn convert_while_ascii(b: &[u8], convert: fn(&u8) -> u8) -> Vec<u8> {
646+
let mut out = Vec::with_capacity(b.len());
647+
648+
const USIZE_SIZE: usize = mem::size_of::<usize>();
649+
const MAGIC_UNROLL: usize = 2;
650+
const N: usize = USIZE_SIZE * MAGIC_UNROLL;
651+
const NONASCII_MASK: usize = usize::from_ne_bytes([0x80; USIZE_SIZE]);
652+
653+
let mut i = 0;
654+
unsafe {
655+
while i + N <= b.len() {
656+
// Safety: we have checks the sizes `b` and `out` to know that our
657+
let in_chunk = b.get_unchecked(i..i + N);
658+
let out_chunk = out.spare_capacity_mut().get_unchecked_mut(i..i + N);
659+
660+
let mut bits = 0;
661+
for j in 0..MAGIC_UNROLL {
662+
// read the bytes 1 usize at a time (unaligned since we haven't checked the alignment)
663+
// safety: in_chunk is valid bytes in the range
664+
bits |= in_chunk.as_ptr().cast::<usize>().add(j).read_unaligned();
665+
}
666+
// if our chunks aren't ascii, then return only the prior bytes as init
667+
if bits & NONASCII_MASK != 0 {
668+
break;
669+
}
670+
671+
// perform the case conversions on N bytes (gets heavily autovec'd)
672+
for j in 0..N {
673+
// safety: in_chunk and out_chunk is valid bytes in the range
674+
let out = out_chunk.get_unchecked_mut(j);
675+
out.write(convert(in_chunk.get_unchecked(j)));
676+
}
677+
678+
// mark these bytes as initialised
679+
i += N;
680+
}
681+
out.set_len(i);
682+
}
683+
684+
out
685+
}

library/alloc/tests/str.rs

+14
Original file line numberDiff line numberDiff line change
@@ -1772,6 +1772,20 @@ fn to_lowercase() {
17721772
assert_eq!("ΑΣΑ".to_lowercase(), "ασα");
17731773
assert_eq!("ΑΣ'Α".to_lowercase(), "ασ'α");
17741774
assert_eq!("ΑΣ''Α".to_lowercase(), "ασ''α");
1775+
1776+
// a really long string that has it's lowercase form
1777+
// even longer. this tests that implementations don't assume
1778+
// an incorrect upper bound on allocations
1779+
let upper = str::repeat("İ", 512);
1780+
let lower = str::repeat("i̇", 512);
1781+
assert_eq!(upper.to_lowercase(), lower);
1782+
1783+
// a really long ascii-only string.
1784+
// This test that the ascii hot-path
1785+
// functions correctly
1786+
let upper = str::repeat("A", 511);
1787+
let lower = str::repeat("a", 511);
1788+
assert_eq!(upper.to_lowercase(), lower);
17751789
}
17761790

17771791
#[test]

0 commit comments

Comments
 (0)