rust-lang · bors · Apr 9, 2025 · May 6, 2024 · Apr 8, 2025 · tgross35
diff --git a/library/alloc/src/lib.rs b/library/alloc/src/lib.rs
@@ -104,6 +104,7 @@
 #![feature(async_iterator)]
 #![feature(bstr)]
 #![feature(bstr_internals)]
+#![feature(char_internals)]
 #![feature(char_max_len)]
 #![feature(clone_to_uninit)]
 #![feature(coerce_unsized)]

diff --git a/library/alloc/src/string.rs b/library/alloc/src/string.rs
@@ -1401,11 +1401,14 @@ impl String {
     #[inline]
     #[stable(feature = "rust1", since = "1.0.0")]
     pub fn push(&mut self, ch: char) {
-        match ch.len_utf8() {
-            1 => self.vec.push(ch as u8),
-            _ => {
-                self.vec.extend_from_slice(ch.encode_utf8(&mut [0; char::MAX_LEN_UTF8]).as_bytes())
-            }
+        let len = self.len();
+        let ch_len = ch.len_utf8();
+        self.reserve(ch_len);
+
+        // SAFETY: Just reserved capacity for at least the length needed to encode `ch`.
+        unsafe {
+            core::char::encode_utf8_raw_unchecked(ch as u32, self.vec.as_mut_ptr().add(self.len()));
+            self.vec.set_len(len + ch_len);
         }
     }
 
@@ -1702,24 +1705,31 @@ impl String {
     #[rustc_confusables("set")]
     pub fn insert(&mut self, idx: usize, ch: char) {
         assert!(self.is_char_boundary(idx));
-        let mut bits = [0; char::MAX_LEN_UTF8];
-        let bits = ch.encode_utf8(&mut bits).as_bytes();
 
+        let len = self.len();
+        let ch_len = ch.len_utf8();
+        self.reserve(ch_len);
+
+        // SAFETY: Move the bytes starting from `idx` to their new location `ch_len`
+        // bytes ahead. This is safe because sufficient capacity was reserved, and `idx`
+        // is a char boundary.
         unsafe {
-            self.insert_bytes(idx, bits);
+            ptr::copy(
+                self.vec.as_ptr().add(idx),
+                self.vec.as_mut_ptr().add(idx + ch_len),
+                len - idx,
+            );
         }
-    }
 
-    #[cfg(not(no_global_oom_handling))]
-    unsafe fn insert_bytes(&mut self, idx: usize, bytes: &[u8]) {
-        let len = self.len();
-        let amt = bytes.len();
-        self.vec.reserve(amt);
+        // SAFETY: Encode the character into the vacated region if `idx != len`,
+        // or into the uninitialized spare capacity otherwise.
+        unsafe {
+            core::char::encode_utf8_raw_unchecked(ch as u32, self.vec.as_mut_ptr().add(idx));
+        }
 
+        // SAFETY: Update the length to include the newly added bytes.
         unsafe {
-            ptr::copy(self.vec.as_ptr().add(idx), self.vec.as_mut_ptr().add(idx + amt), len - idx);
-            ptr::copy_nonoverlapping(bytes.as_ptr(), self.vec.as_mut_ptr().add(idx), amt);
-            self.vec.set_len(len + amt);
+            self.vec.set_len(len + ch_len);
         }
     }
 
@@ -1749,8 +1759,27 @@ impl String {
     pub fn insert_str(&mut self, idx: usize, string: &str) {
         assert!(self.is_char_boundary(idx));
 
+        let len = self.len();
+        let amt = string.len();
+        self.reserve(amt);
+
+        // SAFETY: Move the bytes starting from `idx` to their new location `amt` bytes
+        // ahead. This is safe because sufficient capacity was just reserved, and `idx`
+        // is a char boundary.
+        unsafe {
+            ptr::copy(self.vec.as_ptr().add(idx), self.vec.as_mut_ptr().add(idx + amt), len - idx);
+        }
+
+        // SAFETY: Copy the new string slice into the vacated region if `idx != len`,
+        // or into the uninitialized spare capacity otherwise. The borrow checker
+        // ensures that the source and destination do not overlap.
+        unsafe {
+            ptr::copy_nonoverlapping(string.as_ptr(), self.vec.as_mut_ptr().add(idx), amt);
+        }
+
+        // SAFETY: Update the length to include the newly added bytes.
         unsafe {
-            self.insert_bytes(idx, string.as_bytes());
+            self.vec.set_len(len + amt);
         }
     }
 

diff --git a/library/alloctests/benches/string.rs b/library/alloctests/benches/string.rs
@@ -4,15 +4,16 @@ use test::{Bencher, black_box};
 
 #[bench]
 fn bench_with_capacity(b: &mut Bencher) {
-    b.iter(|| String::with_capacity(100));
+    b.iter(|| String::with_capacity(black_box(100)));
 }
 
 #[bench]
 fn bench_push_str(b: &mut Bencher) {
     let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
     b.iter(|| {
         let mut r = String::new();
-        r.push_str(s);
+        black_box(&mut r).push_str(black_box(s));
+        r
     });
 }
 
@@ -24,8 +25,9 @@ fn bench_push_str_one_byte(b: &mut Bencher) {
     b.iter(|| {
         let mut r = String::new();
         for _ in 0..REPETITIONS {
-            r.push_str("a")
+            black_box(&mut r).push_str(black_box("a"));
         }
+        r
     });
 }
 
@@ -35,8 +37,9 @@ fn bench_push_char_one_byte(b: &mut Bencher) {
     b.iter(|| {
         let mut r = String::new();
         for _ in 0..REPETITIONS {
-            r.push('a')
+            black_box(&mut r).push(black_box('a'));
         }
+        r
     });
 }
 
@@ -46,8 +49,9 @@ fn bench_push_char_two_bytes(b: &mut Bencher) {
     b.iter(|| {
         let mut r = String::new();
         for _ in 0..REPETITIONS {
-            r.push('â')
+            black_box(&mut r).push(black_box('â'));
         }
+        r
     });
 }
 
@@ -57,34 +61,26 @@ fn from_utf8_lossy_100_ascii(b: &mut Bencher) {
               Lorem ipsum dolor sit amet, consectetur. ";
 
     assert_eq!(100, s.len());
-    b.iter(|| {
-        let _ = String::from_utf8_lossy(s);
-    });
+    b.iter(|| String::from_utf8_lossy(black_box(s)));
 }
 
 #[bench]
 fn from_utf8_lossy_100_multibyte(b: &mut Bencher) {
     let s = "𐌀𐌖𐌋𐌄𐌑𐌉ปรدولة الكويتทศไทย中华𐍅𐌿𐌻𐍆𐌹𐌻𐌰".as_bytes();
     assert_eq!(100, s.len());
-    b.iter(|| {
-        let _ = String::from_utf8_lossy(s);
-    });
+    b.iter(|| String::from_utf8_lossy(black_box(s)));
 }
 
 #[bench]
 fn from_utf8_lossy_invalid(b: &mut Bencher) {
     let s = b"Hello\xC0\x80 There\xE6\x83 Goodbye";
-    b.iter(|| {
-        let _ = String::from_utf8_lossy(s);
-    });
+    b.iter(|| String::from_utf8_lossy(black_box(s)));
 }
 
 #[bench]
 fn from_utf8_lossy_100_invalid(b: &mut Bencher) {
     let s = repeat(0xf5).take(100).collect::<Vec<_>>();
-    b.iter(|| {
-        let _ = String::from_utf8_lossy(&s);
-    });
+    b.iter(|| String::from_utf8_lossy(black_box(&s)));
 }
 
 #[bench]
@@ -96,8 +92,8 @@ fn bench_exact_size_shrink_to_fit(b: &mut Bencher) {
     r.push_str(s);
     assert_eq!(r.len(), r.capacity());
     b.iter(|| {
-        let mut r = String::with_capacity(s.len());
-        r.push_str(s);
+        let mut r = String::with_capacity(black_box(s.len()));
+        r.push_str(black_box(s));
         r.shrink_to_fit();
         r
     });
@@ -107,29 +103,29 @@ fn bench_exact_size_shrink_to_fit(b: &mut Bencher) {
 fn bench_from_str(b: &mut Bencher) {
     let s = "Hello there, the quick brown fox jumped over the lazy dog! \
              Lorem ipsum dolor sit amet, consectetur. ";
-    b.iter(|| String::from(s))
+    b.iter(|| String::from(black_box(s)))
 }
 
 #[bench]
 fn bench_from(b: &mut Bencher) {
     let s = "Hello there, the quick brown fox jumped over the lazy dog! \
              Lorem ipsum dolor sit amet, consectetur. ";
-    b.iter(|| String::from(s))
+    b.iter(|| String::from(black_box(s)))
 }
 
 #[bench]
 fn bench_to_string(b: &mut Bencher) {
     let s = "Hello there, the quick brown fox jumped over the lazy dog! \
              Lorem ipsum dolor sit amet, consectetur. ";
-    b.iter(|| s.to_string())
+    b.iter(|| black_box(s).to_string())
 }
 
 #[bench]
 fn bench_insert_char_short(b: &mut Bencher) {
     let s = "Hello, World!";
     b.iter(|| {
         let mut x = String::from(s);
-        black_box(&mut x).insert(6, black_box(' '));
+        black_box(&mut x).insert(black_box(6), black_box(' '));
         x
     })
 }
@@ -139,7 +135,7 @@ fn bench_insert_char_long(b: &mut Bencher) {
     let s = "Hello, World!";
     b.iter(|| {
         let mut x = String::from(s);
-        black_box(&mut x).insert(6, black_box('❤'));
+        black_box(&mut x).insert(black_box(6), black_box('❤'));
         x
     })
 }
@@ -149,7 +145,7 @@ fn bench_insert_str_short(b: &mut Bencher) {
     let s = "Hello, World!";
     b.iter(|| {
         let mut x = String::from(s);
-        black_box(&mut x).insert_str(6, black_box(" "));
+        black_box(&mut x).insert_str(black_box(6), black_box(" "));
         x
     })
 }
@@ -159,7 +155,7 @@ fn bench_insert_str_long(b: &mut Bencher) {
     let s = "Hello, World!";
     b.iter(|| {
         let mut x = String::from(s);
-        black_box(&mut x).insert_str(6, black_box(" rustic "));
+        black_box(&mut x).insert_str(black_box(6), black_box(" rustic "));
         x
     })
 }
diff --git a/library/core/src/char/methods.rs b/library/core/src/char/methods.rs
@@ -1806,39 +1806,71 @@ const fn len_utf16(code: u32) -> usize {
 #[inline]
 pub const fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> &mut [u8] {
     let len = len_utf8(code);
-    match (len, &mut *dst) {
-        (1, [a, ..]) => {
-            *a = code as u8;
-        }
-        (2, [a, b, ..]) => {
-            *a = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
-            *b = (code & 0x3F) as u8 | TAG_CONT;
-        }
-        (3, [a, b, c, ..]) => {
-            *a = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
-            *b = (code >> 6 & 0x3F) as u8 | TAG_CONT;
-            *c = (code & 0x3F) as u8 | TAG_CONT;
-        }
-        (4, [a, b, c, d, ..]) => {
-            *a = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
-            *b = (code >> 12 & 0x3F) as u8 | TAG_CONT;
-            *c = (code >> 6 & 0x3F) as u8 | TAG_CONT;
-            *d = (code & 0x3F) as u8 | TAG_CONT;
-        }
-        _ => {
-            const_panic!(
-                "encode_utf8: buffer does not have enough bytes to encode code point",
-                "encode_utf8: need {len} bytes to encode U+{code:04X} but buffer has just {dst_len}",
-                code: u32 = code,
-                len: usize = len,
-                dst_len: usize = dst.len(),
-            )
-        }
-    };
+    if dst.len() < len {
+        const_panic!(
+            "encode_utf8: buffer does not have enough bytes to encode code point",
+            "encode_utf8: need {len} bytes to encode U+{code:04X} but buffer has just {dst_len}",
+            code: u32 = code,
+            len: usize = len,
+            dst_len: usize = dst.len(),
+        );
+    }
+
+    // SAFETY: `dst` is checked to be at least the length needed to encode the codepoint.
+    unsafe { encode_utf8_raw_unchecked(code, dst.as_mut_ptr()) };
+
     // SAFETY: `<&mut [u8]>::as_mut_ptr` is guaranteed to return a valid pointer and `len` has been tested to be within bounds.
     unsafe { slice::from_raw_parts_mut(dst.as_mut_ptr(), len) }
 }
 
+/// Encodes a raw `u32` value as UTF-8 into the byte buffer pointed to by `dst`.
+///
+/// Unlike `char::encode_utf8`, this method also handles codepoints in the surrogate range.
+/// (Creating a `char` in the surrogate range is UB.)
+/// The result is valid [generalized UTF-8] but not valid UTF-8.
+///
+/// [generalized UTF-8]: https://simonsapin.github.io/wtf-8/#generalized-utf8
+///
+/// # Safety
+///
+/// The behavior is undefined if the buffer pointed to by `dst` is not
+/// large enough to hold the encoded codepoint. A buffer of length four
+/// is large enough to encode any `char`.
+///
+/// For a safe version of this function, see the [`encode_utf8_raw`] function.
+#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
+#[doc(hidden)]
+#[inline]
+pub const unsafe fn encode_utf8_raw_unchecked(code: u32, dst: *mut u8) {
+    let len = len_utf8(code);
+    // SAFETY: The caller must guarantee that the buffer pointed to by `dst`
+    // is at least `len` bytes long.
+    unsafe {
+        match len {
+            1 => {
+                *dst = code as u8;
+            }
+            2 => {
+                *dst = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
+                *dst.add(1) = (code & 0x3F) as u8 | TAG_CONT;
+            }
+            3 => {
+                *dst = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
+                *dst.add(1) = (code >> 6 & 0x3F) as u8 | TAG_CONT;
+                *dst.add(2) = (code & 0x3F) as u8 | TAG_CONT;
+            }
+            4 => {
+                *dst = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
+                *dst.add(1) = (code >> 12 & 0x3F) as u8 | TAG_CONT;
+                *dst.add(2) = (code >> 6 & 0x3F) as u8 | TAG_CONT;
+                *dst.add(3) = (code & 0x3F) as u8 | TAG_CONT;
+            }
+            // SAFETY: `char` always takes between 1 and 4 bytes to encode in UTF-8.
+            _ => crate::hint::unreachable_unchecked(),
+        }
+    }
+}
+
 /// Encodes a raw `u32` value as native endian UTF-16 into the provided `u16` buffer,
 /// and then returns the subslice of the buffer that contains the encoded character.
 ///

diff --git a/library/core/src/char/mod.rs b/library/core/src/char/mod.rs
@@ -38,7 +38,7 @@ pub use self::decode::{DecodeUtf16, DecodeUtf16Error};
 #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
 pub use self::methods::encode_utf16_raw; // perma-unstable
 #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
-pub use self::methods::encode_utf8_raw; // perma-unstable
+pub use self::methods::{encode_utf8_raw, encode_utf8_raw_unchecked}; // perma-unstable
 
 #[rustfmt::skip]
 use crate::ascii;

diff --git a/tests/codegen/string-push.rs b/tests/codegen/string-push.rs
@@ -0,0 +1,11 @@
+//! Check that `String::push` is optimized enough not to call `memcpy`.
+
+//@ compile-flags: -O
+#![crate_type = "lib"]
+
+// CHECK-LABEL: @string_push_does_not_call_memcpy
+#[no_mangle]
+pub fn string_push_does_not_call_memcpy(s: &mut String, ch: char) {
+    // CHECK-NOT: call void @llvm.memcpy
+    s.push(ch);
+}