Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stabilize Utf8Chunks #123909

Merged
merged 1 commit into from
Apr 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion library/alloc/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,6 @@
#![feature(tuple_trait)]
#![feature(unicode_internals)]
#![feature(unsize)]
#![feature(utf8_chunks)]
#![feature(vec_pop_if)]
// tidy-alphabetical-end
//
Expand Down
2 changes: 1 addition & 1 deletion library/alloc/src/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ pub use core::str::{RSplit, Split};
pub use core::str::{RSplitN, SplitN};
#[stable(feature = "rust1", since = "1.0.0")]
pub use core::str::{RSplitTerminator, SplitTerminator};
#[unstable(feature = "utf8_chunks", issue = "99543")]
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
pub use core::str::{Utf8Chunk, Utf8Chunks};

/// Note: `str` in `Concat<str>` is not meaningful here.
Expand Down
4 changes: 1 addition & 3 deletions library/alloc/src/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,6 @@ use core::ops::{self, Range, RangeBounds};
use core::ptr;
use core::slice;
use core::str::pattern::Pattern;
#[cfg(not(no_global_oom_handling))]
use core::str::Utf8Chunks;

#[cfg(not(no_global_oom_handling))]
use crate::borrow::{Cow, ToOwned};
Expand Down Expand Up @@ -633,7 +631,7 @@ impl String {
#[cfg(not(no_global_oom_handling))]
#[stable(feature = "rust1", since = "1.0.0")]
pub fn from_utf8_lossy(v: &[u8]) -> Cow<'_, str> {
let mut iter = Utf8Chunks::new(v);
let mut iter = v.utf8_chunks();

let first_valid = if let Some(chunk) = iter.next() {
let valid = chunk.valid();
Expand Down
74 changes: 50 additions & 24 deletions library/core/src/str/lossy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,46 @@ use crate::iter::FusedIterator;
use super::from_utf8_unchecked;
use super::validations::utf8_char_width;

impl [u8] {
/// Creates an iterator over the contiguous valid UTF-8 ranges of this
/// slice, and the non-UTF-8 fragments in between.
///
/// # Examples
///
/// This function formats arbitrary but mostly-UTF-8 bytes into Rust source
/// code in the form of a C-string literal (`c"..."`).
///
/// ```
/// use std::fmt::Write as _;
///
/// pub fn cstr_literal(bytes: &[u8]) -> String {
/// let mut repr = String::new();
/// repr.push_str("c\"");
/// for chunk in bytes.utf8_chunks() {
/// for ch in chunk.valid().chars() {
/// // Escapes \0, \t, \r, \n, \\, \', \", and uses \u{...} for non-printable characters.
/// write!(repr, "{}", ch.escape_debug()).unwrap();
/// }
/// for byte in chunk.invalid() {
/// write!(repr, "\\x{:02X}", byte).unwrap();
/// }
/// }
/// repr.push('"');
/// repr
/// }
///
/// fn main() {
/// let lit = cstr_literal(b"\xferris the \xf0\x9f\xa6\x80\x07");
/// let expected = stringify!(c"\xFErris the 🦀\u{7}");
/// assert_eq!(lit, expected);
/// }
/// ```
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
pub fn utf8_chunks(&self) -> Utf8Chunks<'_> {
Utf8Chunks { source: self }
}
}

/// An item returned by the [`Utf8Chunks`] iterator.
///
/// A `Utf8Chunk` stores a sequence of [`u8`] up to the first broken character
Expand All @@ -14,23 +54,19 @@ use super::validations::utf8_char_width;
/// # Examples
///
/// ```
/// #![feature(utf8_chunks)]
///
/// use std::str::Utf8Chunks;
///
/// // An invalid UTF-8 string
/// let bytes = b"foo\xF1\x80bar";
///
/// // Decode the first `Utf8Chunk`
/// let chunk = Utf8Chunks::new(bytes).next().unwrap();
/// let chunk = bytes.utf8_chunks().next().unwrap();
///
/// // The first three characters are valid UTF-8
/// assert_eq!("foo", chunk.valid());
///
/// // The fourth character is broken
/// assert_eq!(b"\xF1\x80", chunk.invalid());
/// ```
#[unstable(feature = "utf8_chunks", issue = "99543")]
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct Utf8Chunk<'a> {
valid: &'a str,
Expand All @@ -43,7 +79,7 @@ impl<'a> Utf8Chunk<'a> {
/// This substring can be empty at the start of the string or between
/// broken UTF-8 characters.
#[must_use]
#[unstable(feature = "utf8_chunks", issue = "99543")]
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
pub fn valid(&self) -> &'a str {
self.valid
}
Expand All @@ -63,7 +99,7 @@ impl<'a> Utf8Chunk<'a> {
/// [`valid`]: Self::valid
/// [`U+FFFD REPLACEMENT CHARACTER`]: crate::char::REPLACEMENT_CHARACTER
#[must_use]
#[unstable(feature = "utf8_chunks", issue = "99543")]
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
pub fn invalid(&self) -> &'a [u8] {
self.invalid
}
Expand All @@ -78,7 +114,7 @@ impl fmt::Debug for Debug<'_> {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
f.write_char('"')?;

for chunk in Utf8Chunks::new(self.0) {
for chunk in self.0.utf8_chunks() {
// Valid part.
// Here we partially parse UTF-8 again which is suboptimal.
{
Expand Down Expand Up @@ -123,12 +159,8 @@ impl fmt::Debug for Debug<'_> {
/// [`String::from_utf8_lossy`] without allocating heap memory:
///
/// ```
/// #![feature(utf8_chunks)]
///
/// use std::str::Utf8Chunks;
///
/// fn from_utf8_lossy<F>(input: &[u8], mut push: F) where F: FnMut(&str) {
/// for chunk in Utf8Chunks::new(input) {
/// for chunk in input.utf8_chunks() {
/// push(chunk.valid());
///
/// if !chunk.invalid().is_empty() {
Expand All @@ -140,27 +172,21 @@ impl fmt::Debug for Debug<'_> {
///
/// [`String::from_utf8_lossy`]: ../../std/string/struct.String.html#method.from_utf8_lossy
#[must_use = "iterators are lazy and do nothing unless consumed"]
#[unstable(feature = "utf8_chunks", issue = "99543")]
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
#[derive(Clone)]
pub struct Utf8Chunks<'a> {
source: &'a [u8],
}

impl<'a> Utf8Chunks<'a> {
/// Creates a new iterator to decode the bytes.
#[unstable(feature = "utf8_chunks", issue = "99543")]
pub fn new(bytes: &'a [u8]) -> Self {
Self { source: bytes }
}

#[doc(hidden)]
#[unstable(feature = "str_internals", issue = "none")]
pub fn debug(&self) -> Debug<'_> {
Debug(self.source)
}
}

#[unstable(feature = "utf8_chunks", issue = "99543")]
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
impl<'a> Iterator for Utf8Chunks<'a> {
type Item = Utf8Chunk<'a>;

Expand Down Expand Up @@ -259,10 +285,10 @@ impl<'a> Iterator for Utf8Chunks<'a> {
}
}

#[unstable(feature = "utf8_chunks", issue = "99543")]
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
impl FusedIterator for Utf8Chunks<'_> {}

#[unstable(feature = "utf8_chunks", issue = "99543")]
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
impl fmt::Debug for Utf8Chunks<'_> {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
f.debug_struct("Utf8Chunks").field("source", &self.debug()).finish()
Expand Down
2 changes: 1 addition & 1 deletion library/core/src/str/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ use crate::slice::{self, SliceIndex};
pub mod pattern;

mod lossy;
#[unstable(feature = "utf8_chunks", issue = "99543")]
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
pub use lossy::{Utf8Chunk, Utf8Chunks};

#[stable(feature = "rust1", since = "1.0.0")]
Expand Down
1 change: 0 additions & 1 deletion library/core/tests/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,6 @@
#![feature(error_generic_member_access)]
#![feature(error_in_core)]
#![feature(trait_upcasting)]
#![feature(utf8_chunks)]
#![feature(is_ascii_octdigit)]
#![feature(get_many_mut)]
#![feature(iter_map_windows)]
Expand Down
6 changes: 2 additions & 4 deletions library/core/tests/str_lossy.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
use core::str::Utf8Chunks;

#[test]
fn chunks() {
macro_rules! assert_chunks {
( $string:expr, $(($valid:expr, $invalid:expr)),* $(,)? ) => {{
let mut iter = Utf8Chunks::new($string);
let mut iter = $string.utf8_chunks();
$(
let chunk = iter.next().expect("missing chunk");
assert_eq!($valid, chunk.valid());
Expand Down Expand Up @@ -79,7 +77,7 @@ fn debug() {
"\"Hello\\xC0\\x80 There\\xE6\\x83 Goodbye\\u{10d4ea}\"",
&format!(
"{:?}",
Utf8Chunks::new(b"Hello\xC0\x80 There\xE6\x83 Goodbye\xf4\x8d\x93\xaa").debug(),
b"Hello\xC0\x80 There\xE6\x83 Goodbye\xf4\x8d\x93\xaa".utf8_chunks().debug(),
),
);
}
1 change: 0 additions & 1 deletion library/std/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,6 @@
#![feature(thread_local)]
#![feature(try_blocks)]
#![feature(type_alias_impl_trait)]
#![feature(utf8_chunks)]
// tidy-alphabetical-end
//
// Library features (core):
Expand Down
6 changes: 2 additions & 4 deletions library/std/src/sys/os_str/bytes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@ use crate::str;
use crate::sync::Arc;
use crate::sys_common::{AsInner, IntoInner};

use core::str::Utf8Chunks;

#[cfg(test)]
mod tests;

Expand All @@ -29,7 +27,7 @@ pub struct Slice {

impl fmt::Debug for Slice {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Debug::fmt(&Utf8Chunks::new(&self.inner).debug(), f)
fmt::Debug::fmt(&self.inner.utf8_chunks().debug(), f)
}
}

Expand All @@ -41,7 +39,7 @@ impl fmt::Display for Slice {
return "".fmt(f);
}

for chunk in Utf8Chunks::new(&self.inner) {
for chunk in self.inner.utf8_chunks() {
let valid = chunk.valid();
// If we successfully decoded the whole chunk as a valid string then
// we can return a direct formatting of the string which will also
Expand Down
Loading