Skip to content

Commit d499065

Browse files
authored
Rollup merge of #99544 - dylni:expose-utf8lossy, r=Mark-Simulacrum
Expose `Utf8Lossy` as `Utf8Chunks` This PR changes the feature for `Utf8Lossy` from `str_internals` to `utf8_lossy` and improves the API. This is done to eventually expose the API as stable. Proposal: rust-lang/libs-team#54 Tracking Issue: #99543
2 parents 2be85b0 + e8ee0b7 commit d499065

File tree

10 files changed

+273
-184
lines changed

10 files changed

+273
-184
lines changed

library/alloc/src/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@
145145
#![feature(unchecked_math)]
146146
#![feature(unicode_internals)]
147147
#![feature(unsize)]
148+
#![feature(utf8_chunks)]
148149
#![feature(std_internals)]
149150
//
150151
// Language features:

library/alloc/src/str.rs

+2
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,8 @@ pub use core::str::{RSplit, Split};
7171
pub use core::str::{RSplitN, SplitN};
7272
#[stable(feature = "rust1", since = "1.0.0")]
7373
pub use core::str::{RSplitTerminator, SplitTerminator};
74+
#[unstable(feature = "utf8_chunks", issue = "99543")]
75+
pub use core::str::{Utf8Chunk, Utf8Chunks};
7476

7577
/// Note: `str` in `Concat<str>` is not meaningful here.
7678
/// This type parameter of the trait only exists to enable another impl.

library/alloc/src/string.rs

+8-8
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,9 @@ use core::ops::Bound::{Excluded, Included, Unbounded};
5858
use core::ops::{self, Index, IndexMut, Range, RangeBounds};
5959
use core::ptr;
6060
use core::slice;
61-
#[cfg(not(no_global_oom_handling))]
62-
use core::str::lossy;
6361
use core::str::pattern::Pattern;
62+
#[cfg(not(no_global_oom_handling))]
63+
use core::str::Utf8Chunks;
6464

6565
#[cfg(not(no_global_oom_handling))]
6666
use crate::borrow::{Cow, ToOwned};
@@ -628,11 +628,11 @@ impl String {
628628
#[cfg(not(no_global_oom_handling))]
629629
#[stable(feature = "rust1", since = "1.0.0")]
630630
pub fn from_utf8_lossy(v: &[u8]) -> Cow<'_, str> {
631-
let mut iter = lossy::Utf8Lossy::from_bytes(v).chunks();
631+
let mut iter = Utf8Chunks::new(v);
632632

633633
let first_valid = if let Some(chunk) = iter.next() {
634-
let lossy::Utf8LossyChunk { valid, broken } = chunk;
635-
if broken.is_empty() {
634+
let valid = chunk.valid();
635+
if chunk.invalid().is_empty() {
636636
debug_assert_eq!(valid.len(), v.len());
637637
return Cow::Borrowed(valid);
638638
}
@@ -647,9 +647,9 @@ impl String {
647647
res.push_str(first_valid);
648648
res.push_str(REPLACEMENT);
649649

650-
for lossy::Utf8LossyChunk { valid, broken } in iter {
651-
res.push_str(valid);
652-
if !broken.is_empty() {
650+
for chunk in iter {
651+
res.push_str(chunk.valid());
652+
if !chunk.invalid().is_empty() {
653653
res.push_str(REPLACEMENT);
654654
}
655655
}

library/core/src/str/lossy.rs

+157-87
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,170 @@
1-
use crate::char;
2-
use crate::fmt::{self, Write};
3-
use crate::mem;
1+
use crate::fmt;
2+
use crate::fmt::Formatter;
3+
use crate::fmt::Write;
4+
use crate::iter::FusedIterator;
45

56
use super::from_utf8_unchecked;
67
use super::validations::utf8_char_width;
78

8-
/// Lossy UTF-8 string.
9-
#[unstable(feature = "str_internals", issue = "none")]
10-
pub struct Utf8Lossy {
11-
bytes: [u8],
9+
/// An item returned by the [`Utf8Chunks`] iterator.
10+
///
11+
/// A `Utf8Chunk` stores a sequence of [`u8`] up to the first broken character
12+
/// when decoding a UTF-8 string.
13+
///
14+
/// # Examples
15+
///
16+
/// ```
17+
/// #![feature(utf8_chunks)]
18+
///
19+
/// use std::str::Utf8Chunks;
20+
///
21+
/// // An invalid UTF-8 string
22+
/// let bytes = b"foo\xF1\x80bar";
23+
///
24+
/// // Decode the first `Utf8Chunk`
25+
/// let chunk = Utf8Chunks::new(bytes).next().unwrap();
26+
///
27+
/// // The first three characters are valid UTF-8
28+
/// assert_eq!("foo", chunk.valid());
29+
///
30+
/// // The fourth character is broken
31+
/// assert_eq!(b"\xF1\x80", chunk.invalid());
32+
/// ```
33+
#[unstable(feature = "utf8_chunks", issue = "99543")]
34+
#[derive(Clone, Debug, PartialEq, Eq)]
35+
pub struct Utf8Chunk<'a> {
36+
valid: &'a str,
37+
invalid: &'a [u8],
1238
}
1339

14-
impl Utf8Lossy {
40+
impl<'a> Utf8Chunk<'a> {
41+
/// Returns the next validated UTF-8 substring.
42+
///
43+
/// This substring can be empty at the start of the string or between
44+
/// broken UTF-8 characters.
1545
#[must_use]
16-
pub fn from_bytes(bytes: &[u8]) -> &Utf8Lossy {
17-
// SAFETY: Both use the same memory layout, and UTF-8 correctness isn't required.
18-
unsafe { mem::transmute(bytes) }
46+
#[unstable(feature = "utf8_chunks", issue = "99543")]
47+
pub fn valid(&self) -> &'a str {
48+
self.valid
1949
}
2050

21-
pub fn chunks(&self) -> Utf8LossyChunksIter<'_> {
22-
Utf8LossyChunksIter { source: &self.bytes }
51+
/// Returns the invalid sequence that caused a failure.
52+
///
53+
/// The returned slice will have a maximum length of 3 and starts after the
54+
/// substring given by [`valid`]. Decoding will resume after this sequence.
55+
///
56+
/// If empty, this is the last chunk in the string. If non-empty, an
57+
/// unexpected byte was encountered or the end of the input was reached
58+
/// unexpectedly.
59+
///
60+
/// Lossy decoding would replace this sequence with [`U+FFFD REPLACEMENT
61+
/// CHARACTER`].
62+
///
63+
/// [`valid`]: Self::valid
64+
/// [`U+FFFD REPLACEMENT CHARACTER`]: crate::char::REPLACEMENT_CHARACTER
65+
#[must_use]
66+
#[unstable(feature = "utf8_chunks", issue = "99543")]
67+
pub fn invalid(&self) -> &'a [u8] {
68+
self.invalid
2369
}
2470
}
2571

26-
/// Iterator over lossy UTF-8 string
27-
#[must_use = "iterators are lazy and do nothing unless consumed"]
72+
#[must_use]
73+
#[unstable(feature = "str_internals", issue = "none")]
74+
pub struct Debug<'a>(&'a [u8]);
75+
2876
#[unstable(feature = "str_internals", issue = "none")]
29-
#[allow(missing_debug_implementations)]
30-
pub struct Utf8LossyChunksIter<'a> {
77+
impl fmt::Debug for Debug<'_> {
78+
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
79+
f.write_char('"')?;
80+
81+
for chunk in Utf8Chunks::new(self.0) {
82+
// Valid part.
83+
// Here we partially parse UTF-8 again which is suboptimal.
84+
{
85+
let valid = chunk.valid();
86+
let mut from = 0;
87+
for (i, c) in valid.char_indices() {
88+
let esc = c.escape_debug();
89+
// If char needs escaping, flush backlog so far and write, else skip
90+
if esc.len() != 1 {
91+
f.write_str(&valid[from..i])?;
92+
for c in esc {
93+
f.write_char(c)?;
94+
}
95+
from = i + c.len_utf8();
96+
}
97+
}
98+
f.write_str(&valid[from..])?;
99+
}
100+
101+
// Broken parts of string as hex escape.
102+
for &b in chunk.invalid() {
103+
write!(f, "\\x{:02X}", b)?;
104+
}
105+
}
106+
107+
f.write_char('"')
108+
}
109+
}
110+
111+
/// An iterator used to decode a slice of mostly UTF-8 bytes to string slices
112+
/// ([`&str`]) and byte slices ([`&[u8]`][byteslice]).
113+
///
114+
/// If you want a simple conversion from UTF-8 byte slices to string slices,
115+
/// [`from_utf8`] is easier to use.
116+
///
117+
/// [byteslice]: slice
118+
/// [`from_utf8`]: super::from_utf8
119+
///
120+
/// # Examples
121+
///
122+
/// This can be used to create functionality similar to
123+
/// [`String::from_utf8_lossy`] without allocating heap memory:
124+
///
125+
/// ```
126+
/// #![feature(utf8_chunks)]
127+
///
128+
/// use std::str::Utf8Chunks;
129+
///
130+
/// fn from_utf8_lossy<F>(input: &[u8], mut push: F) where F: FnMut(&str) {
131+
/// for chunk in Utf8Chunks::new(input) {
132+
/// push(chunk.valid());
133+
///
134+
/// if !chunk.invalid().is_empty() {
135+
/// push("\u{FFFD}");
136+
/// }
137+
/// }
138+
/// }
139+
/// ```
140+
///
141+
/// [`String::from_utf8_lossy`]: ../../std/string/struct.String.html#method.from_utf8_lossy
142+
#[must_use = "iterators are lazy and do nothing unless consumed"]
143+
#[unstable(feature = "utf8_chunks", issue = "99543")]
144+
#[derive(Clone)]
145+
pub struct Utf8Chunks<'a> {
31146
source: &'a [u8],
32147
}
33148

34-
#[unstable(feature = "str_internals", issue = "none")]
35-
#[derive(PartialEq, Eq, Debug)]
36-
pub struct Utf8LossyChunk<'a> {
37-
/// Sequence of valid chars.
38-
/// Can be empty between broken UTF-8 chars.
39-
pub valid: &'a str,
40-
/// Single broken char, empty if none.
41-
/// Empty iff iterator item is last.
42-
pub broken: &'a [u8],
149+
impl<'a> Utf8Chunks<'a> {
150+
/// Creates a new iterator to decode the bytes.
151+
#[unstable(feature = "utf8_chunks", issue = "99543")]
152+
pub fn new(bytes: &'a [u8]) -> Self {
153+
Self { source: bytes }
154+
}
155+
156+
#[doc(hidden)]
157+
#[unstable(feature = "str_internals", issue = "none")]
158+
pub fn debug(&self) -> Debug<'_> {
159+
Debug(self.source)
160+
}
43161
}
44162

45-
impl<'a> Iterator for Utf8LossyChunksIter<'a> {
46-
type Item = Utf8LossyChunk<'a>;
163+
#[unstable(feature = "utf8_chunks", issue = "99543")]
164+
impl<'a> Iterator for Utf8Chunks<'a> {
165+
type Item = Utf8Chunk<'a>;
47166

48-
fn next(&mut self) -> Option<Utf8LossyChunk<'a>> {
167+
fn next(&mut self) -> Option<Utf8Chunk<'a>> {
49168
if self.source.is_empty() {
50169
return None;
51170
}
@@ -130,71 +249,22 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> {
130249

131250
// SAFETY: `valid_up_to <= i` because it is only ever assigned via
132251
// `valid_up_to = i` and `i` only increases.
133-
let (valid, broken) = unsafe { inspected.split_at_unchecked(valid_up_to) };
252+
let (valid, invalid) = unsafe { inspected.split_at_unchecked(valid_up_to) };
134253

135-
Some(Utf8LossyChunk {
254+
Some(Utf8Chunk {
136255
// SAFETY: All bytes up to `valid_up_to` are valid UTF-8.
137256
valid: unsafe { from_utf8_unchecked(valid) },
138-
broken,
257+
invalid,
139258
})
140259
}
141260
}
142261

143-
impl fmt::Display for Utf8Lossy {
144-
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
145-
// If we're the empty string then our iterator won't actually yield
146-
// anything, so perform the formatting manually
147-
if self.bytes.is_empty() {
148-
return "".fmt(f);
149-
}
150-
151-
for Utf8LossyChunk { valid, broken } in self.chunks() {
152-
// If we successfully decoded the whole chunk as a valid string then
153-
// we can return a direct formatting of the string which will also
154-
// respect various formatting flags if possible.
155-
if valid.len() == self.bytes.len() {
156-
assert!(broken.is_empty());
157-
return valid.fmt(f);
158-
}
159-
160-
f.write_str(valid)?;
161-
if !broken.is_empty() {
162-
f.write_char(char::REPLACEMENT_CHARACTER)?;
163-
}
164-
}
165-
Ok(())
166-
}
167-
}
168-
169-
impl fmt::Debug for Utf8Lossy {
170-
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
171-
f.write_char('"')?;
262+
#[unstable(feature = "utf8_chunks", issue = "99543")]
263+
impl FusedIterator for Utf8Chunks<'_> {}
172264

173-
for Utf8LossyChunk { valid, broken } in self.chunks() {
174-
// Valid part.
175-
// Here we partially parse UTF-8 again which is suboptimal.
176-
{
177-
let mut from = 0;
178-
for (i, c) in valid.char_indices() {
179-
let esc = c.escape_debug();
180-
// If char needs escaping, flush backlog so far and write, else skip
181-
if esc.len() != 1 {
182-
f.write_str(&valid[from..i])?;
183-
for c in esc {
184-
f.write_char(c)?;
185-
}
186-
from = i + c.len_utf8();
187-
}
188-
}
189-
f.write_str(&valid[from..])?;
190-
}
191-
192-
// Broken parts of string as hex escape.
193-
for &b in broken {
194-
write!(f, "\\x{:02x}", b)?;
195-
}
196-
}
197-
198-
f.write_char('"')
265+
#[unstable(feature = "utf8_chunks", issue = "99543")]
266+
impl fmt::Debug for Utf8Chunks<'_> {
267+
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
268+
f.debug_struct("Utf8Chunks").field("source", &self.debug()).finish()
199269
}
200270
}

library/core/src/str/mod.rs

+3-3
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,9 @@ use crate::slice::{self, SliceIndex};
2222

2323
pub mod pattern;
2424

25-
#[unstable(feature = "str_internals", issue = "none")]
26-
#[allow(missing_docs)]
27-
pub mod lossy;
25+
mod lossy;
26+
#[unstable(feature = "utf8_chunks", issue = "99543")]
27+
pub use lossy::{Utf8Chunk, Utf8Chunks};
2828

2929
#[stable(feature = "rust1", since = "1.0.0")]
3030
pub use converts::{from_utf8, from_utf8_unchecked};

library/core/tests/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@
9999
#![feature(waker_getters)]
100100
#![feature(slice_flatten)]
101101
#![feature(provide_any)]
102+
#![feature(utf8_chunks)]
102103
#![deny(unsafe_op_in_unsafe_fn)]
103104

104105
extern crate test;

0 commit comments

Comments
 (0)