|
1 |
| -use crate::char; |
2 |
| -use crate::fmt::{self, Write}; |
3 |
| -use crate::mem; |
| 1 | +use crate::fmt; |
| 2 | +use crate::fmt::Formatter; |
| 3 | +use crate::fmt::Write; |
| 4 | +use crate::iter::FusedIterator; |
4 | 5 |
|
5 | 6 | use super::from_utf8_unchecked;
|
6 | 7 | use super::validations::utf8_char_width;
|
7 | 8 |
|
8 |
| -/// Lossy UTF-8 string. |
9 |
| -#[unstable(feature = "str_internals", issue = "none")] |
10 |
| -pub struct Utf8Lossy { |
11 |
| - bytes: [u8], |
| 9 | +/// An item returned by the [`Utf8Chunks`] iterator. |
| 10 | +/// |
| 11 | +/// A `Utf8Chunk` stores a sequence of [`u8`] up to the first broken character |
| 12 | +/// when decoding a UTF-8 string. |
| 13 | +/// |
| 14 | +/// # Examples |
| 15 | +/// |
| 16 | +/// ``` |
| 17 | +/// #![feature(utf8_chunks)] |
| 18 | +/// |
| 19 | +/// use std::str::Utf8Chunks; |
| 20 | +/// |
| 21 | +/// // An invalid UTF-8 string |
| 22 | +/// let bytes = b"foo\xF1\x80bar"; |
| 23 | +/// |
| 24 | +/// // Decode the first `Utf8Chunk` |
| 25 | +/// let chunk = Utf8Chunks::new(bytes).next().unwrap(); |
| 26 | +/// |
| 27 | +/// // The first three characters are valid UTF-8 |
| 28 | +/// assert_eq!("foo", chunk.valid()); |
| 29 | +/// |
| 30 | +/// // The fourth character is broken |
| 31 | +/// assert_eq!(b"\xF1\x80", chunk.invalid()); |
| 32 | +/// ``` |
| 33 | +#[unstable(feature = "utf8_chunks", issue = "99543")] |
| 34 | +#[derive(Clone, Debug, PartialEq, Eq)] |
| 35 | +pub struct Utf8Chunk<'a> { |
| 36 | + valid: &'a str, |
| 37 | + invalid: &'a [u8], |
12 | 38 | }
|
13 | 39 |
|
14 |
| -impl Utf8Lossy { |
| 40 | +impl<'a> Utf8Chunk<'a> { |
| 41 | + /// Returns the next validated UTF-8 substring. |
| 42 | + /// |
| 43 | + /// This substring can be empty at the start of the string or between |
| 44 | + /// broken UTF-8 characters. |
15 | 45 | #[must_use]
|
16 |
| - pub fn from_bytes(bytes: &[u8]) -> &Utf8Lossy { |
17 |
| - // SAFETY: Both use the same memory layout, and UTF-8 correctness isn't required. |
18 |
| - unsafe { mem::transmute(bytes) } |
| 46 | + #[unstable(feature = "utf8_chunks", issue = "99543")] |
| 47 | + pub fn valid(&self) -> &'a str { |
| 48 | + self.valid |
19 | 49 | }
|
20 | 50 |
|
21 |
| - pub fn chunks(&self) -> Utf8LossyChunksIter<'_> { |
22 |
| - Utf8LossyChunksIter { source: &self.bytes } |
| 51 | + /// Returns the invalid sequence that caused a failure. |
| 52 | + /// |
| 53 | + /// The returned slice will have a maximum length of 3 and starts after the |
| 54 | + /// substring given by [`valid`]. Decoding will resume after this sequence. |
| 55 | + /// |
| 56 | + /// If empty, this is the last chunk in the string. If non-empty, an |
| 57 | + /// unexpected byte was encountered or the end of the input was reached |
| 58 | + /// unexpectedly. |
| 59 | + /// |
| 60 | + /// Lossy decoding would replace this sequence with [`U+FFFD REPLACEMENT |
| 61 | + /// CHARACTER`]. |
| 62 | + /// |
| 63 | + /// [`valid`]: Self::valid |
| 64 | + /// [`U+FFFD REPLACEMENT CHARACTER`]: crate::char::REPLACEMENT_CHARACTER |
| 65 | + #[must_use] |
| 66 | + #[unstable(feature = "utf8_chunks", issue = "99543")] |
| 67 | + pub fn invalid(&self) -> &'a [u8] { |
| 68 | + self.invalid |
23 | 69 | }
|
24 | 70 | }
|
25 | 71 |
|
26 |
| -/// Iterator over lossy UTF-8 string |
27 |
| -#[must_use = "iterators are lazy and do nothing unless consumed"] |
| 72 | +#[must_use] |
| 73 | +#[unstable(feature = "str_internals", issue = "none")] |
| 74 | +pub struct Debug<'a>(&'a [u8]); |
| 75 | + |
28 | 76 | #[unstable(feature = "str_internals", issue = "none")]
|
29 |
| -#[allow(missing_debug_implementations)] |
30 |
| -pub struct Utf8LossyChunksIter<'a> { |
| 77 | +impl fmt::Debug for Debug<'_> { |
| 78 | + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { |
| 79 | + f.write_char('"')?; |
| 80 | + |
| 81 | + for chunk in Utf8Chunks::new(self.0) { |
| 82 | + // Valid part. |
| 83 | + // Here we partially parse UTF-8 again which is suboptimal. |
| 84 | + { |
| 85 | + let valid = chunk.valid(); |
| 86 | + let mut from = 0; |
| 87 | + for (i, c) in valid.char_indices() { |
| 88 | + let esc = c.escape_debug(); |
| 89 | + // If char needs escaping, flush backlog so far and write, else skip |
| 90 | + if esc.len() != 1 { |
| 91 | + f.write_str(&valid[from..i])?; |
| 92 | + for c in esc { |
| 93 | + f.write_char(c)?; |
| 94 | + } |
| 95 | + from = i + c.len_utf8(); |
| 96 | + } |
| 97 | + } |
| 98 | + f.write_str(&valid[from..])?; |
| 99 | + } |
| 100 | + |
| 101 | + // Broken parts of string as hex escape. |
| 102 | + for &b in chunk.invalid() { |
| 103 | + write!(f, "\\x{:02X}", b)?; |
| 104 | + } |
| 105 | + } |
| 106 | + |
| 107 | + f.write_char('"') |
| 108 | + } |
| 109 | +} |
| 110 | + |
| 111 | +/// An iterator used to decode a slice of mostly UTF-8 bytes to string slices |
| 112 | +/// ([`&str`]) and byte slices ([`&[u8]`][byteslice]). |
| 113 | +/// |
| 114 | +/// If you want a simple conversion from UTF-8 byte slices to string slices, |
| 115 | +/// [`from_utf8`] is easier to use. |
| 116 | +/// |
| 117 | +/// [byteslice]: slice |
| 118 | +/// [`from_utf8`]: super::from_utf8 |
| 119 | +/// |
| 120 | +/// # Examples |
| 121 | +/// |
| 122 | +/// This can be used to create functionality similar to |
| 123 | +/// [`String::from_utf8_lossy`] without allocating heap memory: |
| 124 | +/// |
| 125 | +/// ``` |
| 126 | +/// #![feature(utf8_chunks)] |
| 127 | +/// |
| 128 | +/// use std::str::Utf8Chunks; |
| 129 | +/// |
| 130 | +/// fn from_utf8_lossy<F>(input: &[u8], mut push: F) where F: FnMut(&str) { |
| 131 | +/// for chunk in Utf8Chunks::new(input) { |
| 132 | +/// push(chunk.valid()); |
| 133 | +/// |
| 134 | +/// if !chunk.invalid().is_empty() { |
| 135 | +/// push("\u{FFFD}"); |
| 136 | +/// } |
| 137 | +/// } |
| 138 | +/// } |
| 139 | +/// ``` |
| 140 | +/// |
| 141 | +/// [`String::from_utf8_lossy`]: ../../std/string/struct.String.html#method.from_utf8_lossy |
| 142 | +#[must_use = "iterators are lazy and do nothing unless consumed"] |
| 143 | +#[unstable(feature = "utf8_chunks", issue = "99543")] |
| 144 | +#[derive(Clone)] |
| 145 | +pub struct Utf8Chunks<'a> { |
31 | 146 | source: &'a [u8],
|
32 | 147 | }
|
33 | 148 |
|
34 |
| -#[unstable(feature = "str_internals", issue = "none")] |
35 |
| -#[derive(PartialEq, Eq, Debug)] |
36 |
| -pub struct Utf8LossyChunk<'a> { |
37 |
| - /// Sequence of valid chars. |
38 |
| - /// Can be empty between broken UTF-8 chars. |
39 |
| - pub valid: &'a str, |
40 |
| - /// Single broken char, empty if none. |
41 |
| - /// Empty iff iterator item is last. |
42 |
| - pub broken: &'a [u8], |
| 149 | +impl<'a> Utf8Chunks<'a> { |
| 150 | + /// Creates a new iterator to decode the bytes. |
| 151 | + #[unstable(feature = "utf8_chunks", issue = "99543")] |
| 152 | + pub fn new(bytes: &'a [u8]) -> Self { |
| 153 | + Self { source: bytes } |
| 154 | + } |
| 155 | + |
| 156 | + #[doc(hidden)] |
| 157 | + #[unstable(feature = "str_internals", issue = "none")] |
| 158 | + pub fn debug(&self) -> Debug<'_> { |
| 159 | + Debug(self.source) |
| 160 | + } |
43 | 161 | }
|
44 | 162 |
|
45 |
| -impl<'a> Iterator for Utf8LossyChunksIter<'a> { |
46 |
| - type Item = Utf8LossyChunk<'a>; |
| 163 | +#[unstable(feature = "utf8_chunks", issue = "99543")] |
| 164 | +impl<'a> Iterator for Utf8Chunks<'a> { |
| 165 | + type Item = Utf8Chunk<'a>; |
47 | 166 |
|
48 |
| - fn next(&mut self) -> Option<Utf8LossyChunk<'a>> { |
| 167 | + fn next(&mut self) -> Option<Utf8Chunk<'a>> { |
49 | 168 | if self.source.is_empty() {
|
50 | 169 | return None;
|
51 | 170 | }
|
@@ -130,71 +249,22 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> {
|
130 | 249 |
|
131 | 250 | // SAFETY: `valid_up_to <= i` because it is only ever assigned via
|
132 | 251 | // `valid_up_to = i` and `i` only increases.
|
133 |
| - let (valid, broken) = unsafe { inspected.split_at_unchecked(valid_up_to) }; |
| 252 | + let (valid, invalid) = unsafe { inspected.split_at_unchecked(valid_up_to) }; |
134 | 253 |
|
135 |
| - Some(Utf8LossyChunk { |
| 254 | + Some(Utf8Chunk { |
136 | 255 | // SAFETY: All bytes up to `valid_up_to` are valid UTF-8.
|
137 | 256 | valid: unsafe { from_utf8_unchecked(valid) },
|
138 |
| - broken, |
| 257 | + invalid, |
139 | 258 | })
|
140 | 259 | }
|
141 | 260 | }
|
142 | 261 |
|
143 |
| -impl fmt::Display for Utf8Lossy { |
144 |
| - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
145 |
| - // If we're the empty string then our iterator won't actually yield |
146 |
| - // anything, so perform the formatting manually |
147 |
| - if self.bytes.is_empty() { |
148 |
| - return "".fmt(f); |
149 |
| - } |
150 |
| - |
151 |
| - for Utf8LossyChunk { valid, broken } in self.chunks() { |
152 |
| - // If we successfully decoded the whole chunk as a valid string then |
153 |
| - // we can return a direct formatting of the string which will also |
154 |
| - // respect various formatting flags if possible. |
155 |
| - if valid.len() == self.bytes.len() { |
156 |
| - assert!(broken.is_empty()); |
157 |
| - return valid.fmt(f); |
158 |
| - } |
159 |
| - |
160 |
| - f.write_str(valid)?; |
161 |
| - if !broken.is_empty() { |
162 |
| - f.write_char(char::REPLACEMENT_CHARACTER)?; |
163 |
| - } |
164 |
| - } |
165 |
| - Ok(()) |
166 |
| - } |
167 |
| -} |
168 |
| - |
169 |
| -impl fmt::Debug for Utf8Lossy { |
170 |
| - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
171 |
| - f.write_char('"')?; |
| 262 | +#[unstable(feature = "utf8_chunks", issue = "99543")] |
| 263 | +impl FusedIterator for Utf8Chunks<'_> {} |
172 | 264 |
|
173 |
| - for Utf8LossyChunk { valid, broken } in self.chunks() { |
174 |
| - // Valid part. |
175 |
| - // Here we partially parse UTF-8 again which is suboptimal. |
176 |
| - { |
177 |
| - let mut from = 0; |
178 |
| - for (i, c) in valid.char_indices() { |
179 |
| - let esc = c.escape_debug(); |
180 |
| - // If char needs escaping, flush backlog so far and write, else skip |
181 |
| - if esc.len() != 1 { |
182 |
| - f.write_str(&valid[from..i])?; |
183 |
| - for c in esc { |
184 |
| - f.write_char(c)?; |
185 |
| - } |
186 |
| - from = i + c.len_utf8(); |
187 |
| - } |
188 |
| - } |
189 |
| - f.write_str(&valid[from..])?; |
190 |
| - } |
191 |
| - |
192 |
| - // Broken parts of string as hex escape. |
193 |
| - for &b in broken { |
194 |
| - write!(f, "\\x{:02x}", b)?; |
195 |
| - } |
196 |
| - } |
197 |
| - |
198 |
| - f.write_char('"') |
| 265 | +#[unstable(feature = "utf8_chunks", issue = "99543")] |
| 266 | +impl fmt::Debug for Utf8Chunks<'_> { |
| 267 | + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { |
| 268 | + f.debug_struct("Utf8Chunks").field("source", &self.debug()).finish() |
199 | 269 | }
|
200 | 270 | }
|
0 commit comments