Skip to content

Commit 11a3376

Browse files
committed
Auto merge of #40212 - SimonSapin:utf8error-resume-from, r=aturon
Add Utf8Error::error_len, to help incremental and/or lossy decoding. Without this, code outside of the standard library needs to reimplement most of the logic `from_utf8` to interpret the bytes after `valid_up_to()`.
2 parents 71d7b29 + 73370c5 commit 11a3376

File tree

3 files changed

+89
-22
lines changed

3 files changed

+89
-22
lines changed

src/libcollectionstest/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#![feature(test)]
2929
#![feature(unboxed_closures)]
3030
#![feature(unicode)]
31+
#![feature(utf8_error_error_len)]
3132

3233
extern crate collections;
3334
extern crate test;

src/libcollectionstest/str.rs

+30
Original file line numberDiff line numberDiff line change
@@ -540,6 +540,36 @@ fn from_utf8_mostly_ascii() {
540540
}
541541
}
542542

543+
#[test]
544+
fn from_utf8_error() {
545+
macro_rules! test {
546+
($input: expr, $expected_valid_up_to: expr, $expected_error_len: expr) => {
547+
let error = from_utf8($input).unwrap_err();
548+
assert_eq!(error.valid_up_to(), $expected_valid_up_to);
549+
assert_eq!(error.error_len(), $expected_error_len);
550+
}
551+
}
552+
test!(b"A\xC3\xA9 \xFF ", 4, Some(1));
553+
test!(b"A\xC3\xA9 \x80 ", 4, Some(1));
554+
test!(b"A\xC3\xA9 \xC1 ", 4, Some(1));
555+
test!(b"A\xC3\xA9 \xC1", 4, Some(1));
556+
test!(b"A\xC3\xA9 \xC2", 4, None);
557+
test!(b"A\xC3\xA9 \xC2 ", 4, Some(1));
558+
test!(b"A\xC3\xA9 \xC2\xC0", 4, Some(1));
559+
test!(b"A\xC3\xA9 \xE0", 4, None);
560+
test!(b"A\xC3\xA9 \xE0\x9F", 4, Some(1));
561+
test!(b"A\xC3\xA9 \xE0\xA0", 4, None);
562+
test!(b"A\xC3\xA9 \xE0\xA0\xC0", 4, Some(2));
563+
test!(b"A\xC3\xA9 \xE0\xA0 ", 4, Some(2));
564+
test!(b"A\xC3\xA9 \xED\xA0\x80 ", 4, Some(1));
565+
test!(b"A\xC3\xA9 \xF1", 4, None);
566+
test!(b"A\xC3\xA9 \xF1\x80", 4, None);
567+
test!(b"A\xC3\xA9 \xF1\x80\x80", 4, None);
568+
test!(b"A\xC3\xA9 \xF1 ", 4, Some(1));
569+
test!(b"A\xC3\xA9 \xF1\x80 ", 4, Some(2));
570+
test!(b"A\xC3\xA9 \xF1\x80\x80 ", 4, Some(3));
571+
}
572+
543573
#[test]
544574
fn test_as_bytes() {
545575
// no null

src/libcore/str/mod.rs

+58-22
Original file line numberDiff line numberDiff line change
@@ -125,13 +125,14 @@ Section: Creating a string
125125
#[stable(feature = "rust1", since = "1.0.0")]
126126
pub struct Utf8Error {
127127
valid_up_to: usize,
128+
error_len: Option<u8>,
128129
}
129130

130131
impl Utf8Error {
131132
/// Returns the index in the given string up to which valid UTF-8 was
132133
/// verified.
133134
///
134-
/// It is the maximum index such that `from_utf8(input[..index])`
135+
/// It is the maximum index such that `from_utf8(&input[..index])`
135136
/// would return `Ok(_)`.
136137
///
137138
/// # Examples
@@ -152,6 +153,23 @@ impl Utf8Error {
152153
/// ```
153154
#[stable(feature = "utf8_error", since = "1.5.0")]
154155
pub fn valid_up_to(&self) -> usize { self.valid_up_to }
156+
157+
/// Provide more information about the failure:
158+
///
159+
/// * `None`: the end of the input was reached unexpectedly.
160+
/// `self.valid_up_to()` is 1 to 3 bytes from the end of the input.
161+
/// If a byte stream (such as a file or a network socket) is being decoded incrementally,
162+
/// this could be a valid `char` whose UTF-8 byte sequence is spanning multiple chunks.
163+
///
164+
/// * `Some(len)`: an unexpected byte was encountered.
165+
/// The length provided is that of the invalid byte sequence
166+
/// that starts at the index given by `valid_up_to()`.
167+
/// Decoding should resume after that sequence
168+
/// (after inserting a U+FFFD REPLACEMENT CHARACTER) in case of lossy decoding.
169+
#[unstable(feature = "utf8_error_error_len", reason ="new", issue = "40494")]
170+
pub fn error_len(&self) -> Option<usize> {
171+
self.error_len.map(|len| len as usize)
172+
}
155173
}
156174

157175
/// Converts a slice of bytes to a string slice.
@@ -300,7 +318,12 @@ pub unsafe fn from_utf8_unchecked(v: &[u8]) -> &str {
300318
#[stable(feature = "rust1", since = "1.0.0")]
301319
impl fmt::Display for Utf8Error {
302320
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
303-
write!(f, "invalid utf-8: invalid byte near index {}", self.valid_up_to)
321+
if let Some(error_len) = self.error_len {
322+
write!(f, "invalid utf-8 sequence of {} bytes from index {}",
323+
error_len, self.valid_up_to)
324+
} else {
325+
write!(f, "incomplete utf-8 byte sequence from index {}", self.valid_up_to)
326+
}
304327
}
305328
}
306329

@@ -1241,25 +1264,27 @@ fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
12411264

12421265
while index < len {
12431266
let old_offset = index;
1244-
macro_rules! err { () => {{
1245-
return Err(Utf8Error {
1246-
valid_up_to: old_offset
1247-
})
1248-
}}}
1267+
macro_rules! err {
1268+
($error_len: expr) => {
1269+
return Err(Utf8Error {
1270+
valid_up_to: old_offset,
1271+
error_len: $error_len,
1272+
})
1273+
}
1274+
}
12491275

12501276
macro_rules! next { () => {{
12511277
index += 1;
12521278
// we needed data, but there was none: error!
12531279
if index >= len {
1254-
err!()
1280+
err!(None)
12551281
}
12561282
v[index]
12571283
}}}
12581284

12591285
let first = v[index];
12601286
if first >= 128 {
12611287
let w = UTF8_CHAR_WIDTH[first as usize];
1262-
let second = next!();
12631288
// 2-byte encoding is for codepoints \u{0080} to \u{07ff}
12641289
// first C2 80 last DF BF
12651290
// 3-byte encoding is for codepoints \u{0800} to \u{ffff}
@@ -1279,25 +1304,36 @@ fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
12791304
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
12801305
// %xF4 %x80-8F 2( UTF8-tail )
12811306
match w {
1282-
2 => if second & !CONT_MASK != TAG_CONT_U8 {err!()},
1307+
2 => if next!() & !CONT_MASK != TAG_CONT_U8 {
1308+
err!(Some(1))
1309+
},
12831310
3 => {
1284-
match (first, second, next!() & !CONT_MASK) {
1285-
(0xE0 , 0xA0 ... 0xBF, TAG_CONT_U8) |
1286-
(0xE1 ... 0xEC, 0x80 ... 0xBF, TAG_CONT_U8) |
1287-
(0xED , 0x80 ... 0x9F, TAG_CONT_U8) |
1288-
(0xEE ... 0xEF, 0x80 ... 0xBF, TAG_CONT_U8) => {}
1289-
_ => err!()
1311+
match (first, next!()) {
1312+
(0xE0 , 0xA0 ... 0xBF) |
1313+
(0xE1 ... 0xEC, 0x80 ... 0xBF) |
1314+
(0xED , 0x80 ... 0x9F) |
1315+
(0xEE ... 0xEF, 0x80 ... 0xBF) => {}
1316+
_ => err!(Some(1))
1317+
}
1318+
if next!() & !CONT_MASK != TAG_CONT_U8 {
1319+
err!(Some(2))
12901320
}
12911321
}
12921322
4 => {
1293-
match (first, second, next!() & !CONT_MASK, next!() & !CONT_MASK) {
1294-
(0xF0 , 0x90 ... 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
1295-
(0xF1 ... 0xF3, 0x80 ... 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
1296-
(0xF4 , 0x80 ... 0x8F, TAG_CONT_U8, TAG_CONT_U8) => {}
1297-
_ => err!()
1323+
match (first, next!()) {
1324+
(0xF0 , 0x90 ... 0xBF) |
1325+
(0xF1 ... 0xF3, 0x80 ... 0xBF) |
1326+
(0xF4 , 0x80 ... 0x8F) => {}
1327+
_ => err!(Some(1))
1328+
}
1329+
if next!() & !CONT_MASK != TAG_CONT_U8 {
1330+
err!(Some(2))
1331+
}
1332+
if next!() & !CONT_MASK != TAG_CONT_U8 {
1333+
err!(Some(3))
12981334
}
12991335
}
1300-
_ => err!()
1336+
_ => err!(Some(1))
13011337
}
13021338
index += 1;
13031339
} else {

0 commit comments

Comments
 (0)