@@ -125,13 +125,14 @@ Section: Creating a string
125
125
#[ stable( feature = "rust1" , since = "1.0.0" ) ]
126
126
pub struct Utf8Error {
127
127
valid_up_to : usize ,
128
+ error_len : Option < u8 > ,
128
129
}
129
130
130
131
impl Utf8Error {
131
132
/// Returns the index in the given string up to which valid UTF-8 was
132
133
/// verified.
133
134
///
134
- /// It is the maximum index such that `from_utf8(input[..index])`
135
+ /// It is the maximum index such that `from_utf8(& input[..index])`
135
136
/// would return `Ok(_)`.
136
137
///
137
138
/// # Examples
@@ -152,6 +153,23 @@ impl Utf8Error {
152
153
/// ```
153
154
#[ stable( feature = "utf8_error" , since = "1.5.0" ) ]
154
155
pub fn valid_up_to ( & self ) -> usize { self . valid_up_to }
156
+
157
+ /// Provide more information about the failure:
158
+ ///
159
+ /// * `None`: the end of the input was reached unexpectedly.
160
+ /// `self.valid_up_to()` is 1 to 3 bytes from the end of the input.
161
+ /// If a byte stream (such as a file or a network socket) is being decoded incrementally,
162
+ /// this could be a valid `char` whose UTF-8 byte sequence is spanning multiple chunks.
163
+ ///
164
+ /// * `Some(len)`: an unexpected byte was encountered.
165
+ /// The length provided is that of the invalid byte sequence
166
+ /// that starts at the index given by `valid_up_to()`.
167
+ /// Decoding should resume after that sequence
168
+ /// (after inserting a U+FFFD REPLACEMENT CHARACTER) in case of lossy decoding.
169
+ #[ unstable( feature = "utf8_error_error_len" , reason ="new" , issue = "40494" ) ]
170
+ pub fn error_len ( & self ) -> Option < usize > {
171
+ self . error_len . map ( |len| len as usize )
172
+ }
155
173
}
156
174
157
175
/// Converts a slice of bytes to a string slice.
@@ -300,7 +318,12 @@ pub unsafe fn from_utf8_unchecked(v: &[u8]) -> &str {
300
318
#[ stable( feature = "rust1" , since = "1.0.0" ) ]
301
319
impl fmt:: Display for Utf8Error {
302
320
fn fmt ( & self , f : & mut fmt:: Formatter ) -> fmt:: Result {
303
- write ! ( f, "invalid utf-8: invalid byte near index {}" , self . valid_up_to)
321
+ if let Some ( error_len) = self . error_len {
322
+ write ! ( f, "invalid utf-8 sequence of {} bytes from index {}" ,
323
+ error_len, self . valid_up_to)
324
+ } else {
325
+ write ! ( f, "incomplete utf-8 byte sequence from index {}" , self . valid_up_to)
326
+ }
304
327
}
305
328
}
306
329
@@ -1241,25 +1264,27 @@ fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
1241
1264
1242
1265
while index < len {
1243
1266
let old_offset = index;
1244
- macro_rules! err { ( ) => { {
1245
- return Err ( Utf8Error {
1246
- valid_up_to: old_offset
1247
- } )
1248
- } } }
1267
+ macro_rules! err {
1268
+ ( $error_len: expr) => {
1269
+ return Err ( Utf8Error {
1270
+ valid_up_to: old_offset,
1271
+ error_len: $error_len,
1272
+ } )
1273
+ }
1274
+ }
1249
1275
1250
1276
macro_rules! next { ( ) => { {
1251
1277
index += 1 ;
1252
1278
// we needed data, but there was none: error!
1253
1279
if index >= len {
1254
- err!( )
1280
+ err!( None )
1255
1281
}
1256
1282
v[ index]
1257
1283
} } }
1258
1284
1259
1285
let first = v[ index] ;
1260
1286
if first >= 128 {
1261
1287
let w = UTF8_CHAR_WIDTH [ first as usize ] ;
1262
- let second = next ! ( ) ;
1263
1288
// 2-byte encoding is for codepoints \u{0080} to \u{07ff}
1264
1289
// first C2 80 last DF BF
1265
1290
// 3-byte encoding is for codepoints \u{0800} to \u{ffff}
@@ -1279,25 +1304,36 @@ fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
1279
1304
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
1280
1305
// %xF4 %x80-8F 2( UTF8-tail )
1281
1306
match w {
1282
- 2 => if second & !CONT_MASK != TAG_CONT_U8 { err ! ( ) } ,
1307
+ 2 => if next ! ( ) & !CONT_MASK != TAG_CONT_U8 {
1308
+ err ! ( Some ( 1 ) )
1309
+ } ,
1283
1310
3 => {
1284
- match ( first, second, next ! ( ) & !CONT_MASK ) {
1285
- ( 0xE0 , 0xA0 ... 0xBF , TAG_CONT_U8 ) |
1286
- ( 0xE1 ... 0xEC , 0x80 ... 0xBF , TAG_CONT_U8 ) |
1287
- ( 0xED , 0x80 ... 0x9F , TAG_CONT_U8 ) |
1288
- ( 0xEE ... 0xEF , 0x80 ... 0xBF , TAG_CONT_U8 ) => { }
1289
- _ => err ! ( )
1311
+ match ( first, next ! ( ) ) {
1312
+ ( 0xE0 , 0xA0 ... 0xBF ) |
1313
+ ( 0xE1 ... 0xEC , 0x80 ... 0xBF ) |
1314
+ ( 0xED , 0x80 ... 0x9F ) |
1315
+ ( 0xEE ... 0xEF , 0x80 ... 0xBF ) => { }
1316
+ _ => err ! ( Some ( 1 ) )
1317
+ }
1318
+ if next ! ( ) & !CONT_MASK != TAG_CONT_U8 {
1319
+ err ! ( Some ( 2 ) )
1290
1320
}
1291
1321
}
1292
1322
4 => {
1293
- match ( first, second, next ! ( ) & !CONT_MASK , next ! ( ) & !CONT_MASK ) {
1294
- ( 0xF0 , 0x90 ... 0xBF , TAG_CONT_U8 , TAG_CONT_U8 ) |
1295
- ( 0xF1 ... 0xF3 , 0x80 ... 0xBF , TAG_CONT_U8 , TAG_CONT_U8 ) |
1296
- ( 0xF4 , 0x80 ... 0x8F , TAG_CONT_U8 , TAG_CONT_U8 ) => { }
1297
- _ => err ! ( )
1323
+ match ( first, next ! ( ) ) {
1324
+ ( 0xF0 , 0x90 ... 0xBF ) |
1325
+ ( 0xF1 ... 0xF3 , 0x80 ... 0xBF ) |
1326
+ ( 0xF4 , 0x80 ... 0x8F ) => { }
1327
+ _ => err ! ( Some ( 1 ) )
1328
+ }
1329
+ if next ! ( ) & !CONT_MASK != TAG_CONT_U8 {
1330
+ err ! ( Some ( 2 ) )
1331
+ }
1332
+ if next ! ( ) & !CONT_MASK != TAG_CONT_U8 {
1333
+ err ! ( Some ( 3 ) )
1298
1334
}
1299
1335
}
1300
- _ => err ! ( )
1336
+ _ => err ! ( Some ( 1 ) )
1301
1337
}
1302
1338
index += 1 ;
1303
1339
} else {
0 commit comments