Skip to content

Commit 46226a7

Browse files
committed
Yield Err in char::decode_utf8 per Unicode, like String::from_utf8_lossy
1 parent 892bf3d commit 46226a7

File tree

2 files changed

+89
-14
lines changed

2 files changed

+89
-14
lines changed

src/libcore/char.rs

+69-13
Original file line numberDiff line numberDiff line change
@@ -737,25 +737,81 @@ pub struct InvalidSequence(());
737737
impl<I: Iterator<Item = u8>> Iterator for DecodeUtf8<I> {
738738
type Item = Result<char, InvalidSequence>;
739739
#[inline]
740+
740741
fn next(&mut self) -> Option<Result<char, InvalidSequence>> {
741-
self.0.next().map(|b| {
742-
if b & 0x80 == 0 { Ok(b as char) } else {
743-
let l = (!b).leading_zeros() as usize; // number of bytes in UTF-8 representation
744-
if l < 2 || l > 6 { return Err(InvalidSequence(())) };
745-
let mut x = (b as u32) & (0x7F >> l);
746-
for _ in 0..l-1 {
742+
self.0.next().map(|first_byte| {
743+
// Emit InvalidSequence according to
744+
// Unicode §5.22 Best Practice for U+FFFD Substitution
745+
// http://www.unicode.org/versions/Unicode9.0.0/ch05.pdf#G40630
746+
747+
// Roughly: consume at least one byte,
748+
// then validate one byte at a time and stop before the first unexpected byte
749+
// (which might be the valid start of the next byte sequence).
750+
751+
let mut code_point;
752+
macro_rules! first_byte {
753+
($mask: expr) => {
754+
code_point = u32::from(first_byte & $mask)
755+
}
756+
}
757+
macro_rules! continuation_byte {
758+
() => { continuation_byte!(0x80...0xBF) };
759+
($range: pat) => {
747760
match self.0.peek() {
748-
Some(&b) if b & 0xC0 == 0x80 => {
761+
Some(&byte @ $range) => {
762+
code_point = (code_point << 6) | u32::from(byte & 0b0011_1111);
749763
self.0.next();
750-
x = (x << 6) | (b as u32) & 0x3F;
751-
},
752-
_ => return Err(InvalidSequence(())),
764+
}
765+
_ => return Err(InvalidSequence(()))
753766
}
754767
}
755-
match from_u32(x) {
756-
Some(x) if l == x.len_utf8() => Ok(x),
757-
_ => Err(InvalidSequence(())),
768+
}
769+
770+
match first_byte {
771+
0x00...0x7F => {
772+
first_byte!(0b1111_1111);
773+
}
774+
0xC2...0xDF => {
775+
first_byte!(0b0001_1111);
776+
continuation_byte!();
777+
}
778+
0xE0 => {
779+
first_byte!(0b0000_1111);
780+
continuation_byte!(0xA0...0xBF); // 0x80...0x9F here are overlong
781+
continuation_byte!();
758782
}
783+
0xE1...0xEC | 0xEE...0xEF => {
784+
first_byte!(0b0000_1111);
785+
continuation_byte!();
786+
continuation_byte!();
787+
}
788+
0xED => {
789+
first_byte!(0b0000_1111);
790+
continuation_byte!(0x80...0x9F); // 0xA0..0xBF here are surrogates
791+
continuation_byte!();
792+
}
793+
0xF0 => {
794+
first_byte!(0b0000_0111);
795+
continuation_byte!(0x90...0xBF); // 0x80..0x8F here are overlong
796+
continuation_byte!();
797+
continuation_byte!();
798+
}
799+
0xF1...0xF3 => {
800+
first_byte!(0b0000_0111);
801+
continuation_byte!();
802+
continuation_byte!();
803+
continuation_byte!();
804+
}
805+
0xF4 => {
806+
first_byte!(0b0000_0111);
807+
continuation_byte!(0x80...0x8F); // 0x90..0xBF here are beyond char::MAX
808+
continuation_byte!();
809+
continuation_byte!();
810+
}
811+
_ => return Err(InvalidSequence(())) // Illegal first byte, overlong, or beyond MAX
812+
}
813+
unsafe {
814+
Ok(from_u32_unchecked(code_point))
759815
}
760816
})
761817
}

src/libcoretest/char.rs

+20-1
Original file line numberDiff line numberDiff line change
@@ -367,12 +367,13 @@ fn test_decode_utf8() {
367367
assert_eq!(s, $expected_str,
368368
"input bytes: {:?}, expected str: {:?}, result: {:?}",
369369
input_bytes, $expected_str, s);
370+
assert_eq!(String::from_utf8_lossy(&$input_bytes), $expected_str);
370371
}
371372
}
372373

373374
assert_decode_utf8!([], "");
374375
assert_decode_utf8!([0x41], "A");
375-
assert_decode_utf8!([0xC1, 0x81], "�");
376+
assert_decode_utf8!([0xC1, 0x81], "�");
376377
assert_decode_utf8!([0xE2, 0x99, 0xA5], "♥");
377378
assert_decode_utf8!([0xE2, 0x99, 0xA5, 0x41], "♥A");
378379
assert_decode_utf8!([0xE2, 0x99], "�");
@@ -385,4 +386,22 @@ fn test_decode_utf8() {
385386
assert_decode_utf8!([0xFE, 0x41], "�A");
386387
assert_decode_utf8!([0xFF], "�");
387388
assert_decode_utf8!([0xFF, 0x41], "�A");
389+
assert_decode_utf8!([0xC0, 0x80], "��");
390+
391+
// Surrogates
392+
assert_decode_utf8!([0xED, 0x9F, 0xBF], "\u{D7FF}");
393+
assert_decode_utf8!([0xED, 0xA0, 0x80], "���");
394+
assert_decode_utf8!([0xED, 0xBF, 0x80], "���");
395+
assert_decode_utf8!([0xEE, 0x80, 0x80], "\u{E000}");
396+
397+
// char::MAX
398+
assert_decode_utf8!([0xF4, 0x8F, 0xBF, 0xBF], "\u{10FFFF}");
399+
assert_decode_utf8!([0xF4, 0x8F, 0xBF, 0x41], "�A");
400+
assert_decode_utf8!([0xF4, 0x90, 0x80, 0x80], "����");
401+
402+
// 5 and 6 bytes sequence
403+
// Part of the original design of UTF-8,
404+
// but invalid now that UTF-8 is artificially restricted to match the range of UTF-16.
405+
assert_decode_utf8!([0xF8, 0x80, 0x80, 0x80, 0x80], "�����");
406+
assert_decode_utf8!([0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], "������");
388407
}

0 commit comments

Comments
 (0)