Skip to content

Commit b153219

Browse files
blake2-ppcthestinger
blake2-ppc
authored andcommitted
std::str: Deny surrogates in is_utf8
Reject codepoints \uD800 to \uDFFF which are the surrogates (reserved/unused codepoints that are invalid to encode into UTF-8) The surrogates is the only hole of invalid codepoints in the range from \u0 to \u10FFFF.
1 parent b49e9fa commit b153219

File tree

1 file changed

+12
-3
lines changed

1 file changed

+12
-3
lines changed

src/libstd/str.rs

+12-3
Original file line numberDiff line numberDiff line change
@@ -799,6 +799,8 @@ pub fn is_utf8(v: &[u8]) -> bool {
799799
// first C2 80 last DF BF
800800
// 3-byte encoding is for codepoints \u0800 to \uffff
801801
// first E0 A0 80 last EF BF BF
802+
// excluding surrogates codepoints \ud800 to \udfff
803+
// ED A0 80 to ED BF BF
802804
// 4-byte encoding is for codepoints \u10000 to \u10ffff
803805
// first F0 90 80 80 last F4 8F BF BF
804806
//
@@ -812,8 +814,6 @@ pub fn is_utf8(v: &[u8]) -> bool {
812814
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
813815
// %xF4 %x80-8F 2( UTF8-tail )
814816
// UTF8-tail = %x80-BF
815-
// --
816-
// This code allows surrogate pairs: \uD800 to \uDFFF -> ED A0 80 to ED BF BF
817817
match w {
818818
2 => if unsafe_get(v, i + 1) & 192u8 != TAG_CONT_U8 {
819819
return false
@@ -822,7 +822,9 @@ pub fn is_utf8(v: &[u8]) -> bool {
822822
unsafe_get(v, i + 1),
823823
unsafe_get(v, i + 2) & 192u8) {
824824
(0xE0 , 0xA0 .. 0xBF, TAG_CONT_U8) => (),
825-
(0xE1 .. 0xEF, 0x80 .. 0xBF, TAG_CONT_U8) => (),
825+
(0xE1 .. 0xEC, 0x80 .. 0xBF, TAG_CONT_U8) => (),
826+
(0xED , 0x80 .. 0x9F, TAG_CONT_U8) => (),
827+
(0xEE .. 0xEF, 0x80 .. 0xBF, TAG_CONT_U8) => (),
826828
_ => return false,
827829
},
828830
_ => match (v_i,
@@ -3012,6 +3014,7 @@ mod tests {
30123014
30133015
#[test]
30143016
fn test_is_utf8() {
3017+
// deny overlong encodings
30153018
assert!(!is_utf8([0xc0, 0x80]));
30163019
assert!(!is_utf8([0xc0, 0xae]));
30173020
assert!(!is_utf8([0xe0, 0x80, 0x80]));
@@ -3020,9 +3023,15 @@ mod tests {
30203023
assert!(!is_utf8([0xf0, 0x82, 0x82, 0xac]));
30213024
assert!(!is_utf8([0xf4, 0x90, 0x80, 0x80]));
30223025
3026+
// deny surrogates
3027+
assert!(!is_utf8([0xED, 0xA0, 0x80]));
3028+
assert!(!is_utf8([0xED, 0xBF, 0xBF]));
3029+
30233030
assert!(is_utf8([0xC2, 0x80]));
30243031
assert!(is_utf8([0xDF, 0xBF]));
30253032
assert!(is_utf8([0xE0, 0xA0, 0x80]));
3033+
assert!(is_utf8([0xED, 0x9F, 0xBF]));
3034+
assert!(is_utf8([0xEE, 0x80, 0x80]));
30263035
assert!(is_utf8([0xEF, 0xBF, 0xBF]));
30273036
assert!(is_utf8([0xF0, 0x90, 0x80, 0x80]));
30283037
assert!(is_utf8([0xF4, 0x8F, 0xBF, 0xBF]));

0 commit comments

Comments
 (0)