@@ -799,6 +799,8 @@ pub fn is_utf8(v: &[u8]) -> bool {
799
799
// first C2 80 last DF BF
800
800
// 3-byte encoding is for codepoints \u0800 to \uffff
801
801
// first E0 A0 80 last EF BF BF
802
+ // excluding surrogates codepoints \ud800 to \udfff
803
+ // ED A0 80 to ED BF BF
802
804
// 4-byte encoding is for codepoints \u10000 to \u10ffff
803
805
// first F0 90 80 80 last F4 8F BF BF
804
806
//
@@ -812,8 +814,6 @@ pub fn is_utf8(v: &[u8]) -> bool {
812
814
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
813
815
// %xF4 %x80-8F 2( UTF8-tail )
814
816
// UTF8-tail = %x80-BF
815
- // --
816
- // This code allows surrogate pairs: \uD800 to \uDFFF -> ED A0 80 to ED BF BF
817
817
match w {
818
818
2 => if unsafe_get ( v, i + 1 ) & 192u8 != TAG_CONT_U8 {
819
819
return false
@@ -822,7 +822,9 @@ pub fn is_utf8(v: &[u8]) -> bool {
822
822
unsafe_get ( v, i + 1 ) ,
823
823
unsafe_get ( v, i + 2 ) & 192u8 ) {
824
824
( 0xE0 , 0xA0 .. 0xBF , TAG_CONT_U8 ) => ( ) ,
825
- ( 0xE1 .. 0xEF , 0x80 .. 0xBF , TAG_CONT_U8 ) => ( ) ,
825
+ ( 0xE1 .. 0xEC , 0x80 .. 0xBF , TAG_CONT_U8 ) => ( ) ,
826
+ ( 0xED , 0x80 .. 0x9F , TAG_CONT_U8 ) => ( ) ,
827
+ ( 0xEE .. 0xEF , 0x80 .. 0xBF , TAG_CONT_U8 ) => ( ) ,
826
828
_ => return false ,
827
829
} ,
828
830
_ => match ( v_i,
@@ -3012,6 +3014,7 @@ mod tests {
3012
3014
3013
3015
#[test]
3014
3016
fn test_is_utf8() {
3017
+ // deny overlong encodings
3015
3018
assert!(!is_utf8([0xc0, 0x80]));
3016
3019
assert!(!is_utf8([0xc0, 0xae]));
3017
3020
assert!(!is_utf8([0xe0, 0x80, 0x80]));
@@ -3020,9 +3023,15 @@ mod tests {
3020
3023
assert!(!is_utf8([0xf0, 0x82, 0x82, 0xac]));
3021
3024
assert!(!is_utf8([0xf4, 0x90, 0x80, 0x80]));
3022
3025
3026
+ // deny surrogates
3027
+ assert!(!is_utf8([0xED, 0xA0, 0x80]));
3028
+ assert!(!is_utf8([0xED, 0xBF, 0xBF]));
3029
+
3023
3030
assert!(is_utf8([0xC2, 0x80]));
3024
3031
assert!(is_utf8([0xDF, 0xBF]));
3025
3032
assert!(is_utf8([0xE0, 0xA0, 0x80]));
3033
+ assert!(is_utf8([0xED, 0x9F, 0xBF]));
3034
+ assert!(is_utf8([0xEE, 0x80, 0x80]));
3026
3035
assert!(is_utf8([0xEF, 0xBF, 0xBF]));
3027
3036
assert!(is_utf8([0xF0, 0x90, 0x80, 0x80]));
3028
3037
assert!(is_utf8([0xF4, 0x8F, 0xBF, 0xBF]));
0 commit comments