@@ -564,51 +564,63 @@ fn match_at<'a,'b>(haystack: &'a str, needle: &'b str, at: uint) -> bool {
564
564
Section: Misc
565
565
*/
566
566
567
- // Return the initial codepoint accumulator for the first byte.
568
- // The first byte is special, only want bottom 5 bits for width 2, 4 bits
569
- // for width 3, and 3 bits for width 4
570
- macro_rules! utf8_first_byte(
571
- ( $byte: expr, $width: expr) => ( ( $byte & ( 0x7F >> $width) ) as uint)
572
- )
573
-
574
- // return the value of $ch updated with continuation byte $byte
575
- macro_rules! utf8_acc_cont_byte(
576
- ( $ch: expr, $byte: expr) => ( ( $ch << 6 ) | ( $byte & 63u8 ) as uint)
577
- )
578
-
579
567
/// Determines if a vector of bytes contains valid UTF-8
580
568
pub fn is_utf8 ( v : & [ u8 ] ) -> bool {
581
569
let mut i = 0 u;
582
570
let total = v. len ( ) ;
571
+ fn unsafe_get ( xs : & [ u8 ] , i : uint ) -> u8 {
572
+ unsafe { * xs. unsafe_ref ( i) }
573
+ }
583
574
while i < total {
584
- if v[ i] < 128u8 {
575
+ let v_i = unsafe_get ( v, i) ;
576
+ if v_i < 128u8 {
585
577
i += 1 u;
586
578
} else {
587
- let w = utf8_char_width ( v [ i ] ) ;
579
+ let w = utf8_char_width ( v_i ) ;
588
580
if w == 0 u { return false ; }
589
581
590
582
let nexti = i + w;
591
583
if nexti > total { return false ; }
592
- // 1. Make sure the correct number of continuation bytes are present
593
- // 2. Check codepoint ranges (deny overlong encodings)
594
- // 2-byte encoding is for codepoints \u0080 to \u07ff
595
- // 3-byte encoding is for codepoints \u0800 to \uffff
596
- // 4-byte encoding is for codepoints \u10000 to \u10ffff
597
-
598
- // 2-byte encodings are correct if the width and continuation match up
599
- if v[ i + 1 ] & 192u8 != TAG_CONT_U8 { return false ; }
600
- if w > 2 {
601
- let mut ch;
602
- ch = utf8_first_byte ! ( v[ i] , w) ;
603
- ch = utf8_acc_cont_byte ! ( ch, v[ i + 1 ] ) ;
604
- if v[ i + 2 ] & 192u8 != TAG_CONT_U8 { return false ; }
605
- ch = utf8_acc_cont_byte ! ( ch, v[ i + 2 ] ) ;
606
- if w == 3 && ch < MAX_TWO_B { return false ; }
607
- if w > 3 {
608
- if v[ i + 3 ] & 192u8 != TAG_CONT_U8 { return false ; }
609
- ch = utf8_acc_cont_byte ! ( ch, v[ i + 3 ] ) ;
610
- if ch < MAX_THREE_B || ch >= MAX_UNICODE { return false ; }
611
- }
584
+
585
+ // 2-byte encoding is for codepoints \u0080 to \u07ff
586
+ // first C2 80 last DF BF
587
+ // 3-byte encoding is for codepoints \u0800 to \uffff
588
+ // first E0 A0 80 last EF BF BF
589
+ // 4-byte encoding is for codepoints \u10000 to \u10ffff
590
+ // first F0 90 80 80 last F4 8F BF BF
591
+ //
592
+ // Use the UTF-8 syntax from the RFC
593
+ //
594
+ // https://tools.ietf.org/html/rfc3629
595
+ // UTF8-1 = %x00-7F
596
+ // UTF8-2 = %xC2-DF UTF8-tail
597
+ // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
598
+ // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
599
+ // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
600
+ // %xF4 %x80-8F 2( UTF8-tail )
601
+ // UTF8-tail = %x80-BF
602
+ // --
603
+ // This code allows surrogate pairs: \uD800 to \uDFFF -> ED A0 80 to ED BF BF
604
+ match w {
605
+ 2 => if unsafe_get ( v, i + 1 ) & 192u8 != TAG_CONT_U8 {
606
+ return false
607
+ } ,
608
+ 3 => match ( v_i,
609
+ unsafe_get ( v, i + 1 ) ,
610
+ unsafe_get ( v, i + 2 ) & 192u8 ) {
611
+ ( 0xE0 , 0xA0 .. 0xBF , TAG_CONT_U8 ) => ( ) ,
612
+ ( 0xE1 .. 0xEF , 0x80 .. 0xBF , TAG_CONT_U8 ) => ( ) ,
613
+ _ => return false ,
614
+ } ,
615
+ _ => match ( v_i,
616
+ unsafe_get ( v, i + 1 ) ,
617
+ unsafe_get ( v, i + 2 ) & 192u8 ,
618
+ unsafe_get ( v, i + 3 ) & 192u8 ) {
619
+ ( 0xF0 , 0x90 .. 0xBF , TAG_CONT_U8 , TAG_CONT_U8 ) => ( ) ,
620
+ ( 0xF1 .. 0xF3 , 0x80 .. 0xBF , TAG_CONT_U8 , TAG_CONT_U8 ) => ( ) ,
621
+ ( 0xF4 , 0x80 .. 0x8F , TAG_CONT_U8 , TAG_CONT_U8 ) => ( ) ,
622
+ _ => return false ,
623
+ } ,
612
624
}
613
625
614
626
i = nexti;
@@ -756,6 +768,18 @@ pub struct CharRange {
756
768
next : uint
757
769
}
758
770
771
+ // Return the initial codepoint accumulator for the first byte.
772
+ // The first byte is special, only want bottom 5 bits for width 2, 4 bits
773
+ // for width 3, and 3 bits for width 4
774
+ macro_rules! utf8_first_byte(
775
+ ( $byte: expr, $width: expr) => ( ( $byte & ( 0x7F >> $width) ) as uint)
776
+ )
777
+
778
+ // return the value of $ch updated with continuation byte $byte
779
+ macro_rules! utf8_acc_cont_byte(
780
+ ( $ch: expr, $byte: expr) => ( ( $ch << 6 ) | ( $byte & 63u8 ) as uint)
781
+ )
782
+
759
783
// UTF-8 tags and ranges
760
784
priv static TAG_CONT_U8 : u8 = 128u8 ;
761
785
priv static TAG_CONT : uint = 128 u;
@@ -2845,13 +2869,21 @@ mod tests {
2845
2869
}
2846
2870
2847
2871
#[test]
2848
- fn test_is_utf8_deny_overlong () {
2872
+ fn test_is_utf8 () {
2849
2873
assert!(!is_utf8([0xc0, 0x80]));
2850
2874
assert!(!is_utf8([0xc0, 0xae]));
2851
2875
assert!(!is_utf8([0xe0, 0x80, 0x80]));
2852
2876
assert!(!is_utf8([0xe0, 0x80, 0xaf]));
2853
2877
assert!(!is_utf8([0xe0, 0x81, 0x81]));
2854
2878
assert!(!is_utf8([0xf0, 0x82, 0x82, 0xac]));
2879
+ assert!(!is_utf8([0xf4, 0x90, 0x80, 0x80]));
2880
+
2881
+ assert!(is_utf8([0xC2, 0x80]));
2882
+ assert!(is_utf8([0xDF, 0xBF]));
2883
+ assert!(is_utf8([0xE0, 0xA0, 0x80]));
2884
+ assert!(is_utf8([0xEF, 0xBF, 0xBF]));
2885
+ assert!(is_utf8([0xF0, 0x90, 0x80, 0x80]));
2886
+ assert!(is_utf8([0xF4, 0x8F, 0xBF, 0xBF]));
2855
2887
}
2856
2888
2857
2889
0 commit comments