@@ -564,6 +564,18 @@ fn match_at<'a,'b>(haystack: &'a str, needle: &'b str, at: uint) -> bool {
564
564
Section: Misc
565
565
*/
566
566
567
+ // Return the initial codepoint accumulator for the first byte.
568
+ // The first byte is special, only want bottom 5 bits for width 2, 4 bits
569
+ // for width 3, and 3 bits for width 4
570
+ macro_rules! utf8_first_byte(
571
+ ( $byte: expr, $width: expr) => ( ( $byte & ( 0x7F >> $width) ) as uint)
572
+ )
573
+
574
+ // return the value of $ch updated with continuation byte $byte
575
+ macro_rules! utf8_acc_cont_byte(
576
+ ( $ch: expr, $byte: expr) => ( ( $ch << 6 ) | ( $byte & 63u8 ) as uint)
577
+ )
578
+
567
579
/// Determines if a vector of bytes contains valid UTF-8
568
580
pub fn is_utf8 ( v : & [ u8 ] ) -> bool {
569
581
let mut i = 0 u;
@@ -577,11 +589,26 @@ pub fn is_utf8(v: &[u8]) -> bool {
577
589
578
590
let nexti = i + w;
579
591
if nexti > total { return false ; }
592
+ // 1. Make sure the correct number of continuation bytes are present
593
+ // 2. Check codepoint ranges (deny overlong encodings)
594
+ // 2-byte encoding is for codepoints \u0080 to \u07ff
595
+ // 3-byte encoding is for codepoints \u0800 to \uffff
596
+ // 4-byte encoding is for codepoints \u10000 to \u10ffff
580
597
598
+ // 2-byte encodings are correct if the width and continuation match up
581
599
if v[ i + 1 ] & 192u8 != TAG_CONT_U8 { return false ; }
582
600
if w > 2 {
601
+ let mut ch;
602
+ ch = utf8_first_byte ! ( v[ i] , w) ;
603
+ ch = utf8_acc_cont_byte ! ( ch, v[ i + 1 ] ) ;
583
604
if v[ i + 2 ] & 192u8 != TAG_CONT_U8 { return false ; }
584
- if w > 3 && ( v[ i + 3 ] & 192u8 != TAG_CONT_U8 ) { return false ; }
605
+ ch = utf8_acc_cont_byte ! ( ch, v[ i + 2 ] ) ;
606
+ if w == 3 && ch < MAX_TWO_B { return false ; }
607
+ if w > 3 {
608
+ if v[ i + 3 ] & 192u8 != TAG_CONT_U8 { return false ; }
609
+ ch = utf8_acc_cont_byte ! ( ch, v[ i + 3 ] ) ;
610
+ if ch < MAX_THREE_B || ch >= MAX_UNICODE { return false ; }
611
+ }
585
612
}
586
613
587
614
i = nexti;
@@ -699,7 +726,7 @@ pub fn count_bytes<'b>(s: &'b str, start: uint, n: uint) -> uint {
699
726
}
700
727
701
728
// https://tools.ietf.org/html/rfc3629
702
- static UTF8_CHAR_WIDTH : [ u8 , ..256 ] = [
729
+ priv static UTF8_CHAR_WIDTH : [ u8 , ..256 ] = [
703
730
1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
704
731
1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , // 0x1F
705
732
1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
@@ -712,7 +739,7 @@ static UTF8_CHAR_WIDTH: [u8, ..256] = [
712
739
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // 0x9F
713
740
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
714
741
0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // 0xBF
715
- 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 ,
742
+ 0 , 0 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 ,
716
743
2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , // 0xDF
717
744
3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , // 0xEF
718
745
4 , 4 , 4 , 4 , 4 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , // 0xFF
@@ -730,14 +757,15 @@ pub struct CharRange {
730
757
}
731
758
732
759
// UTF-8 tags and ranges
733
- static TAG_CONT_U8 : u8 = 128u8 ;
734
- static TAG_CONT : uint = 128 u;
735
- static MAX_ONE_B : uint = 128 u;
736
- static TAG_TWO_B : uint = 192 u;
737
- static MAX_TWO_B : uint = 2048 u;
738
- static TAG_THREE_B : uint = 224 u;
739
- static MAX_THREE_B : uint = 65536 u;
740
- static TAG_FOUR_B : uint = 240 u;
760
+ priv static TAG_CONT_U8 : u8 = 128u8 ;
761
+ priv static TAG_CONT : uint = 128 u;
762
+ priv static MAX_ONE_B : uint = 128 u;
763
+ priv static TAG_TWO_B : uint = 192 u;
764
+ priv static MAX_TWO_B : uint = 2048 u;
765
+ priv static TAG_THREE_B : uint = 224 u;
766
+ priv static MAX_THREE_B : uint = 65536 u;
767
+ priv static TAG_FOUR_B : uint = 240 u;
768
+ priv static MAX_UNICODE : uint = 1114112 u;
741
769
742
770
/// Unsafe operations
743
771
pub mod raw {
@@ -1665,12 +1693,10 @@ impl<'self> StrSlice<'self> for &'self str {
1665
1693
let w = UTF8_CHAR_WIDTH[val] as uint;
1666
1694
assert!((w != 0));
1667
1695
1668
- // First byte is special, only want bottom 5 bits for width 2, 4 bits
1669
- // for width 3, and 3 bits for width 4
1670
- val &= 0x7Fu >> w;
1671
- val = (val << 6) | (s[i + 1] & 63u8) as uint;
1672
- if w > 2 { val = (val << 6) | (s[i + 2] & 63u8) as uint; }
1673
- if w > 3 { val = (val << 6) | (s[i + 3] & 63u8) as uint; }
1696
+ val = utf8_first_byte!(val, w);
1697
+ val = utf8_acc_cont_byte!(val, s[i + 1]);
1698
+ if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); }
1699
+ if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); }
1674
1700
1675
1701
return CharRange {ch: val as char, next: i + w};
1676
1702
}
@@ -2035,7 +2061,7 @@ impl OwnedStr for ~str {
2035
2061
/// Appends a character to the back of a string
2036
2062
#[inline]
2037
2063
fn push_char(&mut self, c: char) {
2038
- assert!(c as uint <= 0x10ffff ); // FIXME: #7609: should be enforced on all `char`
2064
+ assert!(( c as uint) < MAX_UNICODE ); // FIXME: #7609: should be enforced on all `char`
2039
2065
unsafe {
2040
2066
let code = c as uint;
2041
2067
let nb = if code < MAX_ONE_B { 1u }
@@ -2799,9 +2825,23 @@ mod tests {
2799
2825
0x20_u8, 0x4e_u8, 0x61_u8,
2800
2826
0x6d_u8];
2801
2827
2828
+
2802
2829
assert_eq!(ss, from_bytes(bb));
2830
+ assert_eq!(~" 𐌀𐌖𐌋𐌄𐌑𐌉ปรدولة الكويتทศไทย中华𐍅𐌿𐌻𐍆𐌹𐌻𐌰",
2831
+ from_bytes(bytes!(" 𐌀𐌖𐌋𐌄𐌑𐌉ปรدولة الكويتทศไทย中华𐍅𐌿𐌻𐍆𐌹𐌻𐌰")));
2803
2832
}
2804
2833
2834
+ #[test]
2835
+ fn test_is_utf8_deny_overlong() {
2836
+ assert!(!is_utf8([0xc0, 0x80]));
2837
+ assert!(!is_utf8([0xc0, 0xae]));
2838
+ assert!(!is_utf8([0xe0, 0x80, 0x80]));
2839
+ assert!(!is_utf8([0xe0, 0x80, 0xaf]));
2840
+ assert!(!is_utf8([0xe0, 0x81, 0x81]));
2841
+ assert!(!is_utf8([0xf0, 0x82, 0x82, 0xac]));
2842
+ }
2843
+
2844
+
2805
2845
#[test]
2806
2846
#[ignore(cfg(windows))]
2807
2847
fn test_from_bytes_fail() {
0 commit comments