auto merge of #13469 : kmcallister/rust/utf16, r=huonw

This fixes two separate issues related to character encoding. * Add `encode_utf16` to the `Char` trait, analogous to `encode_utf8`. `&str` already supports UTF-16 encoding but only with a heap allocation. Also fix `encode_utf8` docs and add tests. * Correctly decode non-BMP hex escapes in JSON (#13064).
rust-lang · Apr 13, 2014 · 4c62ab1 · 4c62ab1
2 parents 770b2fe + cee9a83
commit 4c62ab1
Show file tree

Hide file tree

Showing 3 changed files with 126 additions and 44 deletions.
diff --git a/src/libserialize/json.rs b/src/libserialize/json.rs
@@ -239,6 +239,7 @@ use std::io::MemWriter;
 use std::io;
 use std::num;
 use std::str;
+use std::str::ScalarValue;
 use std::strbuf::StrBuf;
 
 use Encodable;
@@ -1129,6 +1130,35 @@ impl<T : Iterator<char>> Parser<T> {
         Ok(res)
     }
 
+    fn decode_hex_escape(&mut self) -> DecodeResult<u16> {
+        let mut i = 0u;
+        let mut n = 0u16;
+        while i < 4u && !self.eof() {
+            self.bump();
+            n = match self.ch_or_null() {
+                c @ '0' .. '9' => n * 16_u16 + ((c as u16) - ('0' as u16)),
+                'a' | 'A' => n * 16_u16 + 10_u16,
+                'b' | 'B' => n * 16_u16 + 11_u16,
+                'c' | 'C' => n * 16_u16 + 12_u16,
+                'd' | 'D' => n * 16_u16 + 13_u16,
+                'e' | 'E' => n * 16_u16 + 14_u16,
+                'f' | 'F' => n * 16_u16 + 15_u16,
+                _ => return self.error(
+                    ~"invalid \\u escape (unrecognized hex)")
+            };
+
+            i += 1u;
+        }
+
+        // Error out if we didn't parse 4 digits.
+        if i != 4u {
+            return self.error(
+                ~"invalid \\u escape (not four digits)");
+        }
+
+        Ok(n)
+    }
+
     fn parse_str(&mut self) -> DecodeResult<~str> {
         let mut escape = false;
         let mut res = StrBuf::new();
@@ -1149,35 +1179,35 @@ impl<T : Iterator<char>> Parser<T> {
                     'n' => res.push_char('\n'),
                     'r' => res.push_char('\r'),
                     't' => res.push_char('\t'),
-                    'u' => {
-                        // Parse \u1234.
-                        let mut i = 0u;
-                        let mut n = 0u;
-                        while i < 4u && !self.eof() {
-                            self.bump();
-                            n = match self.ch_or_null() {
-                                c @ '0' .. '9' => n * 16u + (c as uint) - ('0' as uint),
-                                'a' | 'A' => n * 16u + 10u,
-                                'b' | 'B' => n * 16u + 11u,
-                                'c' | 'C' => n * 16u + 12u,
-                                'd' | 'D' => n * 16u + 13u,
-                                'e' | 'E' => n * 16u + 14u,
-                                'f' | 'F' => n * 16u + 15u,
+                    'u' => match try!(self.decode_hex_escape()) {
+                        0xDC00 .. 0xDFFF => return self.error(
+                                ~"lone trailing surrogate in hex escape"),
+
+                        // Non-BMP characters are encoded as a sequence of
+                        // two hex escapes, representing UTF-16 surrogates.
+                        n1 @ 0xD800 .. 0xDBFF => {
+                            let c1 = self.next_char();
+                            let c2 = self.next_char();
+                            match (c1, c2) {
+                                (Some('\\'), Some('u')) => (),
                                 _ => return self.error(
-                                    ~"invalid \\u escape (unrecognized hex)")
-                            };
-
-                            i += 1u;
-                        }
+                                    ~"unexpected end of non-BMP hex escape"),
+                            }
 
-                        // Error out if we didn't parse 4 digits.
-                        if i != 4u {
-                            return self.error(
-                                ~"invalid \\u escape (not four digits)");
+                            let buf = [n1, try!(self.decode_hex_escape())];
+                            match str::utf16_items(buf.as_slice()).next() {
+                                Some(ScalarValue(c)) => res.push_char(c),
+                                _ => return self.error(
+                                    ~"lone leading surrogate in hex escape"),
+                            }
                         }
 
-                        res.push_char(char::from_u32(n as u32).unwrap());
-                    }
+                        n => match char::from_u32(n as u32) {
+                            Some(c) => res.push_char(c),
+                            None => return self.error(
+                                format!("invalid Unicode codepoint {:u}", n)),
+                        },
+                    },
                     _ => return self.error(~"invalid escape"),
                 }
                 escape = false;
@@ -2139,6 +2169,16 @@ mod tests {
         assert_eq!(from_str(" \"foo\" "), Ok(String(~"foo")));
         assert_eq!(from_str("\"\\u12ab\""), Ok(String(~"\u12ab")));
         assert_eq!(from_str("\"\\uAB12\""), Ok(String(~"\uAB12")));
+
+        // Non-BMP escapes.  The exact error messages and positions are kind of
+        // arbitrary.
+        assert_eq!(from_str("\"\\ud83d\\udca9\""), Ok(String(~"\U0001F4A9")));
+        assert!(from_str("\"\\ud83d\"").is_err());
+        assert!(from_str("\"\\udca9\"").is_err());
+        assert!(from_str("\"\\ud83d\\ud83d\"").is_err());
+        assert!(from_str("\"\\ud83dx\"").is_err());
+        assert!(from_str("\"\\udca9\\udca9\"").is_err());
+        assert!(from_str("\"\\udca9x\"").is_err());
     }
 
     #[test]

diff --git a/src/libstd/char.rs b/src/libstd/char.rs
@@ -32,6 +32,7 @@ use unicode::{derived_property, property, general_category, decompose, conversio
 
 #[cfg(test)] use str::Str;
 #[cfg(test)] use strbuf::StrBuf;
+#[cfg(test)] use slice::ImmutableVector;
 
 #[cfg(not(test))] use cmp::{Eq, Ord};
 #[cfg(not(test))] use default::Default;
@@ -560,11 +561,19 @@ pub trait Char {
 
     /// Encodes this character as UTF-8 into the provided byte buffer.
     ///
-    /// The buffer must be at least 4 bytes long or a runtime failure will
+    /// The buffer must be at least 4 bytes long or a runtime failure may
     /// occur.
     ///
-    /// This will then return the number of characters written to the slice.
+    /// This will then return the number of bytes written to the slice.
     fn encode_utf8(&self, dst: &mut [u8]) -> uint;
+
+    /// Encodes this character as UTF-16 into the provided `u16` buffer.
+    ///
+    /// The buffer must be at least 2 elements long or a runtime failure may
+    /// occur.
+    ///
+    /// This will then return the number of `u16`s written to the slice.
+    fn encode_utf16(&self, dst: &mut [u16]) -> uint;
 }
 
 impl Char for char {
@@ -602,7 +611,7 @@ impl Char for char {
 
     fn len_utf8_bytes(&self) -> uint { len_utf8_bytes(*self) }
 
-    fn encode_utf8<'a>(&self, dst: &'a mut [u8]) -> uint {
+    fn encode_utf8(&self, dst: &mut [u8]) -> uint {
         let code = *self as uint;
         if code < MAX_ONE_B {
             dst[0] = code as u8;
@@ -624,6 +633,24 @@ impl Char for char {
             return 4;
         }
     }
+
+    fn encode_utf16(&self, dst: &mut [u16]) -> uint {
+        let mut ch = *self as uint;
+        if (ch & 0xFFFF_u) == ch {
+            // The BMP falls through (assuming non-surrogate, as it
+            // should)
+            assert!(ch <= 0xD7FF_u || ch >= 0xE000_u);
+            dst[0] = ch as u16;
+            1
+        } else {
+            // Supplementary planes break into surrogates.
+            assert!(ch >= 0x1_0000_u && ch <= 0x10_FFFF_u);
+            ch -= 0x1_0000_u;
+            dst[0] = 0xD800_u16 | ((ch >> 10) as u16);
+            dst[1] = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
+            2
+        }
+    }
 }
 
 #[cfg(not(test))]
@@ -788,3 +815,31 @@ fn test_to_str() {
     let s = 't'.to_str();
     assert_eq!(s, ~"t");
 }
+
+#[test]
+fn test_encode_utf8() {
+    fn check(input: char, expect: &[u8]) {
+        let mut buf = [0u8, ..4];
+        let n = input.encode_utf8(buf /* as mut slice! */);
+        assert_eq!(buf.slice_to(n), expect);
+    }
+
+    check('x', [0x78]);
+    check('\u00e9', [0xc3, 0xa9]);
+    check('\ua66e', [0xea, 0x99, 0xae]);
+    check('\U0001f4a9', [0xf0, 0x9f, 0x92, 0xa9]);
+}
+
+#[test]
+fn test_encode_utf16() {
+    fn check(input: char, expect: &[u16]) {
+        let mut buf = [0u16, ..2];
+        let n = input.encode_utf16(buf /* as mut slice! */);
+        assert_eq!(buf.slice_to(n), expect);
+    }
+
+    check('x', [0x0078]);
+    check('\u00e9', [0x00e9]);
+    check('\ua66e', [0xa66e]);
+    check('\U0001f4a9', [0xd83d, 0xdca9]);
+}
diff --git a/src/libstd/str.rs b/src/libstd/str.rs
@@ -2555,22 +2555,9 @@ impl<'a> StrSlice<'a> for &'a str {
     fn to_utf16(&self) -> ~[u16] {
         let mut u = ~[];
         for ch in self.chars() {
-            // Arithmetic with u32 literals is easier on the eyes than chars.
-            let mut ch = ch as u32;
-
-            if (ch & 0xFFFF_u32) == ch {
-                // The BMP falls through (assuming non-surrogate, as it
-                // should)
-                assert!(ch <= 0xD7FF_u32 || ch >= 0xE000_u32);
-                u.push(ch as u16)
-            } else {
-                // Supplementary planes break into surrogates.
-                assert!(ch >= 0x1_0000_u32 && ch <= 0x10_FFFF_u32);
-                ch -= 0x1_0000_u32;
-                let w1 = 0xD800_u16 | ((ch >> 10) as u16);
-                let w2 = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
-                u.push_all([w1, w2])
-            }
+            let mut buf = [0u16, ..2];
+            let n = ch.encode_utf16(buf /* as mut slice! */);
+            u.push_all(buf.slice_to(n));
         }
         u
     }