From 99d52796d0537488e9d2b9162f05c0a9a628e145 Mon Sep 17 00:00:00 2001
From: Florian Zeitz <florob@babelmonkeys.de>
Date: Mon, 12 May 2014 19:56:41 +0200
Subject: [PATCH 1/4] std, core: Generate unicode.rs using unicode.py

---
 src/etc/unicode.py     | 131 ++++++++++++++++++++++++-----------------
 src/libcore/unicode.rs |  11 ++--
 src/libstd/unicode.rs  |   4 +-
 3 files changed, 85 insertions(+), 61 deletions(-)

diff --git a/src/etc/unicode.py b/src/etc/unicode.py
index d5c74e367340e..e98c65ca50eee 100755
--- a/src/etc/unicode.py
+++ b/src/etc/unicode.py
@@ -169,7 +169,7 @@ def emit_bsearch_range_table(f):
         else if hi < c { Less }
         else { Greater }
     }) != None
-}\n\n
+}\n
 """);
 
 def emit_property_module(f, mod, tbl):
@@ -193,11 +193,11 @@ def emit_property_module(f, mod, tbl):
         f.write("    pub fn %s(c: char) -> bool {\n" % cat)
         f.write("        super::bsearch_range_table(c, %s_table)\n" % cat)
         f.write("    }\n\n")
-    f.write("}\n")
+    f.write("}\n\n")
 
 
 def emit_conversions_module(f, lowerupper, upperlower):
-    f.write("pub mod conversions {\n")
+    f.write("pub mod conversions {")
     f.write("""
     use cmp::{Equal, Less, Greater};
     use slice::ImmutableVector;
@@ -225,13 +225,14 @@ def emit_conversions_module(f, lowerupper, upperlower):
             else { Greater }
         })
     }
+
 """);
     emit_caseconversion_table(f, "LuLl", upperlower)
     emit_caseconversion_table(f, "LlLu", lowerupper)
     f.write("}\n")
 
 def emit_caseconversion_table(f, name, table):
-    f.write("   static %s_table : &'static [(char, char)] = &[\n" % name)
+    f.write("    static %s_table : &'static [(char, char)] = &[\n" % name)
     sorted_table = sorted(table.iteritems(), key=operator.itemgetter(0))
     ix = 0
     for key, value in sorted_table:
@@ -255,7 +256,7 @@ def format_table_content(f, content, indent):
             line = " "*indent + chunk
     f.write(line)
 
-def emit_decomp_module(f, canon, compat, combine):
+def emit_core_decomp_module(f, canon, compat):
     canon_keys = canon.keys()
     canon_keys.sort()
 
@@ -279,23 +280,6 @@ def emit_decomp_module(f, canon, compat, combine):
             }
             None => None
         }
-    }\n
-""")
-
-    f.write("""
-    fn bsearch_range_value_table(c: char, r: &'static [(char, char, u8)]) -> u8 {
-        use cmp::{Equal, Less, Greater};
-        match r.bsearch(|&(lo, hi, _)| {
-            if lo <= c && c <= hi { Equal }
-            else if hi < c { Less }
-            else { Greater }
-        }) {
-            Some(idx) => {
-                let (_, _, result) = r[idx];
-                result
-            }
-            None => 0
-        }
     }\n\n
 """)
 
@@ -337,21 +321,10 @@ def emit_decomp_module(f, canon, compat, combine):
     format_table_content(f, data, 8)
     f.write("\n    ];\n\n")
 
-    f.write("    static combining_class_table : &'static [(char, char, u8)] = &[\n")
-    ix = 0
-    for pair in combine:
-        f.write(ch_prefix(ix))
-        f.write("(%s, %s, %s)" % (escape_char(pair[0]), escape_char(pair[1]), pair[2]))
-        ix += 1
-    f.write("\n    ];\n")
-
     f.write("    pub fn canonical(c: char, i: |char|) "
         + "{ d(c, i, false); }\n\n")
     f.write("    pub fn compatibility(c: char, i: |char|) "
             +"{ d(c, i, true); }\n\n")
-    f.write("    pub fn canonical_combining_class(c: char) -> u8 {\n"
-        + "        bsearch_range_value_table(c, combining_class_table)\n"
-        + "    }\n\n")
     f.write("    fn d(c: char, i: |char|, k: bool) {\n")
     f.write("        use iter::Iterator;\n");
 
@@ -389,17 +362,43 @@ def emit_decomp_module(f, canon, compat, combine):
     f.write("    }\n")
     f.write("}\n\n")
 
-r = "unicode.rs"
-for i in [r]:
-    if os.path.exists(i):
-        os.remove(i);
-rf = open(r, "w")
+def emit_std_decomp_module(f, combine):
+    f.write("pub mod decompose {\n");
+    f.write("    use option::{Some, None};\n");
+    f.write("    use slice::ImmutableVector;\n");
 
-(canon_decomp, compat_decomp, gencats,
- combines, lowerupper, upperlower) = load_unicode_data("UnicodeData.txt")
+    f.write("""
+    fn bsearch_range_value_table(c: char, r: &'static [(char, char, u8)]) -> u8 {
+        use cmp::{Equal, Less, Greater};
+        match r.bsearch(|&(lo, hi, _)| {
+            if lo <= c && c <= hi { Equal }
+            else if hi < c { Less }
+            else { Greater }
+        }) {
+            Some(idx) => {
+                let (_, _, result) = r[idx];
+                result
+            }
+            None => 0
+        }
+    }\n\n
+""")
+
+    f.write("    static combining_class_table : &'static [(char, char, u8)] = &[\n")
+    ix = 0
+    for pair in combine:
+        f.write(ch_prefix(ix))
+        f.write("(%s, %s, %s)" % (escape_char(pair[0]), escape_char(pair[1]), pair[2]))
+        ix += 1
+    f.write("\n    ];\n\n")
 
-# Preamble
-rf.write('''// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
+    f.write("    pub fn canonical_combining_class(c: char) -> u8 {\n"
+        + "        bsearch_range_value_table(c, combining_class_table)\n"
+        + "    }\n")
+    f.write("}\n")
+
+
+preamble = '''// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
 // file at the top-level directory of this distribution and at
 // http://rust-lang.org/COPYRIGHT.
 //
@@ -409,23 +408,45 @@ def emit_decomp_module(f, canon, compat, combine):
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
 
-// The following code was generated by "src/etc/unicode.py"
+// NOTE: The following code was generated by "src/etc/unicode.py", do not edit directly
+
+#![allow(missing_doc, non_uppercase_statics)]
+
+'''
+
+(canon_decomp, compat_decomp, gencats,
+ combines, lowerupper, upperlower) = load_unicode_data("UnicodeData.txt")
+
+def gen_core_unicode():
+    r = "core_unicode.rs"
+    if os.path.exists(r):
+        os.remove(r);
+    with open(r, "w") as rf:
+        # Preamble
+        rf.write(preamble)
 
-#![allow(missing_doc)]
-#![allow(non_uppercase_statics)]
+        emit_bsearch_range_table(rf);
+        emit_property_module(rf, "general_category", gencats)
 
-''')
+        emit_core_decomp_module(rf, canon_decomp, compat_decomp)
 
-emit_bsearch_range_table(rf);
-emit_property_module(rf, "general_category", gencats)
+        derived = load_properties("DerivedCoreProperties.txt",
+                ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"])
 
-emit_decomp_module(rf, canon_decomp, compat_decomp, combines)
+        emit_property_module(rf, "derived_property", derived)
 
-derived = load_properties("DerivedCoreProperties.txt",
-        ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"])
+        props = load_properties("PropList.txt", ["White_Space"])
+        emit_property_module(rf, "property", props)
+        emit_conversions_module(rf, lowerupper, upperlower)
 
-emit_property_module(rf, "derived_property", derived)
+def gen_std_unicode():
+    r = "std_unicode.rs"
+    if os.path.exists(r):
+        os.remove(r);
+    with open(r, "w") as rf:
+        # Preamble
+        rf.write(preamble)
+        emit_std_decomp_module(rf, combines)
 
-props = load_properties("PropList.txt", ["White_Space"])
-emit_property_module(rf, "property", props)
-emit_conversions_module(rf, lowerupper, upperlower)
+gen_core_unicode()
+gen_std_unicode()
diff --git a/src/libcore/unicode.rs b/src/libcore/unicode.rs
index db016ad880741..b3298bde05547 100644
--- a/src/libcore/unicode.rs
+++ b/src/libcore/unicode.rs
@@ -1,4 +1,4 @@
-// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
+// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
 // file at the top-level directory of this distribution and at
 // http://rust-lang.org/COPYRIGHT.
 //
@@ -8,10 +8,11 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
 
-// The following code was generated by "src/etc/unicode.py"
+// NOTE: The following code was generated by "src/etc/unicode.py", do not edit directly
 
 #![allow(missing_doc, non_uppercase_statics)]
 
+
 fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
     use cmp::{Equal, Less, Greater};
     use slice::ImmutableVector;
@@ -102,6 +103,7 @@ pub mod general_category {
     }
 
 }
+
 pub mod decompose {
     use option::Option;
     use option::{Some, None};
@@ -123,7 +125,6 @@ pub mod decompose {
     }
 
 
-
     // Canonical decompositions
     static canonical_table : &'static [(char, &'static [char])] = &[
         ('\xc0', &['\x41', '\u0300']), ('\xc1', &['\x41', '\u0301']), ('\xc2', &['\x41', '\u0302']),
@@ -3968,6 +3969,7 @@ pub mod derived_property {
     pub fn XID_Start(c: char) -> bool {
         super::bsearch_range_table(c, XID_Start_table)
     }
+
 }
 
 pub mod property {
@@ -3983,6 +3985,7 @@ pub mod property {
     pub fn White_Space(c: char) -> bool {
         super::bsearch_range_table(c, White_Space_table)
     }
+
 }
 
 pub mod conversions {
@@ -4501,7 +4504,7 @@ pub mod conversions {
         ('\U00010426', '\U0001044e'), ('\U00010427', '\U0001044f')
     ];
 
-   static LlLu_table : &'static [(char, char)] = &[
+    static LlLu_table : &'static [(char, char)] = &[
         ('\x61', '\x41'), ('\x62', '\x42'),
         ('\x63', '\x43'), ('\x64', '\x44'),
         ('\x65', '\x45'), ('\x66', '\x46'),
diff --git a/src/libstd/unicode.rs b/src/libstd/unicode.rs
index be6e5d040a7c9..d534b30221b4a 100644
--- a/src/libstd/unicode.rs
+++ b/src/libstd/unicode.rs
@@ -1,4 +1,4 @@
-// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
+// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
 // file at the top-level directory of this distribution and at
 // http://rust-lang.org/COPYRIGHT.
 //
@@ -8,7 +8,7 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
 
-// The following code was generated by "src/etc/unicode.py"
+// NOTE: The following code was generated by "src/etc/unicode.py", do not edit directly
 
 #![allow(missing_doc, non_uppercase_statics)]
 

From fb59b7195768689f837a5fdaac04ffc63ae43d31 Mon Sep 17 00:00:00 2001
From: Florian Zeitz <florob@babelmonkeys.de>
Date: Mon, 12 May 2014 21:53:53 +0200
Subject: [PATCH 2/4] core: Use appropriately sized integers for codepoints and
 bytes

---
 src/libcore/char.rs | 95 +++++++++++++++++++++------------------------
 1 file changed, 45 insertions(+), 50 deletions(-)

diff --git a/src/libcore/char.rs b/src/libcore/char.rs
index 7f2deb81f8c90..ca5e56f0649cc 100644
--- a/src/libcore/char.rs
+++ b/src/libcore/char.rs
@@ -33,13 +33,14 @@ use unicode::{derived_property, property, general_category, decompose, conversio
 #[cfg(not(test))] use default::Default;
 
 // UTF-8 ranges and tags for encoding characters
-static TAG_CONT: uint = 128u;
-static MAX_ONE_B: uint = 128u;
-static TAG_TWO_B: uint = 192u;
-static MAX_TWO_B: uint = 2048u;
-static TAG_THREE_B: uint = 224u;
-static MAX_THREE_B: uint = 65536u;
-static TAG_FOUR_B: uint = 240u;
+static TAG_CONT: u8    = 0b1000_0000u8;
+static TAG_TWO_B: u8   = 0b1100_0000u8;
+static TAG_THREE_B: u8 = 0b1110_0000u8;
+static TAG_FOUR_B: u8  = 0b1111_0000u8;
+static MAX_ONE_B: u32   =     0x80u32;
+static MAX_TWO_B: u32   =    0x800u32;
+static MAX_THREE_B: u32 =  0x10000u32;
+static MAX_FOUR_B:  u32 = 0x200000u32;
 
 /*
     Lu  Uppercase_Letter        an uppercase letter
@@ -285,37 +286,37 @@ pub fn from_digit(num: uint, radix: uint) -> Option<char> {
 }
 
 // Constants from Unicode 6.2.0 Section 3.12 Conjoining Jamo Behavior
-static S_BASE: uint = 0xAC00;
-static L_BASE: uint = 0x1100;
-static V_BASE: uint = 0x1161;
-static T_BASE: uint = 0x11A7;
-static L_COUNT: uint = 19;
-static V_COUNT: uint = 21;
-static T_COUNT: uint = 28;
-static N_COUNT: uint = (V_COUNT * T_COUNT);
-static S_COUNT: uint = (L_COUNT * N_COUNT);
+static S_BASE: u32 = 0xAC00;
+static L_BASE: u32 = 0x1100;
+static V_BASE: u32 = 0x1161;
+static T_BASE: u32 = 0x11A7;
+static L_COUNT: u32 = 19;
+static V_COUNT: u32 = 21;
+static T_COUNT: u32 = 28;
+static N_COUNT: u32 = (V_COUNT * T_COUNT);
+static S_COUNT: u32 = (L_COUNT * N_COUNT);
 
 // Decompose a precomposed Hangul syllable
 fn decompose_hangul(s: char, f: |char|) {
-    let si = s as uint - S_BASE;
+    let si = s as u32 - S_BASE;
 
     let li = si / N_COUNT;
     unsafe {
-        f(transmute((L_BASE + li) as u32));
+        f(transmute(L_BASE + li));
 
         let vi = (si % N_COUNT) / T_COUNT;
-        f(transmute((V_BASE + vi) as u32));
+        f(transmute(V_BASE + vi));
 
         let ti = si % T_COUNT;
         if ti > 0 {
-            f(transmute((T_BASE + ti) as u32));
+            f(transmute(T_BASE + ti));
         }
     }
 }
 
 /// Returns the canonical decomposition of a character
 pub fn decompose_canonical(c: char, f: |char|) {
-    if (c as uint) < S_BASE || (c as uint) >= (S_BASE + S_COUNT) {
+    if (c as u32) < S_BASE || (c as u32) >= (S_BASE + S_COUNT) {
         decompose::canonical(c, f);
     } else {
         decompose_hangul(c, f);
@@ -324,7 +325,7 @@ pub fn decompose_canonical(c: char, f: |char|) {
 
 /// Returns the compatibility decomposition of a character
 pub fn decompose_compatible(c: char, f: |char|) {
-    if (c as uint) < S_BASE || (c as uint) >= (S_BASE + S_COUNT) {
+    if (c as u32) < S_BASE || (c as u32) >= (S_BASE + S_COUNT) {
         decompose::compatibility(c, f);
     } else {
         decompose_hangul(c, f);
@@ -386,12 +387,7 @@ pub fn escape_default(c: char, f: |char|) {
 
 /// Returns the amount of bytes this `char` would need if encoded in UTF-8
 pub fn len_utf8_bytes(c: char) -> uint {
-    static MAX_ONE_B:   uint = 128u;
-    static MAX_TWO_B:   uint = 2048u;
-    static MAX_THREE_B: uint = 65536u;
-    static MAX_FOUR_B:  uint = 2097152u;
-
-    let code = c as uint;
+    let code = c as u32;
     match () {
         _ if code < MAX_ONE_B   => 1u,
         _ if code < MAX_TWO_B   => 2u,
@@ -606,41 +602,40 @@ impl Char for char {
 
     fn len_utf8_bytes(&self) -> uint { len_utf8_bytes(*self) }
 
-    fn encode_utf8(&self, dst: &mut [u8]) -> uint {
-        let code = *self as uint;
+    fn encode_utf8<'a>(&self, dst: &'a mut [u8]) -> uint {
+        let code = *self as u32;
         if code < MAX_ONE_B {
             dst[0] = code as u8;
-            return 1;
+            1
         } else if code < MAX_TWO_B {
-            dst[0] = (code >> 6u & 31u | TAG_TWO_B) as u8;
-            dst[1] = (code & 63u | TAG_CONT) as u8;
-            return 2;
+            dst[0] = (code >> 6u & 0x1F_u32) as u8 | TAG_TWO_B;
+            dst[1] = (code & 0x3F_u32) as u8 | TAG_CONT;
+            2
         } else if code < MAX_THREE_B {
-            dst[0] = (code >> 12u & 15u | TAG_THREE_B) as u8;
-            dst[1] = (code >> 6u & 63u | TAG_CONT) as u8;
-            dst[2] = (code & 63u | TAG_CONT) as u8;
-            return 3;
+            dst[0] = (code >> 12u & 0x0F_u32) as u8 | TAG_THREE_B;
+            dst[1] = (code >>  6u & 0x3F_u32) as u8 | TAG_CONT;
+            dst[2] = (code & 0x3F_u32) as u8 | TAG_CONT;
+            3
         } else {
-            dst[0] = (code >> 18u & 7u | TAG_FOUR_B) as u8;
-            dst[1] = (code >> 12u & 63u | TAG_CONT) as u8;
-            dst[2] = (code >> 6u & 63u | TAG_CONT) as u8;
-            dst[3] = (code & 63u | TAG_CONT) as u8;
-            return 4;
+            dst[0] = (code >> 18u & 0x07_u32) as u8 | TAG_FOUR_B;
+            dst[1] = (code >> 12u & 0x3F_u32) as u8 | TAG_CONT;
+            dst[2] = (code >>  6u & 0x3F_u32) as u8 | TAG_CONT;
+            dst[3] = (code & 0x3F_u32) as u8 | TAG_CONT;
+            4
         }
     }
 
     fn encode_utf16(&self, dst: &mut [u16]) -> uint {
-        let mut ch = *self as uint;
-        if (ch & 0xFFFF_u) == ch {
-            // The BMP falls through (assuming non-surrogate, as it
-            // should)
-            assert!(ch <= 0xD7FF_u || ch >= 0xE000_u);
+        let mut ch = *self as u32;
+        if (ch & 0xFFFF_u32) == ch {
+            // The BMP falls through (assuming non-surrogate, as it should)
+            assert!(ch <= 0xD7FF_u32 || ch >= 0xE000_u32);
             dst[0] = ch as u16;
             1
         } else {
             // Supplementary planes break into surrogates.
-            assert!(ch >= 0x1_0000_u && ch <= 0x10_FFFF_u);
-            ch -= 0x1_0000_u;
+            assert!(ch >= 0x1_0000_u32 && ch <= 0x10_FFFF_u32);
+            ch -= 0x1_0000_u32;
             dst[0] = 0xD800_u16 | ((ch >> 10) as u16);
             dst[1] = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
             2

From 32a20bfa5f467b4713436c5c890b0a92b26da8a5 Mon Sep 17 00:00:00 2001
From: Florian Zeitz <florob@babelmonkeys.de>
Date: Mon, 12 May 2014 22:25:38 +0200
Subject: [PATCH 3/4] core: Move Hangul decomposition into unicode.rs

---
 src/etc/unicode.py     | 77 +++++++++++++++++++++++++++++++-----------
 src/libcore/char.rs    | 54 ++++-------------------------
 src/libcore/unicode.rs | 48 ++++++++++++++++++++++++--
 3 files changed, 110 insertions(+), 69 deletions(-)

diff --git a/src/etc/unicode.py b/src/etc/unicode.py
index e98c65ca50eee..f079ef73cd8e2 100755
--- a/src/etc/unicode.py
+++ b/src/etc/unicode.py
@@ -321,17 +321,24 @@ def emit_core_decomp_module(f, canon, compat):
     format_table_content(f, data, 8)
     f.write("\n    ];\n\n")
 
-    f.write("    pub fn canonical(c: char, i: |char|) "
-        + "{ d(c, i, false); }\n\n")
-    f.write("    pub fn compatibility(c: char, i: |char|) "
-            +"{ d(c, i, true); }\n\n")
-    f.write("    fn d(c: char, i: |char|, k: bool) {\n")
-    f.write("        use iter::Iterator;\n");
+    f.write("""
+    pub fn decompose_canonical(c: char, i: |char|) { d(c, i, false); }
 
-    f.write("        if c <= '\\x7f' { i(c); return; }\n")
+    pub fn decompose_compatible(c: char, i: |char|) { d(c, i, true); }
 
-    # First check the canonical decompositions
-    f.write("""
+    fn d(c: char, i: |char|, k: bool) {
+        use iter::Iterator;
+
+        // 7-bit ASCII never decomposes
+        if c <= '\\x7f' { i(c); return; }
+
+        // Perform decomposition for Hangul
+        if (c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT) {
+            decompose_hangul(c, i);
+            return;
+        }
+
+        // First check the canonical decompositions
         match bsearch_table(c, canonical_table) {
             Some(canon) => {
                 for x in canon.iter() {
@@ -340,13 +347,12 @@ def emit_core_decomp_module(f, canon, compat):
                 return;
             }
             None => ()
-        }\n\n""")
+        }
 
-    # Bottom out if we're not doing compat.
-    f.write("        if !k { i(c); return; }\n")
+        // Bottom out if we're not doing compat.
+        if !k { i(c); return; }
 
-    # Then check the compatibility decompositions
-    f.write("""
+        // Then check the compatibility decompositions
         match bsearch_table(c, compatibility_table) {
             Some(compat) => {
                 for x in compat.iter() {
@@ -355,12 +361,45 @@ def emit_core_decomp_module(f, canon, compat):
                 return;
             }
             None => ()
-        }\n\n""")
+        }
 
-    # Finally bottom out.
-    f.write("        i(c);\n")
-    f.write("    }\n")
-    f.write("}\n\n")
+        // Finally bottom out.
+        i(c);
+    }
+
+    // Constants from Unicode 6.2.0 Section 3.12 Conjoining Jamo Behavior
+    static S_BASE: u32 = 0xAC00;
+    static L_BASE: u32 = 0x1100;
+    static V_BASE: u32 = 0x1161;
+    static T_BASE: u32 = 0x11A7;
+    static L_COUNT: u32 = 19;
+    static V_COUNT: u32 = 21;
+    static T_COUNT: u32 = 28;
+    static N_COUNT: u32 = (V_COUNT * T_COUNT);
+    static S_COUNT: u32 = (L_COUNT * N_COUNT);
+
+    // Decompose a precomposed Hangul syllable
+    fn decompose_hangul(s: char, f: |char|) {
+        use cast::transmute;
+
+        let si = s as u32 - S_BASE;
+
+        let li = si / N_COUNT;
+        unsafe {
+            f(transmute(L_BASE + li));
+
+            let vi = (si % N_COUNT) / T_COUNT;
+            f(transmute(V_BASE + vi));
+
+            let ti = si % T_COUNT;
+            if ti > 0 {
+                f(transmute(T_BASE + ti));
+            }
+        }
+    }
+}
+
+""")
 
 def emit_std_decomp_module(f, combine):
     f.write("pub mod decompose {\n");
diff --git a/src/libcore/char.rs b/src/libcore/char.rs
index ca5e56f0649cc..71a2d75715b5a 100644
--- a/src/libcore/char.rs
+++ b/src/libcore/char.rs
@@ -27,7 +27,12 @@
 use mem::transmute;
 use option::{None, Option, Some};
 use iter::{Iterator, range_step};
-use unicode::{derived_property, property, general_category, decompose, conversions};
+use unicode::{derived_property, property, general_category, conversions};
+
+/// Returns the canonical decomposition of a character.
+pub use unicode::decompose::decompose_canonical;
+/// Returns the compatibility decomposition of a character.
+pub use unicode::decompose::decompose_compatible;
 
 #[cfg(not(test))] use cmp::{Eq, Ord, TotalEq, TotalOrd, Ordering};
 #[cfg(not(test))] use default::Default;
@@ -285,53 +290,6 @@ pub fn from_digit(num: uint, radix: uint) -> Option<char> {
     }
 }
 
-// Constants from Unicode 6.2.0 Section 3.12 Conjoining Jamo Behavior
-static S_BASE: u32 = 0xAC00;
-static L_BASE: u32 = 0x1100;
-static V_BASE: u32 = 0x1161;
-static T_BASE: u32 = 0x11A7;
-static L_COUNT: u32 = 19;
-static V_COUNT: u32 = 21;
-static T_COUNT: u32 = 28;
-static N_COUNT: u32 = (V_COUNT * T_COUNT);
-static S_COUNT: u32 = (L_COUNT * N_COUNT);
-
-// Decompose a precomposed Hangul syllable
-fn decompose_hangul(s: char, f: |char|) {
-    let si = s as u32 - S_BASE;
-
-    let li = si / N_COUNT;
-    unsafe {
-        f(transmute(L_BASE + li));
-
-        let vi = (si % N_COUNT) / T_COUNT;
-        f(transmute(V_BASE + vi));
-
-        let ti = si % T_COUNT;
-        if ti > 0 {
-            f(transmute(T_BASE + ti));
-        }
-    }
-}
-
-/// Returns the canonical decomposition of a character
-pub fn decompose_canonical(c: char, f: |char|) {
-    if (c as u32) < S_BASE || (c as u32) >= (S_BASE + S_COUNT) {
-        decompose::canonical(c, f);
-    } else {
-        decompose_hangul(c, f);
-    }
-}
-
-/// Returns the compatibility decomposition of a character
-pub fn decompose_compatible(c: char, f: |char|) {
-    if (c as u32) < S_BASE || (c as u32) >= (S_BASE + S_COUNT) {
-        decompose::compatibility(c, f);
-    } else {
-        decompose_hangul(c, f);
-    }
-}
-
 ///
 /// Returns the hexadecimal Unicode escape of a `char`
 ///
diff --git a/src/libcore/unicode.rs b/src/libcore/unicode.rs
index b3298bde05547..bffde2323bf94 100644
--- a/src/libcore/unicode.rs
+++ b/src/libcore/unicode.rs
@@ -2121,14 +2121,24 @@ pub mod decompose {
         &['\u53ef'])
     ];
 
-    pub fn canonical(c: char, i: |char|) { d(c, i, false); }
 
-    pub fn compatibility(c: char, i: |char|) { d(c, i, true); }
+    pub fn decompose_canonical(c: char, i: |char|) { d(c, i, false); }
+
+    pub fn decompose_compatible(c: char, i: |char|) { d(c, i, true); }
 
     fn d(c: char, i: |char|, k: bool) {
         use iter::Iterator;
+
+        // 7-bit ASCII never decomposes
         if c <= '\x7f' { i(c); return; }
 
+        // Perform decomposition for Hangul
+        if (c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT) {
+            decompose_hangul(c, i);
+            return;
+        }
+
+        // First check the canonical decompositions
         match bsearch_table(c, canonical_table) {
             Some(canon) => {
                 for x in canon.iter() {
@@ -2139,8 +2149,10 @@ pub mod decompose {
             None => ()
         }
 
+        // Bottom out if we're not doing compat.
         if !k { i(c); return; }
 
+        // Then check the compatibility decompositions
         match bsearch_table(c, compatibility_table) {
             Some(compat) => {
                 for x in compat.iter() {
@@ -2151,8 +2163,40 @@ pub mod decompose {
             None => ()
         }
 
+        // Finally bottom out.
         i(c);
     }
+
+    // Constants from Unicode 6.2.0 Section 3.12 Conjoining Jamo Behavior
+    static S_BASE: u32 = 0xAC00;
+    static L_BASE: u32 = 0x1100;
+    static V_BASE: u32 = 0x1161;
+    static T_BASE: u32 = 0x11A7;
+    static L_COUNT: u32 = 19;
+    static V_COUNT: u32 = 21;
+    static T_COUNT: u32 = 28;
+    static N_COUNT: u32 = (V_COUNT * T_COUNT);
+    static S_COUNT: u32 = (L_COUNT * N_COUNT);
+
+    // Decompose a precomposed Hangul syllable
+    fn decompose_hangul(s: char, f: |char|) {
+        use mem::transmute;
+
+        let si = s as u32 - S_BASE;
+
+        let li = si / N_COUNT;
+        unsafe {
+            f(transmute(L_BASE + li));
+
+            let vi = (si % N_COUNT) / T_COUNT;
+            f(transmute(V_BASE + vi));
+
+            let ti = si % T_COUNT;
+            if ti > 0 {
+                f(transmute(T_BASE + ti));
+            }
+        }
+    }
 }
 
 pub mod derived_property {

From 748061e08908fb4fca7e1373ccfc129e12c734ac Mon Sep 17 00:00:00 2001
From: Florian Zeitz <florob@babelmonkeys.de>
Date: Mon, 12 May 2014 22:44:21 +0200
Subject: [PATCH 4/4] std: Rename str::Normalizations to str::Decompositions

The Normalizations iterator has been renamed to Decompositions.
It does not currently include all forms of Unicode normalization,
but only encompasses decompositions.
If implemented recomposition would likely be a separate iterator
which works on the result of this one.

[breaking-change]
---
 src/etc/unicode.py     | 12 ++++++------
 src/libcore/char.rs    |  4 ++--
 src/libcore/unicode.rs |  2 +-
 src/libstd/str.rs      | 32 ++++++++++++++++----------------
 src/libstd/unicode.rs  |  2 +-
 5 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/src/etc/unicode.py b/src/etc/unicode.py
index f079ef73cd8e2..586890ebe4c9a 100755
--- a/src/etc/unicode.py
+++ b/src/etc/unicode.py
@@ -256,13 +256,13 @@ def format_table_content(f, content, indent):
             line = " "*indent + chunk
     f.write(line)
 
-def emit_core_decomp_module(f, canon, compat):
+def emit_core_norm_module(f, canon, compat):
     canon_keys = canon.keys()
     canon_keys.sort()
 
     compat_keys = compat.keys()
     compat_keys.sort()
-    f.write("pub mod decompose {\n");
+    f.write("pub mod normalization {\n");
     f.write("    use option::Option;\n");
     f.write("    use option::{Some, None};\n");
     f.write("    use slice::ImmutableVector;\n");
@@ -401,8 +401,8 @@ def emit_core_decomp_module(f, canon, compat):
 
 """)
 
-def emit_std_decomp_module(f, combine):
-    f.write("pub mod decompose {\n");
+def emit_std_norm_module(f, combine):
+    f.write("pub mod normalization {\n");
     f.write("    use option::{Some, None};\n");
     f.write("    use slice::ImmutableVector;\n");
 
@@ -467,7 +467,7 @@ def gen_core_unicode():
         emit_bsearch_range_table(rf);
         emit_property_module(rf, "general_category", gencats)
 
-        emit_core_decomp_module(rf, canon_decomp, compat_decomp)
+        emit_core_norm_module(rf, canon_decomp, compat_decomp)
 
         derived = load_properties("DerivedCoreProperties.txt",
                 ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"])
@@ -485,7 +485,7 @@ def gen_std_unicode():
     with open(r, "w") as rf:
         # Preamble
         rf.write(preamble)
-        emit_std_decomp_module(rf, combines)
+        emit_std_norm_module(rf, combines)
 
 gen_core_unicode()
 gen_std_unicode()
diff --git a/src/libcore/char.rs b/src/libcore/char.rs
index 71a2d75715b5a..934483dbed423 100644
--- a/src/libcore/char.rs
+++ b/src/libcore/char.rs
@@ -30,9 +30,9 @@ use iter::{Iterator, range_step};
 use unicode::{derived_property, property, general_category, conversions};
 
 /// Returns the canonical decomposition of a character.
-pub use unicode::decompose::decompose_canonical;
+pub use unicode::normalization::decompose_canonical;
 /// Returns the compatibility decomposition of a character.
-pub use unicode::decompose::decompose_compatible;
+pub use unicode::normalization::decompose_compatible;
 
 #[cfg(not(test))] use cmp::{Eq, Ord, TotalEq, TotalOrd, Ordering};
 #[cfg(not(test))] use default::Default;
diff --git a/src/libcore/unicode.rs b/src/libcore/unicode.rs
index bffde2323bf94..242672de2967a 100644
--- a/src/libcore/unicode.rs
+++ b/src/libcore/unicode.rs
@@ -104,7 +104,7 @@ pub mod general_category {
 
 }
 
-pub mod decompose {
+pub mod normalization {
     use option::Option;
     use option::{Some, None};
     use slice::ImmutableVector;
diff --git a/src/libstd/str.rs b/src/libstd/str.rs
index 24cf9681ca84d..fa4cf8e4427d0 100644
--- a/src/libstd/str.rs
+++ b/src/libstd/str.rs
@@ -228,25 +228,25 @@ fn canonical_sort(comb: &mut [(char, u8)]) {
 }
 
 #[deriving(Clone)]
-enum NormalizationForm {
-    NFD,
-    NFKD
+enum DecompositionType {
+    Canonical,
+    Compatible
 }
 
-/// External iterator for a string's normalization's characters.
+/// External iterator for a string's decomposition's characters.
 /// Use with the `std::iter` module.
 #[deriving(Clone)]
-pub struct Normalizations<'a> {
-    kind: NormalizationForm,
+pub struct Decompositions<'a> {
+    kind: DecompositionType,
     iter: Chars<'a>,
     buffer: Vec<(char, u8)>,
     sorted: bool
 }
 
-impl<'a> Iterator<char> for Normalizations<'a> {
+impl<'a> Iterator<char> for Decompositions<'a> {
     #[inline]
     fn next(&mut self) -> Option<char> {
-        use unicode::decompose::canonical_combining_class;
+        use unicode::normalization::canonical_combining_class;
 
         match self.buffer.as_slice().head() {
             Some(&(c, 0)) => {
@@ -262,8 +262,8 @@ impl<'a> Iterator<char> for Normalizations<'a> {
         }
 
         let decomposer = match self.kind {
-            NFD => char::decompose_canonical,
-            NFKD => char::decompose_compatible
+            Canonical => char::decompose_canonical,
+            Compatible => char::decompose_compatible
         };
 
         if !self.sorted {
@@ -887,24 +887,24 @@ pub trait StrAllocating: Str {
     /// An Iterator over the string in Unicode Normalization Form D
     /// (canonical decomposition).
     #[inline]
-    fn nfd_chars<'a>(&'a self) -> Normalizations<'a> {
-        Normalizations {
+    fn nfd_chars<'a>(&'a self) -> Decompositions<'a> {
+        Decompositions {
             iter: self.as_slice().chars(),
             buffer: Vec::new(),
             sorted: false,
-            kind: NFD
+            kind: Canonical
         }
     }
 
     /// An Iterator over the string in Unicode Normalization Form KD
     /// (compatibility decomposition).
     #[inline]
-    fn nfkd_chars<'a>(&'a self) -> Normalizations<'a> {
-        Normalizations {
+    fn nfkd_chars<'a>(&'a self) -> Decompositions<'a> {
+        Decompositions {
             iter: self.as_slice().chars(),
             buffer: Vec::new(),
             sorted: false,
-            kind: NFKD
+            kind: Compatible
         }
     }
 }
diff --git a/src/libstd/unicode.rs b/src/libstd/unicode.rs
index d534b30221b4a..03c960e96ffe1 100644
--- a/src/libstd/unicode.rs
+++ b/src/libstd/unicode.rs
@@ -12,7 +12,7 @@
 
 #![allow(missing_doc, non_uppercase_statics)]
 
-pub mod decompose {
+pub mod normalization {
     use option::{Some, None};
     use slice::ImmutableVector;