From 99d52796d0537488e9d2b9162f05c0a9a628e145 Mon Sep 17 00:00:00 2001 From: Florian Zeitz <florob@babelmonkeys.de> Date: Mon, 12 May 2014 19:56:41 +0200 Subject: [PATCH 1/4] std, core: Generate unicode.rs using unicode.py --- src/etc/unicode.py | 131 ++++++++++++++++++++++++----------------- src/libcore/unicode.rs | 11 ++-- src/libstd/unicode.rs | 4 +- 3 files changed, 85 insertions(+), 61 deletions(-) diff --git a/src/etc/unicode.py b/src/etc/unicode.py index d5c74e367340e..e98c65ca50eee 100755 --- a/src/etc/unicode.py +++ b/src/etc/unicode.py @@ -169,7 +169,7 @@ def emit_bsearch_range_table(f): else if hi < c { Less } else { Greater } }) != None -}\n\n +}\n """); def emit_property_module(f, mod, tbl): @@ -193,11 +193,11 @@ def emit_property_module(f, mod, tbl): f.write(" pub fn %s(c: char) -> bool {\n" % cat) f.write(" super::bsearch_range_table(c, %s_table)\n" % cat) f.write(" }\n\n") - f.write("}\n") + f.write("}\n\n") def emit_conversions_module(f, lowerupper, upperlower): - f.write("pub mod conversions {\n") + f.write("pub mod conversions {") f.write(""" use cmp::{Equal, Less, Greater}; use slice::ImmutableVector; @@ -225,13 +225,14 @@ def emit_conversions_module(f, lowerupper, upperlower): else { Greater } }) } + """); emit_caseconversion_table(f, "LuLl", upperlower) emit_caseconversion_table(f, "LlLu", lowerupper) f.write("}\n") def emit_caseconversion_table(f, name, table): - f.write(" static %s_table : &'static [(char, char)] = &[\n" % name) + f.write(" static %s_table : &'static [(char, char)] = &[\n" % name) sorted_table = sorted(table.iteritems(), key=operator.itemgetter(0)) ix = 0 for key, value in sorted_table: @@ -255,7 +256,7 @@ def format_table_content(f, content, indent): line = " "*indent + chunk f.write(line) -def emit_decomp_module(f, canon, compat, combine): +def emit_core_decomp_module(f, canon, compat): canon_keys = canon.keys() canon_keys.sort() @@ -279,23 +280,6 @@ def emit_decomp_module(f, canon, compat, combine): } None => None } - }\n -""") - - f.write(""" - fn bsearch_range_value_table(c: char, r: &'static [(char, char, u8)]) -> u8 { - use cmp::{Equal, Less, Greater}; - match r.bsearch(|&(lo, hi, _)| { - if lo <= c && c <= hi { Equal } - else if hi < c { Less } - else { Greater } - }) { - Some(idx) => { - let (_, _, result) = r[idx]; - result - } - None => 0 - } }\n\n """) @@ -337,21 +321,10 @@ def emit_decomp_module(f, canon, compat, combine): format_table_content(f, data, 8) f.write("\n ];\n\n") - f.write(" static combining_class_table : &'static [(char, char, u8)] = &[\n") - ix = 0 - for pair in combine: - f.write(ch_prefix(ix)) - f.write("(%s, %s, %s)" % (escape_char(pair[0]), escape_char(pair[1]), pair[2])) - ix += 1 - f.write("\n ];\n") - f.write(" pub fn canonical(c: char, i: |char|) " + "{ d(c, i, false); }\n\n") f.write(" pub fn compatibility(c: char, i: |char|) " +"{ d(c, i, true); }\n\n") - f.write(" pub fn canonical_combining_class(c: char) -> u8 {\n" - + " bsearch_range_value_table(c, combining_class_table)\n" - + " }\n\n") f.write(" fn d(c: char, i: |char|, k: bool) {\n") f.write(" use iter::Iterator;\n"); @@ -389,17 +362,43 @@ def emit_decomp_module(f, canon, compat, combine): f.write(" }\n") f.write("}\n\n") -r = "unicode.rs" -for i in [r]: - if os.path.exists(i): - os.remove(i); -rf = open(r, "w") +def emit_std_decomp_module(f, combine): + f.write("pub mod decompose {\n"); + f.write(" use option::{Some, None};\n"); + f.write(" use slice::ImmutableVector;\n"); -(canon_decomp, compat_decomp, gencats, - combines, lowerupper, upperlower) = load_unicode_data("UnicodeData.txt") + f.write(""" + fn bsearch_range_value_table(c: char, r: &'static [(char, char, u8)]) -> u8 { + use cmp::{Equal, Less, Greater}; + match r.bsearch(|&(lo, hi, _)| { + if lo <= c && c <= hi { Equal } + else if hi < c { Less } + else { Greater } + }) { + Some(idx) => { + let (_, _, result) = r[idx]; + result + } + None => 0 + } + }\n\n +""") + + f.write(" static combining_class_table : &'static [(char, char, u8)] = &[\n") + ix = 0 + for pair in combine: + f.write(ch_prefix(ix)) + f.write("(%s, %s, %s)" % (escape_char(pair[0]), escape_char(pair[1]), pair[2])) + ix += 1 + f.write("\n ];\n\n") -# Preamble -rf.write('''// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT + f.write(" pub fn canonical_combining_class(c: char) -> u8 {\n" + + " bsearch_range_value_table(c, combining_class_table)\n" + + " }\n") + f.write("}\n") + + +preamble = '''// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT // file at the top-level directory of this distribution and at // http://rust-lang.org/COPYRIGHT. // @@ -409,23 +408,45 @@ def emit_decomp_module(f, canon, compat, combine): // option. This file may not be copied, modified, or distributed // except according to those terms. -// The following code was generated by "src/etc/unicode.py" +// NOTE: The following code was generated by "src/etc/unicode.py", do not edit directly + +#![allow(missing_doc, non_uppercase_statics)] + +''' + +(canon_decomp, compat_decomp, gencats, + combines, lowerupper, upperlower) = load_unicode_data("UnicodeData.txt") + +def gen_core_unicode(): + r = "core_unicode.rs" + if os.path.exists(r): + os.remove(r); + with open(r, "w") as rf: + # Preamble + rf.write(preamble) -#![allow(missing_doc)] -#![allow(non_uppercase_statics)] + emit_bsearch_range_table(rf); + emit_property_module(rf, "general_category", gencats) -''') + emit_core_decomp_module(rf, canon_decomp, compat_decomp) -emit_bsearch_range_table(rf); -emit_property_module(rf, "general_category", gencats) + derived = load_properties("DerivedCoreProperties.txt", + ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"]) -emit_decomp_module(rf, canon_decomp, compat_decomp, combines) + emit_property_module(rf, "derived_property", derived) -derived = load_properties("DerivedCoreProperties.txt", - ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"]) + props = load_properties("PropList.txt", ["White_Space"]) + emit_property_module(rf, "property", props) + emit_conversions_module(rf, lowerupper, upperlower) -emit_property_module(rf, "derived_property", derived) +def gen_std_unicode(): + r = "std_unicode.rs" + if os.path.exists(r): + os.remove(r); + with open(r, "w") as rf: + # Preamble + rf.write(preamble) + emit_std_decomp_module(rf, combines) -props = load_properties("PropList.txt", ["White_Space"]) -emit_property_module(rf, "property", props) -emit_conversions_module(rf, lowerupper, upperlower) +gen_core_unicode() +gen_std_unicode() diff --git a/src/libcore/unicode.rs b/src/libcore/unicode.rs index db016ad880741..b3298bde05547 100644 --- a/src/libcore/unicode.rs +++ b/src/libcore/unicode.rs @@ -1,4 +1,4 @@ -// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT +// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT // file at the top-level directory of this distribution and at // http://rust-lang.org/COPYRIGHT. // @@ -8,10 +8,11 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -// The following code was generated by "src/etc/unicode.py" +// NOTE: The following code was generated by "src/etc/unicode.py", do not edit directly #![allow(missing_doc, non_uppercase_statics)] + fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool { use cmp::{Equal, Less, Greater}; use slice::ImmutableVector; @@ -102,6 +103,7 @@ pub mod general_category { } } + pub mod decompose { use option::Option; use option::{Some, None}; @@ -123,7 +125,6 @@ pub mod decompose { } - // Canonical decompositions static canonical_table : &'static [(char, &'static [char])] = &[ ('\xc0', &['\x41', '\u0300']), ('\xc1', &['\x41', '\u0301']), ('\xc2', &['\x41', '\u0302']), @@ -3968,6 +3969,7 @@ pub mod derived_property { pub fn XID_Start(c: char) -> bool { super::bsearch_range_table(c, XID_Start_table) } + } pub mod property { @@ -3983,6 +3985,7 @@ pub mod property { pub fn White_Space(c: char) -> bool { super::bsearch_range_table(c, White_Space_table) } + } pub mod conversions { @@ -4501,7 +4504,7 @@ pub mod conversions { ('\U00010426', '\U0001044e'), ('\U00010427', '\U0001044f') ]; - static LlLu_table : &'static [(char, char)] = &[ + static LlLu_table : &'static [(char, char)] = &[ ('\x61', '\x41'), ('\x62', '\x42'), ('\x63', '\x43'), ('\x64', '\x44'), ('\x65', '\x45'), ('\x66', '\x46'), diff --git a/src/libstd/unicode.rs b/src/libstd/unicode.rs index be6e5d040a7c9..d534b30221b4a 100644 --- a/src/libstd/unicode.rs +++ b/src/libstd/unicode.rs @@ -1,4 +1,4 @@ -// Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT +// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT // file at the top-level directory of this distribution and at // http://rust-lang.org/COPYRIGHT. // @@ -8,7 +8,7 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -// The following code was generated by "src/etc/unicode.py" +// NOTE: The following code was generated by "src/etc/unicode.py", do not edit directly #![allow(missing_doc, non_uppercase_statics)] From fb59b7195768689f837a5fdaac04ffc63ae43d31 Mon Sep 17 00:00:00 2001 From: Florian Zeitz <florob@babelmonkeys.de> Date: Mon, 12 May 2014 21:53:53 +0200 Subject: [PATCH 2/4] core: Use appropriately sized integers for codepoints and bytes --- src/libcore/char.rs | 95 +++++++++++++++++++++------------------------ 1 file changed, 45 insertions(+), 50 deletions(-) diff --git a/src/libcore/char.rs b/src/libcore/char.rs index 7f2deb81f8c90..ca5e56f0649cc 100644 --- a/src/libcore/char.rs +++ b/src/libcore/char.rs @@ -33,13 +33,14 @@ use unicode::{derived_property, property, general_category, decompose, conversio #[cfg(not(test))] use default::Default; // UTF-8 ranges and tags for encoding characters -static TAG_CONT: uint = 128u; -static MAX_ONE_B: uint = 128u; -static TAG_TWO_B: uint = 192u; -static MAX_TWO_B: uint = 2048u; -static TAG_THREE_B: uint = 224u; -static MAX_THREE_B: uint = 65536u; -static TAG_FOUR_B: uint = 240u; +static TAG_CONT: u8 = 0b1000_0000u8; +static TAG_TWO_B: u8 = 0b1100_0000u8; +static TAG_THREE_B: u8 = 0b1110_0000u8; +static TAG_FOUR_B: u8 = 0b1111_0000u8; +static MAX_ONE_B: u32 = 0x80u32; +static MAX_TWO_B: u32 = 0x800u32; +static MAX_THREE_B: u32 = 0x10000u32; +static MAX_FOUR_B: u32 = 0x200000u32; /* Lu Uppercase_Letter an uppercase letter @@ -285,37 +286,37 @@ pub fn from_digit(num: uint, radix: uint) -> Option<char> { } // Constants from Unicode 6.2.0 Section 3.12 Conjoining Jamo Behavior -static S_BASE: uint = 0xAC00; -static L_BASE: uint = 0x1100; -static V_BASE: uint = 0x1161; -static T_BASE: uint = 0x11A7; -static L_COUNT: uint = 19; -static V_COUNT: uint = 21; -static T_COUNT: uint = 28; -static N_COUNT: uint = (V_COUNT * T_COUNT); -static S_COUNT: uint = (L_COUNT * N_COUNT); +static S_BASE: u32 = 0xAC00; +static L_BASE: u32 = 0x1100; +static V_BASE: u32 = 0x1161; +static T_BASE: u32 = 0x11A7; +static L_COUNT: u32 = 19; +static V_COUNT: u32 = 21; +static T_COUNT: u32 = 28; +static N_COUNT: u32 = (V_COUNT * T_COUNT); +static S_COUNT: u32 = (L_COUNT * N_COUNT); // Decompose a precomposed Hangul syllable fn decompose_hangul(s: char, f: |char|) { - let si = s as uint - S_BASE; + let si = s as u32 - S_BASE; let li = si / N_COUNT; unsafe { - f(transmute((L_BASE + li) as u32)); + f(transmute(L_BASE + li)); let vi = (si % N_COUNT) / T_COUNT; - f(transmute((V_BASE + vi) as u32)); + f(transmute(V_BASE + vi)); let ti = si % T_COUNT; if ti > 0 { - f(transmute((T_BASE + ti) as u32)); + f(transmute(T_BASE + ti)); } } } /// Returns the canonical decomposition of a character pub fn decompose_canonical(c: char, f: |char|) { - if (c as uint) < S_BASE || (c as uint) >= (S_BASE + S_COUNT) { + if (c as u32) < S_BASE || (c as u32) >= (S_BASE + S_COUNT) { decompose::canonical(c, f); } else { decompose_hangul(c, f); @@ -324,7 +325,7 @@ pub fn decompose_canonical(c: char, f: |char|) { /// Returns the compatibility decomposition of a character pub fn decompose_compatible(c: char, f: |char|) { - if (c as uint) < S_BASE || (c as uint) >= (S_BASE + S_COUNT) { + if (c as u32) < S_BASE || (c as u32) >= (S_BASE + S_COUNT) { decompose::compatibility(c, f); } else { decompose_hangul(c, f); @@ -386,12 +387,7 @@ pub fn escape_default(c: char, f: |char|) { /// Returns the amount of bytes this `char` would need if encoded in UTF-8 pub fn len_utf8_bytes(c: char) -> uint { - static MAX_ONE_B: uint = 128u; - static MAX_TWO_B: uint = 2048u; - static MAX_THREE_B: uint = 65536u; - static MAX_FOUR_B: uint = 2097152u; - - let code = c as uint; + let code = c as u32; match () { _ if code < MAX_ONE_B => 1u, _ if code < MAX_TWO_B => 2u, @@ -606,41 +602,40 @@ impl Char for char { fn len_utf8_bytes(&self) -> uint { len_utf8_bytes(*self) } - fn encode_utf8(&self, dst: &mut [u8]) -> uint { - let code = *self as uint; + fn encode_utf8<'a>(&self, dst: &'a mut [u8]) -> uint { + let code = *self as u32; if code < MAX_ONE_B { dst[0] = code as u8; - return 1; + 1 } else if code < MAX_TWO_B { - dst[0] = (code >> 6u & 31u | TAG_TWO_B) as u8; - dst[1] = (code & 63u | TAG_CONT) as u8; - return 2; + dst[0] = (code >> 6u & 0x1F_u32) as u8 | TAG_TWO_B; + dst[1] = (code & 0x3F_u32) as u8 | TAG_CONT; + 2 } else if code < MAX_THREE_B { - dst[0] = (code >> 12u & 15u | TAG_THREE_B) as u8; - dst[1] = (code >> 6u & 63u | TAG_CONT) as u8; - dst[2] = (code & 63u | TAG_CONT) as u8; - return 3; + dst[0] = (code >> 12u & 0x0F_u32) as u8 | TAG_THREE_B; + dst[1] = (code >> 6u & 0x3F_u32) as u8 | TAG_CONT; + dst[2] = (code & 0x3F_u32) as u8 | TAG_CONT; + 3 } else { - dst[0] = (code >> 18u & 7u | TAG_FOUR_B) as u8; - dst[1] = (code >> 12u & 63u | TAG_CONT) as u8; - dst[2] = (code >> 6u & 63u | TAG_CONT) as u8; - dst[3] = (code & 63u | TAG_CONT) as u8; - return 4; + dst[0] = (code >> 18u & 0x07_u32) as u8 | TAG_FOUR_B; + dst[1] = (code >> 12u & 0x3F_u32) as u8 | TAG_CONT; + dst[2] = (code >> 6u & 0x3F_u32) as u8 | TAG_CONT; + dst[3] = (code & 0x3F_u32) as u8 | TAG_CONT; + 4 } } fn encode_utf16(&self, dst: &mut [u16]) -> uint { - let mut ch = *self as uint; - if (ch & 0xFFFF_u) == ch { - // The BMP falls through (assuming non-surrogate, as it - // should) - assert!(ch <= 0xD7FF_u || ch >= 0xE000_u); + let mut ch = *self as u32; + if (ch & 0xFFFF_u32) == ch { + // The BMP falls through (assuming non-surrogate, as it should) + assert!(ch <= 0xD7FF_u32 || ch >= 0xE000_u32); dst[0] = ch as u16; 1 } else { // Supplementary planes break into surrogates. - assert!(ch >= 0x1_0000_u && ch <= 0x10_FFFF_u); - ch -= 0x1_0000_u; + assert!(ch >= 0x1_0000_u32 && ch <= 0x10_FFFF_u32); + ch -= 0x1_0000_u32; dst[0] = 0xD800_u16 | ((ch >> 10) as u16); dst[1] = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16); 2 From 32a20bfa5f467b4713436c5c890b0a92b26da8a5 Mon Sep 17 00:00:00 2001 From: Florian Zeitz <florob@babelmonkeys.de> Date: Mon, 12 May 2014 22:25:38 +0200 Subject: [PATCH 3/4] core: Move Hangul decomposition into unicode.rs --- src/etc/unicode.py | 77 +++++++++++++++++++++++++++++++----------- src/libcore/char.rs | 54 ++++------------------------- src/libcore/unicode.rs | 48 ++++++++++++++++++++++++-- 3 files changed, 110 insertions(+), 69 deletions(-) diff --git a/src/etc/unicode.py b/src/etc/unicode.py index e98c65ca50eee..f079ef73cd8e2 100755 --- a/src/etc/unicode.py +++ b/src/etc/unicode.py @@ -321,17 +321,24 @@ def emit_core_decomp_module(f, canon, compat): format_table_content(f, data, 8) f.write("\n ];\n\n") - f.write(" pub fn canonical(c: char, i: |char|) " - + "{ d(c, i, false); }\n\n") - f.write(" pub fn compatibility(c: char, i: |char|) " - +"{ d(c, i, true); }\n\n") - f.write(" fn d(c: char, i: |char|, k: bool) {\n") - f.write(" use iter::Iterator;\n"); + f.write(""" + pub fn decompose_canonical(c: char, i: |char|) { d(c, i, false); } - f.write(" if c <= '\\x7f' { i(c); return; }\n") + pub fn decompose_compatible(c: char, i: |char|) { d(c, i, true); } - # First check the canonical decompositions - f.write(""" + fn d(c: char, i: |char|, k: bool) { + use iter::Iterator; + + // 7-bit ASCII never decomposes + if c <= '\\x7f' { i(c); return; } + + // Perform decomposition for Hangul + if (c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT) { + decompose_hangul(c, i); + return; + } + + // First check the canonical decompositions match bsearch_table(c, canonical_table) { Some(canon) => { for x in canon.iter() { @@ -340,13 +347,12 @@ def emit_core_decomp_module(f, canon, compat): return; } None => () - }\n\n""") + } - # Bottom out if we're not doing compat. - f.write(" if !k { i(c); return; }\n") + // Bottom out if we're not doing compat. + if !k { i(c); return; } - # Then check the compatibility decompositions - f.write(""" + // Then check the compatibility decompositions match bsearch_table(c, compatibility_table) { Some(compat) => { for x in compat.iter() { @@ -355,12 +361,45 @@ def emit_core_decomp_module(f, canon, compat): return; } None => () - }\n\n""") + } - # Finally bottom out. - f.write(" i(c);\n") - f.write(" }\n") - f.write("}\n\n") + // Finally bottom out. + i(c); + } + + // Constants from Unicode 6.2.0 Section 3.12 Conjoining Jamo Behavior + static S_BASE: u32 = 0xAC00; + static L_BASE: u32 = 0x1100; + static V_BASE: u32 = 0x1161; + static T_BASE: u32 = 0x11A7; + static L_COUNT: u32 = 19; + static V_COUNT: u32 = 21; + static T_COUNT: u32 = 28; + static N_COUNT: u32 = (V_COUNT * T_COUNT); + static S_COUNT: u32 = (L_COUNT * N_COUNT); + + // Decompose a precomposed Hangul syllable + fn decompose_hangul(s: char, f: |char|) { + use cast::transmute; + + let si = s as u32 - S_BASE; + + let li = si / N_COUNT; + unsafe { + f(transmute(L_BASE + li)); + + let vi = (si % N_COUNT) / T_COUNT; + f(transmute(V_BASE + vi)); + + let ti = si % T_COUNT; + if ti > 0 { + f(transmute(T_BASE + ti)); + } + } + } +} + +""") def emit_std_decomp_module(f, combine): f.write("pub mod decompose {\n"); diff --git a/src/libcore/char.rs b/src/libcore/char.rs index ca5e56f0649cc..71a2d75715b5a 100644 --- a/src/libcore/char.rs +++ b/src/libcore/char.rs @@ -27,7 +27,12 @@ use mem::transmute; use option::{None, Option, Some}; use iter::{Iterator, range_step}; -use unicode::{derived_property, property, general_category, decompose, conversions}; +use unicode::{derived_property, property, general_category, conversions}; + +/// Returns the canonical decomposition of a character. +pub use unicode::decompose::decompose_canonical; +/// Returns the compatibility decomposition of a character. +pub use unicode::decompose::decompose_compatible; #[cfg(not(test))] use cmp::{Eq, Ord, TotalEq, TotalOrd, Ordering}; #[cfg(not(test))] use default::Default; @@ -285,53 +290,6 @@ pub fn from_digit(num: uint, radix: uint) -> Option<char> { } } -// Constants from Unicode 6.2.0 Section 3.12 Conjoining Jamo Behavior -static S_BASE: u32 = 0xAC00; -static L_BASE: u32 = 0x1100; -static V_BASE: u32 = 0x1161; -static T_BASE: u32 = 0x11A7; -static L_COUNT: u32 = 19; -static V_COUNT: u32 = 21; -static T_COUNT: u32 = 28; -static N_COUNT: u32 = (V_COUNT * T_COUNT); -static S_COUNT: u32 = (L_COUNT * N_COUNT); - -// Decompose a precomposed Hangul syllable -fn decompose_hangul(s: char, f: |char|) { - let si = s as u32 - S_BASE; - - let li = si / N_COUNT; - unsafe { - f(transmute(L_BASE + li)); - - let vi = (si % N_COUNT) / T_COUNT; - f(transmute(V_BASE + vi)); - - let ti = si % T_COUNT; - if ti > 0 { - f(transmute(T_BASE + ti)); - } - } -} - -/// Returns the canonical decomposition of a character -pub fn decompose_canonical(c: char, f: |char|) { - if (c as u32) < S_BASE || (c as u32) >= (S_BASE + S_COUNT) { - decompose::canonical(c, f); - } else { - decompose_hangul(c, f); - } -} - -/// Returns the compatibility decomposition of a character -pub fn decompose_compatible(c: char, f: |char|) { - if (c as u32) < S_BASE || (c as u32) >= (S_BASE + S_COUNT) { - decompose::compatibility(c, f); - } else { - decompose_hangul(c, f); - } -} - /// /// Returns the hexadecimal Unicode escape of a `char` /// diff --git a/src/libcore/unicode.rs b/src/libcore/unicode.rs index b3298bde05547..bffde2323bf94 100644 --- a/src/libcore/unicode.rs +++ b/src/libcore/unicode.rs @@ -2121,14 +2121,24 @@ pub mod decompose { &['\u53ef']) ]; - pub fn canonical(c: char, i: |char|) { d(c, i, false); } - pub fn compatibility(c: char, i: |char|) { d(c, i, true); } + pub fn decompose_canonical(c: char, i: |char|) { d(c, i, false); } + + pub fn decompose_compatible(c: char, i: |char|) { d(c, i, true); } fn d(c: char, i: |char|, k: bool) { use iter::Iterator; + + // 7-bit ASCII never decomposes if c <= '\x7f' { i(c); return; } + // Perform decomposition for Hangul + if (c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT) { + decompose_hangul(c, i); + return; + } + + // First check the canonical decompositions match bsearch_table(c, canonical_table) { Some(canon) => { for x in canon.iter() { @@ -2139,8 +2149,10 @@ pub mod decompose { None => () } + // Bottom out if we're not doing compat. if !k { i(c); return; } + // Then check the compatibility decompositions match bsearch_table(c, compatibility_table) { Some(compat) => { for x in compat.iter() { @@ -2151,8 +2163,40 @@ pub mod decompose { None => () } + // Finally bottom out. i(c); } + + // Constants from Unicode 6.2.0 Section 3.12 Conjoining Jamo Behavior + static S_BASE: u32 = 0xAC00; + static L_BASE: u32 = 0x1100; + static V_BASE: u32 = 0x1161; + static T_BASE: u32 = 0x11A7; + static L_COUNT: u32 = 19; + static V_COUNT: u32 = 21; + static T_COUNT: u32 = 28; + static N_COUNT: u32 = (V_COUNT * T_COUNT); + static S_COUNT: u32 = (L_COUNT * N_COUNT); + + // Decompose a precomposed Hangul syllable + fn decompose_hangul(s: char, f: |char|) { + use mem::transmute; + + let si = s as u32 - S_BASE; + + let li = si / N_COUNT; + unsafe { + f(transmute(L_BASE + li)); + + let vi = (si % N_COUNT) / T_COUNT; + f(transmute(V_BASE + vi)); + + let ti = si % T_COUNT; + if ti > 0 { + f(transmute(T_BASE + ti)); + } + } + } } pub mod derived_property { From 748061e08908fb4fca7e1373ccfc129e12c734ac Mon Sep 17 00:00:00 2001 From: Florian Zeitz <florob@babelmonkeys.de> Date: Mon, 12 May 2014 22:44:21 +0200 Subject: [PATCH 4/4] std: Rename str::Normalizations to str::Decompositions The Normalizations iterator has been renamed to Decompositions. It does not currently include all forms of Unicode normalization, but only encompasses decompositions. If implemented recomposition would likely be a separate iterator which works on the result of this one. [breaking-change] --- src/etc/unicode.py | 12 ++++++------ src/libcore/char.rs | 4 ++-- src/libcore/unicode.rs | 2 +- src/libstd/str.rs | 32 ++++++++++++++++---------------- src/libstd/unicode.rs | 2 +- 5 files changed, 26 insertions(+), 26 deletions(-) diff --git a/src/etc/unicode.py b/src/etc/unicode.py index f079ef73cd8e2..586890ebe4c9a 100755 --- a/src/etc/unicode.py +++ b/src/etc/unicode.py @@ -256,13 +256,13 @@ def format_table_content(f, content, indent): line = " "*indent + chunk f.write(line) -def emit_core_decomp_module(f, canon, compat): +def emit_core_norm_module(f, canon, compat): canon_keys = canon.keys() canon_keys.sort() compat_keys = compat.keys() compat_keys.sort() - f.write("pub mod decompose {\n"); + f.write("pub mod normalization {\n"); f.write(" use option::Option;\n"); f.write(" use option::{Some, None};\n"); f.write(" use slice::ImmutableVector;\n"); @@ -401,8 +401,8 @@ def emit_core_decomp_module(f, canon, compat): """) -def emit_std_decomp_module(f, combine): - f.write("pub mod decompose {\n"); +def emit_std_norm_module(f, combine): + f.write("pub mod normalization {\n"); f.write(" use option::{Some, None};\n"); f.write(" use slice::ImmutableVector;\n"); @@ -467,7 +467,7 @@ def gen_core_unicode(): emit_bsearch_range_table(rf); emit_property_module(rf, "general_category", gencats) - emit_core_decomp_module(rf, canon_decomp, compat_decomp) + emit_core_norm_module(rf, canon_decomp, compat_decomp) derived = load_properties("DerivedCoreProperties.txt", ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"]) @@ -485,7 +485,7 @@ def gen_std_unicode(): with open(r, "w") as rf: # Preamble rf.write(preamble) - emit_std_decomp_module(rf, combines) + emit_std_norm_module(rf, combines) gen_core_unicode() gen_std_unicode() diff --git a/src/libcore/char.rs b/src/libcore/char.rs index 71a2d75715b5a..934483dbed423 100644 --- a/src/libcore/char.rs +++ b/src/libcore/char.rs @@ -30,9 +30,9 @@ use iter::{Iterator, range_step}; use unicode::{derived_property, property, general_category, conversions}; /// Returns the canonical decomposition of a character. -pub use unicode::decompose::decompose_canonical; +pub use unicode::normalization::decompose_canonical; /// Returns the compatibility decomposition of a character. -pub use unicode::decompose::decompose_compatible; +pub use unicode::normalization::decompose_compatible; #[cfg(not(test))] use cmp::{Eq, Ord, TotalEq, TotalOrd, Ordering}; #[cfg(not(test))] use default::Default; diff --git a/src/libcore/unicode.rs b/src/libcore/unicode.rs index bffde2323bf94..242672de2967a 100644 --- a/src/libcore/unicode.rs +++ b/src/libcore/unicode.rs @@ -104,7 +104,7 @@ pub mod general_category { } -pub mod decompose { +pub mod normalization { use option::Option; use option::{Some, None}; use slice::ImmutableVector; diff --git a/src/libstd/str.rs b/src/libstd/str.rs index 24cf9681ca84d..fa4cf8e4427d0 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -228,25 +228,25 @@ fn canonical_sort(comb: &mut [(char, u8)]) { } #[deriving(Clone)] -enum NormalizationForm { - NFD, - NFKD +enum DecompositionType { + Canonical, + Compatible } -/// External iterator for a string's normalization's characters. +/// External iterator for a string's decomposition's characters. /// Use with the `std::iter` module. #[deriving(Clone)] -pub struct Normalizations<'a> { - kind: NormalizationForm, +pub struct Decompositions<'a> { + kind: DecompositionType, iter: Chars<'a>, buffer: Vec<(char, u8)>, sorted: bool } -impl<'a> Iterator<char> for Normalizations<'a> { +impl<'a> Iterator<char> for Decompositions<'a> { #[inline] fn next(&mut self) -> Option<char> { - use unicode::decompose::canonical_combining_class; + use unicode::normalization::canonical_combining_class; match self.buffer.as_slice().head() { Some(&(c, 0)) => { @@ -262,8 +262,8 @@ impl<'a> Iterator<char> for Normalizations<'a> { } let decomposer = match self.kind { - NFD => char::decompose_canonical, - NFKD => char::decompose_compatible + Canonical => char::decompose_canonical, + Compatible => char::decompose_compatible }; if !self.sorted { @@ -887,24 +887,24 @@ pub trait StrAllocating: Str { /// An Iterator over the string in Unicode Normalization Form D /// (canonical decomposition). #[inline] - fn nfd_chars<'a>(&'a self) -> Normalizations<'a> { - Normalizations { + fn nfd_chars<'a>(&'a self) -> Decompositions<'a> { + Decompositions { iter: self.as_slice().chars(), buffer: Vec::new(), sorted: false, - kind: NFD + kind: Canonical } } /// An Iterator over the string in Unicode Normalization Form KD /// (compatibility decomposition). #[inline] - fn nfkd_chars<'a>(&'a self) -> Normalizations<'a> { - Normalizations { + fn nfkd_chars<'a>(&'a self) -> Decompositions<'a> { + Decompositions { iter: self.as_slice().chars(), buffer: Vec::new(), sorted: false, - kind: NFKD + kind: Compatible } } } diff --git a/src/libstd/unicode.rs b/src/libstd/unicode.rs index d534b30221b4a..03c960e96ffe1 100644 --- a/src/libstd/unicode.rs +++ b/src/libstd/unicode.rs @@ -12,7 +12,7 @@ #![allow(missing_doc, non_uppercase_statics)] -pub mod decompose { +pub mod normalization { use option::{Some, None}; use slice::ImmutableVector;