Add support to libcore for encoded-in-rust unicode character properties, at least. Add script to compute them from unicode.org.

graydon · graydon · commit ac13f0da9ee2 · 2011-12-23T18:48:08.000-08:00
diff --git a/src/etc/unicode.py b/src/etc/unicode.py
@@ -0,0 +1,172 @@
+#!/usr/bin/env python
+
+# This digests UnicodeData.txt and DerivedCoreProperties.txt and emits rust
+# code covering the core properties. Since this is a pretty rare event we
+# just store this out-of-line and check the unicode.rs file into git.
+#
+# The emitted code is "the minimum we think is necessary for libcore", that
+# is, to support basic operations of the compiler and "most nontrivial rust
+# programs". It is not meant to be a complete implementation of unicode.
+# For that we recommend you use a proper binding to libicu.
+
+import fileinput, re, os, sys
+
+
+def fetch(f):
+    if not os.path.exists(f):
+        os.system("curl -O http://www.unicode.org/Public/UNIDATA/%s"
+                  % f)
+
+    if not os.path.exists(f):
+        sys.stderr.write("cannot load %s" % f)
+        exit(1)
+
+
+def load_general_categories(f):
+    fetch(f)
+    gencats = {}
+    curr_cat = ""
+    c_lo = 0
+    c_hi = 0
+    for line in fileinput.input(f):
+        fields = line.split(";")
+        if len(fields) != 15:
+            continue
+        [code, name, gencat, combine, bidi,
+         decomp, deci, digit, num, mirror,
+         old, iso, upcase, lowcsae, titlecase ] = fields
+
+        code = int(code, 16)
+
+        if curr_cat == "":
+            curr_cat = gencat
+            c_lo = code
+            c_hi = code
+
+        if curr_cat == gencat:
+            c_hi = code
+        else:
+            if curr_cat not in gencats:
+                gencats[curr_cat] = []
+
+            gencats[curr_cat].append((c_lo, c_hi))
+            curr_cat = gencat
+            c_lo = code
+            c_hi = code
+    return gencats
+
+
+def load_derived_core_properties(f):
+    fetch(f)
+    derivedprops = {}
+    interestingprops = ["XID_Start", "XID_Continue", "Alphabetic"]
+    re1 = re.compile("^([0-9A-F]+) +; (\w+)")
+    re2 = re.compile("^([0-9A-F]+)\.\.([0-9A-F]+) +; (\w+)")
+
+    for line in fileinput.input(f):
+        prop = None
+        d_lo = 0
+        d_hi = 0
+        m = re1.match(line)
+        if m:
+            d_lo = m.group(1)
+            d_hi = m.group(1)
+            prop = m.group(2)
+        else:
+            m = re2.match(line)
+            if m:
+                d_lo = m.group(1)
+                d_hi = m.group(2)
+                prop = m.group(3)
+            else:
+                continue
+        if prop not in interestingprops:
+            continue
+        d_lo = int(d_lo, 16)
+        d_hi = int(d_hi, 16)
+        if prop not in derivedprops:
+            derivedprops[prop] = []
+        derivedprops[prop].append((d_lo, d_hi))
+    return derivedprops
+
+def escape_char(c):
+    if c <= 0xff:
+        return "'\\x%2.2x'" % c
+    if c <= 0xffff:
+        return "'\\u%4.4x'" % c
+    return "'\\U%8.8x'" % c
+
+def emit_rust_module(f, mod, tbl):
+    f.write("mod %s {\n" % mod)
+    keys = tbl.keys()
+    keys.sort()
+    for cat in keys:
+        f.write("    pure fn %s(c: char) -> bool {\n" % cat)
+        f.write("        ret alt c {\n")
+        prefix = ' '
+        for pair in tbl[cat]:
+            if pair[0] == pair[1]:
+                f.write("            %c %s\n" %
+                        (prefix, escape_char(pair[0])))
+            else:
+                f.write("            %c %s to %s\n" %
+                        (prefix,
+                         escape_char(pair[0]),
+                         escape_char(pair[1])))
+            prefix = '|'
+        f.write("              { true }\n")
+        f.write("            _ { false }\n")
+        f.write("        };\n")
+        f.write("    }\n\n")
+    f.write("}\n")
+
+
+def emit_cpp_module(f, mod, tbl):
+    keys = tbl.keys()
+    keys.sort()
+
+    for cat in keys:
+
+        singles = []
+        ranges = []
+
+        for pair in tbl[cat]:
+            if pair[0] == pair[1]:
+                singles.append(pair[0])
+            else:
+                ranges.append(pair)
+
+        f.write("bool %s_%s(unsigned c) {\n" % (mod, cat))
+        for pair in ranges:
+            f.write("    if (0x%x <= c && c <= 0x%x) { return true; }\n"
+                    % pair)
+        if len(singles) > 0:
+            f.write("    switch (c) {\n");
+            for single in singles:
+                f.write("      case 0x%x:\n" % single)
+            f.write("        return true;\n");
+            f.write("      default:\n");
+            f.write("        return false;\n");
+            f.write("    }\n")
+        f.write("return false;\n")
+        f.write("}\n\n")
+
+
+def emit_module(rf, cf, mod, tbl):
+    emit_rust_module(rf, mod, tbl)
+    emit_cpp_module(cf, mod, tbl)
+
+r = "unicode.rs"
+c = "unicode.cpp"
+for i in [r, c]:
+    if os.path.exists(i):
+        os.remove(i);
+
+rf = open(r, "w")
+cf = open(c, "w")
+
+emit_module(rf, cf, "general_category",
+            load_general_categories("UnicodeData.txt"))
+
+emit_module(rf, cf, "derived_property",
+            load_derived_core_properties("DerivedCoreProperties.txt"))
diff --git a/src/libcore/char.rs b/src/libcore/char.rs
@@ -4,96 +4,75 @@ Module: char
 Utilities for manipulating the char type
 */
 
+/*
+    Lu  Uppercase_Letter    an uppercase letter
+    Ll  Lowercase_Letter    a lowercase letter
+    Lt  Titlecase_Letter    a digraphic character, with first part uppercase
+    Lm  Modifier_Letter     a modifier letter
+    Lo  Other_Letter    other letters, including syllables and ideographs
+    Mn  Nonspacing_Mark     a nonspacing combining mark (zero advance width)
+    Mc  Spacing_Mark    a spacing combining mark (positive advance width)
+    Me  Enclosing_Mark  an enclosing combining mark
+    Nd  Decimal_Number  a decimal digit
+    Nl  Letter_Number   a letterlike numeric character
+    No  Other_Number    a numeric character of other type
+    Pc  Connector_Punctuation   a connecting punctuation mark, like a tie
+    Pd  Dash_Punctuation    a dash or hyphen punctuation mark
+    Ps  Open_Punctuation    an opening punctuation mark (of a pair)
+    Pe  Close_Punctuation   a closing punctuation mark (of a pair)
+    Pi  Initial_Punctuation     an initial quotation mark
+    Pf  Final_Punctuation   a final quotation mark
+    Po  Other_Punctuation   a punctuation mark of other type
+    Sm  Math_Symbol     a symbol of primarily mathematical use
+    Sc  Currency_Symbol     a currency sign
+    Sk  Modifier_Symbol     a non-letterlike modifier symbol
+    So  Other_Symbol    a symbol of other type
+    Zs  Space_Separator     a space character (of various non-zero widths)
+    Zl  Line_Separator  U+2028 LINE SEPARATOR only
+    Zp  Paragraph_Separator     U+2029 PARAGRAPH SEPARATOR only
+    Cc  Control     a C0 or C1 control code
+    Cf  Format  a format control character
+    Cs  Surrogate   a surrogate code point
+    Co  Private_Use     a private-use character
+    Cn  Unassigned  a reserved unassigned code point or a noncharacter
+*/
+
+import is_alphabetic = unicode::derived_property::Alphabetic;
+import is_XID_start = unicode::derived_property::XID_Start;
+import is_XID_continue = unicode::derived_property::XID_Continue;
+
 /*
 Function: is_whitespace
 
-Indicates whether a character is whitespace.
+Indicates whether a character is whitespace, defined in terms of
+the Unicode General Categories 'Zs', 'Zl', 'Zp' and the additional
+'Cc'-category control codes in the range [0x09, 0x0d].
 
-Whitespace characters include space (U+0020), tab (U+0009), line feed
-(U+000A), carriage return (U+000D), and a number of less common
-ASCII and unicode characters.
 */
 pure fn is_whitespace(c: char) -> bool {
-    const ch_space: char = '\u0020';
-    const ch_ogham_space_mark: char = '\u1680';
-    const ch_mongolian_vowel_sep: char = '\u180e';
-    const ch_en_quad: char = '\u2000';
-    const ch_em_quad: char = '\u2001';
-    const ch_en_space: char = '\u2002';
-    const ch_em_space: char = '\u2003';
-    const ch_three_per_em_space: char = '\u2004';
-    const ch_four_per_em_space: char = '\u2005';
-    const ch_six_per_em_space: char = '\u2006';
-    const ch_figure_space: char = '\u2007';
-    const ch_punctuation_space: char = '\u2008';
-    const ch_thin_space: char = '\u2009';
-    const ch_hair_space: char = '\u200a';
-    const ch_narrow_no_break_space: char = '\u202f';
-    const ch_medium_mathematical_space: char = '\u205f';
-    const ch_ideographic_space: char = '\u3000';
-    const ch_line_separator: char = '\u2028';
-    const ch_paragraph_separator: char = '\u2029';
-    const ch_character_tabulation: char = '\u0009';
-    const ch_line_feed: char = '\u000a';
-    const ch_line_tabulation: char = '\u000b';
-    const ch_form_feed: char = '\u000c';
-    const ch_carriage_return: char = '\u000d';
-    const ch_next_line: char = '\u0085';
-    const ch_no_break_space: char = '\u00a0';
-
-    if c == ch_space {
-        true
-    } else if c == ch_ogham_space_mark {
-        true
-    } else if c == ch_mongolian_vowel_sep {
-        true
-    } else if c == ch_en_quad {
-        true
-    } else if c == ch_em_quad {
-        true
-    } else if c == ch_en_space {
-        true
-    } else if c == ch_em_space {
-        true
-    } else if c == ch_three_per_em_space {
-        true
-    } else if c == ch_four_per_em_space {
-        true
-    } else if c == ch_six_per_em_space {
-        true
-    } else if c == ch_figure_space {
-        true
-    } else if c == ch_punctuation_space {
-        true
-    } else if c == ch_thin_space {
-        true
-    } else if c == ch_hair_space {
-        true
-    } else if c == ch_narrow_no_break_space {
-        true
-    } else if c == ch_medium_mathematical_space {
-        true
-    } else if c == ch_ideographic_space {
-        true
-    } else if c == ch_line_tabulation {
-        true
-    } else if c == ch_paragraph_separator {
-        true
-    } else if c == ch_character_tabulation {
-        true
-    } else if c == ch_line_feed {
-        true
-    } else if c == ch_line_tabulation {
-        true
-    } else if c == ch_form_feed {
-        true
-    } else if c == ch_carriage_return {
-        true
-    } else if c == ch_next_line {
-        true
-    } else if c == ch_no_break_space { true } else { false }
+    ret ('\x09' <= c && c <= '\x0x0d')
+        || unicode::general_category::Zs(c)
+        || unicode::general_category::Zl(c)
+        || unicode::general_category::Zp(c);
+}
+
+/*
+Function: is_alphanumeric
+
+Indicates whether a character is alphanumeric, defined in terms of
+the Unicode General Categories 'Nd', 'Nl', 'No' and the Derived
+Core Property 'Alphabetic'.
+
+*/
+
+pure fn is_alphanumeric(c: char) -> bool {
+    ret unicode::derived_property::Alphabetic(c) ||
+        unicode::general_category::Nd(c) ||
+        unicode::general_category::Nl(c) ||
+        unicode::general_category::No(c);
 }
 
+
 /*
  Function: to_digit
 
diff --git a/src/libcore/core.rc b/src/libcore/core.rc
@@ -30,6 +30,9 @@ mod u64;
 mod vec;
 mod bool;
 
+// For internal use by char, not exported
+mod unicode;
+
 
 // Ubiquitous-utility-type modules
 
diff --git a/src/libcore/unicode.rs b/src/libcore/unicode.rs