rust-lang · kwantam · Apr 7, 2015
diff --git a/src/etc/unicode.py b/src/etc/unicode.py
@@ -15,6 +15,7 @@
 # - DerivedNormalizationProps.txt
 # - EastAsianWidth.txt
 # - auxiliary/GraphemeBreakProperty.txt
+# - auxiliary/WordBreakProperty.txt
 # - PropList.txt
 # - ReadMe.txt
 # - Scripts.txt
@@ -290,11 +291,13 @@ def emit_bsearch_range_table(f):
 """)
 
 def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
-        pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1]))):
-    pub_string = ""
+        pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])), is_const=True):
+    pub_string = "const"
+    if not is_const:
+        pub_string = "let"
     if is_pub:
-        pub_string = "pub "
-    f.write("    %sconst %s: %s = &[\n" % (pub_string, name, t_type))
+        pub_string = "pub " + pub_string
+    f.write("    %s %s: %s = &[\n" % (pub_string, name, t_type))
     data = ""
     first = True
     for dat in t_data:
@@ -375,21 +378,25 @@ def emit_conversions_module(f, lowerupper, upperlower):
         sorted(lowerupper.iteritems(), key=operator.itemgetter(0)), is_pub=False)
     f.write("}\n\n")
 
-def emit_grapheme_module(f, grapheme_table, grapheme_cats):
-    f.write("""pub mod grapheme {
+def emit_break_module(f, break_table, break_cats, name):
+    Name = name.capitalize()
+    f.write("""pub mod %s {
     use core::slice::SliceExt;
-    pub use self::GraphemeCat::*;
+    pub use self::%sCat::*;
     use core::result::Result::{Ok, Err};
 
     #[allow(non_camel_case_types)]
-    #[derive(Clone, Copy)]
-    pub enum GraphemeCat {
-""")
-    for cat in grapheme_cats + ["Any"]:
-        f.write("        GC_" + cat + ",\n")
+    #[derive(Clone, Copy, PartialEq, Eq)]
+    pub enum %sCat {
+""" % (name, Name, Name))
+
+    break_cats.append("Any")
+    break_cats.sort()
+    for cat in break_cats:
+        f.write(("        %sC_" % Name[0]) + cat + ",\n")
     f.write("""    }
 
-    fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)]) -> GraphemeCat {
+    fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> %sCat {
         use core::cmp::Ordering::{Equal, Less, Greater};
         match r.binary_search_by(|&(lo, hi, _)| {
             if lo <= c && c <= hi { Equal }
@@ -400,19 +407,19 @@ def emit_grapheme_module(f, grapheme_table, grapheme_cats):
                 let (_, _, cat) = r[idx];
                 cat
             }
-            Err(_) => GC_Any
+            Err(_) => %sC_Any
         }
     }
 
-    pub fn grapheme_category(c: char) -> GraphemeCat {
-        bsearch_range_value_table(c, grapheme_cat_table)
+    pub fn %s_category(c: char) -> %sCat {
+        bsearch_range_value_table(c, %s_cat_table)
     }
 
-""")
+""" % (Name, Name, Name[0], name, Name, name))
 
-    emit_table(f, "grapheme_cat_table", grapheme_table, "&'static [(char, char, GraphemeCat)]",
-        pfun=lambda x: "(%s,%s,GC_%s)" % (escape_char(x[0]), escape_char(x[1]), x[2]),
-        is_pub=False)
+    emit_table(f, "%s_cat_table" % name, break_table, "&'static [(char, char, %sCat)]" % Name,
+        pfun=lambda x: "(%s,%s,%sC_%s)" % (escape_char(x[0]), escape_char(x[1]), Name[0], x[2]),
+        is_pub=False, is_const=True)
     f.write("}\n")
 
 def emit_charwidth_module(f, width_table):
@@ -690,4 +697,12 @@ def optimize_width_table(wtable):
         for cat in grapheme_cats:
             grapheme_table.extend([(x, y, cat) for (x, y) in grapheme_cats[cat]])
         grapheme_table.sort(key=lambda w: w[0])
-        emit_grapheme_module(rf, grapheme_table, grapheme_cats.keys())
+        emit_break_module(rf, grapheme_table, grapheme_cats.keys(), "grapheme")
+        rf.write("\n")
+
+        word_cats = load_properties("auxiliary/WordBreakProperty.txt", [])
+        word_table = []
+        for cat in word_cats:
+            word_table.extend([(x, y, cat) for (x, y) in word_cats[cat]])
+        word_table.sort(key=lambda w: w[0])
+        emit_break_module(rf, word_table, word_cats.keys(), "word")
diff --git a/src/etc/unicode_gen_breaktests.py b/src/etc/unicode_gen_breaktests.py
@@ -0,0 +1,197 @@
+#!/usr/bin/env python
+# -*- coding: utf-8
+#
+# Copyright 2015 The Rust Project Developers. See the COPYRIGHT
+# file at the top-level directory of this distribution and at
+# http://rust-lang.org/COPYRIGHT.
+#
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+# This script uses the following Unicode tables:
+# - GraphemeBreakTest.txt
+# - WordBreakTest.txt
+#
+# Since this should not require frequent updates, we just store this
+# out-of-line and check the unicode.rs file into git.
+
+import unicode, re, os, fileinput
+
+def load_test_data(f, optsplit=[]):
+    outls = []
+    testRe1 = re.compile("^÷\s+([^\s].*[^\s])\s+÷\s+#\s+÷\s+\[0.2\].*?([÷×].*)\s+÷\s+\[0.3\]\s*$")
+
+    unicode.fetch(f)
+    data = []
+    for line in fileinput.input(os.path.basename(f)):
+        # lines that include a test start with the ÷ character
+        if len(line) < 2 or line[0:2] != '÷':
+            continue
+
+        m = testRe1.match(line)
+        if not m:
+            print "error: no match on line where test was expected: %s" % line
+            continue
+
+        # process the characters in this test case
+        chars = process_split_string(m.group(1))
+        # skip test case if it contains invalid characters (viz., surrogates)
+        if not chars:
+            continue
+
+        # now process test cases
+        (chars, info) = process_split_info(m.group(2), chars, optsplit)
+
+        # make sure that we have break info for each break!
+        assert len(chars) - 1 == len(info)
+
+        outls.append((chars, info))
+
+    return outls
+
+def process_split_info(s, c, o):
+    outcs = []
+    outis = []
+    workcs = c.pop(0)
+
+    # are we on a × or a ÷?
+    isX = False
+    if s[0:2] == '×':
+        isX = True
+
+    # find each instance of '(÷|×) [x.y] '
+    while s:
+        # find the currently considered rule number
+        sInd = s.index('[') + 1
+        eInd = s.index(']')
+
+        # if it's '× [a.b]' where 'a.b' is in o, then
+        # we consider it a split even though it's not
+        # marked as one
+        # if it's ÷ then it's always a split
+        if not isX or s[sInd:eInd] in o:
+            outis.append(s[sInd:eInd])
+            outcs.append(workcs)
+            workcs = c.pop(0)
+        else:
+            workcs.extend(c.pop(0))
+
+        idx = 1
+        while idx < len(s):
+            if s[idx:idx+2] == '×':
+                isX = True
+                break
+            if s[idx:idx+2] == '÷':
+                isX = False
+                break
+            idx += 1
+        s = s[idx:]
+
+    outcs.append(workcs)
+    return (outcs, outis)
+
+def process_split_string(s):
+    outls = []
+    workls = []
+
+    inls = s.split()
+
+    for i in inls:
+        if i == '÷' or i == '×':
+            outls.append(workls)
+            workls = []
+            continue
+
+        ival = int(i,16)
+
+        if unicode.is_surrogate(ival):
+            return []
+
+        workls.append(ival)
+
+    if workls:
+        outls.append(workls)
+
+    return outls
+
+def showfun(x):
+    outstr = '("'
+    for c in x[0]:
+        outstr += "\\u{%x}" % c
+    outstr += '",&['
+    xfirst = True
+    for xx in x[1:]:
+        if not xfirst:
+            outstr += '],&['
+        xfirst = False
+        sfirst = True
+        for sp in xx:
+            if not sfirst:
+                outstr += ','
+            sfirst = False
+            outstr += '"'
+            for c in sp:
+                outstr += "\\u{%x}" % c
+            outstr += '"'
+    outstr += '])'
+    return outstr
+
+def create_grapheme_data():
+    # rules 9.1 and 9.2 are for extended graphemes only
+    optsplits = ['9.1','9.2']
+    d = load_test_data("auxiliary/GraphemeBreakTest.txt", optsplits)
+
+    test_same = []
+    test_diff = []
+
+    for (c, i) in d:
+        allchars = [cn for s in c for cn in s]
+        extgraphs = []
+        extwork = []
+
+        extwork.extend(c[0])
+        for n in range(0,len(i)):
+            if i[n] in optsplits:
+                extwork.extend(c[n+1])
+            else:
+                extgraphs.append(extwork)
+                extwork = []
+                extwork.extend(c[n+1])
+
+        # these are the extended grapheme clusters
+        extgraphs.append(extwork)
+
+        if extgraphs == c:
+            test_same.append((allchars, c))
+        else:
+            test_diff.append((allchars, extgraphs, c))
+
+    stype = "&[(&str, &[&str])]"
+    dtype = "&[(&str, &[&str], &[&str])]"
+    with open("graph_tests.rs", "w") as rf:
+        rf.write("    // official Unicode test data\n")
+        rf.write("    // http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakTest.txt\n")
+        unicode.emit_table(rf, "test_same", test_same, stype, False, showfun, False)
+        unicode.emit_table(rf, "test_diff", test_diff, dtype, False, showfun, False)
+
+def create_words_data():
+    d = load_test_data("auxiliary/WordBreakTest.txt")
+
+    test = []
+
+    for (c, i) in d:
+        allchars = [cn for s in c for cn in s]
+        test.append((allchars, c))
+
+    wtype = "&[(&str, &[&str])]"
+    with open("word_tests.rs", "w") as rf:
+        rf.write("    // official Unicode test data\n")
+        rf.write("    // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt\n")
+        unicode.emit_table(rf, "test_word", test, wtype, False, showfun, False)
+
+if __name__ == "main":
+    create_grapheme_data()
+    create_words_data()
diff --git a/src/libcollections/str.rs b/src/libcollections/str.rs
@@ -79,6 +79,7 @@ pub use core::str::{MatchIndices, RMatchIndices};
 pub use core::str::{from_utf8, Chars, CharIndices, Bytes};
 pub use core::str::{from_utf8_unchecked, ParseBoolError};
 pub use unicode::str::{Words, Graphemes, GraphemeIndices};
+pub use unicode::str::{UnicodeWords, UWordBounds, UWordBoundIndices};
 pub use core::str::pattern;
 
 /*
@@ -1736,6 +1737,30 @@ impl str {
         UnicodeStr::words(&self[..])
     }
 
+    /// An iterator over the words of `self`, separated on
+    /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
+    ///
+    /// In this function, "words" are just those substrings which, after splitting on
+    /// UAX#29 word boundaries, contain any alphanumeric characters. That is, the
+    /// substring must contain at least one character with the
+    /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
+    /// property, or with
+    /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
+    ///
+    /// # Example
+    /// # #![feature(unicode, core)]
+    /// let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
+    /// let uw1 = uws.words_unicode().collect::<Vec<&str>>();
+    /// let b: &[_] = &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"];
+    ///
+    /// assert_eq!(&uw1[..], b);
+    /// ```
+    #[unstable(feature = "unicode",
+               reason = "questions remain regarding the naming of words() and words_unicode()")]
+    pub fn words_unicode(&self) -> UnicodeWords {
+        UnicodeStr::words_unicode(&self[..])
+    }
+
     /// Returns a string's displayed width in columns.
     ///
     /// Control characters have zero width.
@@ -1819,4 +1844,43 @@ impl str {
         s.extend(self[..].chars().flat_map(|c| c.to_uppercase()));
         return s;
     }
+
+    /// Returns an iterator over substrings of `self` separated on
+    /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
+    ///
+    /// The concatenation of the substrings returned by this function is just the original string.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// # #![feature(unicode, core)]
+    /// let swu1 = "The quick (\"brown\")  fox".split_words_uax29().collect::<Vec<&str>>();
+    /// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", " ", "fox"];
+    ///
+    /// assert_eq!(&swu1[..], b);
+    /// ```
+    #[unstable(feature = "unicode",
+               reason = "this functionality may only be provided by libunicode")]
+    pub fn split_words_uax29(&self) -> UWordBounds {
+        UnicodeStr::split_words_uax29(&self[..])
+    }
+
+    /// Returns an iterator over substrings of `self`, split on UAX#29 word boundaries,
+    /// and their offsets. See `split_words_uax29()` for more information.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// # #![feature(unicode, core)]
+    /// let swi1 = "Brr, it's 29.3°F!".split_words_uax29_indices().collect::<Vec<(usize, &str)>>();
+    /// let b: &[_] = &[(0, "Brr"), (3, ","), (4, " "), (5, "it's"), (9, " "), (10, "29.3"),
+    ///                 (14, "°"), (16, "F"), (17, "!")];
+    ///
+    /// assert_eq!(&swi1[..], b);
+    /// ```
+    #[unstable(feature = "unicode",
+               reason = "this functionality may only be provided by libunicode")]
+    pub fn split_words_uax29_indices(&self) -> UWordBoundIndices {
+        UnicodeStr::split_words_uax29_indices(&self[..])
+    }
 }