Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add UAX#29 word bounds algorithm to libunicode #24340

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 36 additions & 21 deletions src/etc/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
# - DerivedNormalizationProps.txt
# - EastAsianWidth.txt
# - auxiliary/GraphemeBreakProperty.txt
# - auxiliary/WordBreakProperty.txt
# - PropList.txt
# - ReadMe.txt
# - Scripts.txt
Expand Down Expand Up @@ -290,11 +291,13 @@ def emit_bsearch_range_table(f):
""")

def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1]))):
pub_string = ""
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])), is_const=True):
pub_string = "const"
if not is_const:
pub_string = "let"
if is_pub:
pub_string = "pub "
f.write(" %sconst %s: %s = &[\n" % (pub_string, name, t_type))
pub_string = "pub " + pub_string
f.write(" %s %s: %s = &[\n" % (pub_string, name, t_type))
data = ""
first = True
for dat in t_data:
Expand Down Expand Up @@ -375,21 +378,25 @@ def emit_conversions_module(f, lowerupper, upperlower):
sorted(lowerupper.iteritems(), key=operator.itemgetter(0)), is_pub=False)
f.write("}\n\n")

def emit_grapheme_module(f, grapheme_table, grapheme_cats):
f.write("""pub mod grapheme {
def emit_break_module(f, break_table, break_cats, name):
Name = name.capitalize()
f.write("""pub mod %s {
use core::slice::SliceExt;
pub use self::GraphemeCat::*;
pub use self::%sCat::*;
use core::result::Result::{Ok, Err};

#[allow(non_camel_case_types)]
#[derive(Clone, Copy)]
pub enum GraphemeCat {
""")
for cat in grapheme_cats + ["Any"]:
f.write(" GC_" + cat + ",\n")
#[derive(Clone, Copy, PartialEq, Eq)]
pub enum %sCat {
""" % (name, Name, Name))

break_cats.append("Any")
break_cats.sort()
for cat in break_cats:
f.write((" %sC_" % Name[0]) + cat + ",\n")
f.write(""" }

fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)]) -> GraphemeCat {
fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> %sCat {
use core::cmp::Ordering::{Equal, Less, Greater};
match r.binary_search_by(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
Expand All @@ -400,19 +407,19 @@ def emit_grapheme_module(f, grapheme_table, grapheme_cats):
let (_, _, cat) = r[idx];
cat
}
Err(_) => GC_Any
Err(_) => %sC_Any
}
}

pub fn grapheme_category(c: char) -> GraphemeCat {
bsearch_range_value_table(c, grapheme_cat_table)
pub fn %s_category(c: char) -> %sCat {
bsearch_range_value_table(c, %s_cat_table)
}

""")
""" % (Name, Name, Name[0], name, Name, name))

emit_table(f, "grapheme_cat_table", grapheme_table, "&'static [(char, char, GraphemeCat)]",
pfun=lambda x: "(%s,%s,GC_%s)" % (escape_char(x[0]), escape_char(x[1]), x[2]),
is_pub=False)
emit_table(f, "%s_cat_table" % name, break_table, "&'static [(char, char, %sCat)]" % Name,
pfun=lambda x: "(%s,%s,%sC_%s)" % (escape_char(x[0]), escape_char(x[1]), Name[0], x[2]),
is_pub=False, is_const=True)
f.write("}\n")

def emit_charwidth_module(f, width_table):
Expand Down Expand Up @@ -690,4 +697,12 @@ def optimize_width_table(wtable):
for cat in grapheme_cats:
grapheme_table.extend([(x, y, cat) for (x, y) in grapheme_cats[cat]])
grapheme_table.sort(key=lambda w: w[0])
emit_grapheme_module(rf, grapheme_table, grapheme_cats.keys())
emit_break_module(rf, grapheme_table, grapheme_cats.keys(), "grapheme")
rf.write("\n")

word_cats = load_properties("auxiliary/WordBreakProperty.txt", [])
word_table = []
for cat in word_cats:
word_table.extend([(x, y, cat) for (x, y) in word_cats[cat]])
word_table.sort(key=lambda w: w[0])
emit_break_module(rf, word_table, word_cats.keys(), "word")
197 changes: 197 additions & 0 deletions src/etc/unicode_gen_breaktests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
#!/usr/bin/env python
# -*- coding: utf-8
#
# Copyright 2015 The Rust Project Developers. See the COPYRIGHT
# file at the top-level directory of this distribution and at
# http://rust-lang.org/COPYRIGHT.
#
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
# option. This file may not be copied, modified, or distributed
# except according to those terms.

# This script uses the following Unicode tables:
# - GraphemeBreakTest.txt
# - WordBreakTest.txt
#
# Since this should not require frequent updates, we just store this
# out-of-line and check the unicode.rs file into git.

import unicode, re, os, fileinput

def load_test_data(f, optsplit=[]):
outls = []
testRe1 = re.compile("^÷\s+([^\s].*[^\s])\s+÷\s+#\s+÷\s+\[0.2\].*?([÷×].*)\s+÷\s+\[0.3\]\s*$")

unicode.fetch(f)
data = []
for line in fileinput.input(os.path.basename(f)):
# lines that include a test start with the ÷ character
if len(line) < 2 or line[0:2] != '÷':
continue

m = testRe1.match(line)
if not m:
print "error: no match on line where test was expected: %s" % line
continue

# process the characters in this test case
chars = process_split_string(m.group(1))
# skip test case if it contains invalid characters (viz., surrogates)
if not chars:
continue

# now process test cases
(chars, info) = process_split_info(m.group(2), chars, optsplit)

# make sure that we have break info for each break!
assert len(chars) - 1 == len(info)

outls.append((chars, info))

return outls

def process_split_info(s, c, o):
outcs = []
outis = []
workcs = c.pop(0)

# are we on a × or a ÷?
isX = False
if s[0:2] == '×':
isX = True

# find each instance of '(÷|×) [x.y] '
while s:
# find the currently considered rule number
sInd = s.index('[') + 1
eInd = s.index(']')

# if it's '× [a.b]' where 'a.b' is in o, then
# we consider it a split even though it's not
# marked as one
# if it's ÷ then it's always a split
if not isX or s[sInd:eInd] in o:
outis.append(s[sInd:eInd])
outcs.append(workcs)
workcs = c.pop(0)
else:
workcs.extend(c.pop(0))

idx = 1
while idx < len(s):
if s[idx:idx+2] == '×':
isX = True
break
if s[idx:idx+2] == '÷':
isX = False
break
idx += 1
s = s[idx:]

outcs.append(workcs)
return (outcs, outis)

def process_split_string(s):
outls = []
workls = []

inls = s.split()

for i in inls:
if i == '÷' or i == '×':
outls.append(workls)
workls = []
continue

ival = int(i,16)

if unicode.is_surrogate(ival):
return []

workls.append(ival)

if workls:
outls.append(workls)

return outls

def showfun(x):
outstr = '("'
for c in x[0]:
outstr += "\\u{%x}" % c
outstr += '",&['
xfirst = True
for xx in x[1:]:
if not xfirst:
outstr += '],&['
xfirst = False
sfirst = True
for sp in xx:
if not sfirst:
outstr += ','
sfirst = False
outstr += '"'
for c in sp:
outstr += "\\u{%x}" % c
outstr += '"'
outstr += '])'
return outstr

def create_grapheme_data():
# rules 9.1 and 9.2 are for extended graphemes only
optsplits = ['9.1','9.2']
d = load_test_data("auxiliary/GraphemeBreakTest.txt", optsplits)

test_same = []
test_diff = []

for (c, i) in d:
allchars = [cn for s in c for cn in s]
extgraphs = []
extwork = []

extwork.extend(c[0])
for n in range(0,len(i)):
if i[n] in optsplits:
extwork.extend(c[n+1])
else:
extgraphs.append(extwork)
extwork = []
extwork.extend(c[n+1])

# these are the extended grapheme clusters
extgraphs.append(extwork)

if extgraphs == c:
test_same.append((allchars, c))
else:
test_diff.append((allchars, extgraphs, c))

stype = "&[(&str, &[&str])]"
dtype = "&[(&str, &[&str], &[&str])]"
with open("graph_tests.rs", "w") as rf:
rf.write(" // official Unicode test data\n")
rf.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakTest.txt\n")
unicode.emit_table(rf, "test_same", test_same, stype, False, showfun, False)
unicode.emit_table(rf, "test_diff", test_diff, dtype, False, showfun, False)

def create_words_data():
d = load_test_data("auxiliary/WordBreakTest.txt")

test = []

for (c, i) in d:
allchars = [cn for s in c for cn in s]
test.append((allchars, c))

wtype = "&[(&str, &[&str])]"
with open("word_tests.rs", "w") as rf:
rf.write(" // official Unicode test data\n")
rf.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt\n")
unicode.emit_table(rf, "test_word", test, wtype, False, showfun, False)

if __name__ == "main":
create_grapheme_data()
create_words_data()
64 changes: 64 additions & 0 deletions src/libcollections/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ pub use core::str::{MatchIndices, RMatchIndices};
pub use core::str::{from_utf8, Chars, CharIndices, Bytes};
pub use core::str::{from_utf8_unchecked, ParseBoolError};
pub use unicode::str::{Words, Graphemes, GraphemeIndices};
pub use unicode::str::{UnicodeWords, UWordBounds, UWordBoundIndices};
pub use core::str::pattern;

/*
Expand Down Expand Up @@ -1736,6 +1737,30 @@ impl str {
UnicodeStr::words(&self[..])
}

/// An iterator over the words of `self`, separated on
/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
///
/// In this function, "words" are just those substrings which, after splitting on
/// UAX#29 word boundaries, contain any alphanumeric characters. That is, the
/// substring must contain at least one character with the
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
/// property, or with
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
///
/// # Example
/// # #![feature(unicode, core)]
/// let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
/// let uw1 = uws.words_unicode().collect::<Vec<&str>>();
/// let b: &[_] = &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"];
///
/// assert_eq!(&uw1[..], b);
/// ```
#[unstable(feature = "unicode",
reason = "questions remain regarding the naming of words() and words_unicode()")]
pub fn words_unicode(&self) -> UnicodeWords {
UnicodeStr::words_unicode(&self[..])
}

/// Returns a string's displayed width in columns.
///
/// Control characters have zero width.
Expand Down Expand Up @@ -1819,4 +1844,43 @@ impl str {
s.extend(self[..].chars().flat_map(|c| c.to_uppercase()));
return s;
}

/// Returns an iterator over substrings of `self` separated on
/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
///
/// The concatenation of the substrings returned by this function is just the original string.
///
/// # Example
///
/// ```
/// # #![feature(unicode, core)]
/// let swu1 = "The quick (\"brown\") fox".split_words_uax29().collect::<Vec<&str>>();
/// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", " ", "fox"];
///
/// assert_eq!(&swu1[..], b);
/// ```
#[unstable(feature = "unicode",
reason = "this functionality may only be provided by libunicode")]
pub fn split_words_uax29(&self) -> UWordBounds {
UnicodeStr::split_words_uax29(&self[..])
}

/// Returns an iterator over substrings of `self`, split on UAX#29 word boundaries,
/// and their offsets. See `split_words_uax29()` for more information.
///
/// # Example
///
/// ```
/// # #![feature(unicode, core)]
/// let swi1 = "Brr, it's 29.3°F!".split_words_uax29_indices().collect::<Vec<(usize, &str)>>();
/// let b: &[_] = &[(0, "Brr"), (3, ","), (4, " "), (5, "it's"), (9, " "), (10, "29.3"),
/// (14, "°"), (16, "F"), (17, "!")];
///
/// assert_eq!(&swi1[..], b);
/// ```
#[unstable(feature = "unicode",
reason = "this functionality may only be provided by libunicode")]
pub fn split_words_uax29_indices(&self) -> UWordBoundIndices {
UnicodeStr::split_words_uax29_indices(&self[..])
}
}
Loading