Skip to content

Commit ac13f0d

Browse files
committed
Add support to libcore for encoded-in-rust unicode character properties, at least. Add script to compute them from unicode.org.
1 parent 88d7499 commit ac13f0d

File tree

4 files changed

+4919
-82
lines changed

4 files changed

+4919
-82
lines changed

src/etc/unicode.py

+172
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
#!/usr/bin/env python
2+
3+
# This digests UnicodeData.txt and DerivedCoreProperties.txt and emits rust
4+
# code covering the core properties. Since this is a pretty rare event we
5+
# just store this out-of-line and check the unicode.rs file into git.
6+
#
7+
# The emitted code is "the minimum we think is necessary for libcore", that
8+
# is, to support basic operations of the compiler and "most nontrivial rust
9+
# programs". It is not meant to be a complete implementation of unicode.
10+
# For that we recommend you use a proper binding to libicu.
11+
12+
import fileinput, re, os, sys
13+
14+
15+
def fetch(f):
16+
if not os.path.exists(f):
17+
os.system("curl -O http://www.unicode.org/Public/UNIDATA/%s"
18+
% f)
19+
20+
if not os.path.exists(f):
21+
sys.stderr.write("cannot load %s" % f)
22+
exit(1)
23+
24+
25+
def load_general_categories(f):
26+
fetch(f)
27+
gencats = {}
28+
curr_cat = ""
29+
c_lo = 0
30+
c_hi = 0
31+
for line in fileinput.input(f):
32+
fields = line.split(";")
33+
if len(fields) != 15:
34+
continue
35+
[code, name, gencat, combine, bidi,
36+
decomp, deci, digit, num, mirror,
37+
old, iso, upcase, lowcsae, titlecase ] = fields
38+
39+
code = int(code, 16)
40+
41+
if curr_cat == "":
42+
curr_cat = gencat
43+
c_lo = code
44+
c_hi = code
45+
46+
if curr_cat == gencat:
47+
c_hi = code
48+
else:
49+
if curr_cat not in gencats:
50+
gencats[curr_cat] = []
51+
52+
gencats[curr_cat].append((c_lo, c_hi))
53+
curr_cat = gencat
54+
c_lo = code
55+
c_hi = code
56+
return gencats
57+
58+
59+
def load_derived_core_properties(f):
60+
fetch(f)
61+
derivedprops = {}
62+
interestingprops = ["XID_Start", "XID_Continue", "Alphabetic"]
63+
re1 = re.compile("^([0-9A-F]+) +; (\w+)")
64+
re2 = re.compile("^([0-9A-F]+)\.\.([0-9A-F]+) +; (\w+)")
65+
66+
for line in fileinput.input(f):
67+
prop = None
68+
d_lo = 0
69+
d_hi = 0
70+
m = re1.match(line)
71+
if m:
72+
d_lo = m.group(1)
73+
d_hi = m.group(1)
74+
prop = m.group(2)
75+
else:
76+
m = re2.match(line)
77+
if m:
78+
d_lo = m.group(1)
79+
d_hi = m.group(2)
80+
prop = m.group(3)
81+
else:
82+
continue
83+
if prop not in interestingprops:
84+
continue
85+
d_lo = int(d_lo, 16)
86+
d_hi = int(d_hi, 16)
87+
if prop not in derivedprops:
88+
derivedprops[prop] = []
89+
derivedprops[prop].append((d_lo, d_hi))
90+
return derivedprops
91+
92+
def escape_char(c):
93+
if c <= 0xff:
94+
return "'\\x%2.2x'" % c
95+
if c <= 0xffff:
96+
return "'\\u%4.4x'" % c
97+
return "'\\U%8.8x'" % c
98+
99+
def emit_rust_module(f, mod, tbl):
100+
f.write("mod %s {\n" % mod)
101+
keys = tbl.keys()
102+
keys.sort()
103+
for cat in keys:
104+
f.write(" pure fn %s(c: char) -> bool {\n" % cat)
105+
f.write(" ret alt c {\n")
106+
prefix = ' '
107+
for pair in tbl[cat]:
108+
if pair[0] == pair[1]:
109+
f.write(" %c %s\n" %
110+
(prefix, escape_char(pair[0])))
111+
else:
112+
f.write(" %c %s to %s\n" %
113+
(prefix,
114+
escape_char(pair[0]),
115+
escape_char(pair[1])))
116+
prefix = '|'
117+
f.write(" { true }\n")
118+
f.write(" _ { false }\n")
119+
f.write(" };\n")
120+
f.write(" }\n\n")
121+
f.write("}\n")
122+
123+
124+
def emit_cpp_module(f, mod, tbl):
125+
keys = tbl.keys()
126+
keys.sort()
127+
128+
for cat in keys:
129+
130+
singles = []
131+
ranges = []
132+
133+
for pair in tbl[cat]:
134+
if pair[0] == pair[1]:
135+
singles.append(pair[0])
136+
else:
137+
ranges.append(pair)
138+
139+
f.write("bool %s_%s(unsigned c) {\n" % (mod, cat))
140+
for pair in ranges:
141+
f.write(" if (0x%x <= c && c <= 0x%x) { return true; }\n"
142+
% pair)
143+
if len(singles) > 0:
144+
f.write(" switch (c) {\n");
145+
for single in singles:
146+
f.write(" case 0x%x:\n" % single)
147+
f.write(" return true;\n");
148+
f.write(" default:\n");
149+
f.write(" return false;\n");
150+
f.write(" }\n")
151+
f.write("return false;\n")
152+
f.write("}\n\n")
153+
154+
155+
def emit_module(rf, cf, mod, tbl):
156+
emit_rust_module(rf, mod, tbl)
157+
emit_cpp_module(cf, mod, tbl)
158+
159+
r = "unicode.rs"
160+
c = "unicode.cpp"
161+
for i in [r, c]:
162+
if os.path.exists(i):
163+
os.remove(i);
164+
165+
rf = open(r, "w")
166+
cf = open(c, "w")
167+
168+
emit_module(rf, cf, "general_category",
169+
load_general_categories("UnicodeData.txt"))
170+
171+
emit_module(rf, cf, "derived_property",
172+
load_derived_core_properties("DerivedCoreProperties.txt"))

src/libcore/char.rs

+61-82
Original file line numberDiff line numberDiff line change
@@ -4,96 +4,75 @@ Module: char
44
Utilities for manipulating the char type
55
*/
66

7+
/*
8+
Lu Uppercase_Letter an uppercase letter
9+
Ll Lowercase_Letter a lowercase letter
10+
Lt Titlecase_Letter a digraphic character, with first part uppercase
11+
Lm Modifier_Letter a modifier letter
12+
Lo Other_Letter other letters, including syllables and ideographs
13+
Mn Nonspacing_Mark a nonspacing combining mark (zero advance width)
14+
Mc Spacing_Mark a spacing combining mark (positive advance width)
15+
Me Enclosing_Mark an enclosing combining mark
16+
Nd Decimal_Number a decimal digit
17+
Nl Letter_Number a letterlike numeric character
18+
No Other_Number a numeric character of other type
19+
Pc Connector_Punctuation a connecting punctuation mark, like a tie
20+
Pd Dash_Punctuation a dash or hyphen punctuation mark
21+
Ps Open_Punctuation an opening punctuation mark (of a pair)
22+
Pe Close_Punctuation a closing punctuation mark (of a pair)
23+
Pi Initial_Punctuation an initial quotation mark
24+
Pf Final_Punctuation a final quotation mark
25+
Po Other_Punctuation a punctuation mark of other type
26+
Sm Math_Symbol a symbol of primarily mathematical use
27+
Sc Currency_Symbol a currency sign
28+
Sk Modifier_Symbol a non-letterlike modifier symbol
29+
So Other_Symbol a symbol of other type
30+
Zs Space_Separator a space character (of various non-zero widths)
31+
Zl Line_Separator U+2028 LINE SEPARATOR only
32+
Zp Paragraph_Separator U+2029 PARAGRAPH SEPARATOR only
33+
Cc Control a C0 or C1 control code
34+
Cf Format a format control character
35+
Cs Surrogate a surrogate code point
36+
Co Private_Use a private-use character
37+
Cn Unassigned a reserved unassigned code point or a noncharacter
38+
*/
39+
40+
import is_alphabetic = unicode::derived_property::Alphabetic;
41+
import is_XID_start = unicode::derived_property::XID_Start;
42+
import is_XID_continue = unicode::derived_property::XID_Continue;
43+
744
/*
845
Function: is_whitespace
946
10-
Indicates whether a character is whitespace.
47+
Indicates whether a character is whitespace, defined in terms of
48+
the Unicode General Categories 'Zs', 'Zl', 'Zp' and the additional
49+
'Cc'-category control codes in the range [0x09, 0x0d].
1150
12-
Whitespace characters include space (U+0020), tab (U+0009), line feed
13-
(U+000A), carriage return (U+000D), and a number of less common
14-
ASCII and unicode characters.
1551
*/
1652
pure fn is_whitespace(c: char) -> bool {
17-
const ch_space: char = '\u0020';
18-
const ch_ogham_space_mark: char = '\u1680';
19-
const ch_mongolian_vowel_sep: char = '\u180e';
20-
const ch_en_quad: char = '\u2000';
21-
const ch_em_quad: char = '\u2001';
22-
const ch_en_space: char = '\u2002';
23-
const ch_em_space: char = '\u2003';
24-
const ch_three_per_em_space: char = '\u2004';
25-
const ch_four_per_em_space: char = '\u2005';
26-
const ch_six_per_em_space: char = '\u2006';
27-
const ch_figure_space: char = '\u2007';
28-
const ch_punctuation_space: char = '\u2008';
29-
const ch_thin_space: char = '\u2009';
30-
const ch_hair_space: char = '\u200a';
31-
const ch_narrow_no_break_space: char = '\u202f';
32-
const ch_medium_mathematical_space: char = '\u205f';
33-
const ch_ideographic_space: char = '\u3000';
34-
const ch_line_separator: char = '\u2028';
35-
const ch_paragraph_separator: char = '\u2029';
36-
const ch_character_tabulation: char = '\u0009';
37-
const ch_line_feed: char = '\u000a';
38-
const ch_line_tabulation: char = '\u000b';
39-
const ch_form_feed: char = '\u000c';
40-
const ch_carriage_return: char = '\u000d';
41-
const ch_next_line: char = '\u0085';
42-
const ch_no_break_space: char = '\u00a0';
43-
44-
if c == ch_space {
45-
true
46-
} else if c == ch_ogham_space_mark {
47-
true
48-
} else if c == ch_mongolian_vowel_sep {
49-
true
50-
} else if c == ch_en_quad {
51-
true
52-
} else if c == ch_em_quad {
53-
true
54-
} else if c == ch_en_space {
55-
true
56-
} else if c == ch_em_space {
57-
true
58-
} else if c == ch_three_per_em_space {
59-
true
60-
} else if c == ch_four_per_em_space {
61-
true
62-
} else if c == ch_six_per_em_space {
63-
true
64-
} else if c == ch_figure_space {
65-
true
66-
} else if c == ch_punctuation_space {
67-
true
68-
} else if c == ch_thin_space {
69-
true
70-
} else if c == ch_hair_space {
71-
true
72-
} else if c == ch_narrow_no_break_space {
73-
true
74-
} else if c == ch_medium_mathematical_space {
75-
true
76-
} else if c == ch_ideographic_space {
77-
true
78-
} else if c == ch_line_tabulation {
79-
true
80-
} else if c == ch_paragraph_separator {
81-
true
82-
} else if c == ch_character_tabulation {
83-
true
84-
} else if c == ch_line_feed {
85-
true
86-
} else if c == ch_line_tabulation {
87-
true
88-
} else if c == ch_form_feed {
89-
true
90-
} else if c == ch_carriage_return {
91-
true
92-
} else if c == ch_next_line {
93-
true
94-
} else if c == ch_no_break_space { true } else { false }
53+
ret ('\x09' <= c && c <= '\x0x0d')
54+
|| unicode::general_category::Zs(c)
55+
|| unicode::general_category::Zl(c)
56+
|| unicode::general_category::Zp(c);
57+
}
58+
59+
/*
60+
Function: is_alphanumeric
61+
62+
Indicates whether a character is alphanumeric, defined in terms of
63+
the Unicode General Categories 'Nd', 'Nl', 'No' and the Derived
64+
Core Property 'Alphabetic'.
65+
66+
*/
67+
68+
pure fn is_alphanumeric(c: char) -> bool {
69+
ret unicode::derived_property::Alphabetic(c) ||
70+
unicode::general_category::Nd(c) ||
71+
unicode::general_category::Nl(c) ||
72+
unicode::general_category::No(c);
9573
}
9674

75+
9776
/*
9877
Function: to_digit
9978

src/libcore/core.rc

+3
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@ mod u64;
3030
mod vec;
3131
mod bool;
3232

33+
// For internal use by char, not exported
34+
mod unicode;
35+
3336

3437
// Ubiquitous-utility-type modules
3538

0 commit comments

Comments
 (0)