Skip to content

Commit 3b208d2

Browse files
committed
Reduce the size of static data in std_unicode::tables.
`BoolTrie` works well for sets of code points spread out through most of Unicode’s range, but is uses a lot of space for sets with few, mostly low, code points. This switches a few of its instances to a similar but simpler trie data structure. ## Before `size_of::<BoolTrie>()` is 1552, which is added to `table.r3.len() * 8 + t.r5.len() + t.r6.len() * 8`: * `Cc_table`: 1632 * `White_Space_table`: 1656 * `Pattern_White_Space_table`: 1640 * Total: 4928 bytes ## After `size_of::<SmallBoolTrie>()` is 32, which is added to `t.r1.len() + t.r2.len() * 8`: * `Cc_table`: 51 * `White_Space_table`: 273 * `Pattern_White_Space_table`: 193 * Total: 517 bytes ## Difference Every Rust program with `std` statically linked should be about 4 KB smaller.
1 parent 90c7c05 commit 3b208d2

File tree

3 files changed

+110
-220
lines changed

3 files changed

+110
-220
lines changed

.gitignore

+7-6
Original file line numberDiff line numberDiff line change
@@ -73,12 +73,13 @@ __pycache__/
7373
/obj/
7474
/rt/
7575
/rustllvm/
76-
/src/libunicode/DerivedCoreProperties.txt
77-
/src/libunicode/EastAsianWidth.txt
78-
/src/libunicode/HangulSyllableType.txt
79-
/src/libunicode/PropList.txt
80-
/src/libunicode/Scripts.txt
81-
/src/libunicode/UnicodeData.txt
76+
/src/libstd_unicode/DerivedCoreProperties.txt
77+
/src/libstd_unicode/DerivedNormalizationProps.txt
78+
/src/libstd_unicode/PropList.txt
79+
/src/libstd_unicode/ReadMe.txt
80+
/src/libstd_unicode/Scripts.txt
81+
/src/libstd_unicode/SpecialCasing.txt
82+
/src/libstd_unicode/UnicodeData.txt
8283
/stage[0-9]+/
8384
/target
8485
/test/

src/etc/unicode.py

+58-6
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
# Since this should not require frequent updates, we just store this
2424
# out-of-line and check the unicode.rs file into git.
2525

26-
import fileinput, re, os, sys, operator
26+
import fileinput, re, os, sys, operator, math
2727

2828
preamble = '''// Copyright 2012-2016 The Rust Project Developers. See the COPYRIGHT
2929
// file at the top-level directory of this distribution and at
@@ -359,7 +359,23 @@ def emit_trie_lookup_range_table(f):
359359
let leaf = r.r5[((child as usize) << 6) + ((c >> 6) & 0x3f)];
360360
trie_range_leaf(c, r.r6[leaf as usize])
361361
}
362-
}\n
362+
}
363+
364+
pub struct SmallBoolTrie {
365+
r1: &'static [u8], // first level
366+
r2: &'static [u64], // leaves
367+
}
368+
369+
impl SmallBoolTrie {
370+
fn lookup(&self, c: char) -> bool {
371+
let c = c as usize;
372+
match self.r1.get(c >> 6) {
373+
Some(&child) => trie_range_leaf(c, self.r2[child as usize]),
374+
None => false,
375+
}
376+
}
377+
}
378+
363379
""")
364380

365381
def compute_trie(rawdata, chunksize):
@@ -429,13 +445,49 @@ def emit_bool_trie(f, name, t_data, is_pub=True):
429445

430446
f.write(" };\n\n")
431447

448+
def emit_small_bool_trie(f, name, t_data, is_pub=True):
449+
last_chunk = max(int(hi / 64) for (lo, hi) in t_data)
450+
n_chunks = last_chunk + 1
451+
chunks = [0] * n_chunks
452+
for (lo, hi) in t_data:
453+
for cp in range(lo, hi + 1):
454+
if int(cp / 64) >= len(chunks):
455+
print(cp, int(cp / 64), len(chunks), lo, hi)
456+
chunks[int(cp / 64)] |= 1 << (cp & 63)
457+
458+
pub_string = ""
459+
if is_pub:
460+
pub_string = "pub "
461+
f.write(" %sconst %s: &'static super::SmallBoolTrie = &super::SmallBoolTrie {\n"
462+
% (pub_string, name))
463+
464+
(r1, r2) = compute_trie(chunks, 1)
465+
466+
f.write(" r1: &[\n")
467+
data = ','.join(str(node) for node in r1)
468+
format_table_content(f, data, 12)
469+
f.write("\n ],\n")
470+
471+
f.write(" r2: &[\n")
472+
data = ','.join('0x%016x' % node for node in r2)
473+
format_table_content(f, data, 12)
474+
f.write("\n ],\n")
475+
476+
f.write(" };\n\n")
477+
432478
def emit_property_module(f, mod, tbl, emit):
433479
f.write("pub mod %s {\n" % mod)
434480
for cat in sorted(emit):
435-
emit_bool_trie(f, "%s_table" % cat, tbl[cat])
436-
f.write(" pub fn %s(c: char) -> bool {\n" % cat)
437-
f.write(" super::trie_lookup_range_table(c, %s_table)\n" % cat)
438-
f.write(" }\n\n")
481+
if cat in ["Cc", "White_Space", "Pattern_White_Space"]:
482+
emit_small_bool_trie(f, "%s_table" % cat, tbl[cat])
483+
f.write(" pub fn %s(c: char) -> bool {\n" % cat)
484+
f.write(" %s_table.lookup(c)\n" % cat)
485+
f.write(" }\n\n")
486+
else:
487+
emit_bool_trie(f, "%s_table" % cat, tbl[cat])
488+
f.write(" pub fn %s(c: char) -> bool {\n" % cat)
489+
f.write(" super::trie_lookup_range_table(c, %s_table)\n" % cat)
490+
f.write(" }\n\n")
439491
f.write("}\n\n")
440492

441493
def emit_conversions_module(f, to_upper, to_lower, to_title):

0 commit comments

Comments
 (0)