Skip to content

Commit 0896826

Browse files
committed
Auto merge of #38781 - SimonSapin:unishrink, r=alexcrichton
Reduce the size of static data in std_unicode::tables `BoolTrie` works well for sets of code points spread out through most of Unicode’s range, but is uses a lot of space for sets with few, mostly low, code points. This switches a few of its instances to a similar but simpler trie data structure. CC @raphlinus, who wrote the original `BoolTrie`. ## Before `size_of::<BoolTrie>()` is 1552, which is added to `table.r3.len() * 8 + t.r5.len() + t.r6.len() * 8`: * `Cc_table`: 1632 * `White_Space_table`: 1656 * `Pattern_White_Space_table`: 1640 * Total: 4928 bytes ## After `size_of::<SmallBoolTrie>()` is 32, which is added to `t.r1.len() + t.r2.len() * 8`: * `Cc_table`: 51 * `White_Space_table`: 273 * `Pattern_White_Space_table`: 193 * Total: 517 bytes ## Difference Every Rust program with `std` statically linked should be about 4 KB smaller.
2 parents b97b605 + 3b208d2 commit 0896826

File tree

3 files changed

+110
-227
lines changed

3 files changed

+110
-227
lines changed

.gitignore

+7-6
Original file line numberDiff line numberDiff line change
@@ -73,12 +73,13 @@ __pycache__/
7373
/obj/
7474
/rt/
7575
/rustllvm/
76-
/src/libunicode/DerivedCoreProperties.txt
77-
/src/libunicode/EastAsianWidth.txt
78-
/src/libunicode/HangulSyllableType.txt
79-
/src/libunicode/PropList.txt
80-
/src/libunicode/Scripts.txt
81-
/src/libunicode/UnicodeData.txt
76+
/src/libstd_unicode/DerivedCoreProperties.txt
77+
/src/libstd_unicode/DerivedNormalizationProps.txt
78+
/src/libstd_unicode/PropList.txt
79+
/src/libstd_unicode/ReadMe.txt
80+
/src/libstd_unicode/Scripts.txt
81+
/src/libstd_unicode/SpecialCasing.txt
82+
/src/libstd_unicode/UnicodeData.txt
8283
/stage[0-9]+/
8384
/target
8485
/test/

src/etc/unicode.py

+58-13
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,7 @@
2323
# Since this should not require frequent updates, we just store this
2424
# out-of-line and check the unicode.rs file into git.
2525

26-
import fileinput, re, os, sys, operator
27-
28-
bytes_old = 0
29-
bytes_new = 0
26+
import fileinput, re, os, sys, operator, math
3027

3128
preamble = '''// Copyright 2012-2016 The Rust Project Developers. See the COPYRIGHT
3229
// file at the top-level directory of this distribution and at
@@ -362,7 +359,23 @@ def emit_trie_lookup_range_table(f):
362359
let leaf = r.r5[((child as usize) << 6) + ((c >> 6) & 0x3f)];
363360
trie_range_leaf(c, r.r6[leaf as usize])
364361
}
365-
}\n
362+
}
363+
364+
pub struct SmallBoolTrie {
365+
r1: &'static [u8], // first level
366+
r2: &'static [u64], // leaves
367+
}
368+
369+
impl SmallBoolTrie {
370+
fn lookup(&self, c: char) -> bool {
371+
let c = c as usize;
372+
match self.r1.get(c >> 6) {
373+
Some(&child) => trie_range_leaf(c, self.r2[child as usize]),
374+
None => false,
375+
}
376+
}
377+
}
378+
366379
""")
367380

368381
def compute_trie(rawdata, chunksize):
@@ -379,8 +392,6 @@ def compute_trie(rawdata, chunksize):
379392
return (root, child_data)
380393

381394
def emit_bool_trie(f, name, t_data, is_pub=True):
382-
global bytes_old, bytes_new
383-
bytes_old += 8 * len(t_data)
384395
CHUNK = 64
385396
rawdata = [False] * 0x110000
386397
for (lo, hi) in t_data:
@@ -433,15 +444,50 @@ def emit_bool_trie(f, name, t_data, is_pub=True):
433444
f.write("\n ],\n")
434445

435446
f.write(" };\n\n")
436-
bytes_new += 256 + 992 + 256 + 8 * len(r3) + len(r5) + 8 * len(r6)
447+
448+
def emit_small_bool_trie(f, name, t_data, is_pub=True):
449+
last_chunk = max(int(hi / 64) for (lo, hi) in t_data)
450+
n_chunks = last_chunk + 1
451+
chunks = [0] * n_chunks
452+
for (lo, hi) in t_data:
453+
for cp in range(lo, hi + 1):
454+
if int(cp / 64) >= len(chunks):
455+
print(cp, int(cp / 64), len(chunks), lo, hi)
456+
chunks[int(cp / 64)] |= 1 << (cp & 63)
457+
458+
pub_string = ""
459+
if is_pub:
460+
pub_string = "pub "
461+
f.write(" %sconst %s: &'static super::SmallBoolTrie = &super::SmallBoolTrie {\n"
462+
% (pub_string, name))
463+
464+
(r1, r2) = compute_trie(chunks, 1)
465+
466+
f.write(" r1: &[\n")
467+
data = ','.join(str(node) for node in r1)
468+
format_table_content(f, data, 12)
469+
f.write("\n ],\n")
470+
471+
f.write(" r2: &[\n")
472+
data = ','.join('0x%016x' % node for node in r2)
473+
format_table_content(f, data, 12)
474+
f.write("\n ],\n")
475+
476+
f.write(" };\n\n")
437477

438478
def emit_property_module(f, mod, tbl, emit):
439479
f.write("pub mod %s {\n" % mod)
440480
for cat in sorted(emit):
441-
emit_bool_trie(f, "%s_table" % cat, tbl[cat])
442-
f.write(" pub fn %s(c: char) -> bool {\n" % cat)
443-
f.write(" super::trie_lookup_range_table(c, %s_table)\n" % cat)
444-
f.write(" }\n\n")
481+
if cat in ["Cc", "White_Space", "Pattern_White_Space"]:
482+
emit_small_bool_trie(f, "%s_table" % cat, tbl[cat])
483+
f.write(" pub fn %s(c: char) -> bool {\n" % cat)
484+
f.write(" %s_table.lookup(c)\n" % cat)
485+
f.write(" }\n\n")
486+
else:
487+
emit_bool_trie(f, "%s_table" % cat, tbl[cat])
488+
f.write(" pub fn %s(c: char) -> bool {\n" % cat)
489+
f.write(" super::trie_lookup_range_table(c, %s_table)\n" % cat)
490+
f.write(" }\n\n")
445491
f.write("}\n\n")
446492

447493
def emit_conversions_module(f, to_upper, to_lower, to_title):
@@ -543,4 +589,3 @@ def emit_norm_module(f, canon, compat, combine, norm_props):
543589
# normalizations and conversions module
544590
emit_norm_module(rf, canon_decomp, compat_decomp, combines, norm_props)
545591
emit_conversions_module(rf, to_upper, to_lower, to_title)
546-
#print 'bytes before = %d, bytes after = %d' % (bytes_old, bytes_new)

0 commit comments

Comments
 (0)