25
25
26
26
import fileinput , re , os , sys , operator
27
27
28
+ bytes_old = 0
29
+ bytes_new = 0
30
+
28
31
preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
29
32
// file at the top-level directory of this distribution and at
30
33
// http://rust-lang.org/COPYRIGHT.
@@ -309,16 +312,36 @@ def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
309
312
310
313
def emit_trie_lookup_range_table (f ):
311
314
f .write ("""
315
+
316
+ // BoolTrie is a trie for representing a set of Unicode codepoints. It is
317
+ // implemented with postfix compression (sharing of identical child nodes),
318
+ // which gives both compact size and fast lookup.
319
+ //
320
+ // The space of Unicode codepoints is divided into 3 subareas, each
321
+ // represented by a trie with different depth. In the first (0..0x800), there
322
+ // is no trie structure at all; each u64 entry corresponds to a bitvector
323
+ // effectively holding 64 bool values.
324
+ //
325
+ // In the second (0x800..0x10000), each child of the root node represents a
326
+ // 64-wide subrange, but instead of storing the full 64-bit value of the leaf,
327
+ // the trie stores an 8-bit index into a shared table of leaf values. This
328
+ // exploits the fact that in reasonable sets, many such leaves can be shared.
329
+ //
330
+ // In the third (0x10000..0x110000), each child of the root node represents a
331
+ // 4096-wide subrange, and the trie stores an 8-bit index into a 64-byte slice
332
+ // of a child tree. Each of these 64 bytes represents an index into the table
333
+ // of shared 64-bit leaf values. This exploits the sparse structure in the
334
+ // non-BMP range of most Unicode sets.
312
335
pub struct BoolTrie {
313
336
// 0..0x800 (corresponding to 1 and 2 byte utf-8 sequences)
314
337
r1: [u64; 32], // leaves
315
338
316
339
// 0x800..0x10000 (corresponding to 3 byte utf-8 sequences)
317
- r2: [u8; 1024 ], // first level
340
+ r2: [u8; 992 ], // first level
318
341
r3: &'static [u64], // leaves
319
342
320
343
// 0x10000..0x110000 (corresponding to 4 byte utf-8 sequences)
321
- r4: [u8; 272 ], // first level
344
+ r4: [u8; 256 ], // first level
322
345
r5: &'static [u8], // second level
323
346
r6: &'static [u64], // leaves
324
347
}
@@ -332,10 +355,10 @@ def emit_trie_lookup_range_table(f):
332
355
if c < 0x800 {
333
356
trie_range_leaf(c, r.r1[c >> 6])
334
357
} else if c < 0x10000 {
335
- let child = r.r2[c >> 6];
358
+ let child = r.r2[( c >> 6) - 0x20 ];
336
359
trie_range_leaf(c, r.r3[child as usize])
337
360
} else {
338
- let child = r.r4[c >> 12];
361
+ let child = r.r4[( c >> 12) - 0x10 ];
339
362
let leaf = r.r5[((child as usize) << 6) + ((c >> 6) & 0x3f)];
340
363
trie_range_leaf(c, r.r6[leaf as usize])
341
364
}
@@ -356,6 +379,8 @@ def compute_trie(rawdata, chunksize):
356
379
return (root , child_data )
357
380
358
381
def emit_bool_trie (f , name , t_data , is_pub = True ):
382
+ global bytes_old , bytes_new
383
+ bytes_old += 8 * len (t_data )
359
384
CHUNK = 64
360
385
rawdata = [False ] * 0x110000 ;
361
386
for (lo , hi ) in t_data :
@@ -383,7 +408,7 @@ def emit_bool_trie(f, name, t_data, is_pub=True):
383
408
# 0x800..0x10000 trie
384
409
(r2 , r3 ) = compute_trie (chunks [0x800 / CHUNK : 0x10000 / CHUNK ], 64 / CHUNK )
385
410
f .write (" r2: [\n " )
386
- data = ',' .join (str (node ) for node in [ 255 ] * 32 + r2 )
411
+ data = ',' .join (str (node ) for node in r2 )
387
412
format_table_content (f , data , 12 )
388
413
f .write ("\n ],\n " )
389
414
f .write (" r3: &[\n " )
@@ -395,7 +420,7 @@ def emit_bool_trie(f, name, t_data, is_pub=True):
395
420
(mid , r6 ) = compute_trie (chunks [0x10000 / CHUNK : 0x110000 / CHUNK ], 64 / CHUNK )
396
421
(r4 , r5 ) = compute_trie (mid , 64 )
397
422
f .write (" r4: [\n " )
398
- data = ',' .join (str (node ) for node in [ 255 ] * 16 + r4 )
423
+ data = ',' .join (str (node ) for node in r4 )
399
424
format_table_content (f , data , 12 )
400
425
f .write ("\n ],\n " )
401
426
f .write (" r5: &[\n " )
@@ -408,6 +433,7 @@ def emit_bool_trie(f, name, t_data, is_pub=True):
408
433
f .write ("\n ],\n " )
409
434
410
435
f .write (" };\n \n " )
436
+ bytes_new += 256 + 992 + 256 + 8 * len (r3 ) + len (r5 ) + 8 * len (r6 )
411
437
412
438
def emit_property_module (f , mod , tbl , emit ):
413
439
f .write ("pub mod %s {\n " % mod )
@@ -517,3 +543,4 @@ def emit_norm_module(f, canon, compat, combine, norm_props):
517
543
# normalizations and conversions module
518
544
emit_norm_module (rf , canon_decomp , compat_decomp , combines , norm_props )
519
545
emit_conversions_module (rf , to_upper , to_lower , to_title )
546
+ #print 'bytes before = %d, bytes after = %d' % (bytes_old, bytes_new)
0 commit comments