25
25
26
26
import fileinput , re , os , sys , operator
27
27
28
+ bytes_old = 0
29
+ bytes_new = 0
30
+
28
31
preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
29
32
// file at the top-level directory of this distribution and at
30
33
// http://rust-lang.org/COPYRIGHT.
@@ -307,12 +310,137 @@ def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
307
310
format_table_content (f , data , 8 )
308
311
f .write ("\n ];\n \n " )
309
312
313
+ def emit_trie_lookup_range_table (f ):
314
+ f .write ("""
315
+
316
+ // BoolTrie is a trie for representing a set of Unicode codepoints. It is
317
+ // implemented with postfix compression (sharing of identical child nodes),
318
+ // which gives both compact size and fast lookup.
319
+ //
320
+ // The space of Unicode codepoints is divided into 3 subareas, each
321
+ // represented by a trie with different depth. In the first (0..0x800), there
322
+ // is no trie structure at all; each u64 entry corresponds to a bitvector
323
+ // effectively holding 64 bool values.
324
+ //
325
+ // In the second (0x800..0x10000), each child of the root node represents a
326
+ // 64-wide subrange, but instead of storing the full 64-bit value of the leaf,
327
+ // the trie stores an 8-bit index into a shared table of leaf values. This
328
+ // exploits the fact that in reasonable sets, many such leaves can be shared.
329
+ //
330
+ // In the third (0x10000..0x110000), each child of the root node represents a
331
+ // 4096-wide subrange, and the trie stores an 8-bit index into a 64-byte slice
332
+ // of a child tree. Each of these 64 bytes represents an index into the table
333
+ // of shared 64-bit leaf values. This exploits the sparse structure in the
334
+ // non-BMP range of most Unicode sets.
335
+ pub struct BoolTrie {
336
+ // 0..0x800 (corresponding to 1 and 2 byte utf-8 sequences)
337
+ r1: [u64; 32], // leaves
338
+
339
+ // 0x800..0x10000 (corresponding to 3 byte utf-8 sequences)
340
+ r2: [u8; 992], // first level
341
+ r3: &'static [u64], // leaves
342
+
343
+ // 0x10000..0x110000 (corresponding to 4 byte utf-8 sequences)
344
+ r4: [u8; 256], // first level
345
+ r5: &'static [u8], // second level
346
+ r6: &'static [u64], // leaves
347
+ }
348
+
349
+ fn trie_range_leaf(c: usize, bitmap_chunk: u64) -> bool {
350
+ ((bitmap_chunk >> (c & 63)) & 1) != 0
351
+ }
352
+
353
+ fn trie_lookup_range_table(c: char, r: &'static BoolTrie) -> bool {
354
+ let c = c as usize;
355
+ if c < 0x800 {
356
+ trie_range_leaf(c, r.r1[c >> 6])
357
+ } else if c < 0x10000 {
358
+ let child = r.r2[(c >> 6) - 0x20];
359
+ trie_range_leaf(c, r.r3[child as usize])
360
+ } else {
361
+ let child = r.r4[(c >> 12) - 0x10];
362
+ let leaf = r.r5[((child as usize) << 6) + ((c >> 6) & 0x3f)];
363
+ trie_range_leaf(c, r.r6[leaf as usize])
364
+ }
365
+ }\n
366
+ """ )
367
+
368
+ def compute_trie (rawdata , chunksize ):
369
+ root = []
370
+ childmap = {}
371
+ child_data = []
372
+ for i in range (len (rawdata ) / chunksize ):
373
+ data = rawdata [i * chunksize : (i + 1 ) * chunksize ]
374
+ child = '|' .join (map (str , data ))
375
+ if child not in childmap :
376
+ childmap [child ] = len (childmap )
377
+ child_data .extend (data )
378
+ root .append (childmap [child ])
379
+ return (root , child_data )
380
+
381
+ def emit_bool_trie (f , name , t_data , is_pub = True ):
382
+ global bytes_old , bytes_new
383
+ bytes_old += 8 * len (t_data )
384
+ CHUNK = 64
385
+ rawdata = [False ] * 0x110000 ;
386
+ for (lo , hi ) in t_data :
387
+ for cp in range (lo , hi + 1 ):
388
+ rawdata [cp ] = True
389
+
390
+ # convert to bitmap chunks of 64 bits each
391
+ chunks = []
392
+ for i in range (0x110000 / CHUNK ):
393
+ chunk = 0
394
+ for j in range (64 ):
395
+ if rawdata [i * 64 + j ]:
396
+ chunk |= 1 << j
397
+ chunks .append (chunk )
398
+
399
+ pub_string = ""
400
+ if is_pub :
401
+ pub_string = "pub "
402
+ f .write (" %sconst %s: &'static super::BoolTrie = &super::BoolTrie {\n " % (pub_string , name ))
403
+ f .write (" r1: [\n " )
404
+ data = ',' .join ('0x%016x' % chunk for chunk in chunks [0 :0x800 / CHUNK ])
405
+ format_table_content (f , data , 12 )
406
+ f .write ("\n ],\n " )
407
+
408
+ # 0x800..0x10000 trie
409
+ (r2 , r3 ) = compute_trie (chunks [0x800 / CHUNK : 0x10000 / CHUNK ], 64 / CHUNK )
410
+ f .write (" r2: [\n " )
411
+ data = ',' .join (str (node ) for node in r2 )
412
+ format_table_content (f , data , 12 )
413
+ f .write ("\n ],\n " )
414
+ f .write (" r3: &[\n " )
415
+ data = ',' .join ('0x%016x' % chunk for chunk in r3 )
416
+ format_table_content (f , data , 12 )
417
+ f .write ("\n ],\n " )
418
+
419
+ # 0x10000..0x110000 trie
420
+ (mid , r6 ) = compute_trie (chunks [0x10000 / CHUNK : 0x110000 / CHUNK ], 64 / CHUNK )
421
+ (r4 , r5 ) = compute_trie (mid , 64 )
422
+ f .write (" r4: [\n " )
423
+ data = ',' .join (str (node ) for node in r4 )
424
+ format_table_content (f , data , 12 )
425
+ f .write ("\n ],\n " )
426
+ f .write (" r5: &[\n " )
427
+ data = ',' .join (str (node ) for node in r5 )
428
+ format_table_content (f , data , 12 )
429
+ f .write ("\n ],\n " )
430
+ f .write (" r6: &[\n " )
431
+ data = ',' .join ('0x%016x' % chunk for chunk in r6 )
432
+ format_table_content (f , data , 12 )
433
+ f .write ("\n ],\n " )
434
+
435
+ f .write (" };\n \n " )
436
+ bytes_new += 256 + 992 + 256 + 8 * len (r3 ) + len (r5 ) + 8 * len (r6 )
437
+
310
438
def emit_property_module (f , mod , tbl , emit ):
311
439
f .write ("pub mod %s {\n " % mod )
312
440
for cat in sorted (emit ):
313
- emit_table (f , "%s_table" % cat , tbl [cat ])
441
+ emit_bool_trie (f , "%s_table" % cat , tbl [cat ])
314
442
f .write (" pub fn %s(c: char) -> bool {\n " % cat )
315
- f .write (" super::bsearch_range_table (c, %s_table)\n " % cat )
443
+ f .write (" super::trie_lookup_range_table (c, %s_table)\n " % cat )
316
444
f .write (" }\n \n " )
317
445
f .write ("}\n \n " )
318
446
@@ -402,8 +530,9 @@ def emit_norm_module(f, canon, compat, combine, norm_props):
402
530
norm_props = load_properties ("DerivedNormalizationProps.txt" ,
403
531
["Full_Composition_Exclusion" ])
404
532
405
- # bsearch_range_table is used in all the property modules below
406
- emit_bsearch_range_table (rf )
533
+ # trie_lookup_table is used in all the property modules below
534
+ emit_trie_lookup_range_table (rf )
535
+ # emit_bsearch_range_table(rf)
407
536
408
537
# category tables
409
538
for (name , cat , pfuns ) in ("general_category" , gencats , ["N" , "Cc" ]), \
@@ -414,3 +543,4 @@ def emit_norm_module(f, canon, compat, combine, norm_props):
414
543
# normalizations and conversions module
415
544
emit_norm_module (rf , canon_decomp , compat_decomp , combines , norm_props )
416
545
emit_conversions_module (rf , to_upper , to_lower , to_title )
546
+ #print 'bytes before = %d, bytes after = %d' % (bytes_old, bytes_new)
0 commit comments