1
1
use std:: collections:: hash_map:: Entry ;
2
2
use std:: collections:: { BTreeMap , VecDeque } ;
3
3
4
+ use base64:: prelude:: * ;
4
5
use rustc_data_structures:: fx:: { FxHashMap , FxIndexMap } ;
5
6
use rustc_middle:: ty:: TyCtxt ;
6
7
use rustc_span:: def_id:: DefId ;
@@ -21,14 +22,14 @@ use crate::html::render::{self, IndexItem, IndexItemFunctionType, RenderType, Re
21
22
///
22
23
/// The `index` is a JSON-encoded list of names and other information.
23
24
///
24
- /// The desc has newlined descriptions, split up by size into 1MiB shards.
25
+ /// The desc has newlined descriptions, split up by size into 128KiB shards.
25
26
/// For example, `(4, "foo\nbar\nbaz\nquux")`.
26
27
pub ( crate ) struct SerializedSearchIndex {
27
28
pub ( crate ) index : String ,
28
29
pub ( crate ) desc : Vec < ( usize , String ) > ,
29
30
}
30
31
31
- const DESC_INDEX_SHARD_LEN : usize = 1024 * 1024 ;
32
+ const DESC_INDEX_SHARD_LEN : usize = 128 * 1024 ;
32
33
33
34
/// Builds the search index from the collected metadata
34
35
pub ( crate ) fn build_index < ' tcx > (
@@ -342,6 +343,8 @@ pub(crate) fn build_index<'tcx>(
342
343
// A list of shard lengths encoded as vlqhex. See the comment in write_vlqhex_to_string
343
344
// for information on the format.
344
345
descindex : String ,
346
+ // A list of items with no description. This is eventually turned into a bitmap.
347
+ emptydesc : Vec < u32 > ,
345
348
}
346
349
347
350
struct Paths {
@@ -456,7 +459,8 @@ pub(crate) fn build_index<'tcx>(
456
459
}
457
460
458
461
if item. deprecation . is_some ( ) {
459
- deprecated. push ( index) ;
462
+ // bitmasks always use 1-indexing for items, with 0 as the crate itself
463
+ deprecated. push ( u32:: try_from ( index + 1 ) . unwrap ( ) ) ;
460
464
}
461
465
}
462
466
@@ -473,21 +477,37 @@ pub(crate) fn build_index<'tcx>(
473
477
crate_data. serialize_field ( "i" , & parents) ?;
474
478
crate_data. serialize_field ( "f" , & functions) ?;
475
479
crate_data. serialize_field ( "D" , & self . descindex ) ?;
476
- crate_data. serialize_field ( "c" , & deprecated) ?;
477
480
crate_data. serialize_field ( "p" , & paths) ?;
478
481
crate_data. serialize_field ( "b" , & self . associated_item_disambiguators ) ?;
482
+ let mut buf = Vec :: new ( ) ;
483
+ let mut strbuf = String :: new ( ) ;
484
+ write_bitmap_to_bytes ( & deprecated, & mut buf) . unwrap ( ) ;
485
+ BASE64_STANDARD . encode_string ( & buf, & mut strbuf) ;
486
+ crate_data. serialize_field ( "c" , & strbuf) ?;
487
+ strbuf. clear ( ) ;
488
+ buf. clear ( ) ;
489
+ write_bitmap_to_bytes ( & self . emptydesc , & mut buf) . unwrap ( ) ;
490
+ BASE64_STANDARD . encode_string ( & buf, & mut strbuf) ;
491
+ crate_data. serialize_field ( "e" , & strbuf) ?;
479
492
if has_aliases {
480
493
crate_data. serialize_field ( "a" , & self . aliases ) ?;
481
494
}
482
495
crate_data. end ( )
483
496
}
484
497
}
485
498
486
- let desc = {
499
+ let ( emptydesc, desc) = {
500
+ let mut emptydesc = Vec :: new ( ) ;
487
501
let mut result = Vec :: new ( ) ;
488
502
let mut set = String :: new ( ) ;
489
503
let mut len: usize = 0 ;
504
+ let mut itemindex: u32 = 0 ;
490
505
for desc in std:: iter:: once ( & crate_doc) . chain ( crate_items. iter ( ) . map ( |item| & item. desc ) ) {
506
+ if desc == "" {
507
+ emptydesc. push ( itemindex) ;
508
+ itemindex += 1 ;
509
+ continue ;
510
+ }
491
511
if set. len ( ) >= DESC_INDEX_SHARD_LEN {
492
512
result. push ( ( len, std:: mem:: replace ( & mut set, String :: new ( ) ) ) ) ;
493
513
len = 0 ;
@@ -496,9 +516,10 @@ pub(crate) fn build_index<'tcx>(
496
516
}
497
517
set. push_str ( & desc) ;
498
518
len += 1 ;
519
+ itemindex += 1 ;
499
520
}
500
521
result. push ( ( len, std:: mem:: replace ( & mut set, String :: new ( ) ) ) ) ;
501
- result
522
+ ( emptydesc , result)
502
523
} ;
503
524
504
525
let descindex = {
@@ -509,7 +530,10 @@ pub(crate) fn build_index<'tcx>(
509
530
descindex
510
531
} ;
511
532
512
- assert_eq ! ( crate_items. len( ) + 1 , desc. iter( ) . map( |( len, _) | * len) . sum:: <usize >( ) ) ;
533
+ assert_eq ! (
534
+ crate_items. len( ) + 1 ,
535
+ desc. iter( ) . map( |( len, _) | * len) . sum:: <usize >( ) + emptydesc. len( )
536
+ ) ;
513
537
514
538
// The index, which is actually used to search, is JSON
515
539
// It uses `JSON.parse(..)` to actually load, since JSON
@@ -523,6 +547,7 @@ pub(crate) fn build_index<'tcx>(
523
547
aliases: & aliases,
524
548
associated_item_disambiguators: & associated_item_disambiguators,
525
549
descindex,
550
+ emptydesc,
526
551
} )
527
552
. expect( "failed serde conversion" )
528
553
// All these `replace` calls are because we have to go through JS string for JSON content.
@@ -571,6 +596,200 @@ pub(crate) fn write_vlqhex_to_string(n: i32, string: &mut String) {
571
596
}
572
597
}
573
598
599
+ // checked against roaring-rs in
600
+ // https://gitlab.com/notriddle/roaring-test
601
+ pub fn write_bitmap_to_bytes ( domain : & [ u32 ] , mut out : impl std:: io:: Write ) -> std:: io:: Result < ( ) > {
602
+ // https://arxiv.org/pdf/1603.06549.pdf
603
+ let mut keys = Vec :: < u16 > :: new ( ) ;
604
+ let mut containers = Vec :: < Container > :: new ( ) ;
605
+ enum Container {
606
+ /// number of ones, bits
607
+ Bits ( Box < [ u64 ; 1024 ] > ) ,
608
+ /// list of entries
609
+ Array ( Vec < u16 > ) ,
610
+ /// list of (start, len-1)
611
+ Run ( Vec < ( u16 , u16 ) > ) ,
612
+ }
613
+ impl Container {
614
+ fn popcount ( & self ) -> u32 {
615
+ match self {
616
+ Container :: Bits ( bits) => bits. iter ( ) . copied ( ) . map ( |x| x. count_ones ( ) ) . sum ( ) ,
617
+ Container :: Array ( array) => {
618
+ array. len ( ) . try_into ( ) . expect ( "array can't be bigger than 2**32" )
619
+ }
620
+ Container :: Run ( runs) => {
621
+ runs. iter ( ) . copied ( ) . map ( |( _, lenm1) | u32:: from ( lenm1) + 1 ) . sum ( )
622
+ }
623
+ }
624
+ }
625
+ fn push ( & mut self , value : u16 ) {
626
+ match self {
627
+ Container :: Bits ( bits) => bits[ value as usize >> 6 ] |= 1 << ( value & 0x3F ) ,
628
+ Container :: Array ( array) => {
629
+ array. push ( value) ;
630
+ if array. len ( ) >= 4096 {
631
+ let array = std:: mem:: replace ( array, Vec :: new ( ) ) ;
632
+ * self = Container :: Bits ( Box :: new ( [ 0 ; 1024 ] ) ) ;
633
+ for value in array {
634
+ self . push ( value) ;
635
+ }
636
+ }
637
+ }
638
+ Container :: Run ( runs) => {
639
+ if let Some ( r) = runs. last_mut ( )
640
+ && r. 0 + r. 1 + 1 == value
641
+ {
642
+ r. 1 += 1 ;
643
+ } else {
644
+ runs. push ( ( value, 0 ) ) ;
645
+ }
646
+ }
647
+ }
648
+ }
649
+ fn try_make_run ( & mut self ) -> bool {
650
+ match self {
651
+ Container :: Bits ( bits) => {
652
+ let mut r: u64 = 0 ;
653
+ for ( i, chunk) in bits. iter ( ) . copied ( ) . enumerate ( ) {
654
+ let next_chunk =
655
+ i. checked_add ( 1 ) . and_then ( |i| bits. get ( i) ) . copied ( ) . unwrap_or ( 0 ) ;
656
+ r += !chunk & u64:: from ( ( chunk << 1 ) . count_ones ( ) ) ;
657
+ r += !next_chunk & u64:: from ( ( chunk >> 63 ) . count_ones ( ) ) ;
658
+ }
659
+ if ( 2 + 4 * r) < 8192 {
660
+ let bits = std:: mem:: replace ( bits, Box :: new ( [ 0 ; 1024 ] ) ) ;
661
+ * self = Container :: Run ( Vec :: new ( ) ) ;
662
+ for ( i, bits) in bits. iter ( ) . copied ( ) . enumerate ( ) {
663
+ if bits == 0 {
664
+ continue ;
665
+ }
666
+ for j in 0 ..64 {
667
+ let value = ( u16:: try_from ( i) . unwrap ( ) << 6 ) | j;
668
+ if bits & ( 1 << j) != 0 {
669
+ self . push ( value) ;
670
+ }
671
+ }
672
+ }
673
+ true
674
+ } else {
675
+ false
676
+ }
677
+ }
678
+ Container :: Array ( array) if array. len ( ) <= 5 => false ,
679
+ Container :: Array ( array) => {
680
+ let mut r = 0 ;
681
+ let mut prev = None ;
682
+ for value in array. iter ( ) . copied ( ) {
683
+ if value. checked_sub ( 1 ) != prev {
684
+ r += 1 ;
685
+ }
686
+ prev = Some ( value) ;
687
+ }
688
+ if 2 + 4 * r < 2 * array. len ( ) + 2 {
689
+ let array = std:: mem:: replace ( array, Vec :: new ( ) ) ;
690
+ * self = Container :: Run ( Vec :: new ( ) ) ;
691
+ for value in array {
692
+ self . push ( value) ;
693
+ }
694
+ true
695
+ } else {
696
+ false
697
+ }
698
+ }
699
+ Container :: Run ( _) => true ,
700
+ }
701
+ }
702
+ }
703
+ let mut key: u16 ;
704
+ let mut domain_iter = domain. into_iter ( ) . copied ( ) . peekable ( ) ;
705
+ let mut has_run = false ;
706
+ while let Some ( entry) = domain_iter. next ( ) {
707
+ key = ( entry >> 16 ) . try_into ( ) . expect ( "shifted off the top 16 bits, so it should fit" ) ;
708
+ let value: u16 = ( entry & 0x00_00_FF_FF ) . try_into ( ) . expect ( "AND 16 bits, so it should fit" ) ;
709
+ let mut container = Container :: Array ( vec ! [ value] ) ;
710
+ while let Some ( entry) = domain_iter. peek ( ) . copied ( ) {
711
+ let entry_key: u16 =
712
+ ( entry >> 16 ) . try_into ( ) . expect ( "shifted off the top 16 bits, so it should fit" ) ;
713
+ if entry_key != key {
714
+ break ;
715
+ }
716
+ domain_iter. next ( ) . expect ( "peeking just succeeded" ) ;
717
+ container
718
+ . push ( ( entry & 0x00_00_FF_FF ) . try_into ( ) . expect ( "AND 16 bits, so it should fit" ) ) ;
719
+ }
720
+ keys. push ( key) ;
721
+ has_run = container. try_make_run ( ) || has_run;
722
+ containers. push ( container) ;
723
+ }
724
+ // https://github.com/RoaringBitmap/RoaringFormatSpec
725
+ use byteorder:: { WriteBytesExt , LE } ;
726
+ const SERIAL_COOKIE_NO_RUNCONTAINER : u32 = 12346 ;
727
+ const SERIAL_COOKIE : u32 = 12347 ;
728
+ const NO_OFFSET_THRESHOLD : u32 = 4 ;
729
+ let size: u32 = containers. len ( ) . try_into ( ) . unwrap ( ) ;
730
+ let start_offset = if has_run {
731
+ out. write_u32 :: < LE > ( SERIAL_COOKIE | ( ( size - 1 ) << 16 ) ) ?;
732
+ for set in containers. chunks ( 8 ) {
733
+ let mut b = 0 ;
734
+ for ( i, container) in set. iter ( ) . enumerate ( ) {
735
+ if matches ! ( container, & Container :: Run ( ..) ) {
736
+ b |= 1 << i;
737
+ }
738
+ }
739
+ out. write_u8 ( b) ?;
740
+ }
741
+ if size < NO_OFFSET_THRESHOLD {
742
+ 4 + 4 * size + ( ( size + 7 ) / 8 )
743
+ } else {
744
+ 4 + 8 * size + ( ( size + 7 ) / 8 )
745
+ }
746
+ } else {
747
+ out. write_u32 :: < LE > ( SERIAL_COOKIE_NO_RUNCONTAINER ) ?;
748
+ out. write_u32 :: < LE > ( containers. len ( ) . try_into ( ) . unwrap ( ) ) ?;
749
+ 4 + 4 + 4 * size + 4 * size
750
+ } ;
751
+ for ( & key, container) in keys. iter ( ) . zip ( & containers) {
752
+ // descriptive header
753
+ let key: u32 = key. into ( ) ;
754
+ let count: u32 = container. popcount ( ) - 1 ;
755
+ out. write_u32 :: < LE > ( ( count << 16 ) | key) ?;
756
+ }
757
+ if !has_run || size >= NO_OFFSET_THRESHOLD {
758
+ // offset header
759
+ let mut starting_offset = start_offset;
760
+ for container in & containers {
761
+ out. write_u32 :: < LE > ( starting_offset) ?;
762
+ starting_offset += match container {
763
+ Container :: Bits ( _) => 8192u32 ,
764
+ Container :: Array ( array) => u32:: try_from ( array. len ( ) ) . unwrap ( ) * 2 ,
765
+ Container :: Run ( runs) => 2 + u32:: try_from ( runs. len ( ) ) . unwrap ( ) * 4 ,
766
+ } ;
767
+ }
768
+ }
769
+ for container in & containers {
770
+ match container {
771
+ Container :: Bits ( bits) => {
772
+ for chunk in bits. iter ( ) {
773
+ out. write_u64 :: < LE > ( * chunk) ?;
774
+ }
775
+ }
776
+ Container :: Array ( array) => {
777
+ for value in array. iter ( ) {
778
+ out. write_u16 :: < LE > ( * value) ?;
779
+ }
780
+ }
781
+ Container :: Run ( runs) => {
782
+ out. write_u16 :: < LE > ( ( runs. len ( ) ) . try_into ( ) . unwrap ( ) ) ?;
783
+ for ( start, lenm1) in runs. iter ( ) . copied ( ) {
784
+ out. write_u16 :: < LE > ( start) ?;
785
+ out. write_u16 :: < LE > ( lenm1) ?;
786
+ }
787
+ }
788
+ }
789
+ }
790
+ Ok ( ( ) )
791
+ }
792
+
574
793
pub ( crate ) fn get_function_type_for_search < ' tcx > (
575
794
item : & clean:: Item ,
576
795
tcx : TyCtxt < ' tcx > ,
0 commit comments