1
+ pub ( crate ) mod encode;
2
+
1
3
use std:: collections:: hash_map:: Entry ;
2
4
use std:: collections:: { BTreeMap , VecDeque } ;
3
5
4
- use base64:: prelude:: * ;
5
6
use rustc_data_structures:: fx:: { FxHashMap , FxIndexMap } ;
6
7
use rustc_middle:: ty:: TyCtxt ;
7
8
use rustc_span:: def_id:: DefId ;
@@ -18,12 +19,33 @@ use crate::html::format::join_with_double_colon;
18
19
use crate :: html:: markdown:: short_markdown_summary;
19
20
use crate :: html:: render:: { self , IndexItem , IndexItemFunctionType , RenderType , RenderTypeId } ;
20
21
22
+ use encode:: { bitmap_to_string, write_vlqhex_to_string} ;
23
+
21
24
/// The serialized search description sharded version
22
25
///
23
26
/// The `index` is a JSON-encoded list of names and other information.
24
27
///
25
28
/// The desc has newlined descriptions, split up by size into 128KiB shards.
26
29
/// For example, `(4, "foo\nbar\nbaz\nquux")`.
30
+ ///
31
+ /// There is no single, optimal size for these shards, because it depends on
32
+ /// configuration values that we can't predict or control, such as the version
33
+ /// of HTTP used (HTTP/1.1 would work better with larger files, while HTTP/2
34
+ /// and 3 are more agnostic), transport compression (gzip, zstd, etc), whether
35
+ /// the search query is going to produce a large number of results or a small
36
+ /// number, the bandwidth delay product of the network...
37
+ ///
38
+ /// Gzipping some standard library descriptions to guess what transport
39
+ /// compression will do, the compressed file sizes can be as small as 4.9KiB
40
+ /// or as large as 18KiB (ignoring the final 1.9KiB shard of leftovers).
41
+ /// A "reasonable" range for files is for them to be bigger than 1KiB,
42
+ /// since that's about the amount of data that can be transferred in a
43
+ /// single TCP packet, and 64KiB, the maximum amount of data that
44
+ /// TCP can transfer in a single round trip without extensions.
45
+ ///
46
+ /// [1]: https://en.wikipedia.org/wiki/Maximum_transmission_unit#MTUs_for_common_media
47
+ /// [2]: https://en.wikipedia.org/wiki/Sliding_window_protocol#Basic_concept
48
+ /// [3]: https://learn.microsoft.com/en-us/troubleshoot/windows-server/networking/description-tcp-features
27
49
pub ( crate ) struct SerializedSearchIndex {
28
50
pub ( crate ) index : String ,
29
51
pub ( crate ) desc : Vec < ( usize , String ) > ,
@@ -342,9 +364,9 @@ pub(crate) fn build_index<'tcx>(
342
364
associated_item_disambiguators : & ' a Vec < ( usize , String ) > ,
343
365
// A list of shard lengths encoded as vlqhex. See the comment in write_vlqhex_to_string
344
366
// for information on the format.
345
- descindex : String ,
367
+ desc_index : String ,
346
368
// A list of items with no description. This is eventually turned into a bitmap.
347
- emptydesc : Vec < u32 > ,
369
+ empty_desc : Vec < u32 > ,
348
370
}
349
371
350
372
struct Paths {
@@ -476,36 +498,28 @@ pub(crate) fn build_index<'tcx>(
476
498
crate_data. serialize_field ( "q" , & full_paths) ?;
477
499
crate_data. serialize_field ( "i" , & parents) ?;
478
500
crate_data. serialize_field ( "f" , & functions) ?;
479
- crate_data. serialize_field ( "D" , & self . descindex ) ?;
501
+ crate_data. serialize_field ( "D" , & self . desc_index ) ?;
480
502
crate_data. serialize_field ( "p" , & paths) ?;
481
503
crate_data. serialize_field ( "b" , & self . associated_item_disambiguators ) ?;
482
- let mut buf = Vec :: new ( ) ;
483
- let mut strbuf = String :: new ( ) ;
484
- write_bitmap_to_bytes ( & deprecated, & mut buf) . unwrap ( ) ;
485
- BASE64_STANDARD . encode_string ( & buf, & mut strbuf) ;
486
- crate_data. serialize_field ( "c" , & strbuf) ?;
487
- strbuf. clear ( ) ;
488
- buf. clear ( ) ;
489
- write_bitmap_to_bytes ( & self . emptydesc , & mut buf) . unwrap ( ) ;
490
- BASE64_STANDARD . encode_string ( & buf, & mut strbuf) ;
491
- crate_data. serialize_field ( "e" , & strbuf) ?;
504
+ crate_data. serialize_field ( "c" , & bitmap_to_string ( & deprecated) ) ?;
505
+ crate_data. serialize_field ( "e" , & bitmap_to_string ( & self . empty_desc ) ) ?;
492
506
if has_aliases {
493
507
crate_data. serialize_field ( "a" , & self . aliases ) ?;
494
508
}
495
509
crate_data. end ( )
496
510
}
497
511
}
498
512
499
- let ( emptydesc , desc) = {
500
- let mut emptydesc = Vec :: new ( ) ;
513
+ let ( empty_desc , desc) = {
514
+ let mut empty_desc = Vec :: new ( ) ;
501
515
let mut result = Vec :: new ( ) ;
502
516
let mut set = String :: new ( ) ;
503
517
let mut len: usize = 0 ;
504
- let mut itemindex : u32 = 0 ;
518
+ let mut item_index : u32 = 0 ;
505
519
for desc in std:: iter:: once ( & crate_doc) . chain ( crate_items. iter ( ) . map ( |item| & item. desc ) ) {
506
520
if desc == "" {
507
- emptydesc . push ( itemindex ) ;
508
- itemindex += 1 ;
521
+ empty_desc . push ( item_index ) ;
522
+ item_index += 1 ;
509
523
continue ;
510
524
}
511
525
if set. len ( ) >= DESC_INDEX_SHARD_LEN {
@@ -516,23 +530,23 @@ pub(crate) fn build_index<'tcx>(
516
530
}
517
531
set. push_str ( & desc) ;
518
532
len += 1 ;
519
- itemindex += 1 ;
533
+ item_index += 1 ;
520
534
}
521
535
result. push ( ( len, std:: mem:: replace ( & mut set, String :: new ( ) ) ) ) ;
522
- ( emptydesc , result)
536
+ ( empty_desc , result)
523
537
} ;
524
538
525
- let descindex = {
526
- let mut descindex = String :: with_capacity ( desc. len ( ) * 4 ) ;
539
+ let desc_index = {
540
+ let mut desc_index = String :: with_capacity ( desc. len ( ) * 4 ) ;
527
541
for & ( len, _) in desc. iter ( ) {
528
- write_vlqhex_to_string ( len. try_into ( ) . unwrap ( ) , & mut descindex ) ;
542
+ write_vlqhex_to_string ( len. try_into ( ) . unwrap ( ) , & mut desc_index ) ;
529
543
}
530
- descindex
544
+ desc_index
531
545
} ;
532
546
533
547
assert_eq ! (
534
548
crate_items. len( ) + 1 ,
535
- desc. iter( ) . map( |( len, _) | * len) . sum:: <usize >( ) + emptydesc . len( )
549
+ desc. iter( ) . map( |( len, _) | * len) . sum:: <usize >( ) + empty_desc . len( )
536
550
) ;
537
551
538
552
// The index, which is actually used to search, is JSON
@@ -546,8 +560,8 @@ pub(crate) fn build_index<'tcx>(
546
560
paths: crate_paths,
547
561
aliases: & aliases,
548
562
associated_item_disambiguators: & associated_item_disambiguators,
549
- descindex ,
550
- emptydesc ,
563
+ desc_index ,
564
+ empty_desc ,
551
565
} )
552
566
. expect( "failed serde conversion" )
553
567
// All these `replace` calls are because we have to go through JS string for JSON content.
@@ -559,237 +573,6 @@ pub(crate) fn build_index<'tcx>(
559
573
SerializedSearchIndex { index, desc }
560
574
}
561
575
562
- pub ( crate ) fn write_vlqhex_to_string ( n : i32 , string : & mut String ) {
563
- let ( sign, magnitude) : ( bool , u32 ) =
564
- if n >= 0 { ( false , n. try_into ( ) . unwrap ( ) ) } else { ( true , ( -n) . try_into ( ) . unwrap ( ) ) } ;
565
- // zig-zag encoding
566
- let value: u32 = ( magnitude << 1 ) | ( if sign { 1 } else { 0 } ) ;
567
- // Self-terminating hex use capital letters for everything but the
568
- // least significant digit, which is lowercase. For example, decimal 17
569
- // would be `` Aa `` if zig-zag encoding weren't used.
570
- //
571
- // Zig-zag encoding, however, stores the sign bit as the last bit.
572
- // This means, in the last hexit, 1 is actually `c`, -1 is `b`
573
- // (`a` is the imaginary -0), and, because all the bits are shifted
574
- // by one, `` A` `` is actually 8 and `` Aa `` is -8.
575
- //
576
- // https://rust-lang.github.io/rustc-dev-guide/rustdoc-internals/search.html
577
- // describes the encoding in more detail.
578
- let mut shift: u32 = 28 ;
579
- let mut mask: u32 = 0xF0_00_00_00 ;
580
- // first skip leading zeroes
581
- while shift < 32 {
582
- let hexit = ( value & mask) >> shift;
583
- if hexit != 0 || shift == 0 {
584
- break ;
585
- }
586
- shift = shift. wrapping_sub ( 4 ) ;
587
- mask = mask >> 4 ;
588
- }
589
- // now write the rest
590
- while shift < 32 {
591
- let hexit = ( value & mask) >> shift;
592
- let hex = char:: try_from ( if shift == 0 { '`' } else { '@' } as u32 + hexit) . unwrap ( ) ;
593
- string. push ( hex) ;
594
- shift = shift. wrapping_sub ( 4 ) ;
595
- mask = mask >> 4 ;
596
- }
597
- }
598
-
599
- // checked against roaring-rs in
600
- // https://gitlab.com/notriddle/roaring-test
601
- pub fn write_bitmap_to_bytes ( domain : & [ u32 ] , mut out : impl std:: io:: Write ) -> std:: io:: Result < ( ) > {
602
- // https://arxiv.org/pdf/1603.06549.pdf
603
- let mut keys = Vec :: < u16 > :: new ( ) ;
604
- let mut containers = Vec :: < Container > :: new ( ) ;
605
- enum Container {
606
- /// number of ones, bits
607
- Bits ( Box < [ u64 ; 1024 ] > ) ,
608
- /// list of entries
609
- Array ( Vec < u16 > ) ,
610
- /// list of (start, len-1)
611
- Run ( Vec < ( u16 , u16 ) > ) ,
612
- }
613
- impl Container {
614
- fn popcount ( & self ) -> u32 {
615
- match self {
616
- Container :: Bits ( bits) => bits. iter ( ) . copied ( ) . map ( |x| x. count_ones ( ) ) . sum ( ) ,
617
- Container :: Array ( array) => {
618
- array. len ( ) . try_into ( ) . expect ( "array can't be bigger than 2**32" )
619
- }
620
- Container :: Run ( runs) => {
621
- runs. iter ( ) . copied ( ) . map ( |( _, lenm1) | u32:: from ( lenm1) + 1 ) . sum ( )
622
- }
623
- }
624
- }
625
- fn push ( & mut self , value : u16 ) {
626
- match self {
627
- Container :: Bits ( bits) => bits[ value as usize >> 6 ] |= 1 << ( value & 0x3F ) ,
628
- Container :: Array ( array) => {
629
- array. push ( value) ;
630
- if array. len ( ) >= 4096 {
631
- let array = std:: mem:: replace ( array, Vec :: new ( ) ) ;
632
- * self = Container :: Bits ( Box :: new ( [ 0 ; 1024 ] ) ) ;
633
- for value in array {
634
- self . push ( value) ;
635
- }
636
- }
637
- }
638
- Container :: Run ( runs) => {
639
- if let Some ( r) = runs. last_mut ( )
640
- && r. 0 + r. 1 + 1 == value
641
- {
642
- r. 1 += 1 ;
643
- } else {
644
- runs. push ( ( value, 0 ) ) ;
645
- }
646
- }
647
- }
648
- }
649
- fn try_make_run ( & mut self ) -> bool {
650
- match self {
651
- Container :: Bits ( bits) => {
652
- let mut r: u64 = 0 ;
653
- for ( i, chunk) in bits. iter ( ) . copied ( ) . enumerate ( ) {
654
- let next_chunk =
655
- i. checked_add ( 1 ) . and_then ( |i| bits. get ( i) ) . copied ( ) . unwrap_or ( 0 ) ;
656
- r += !chunk & u64:: from ( ( chunk << 1 ) . count_ones ( ) ) ;
657
- r += !next_chunk & u64:: from ( ( chunk >> 63 ) . count_ones ( ) ) ;
658
- }
659
- if ( 2 + 4 * r) < 8192 {
660
- let bits = std:: mem:: replace ( bits, Box :: new ( [ 0 ; 1024 ] ) ) ;
661
- * self = Container :: Run ( Vec :: new ( ) ) ;
662
- for ( i, bits) in bits. iter ( ) . copied ( ) . enumerate ( ) {
663
- if bits == 0 {
664
- continue ;
665
- }
666
- for j in 0 ..64 {
667
- let value = ( u16:: try_from ( i) . unwrap ( ) << 6 ) | j;
668
- if bits & ( 1 << j) != 0 {
669
- self . push ( value) ;
670
- }
671
- }
672
- }
673
- true
674
- } else {
675
- false
676
- }
677
- }
678
- Container :: Array ( array) if array. len ( ) <= 5 => false ,
679
- Container :: Array ( array) => {
680
- let mut r = 0 ;
681
- let mut prev = None ;
682
- for value in array. iter ( ) . copied ( ) {
683
- if value. checked_sub ( 1 ) != prev {
684
- r += 1 ;
685
- }
686
- prev = Some ( value) ;
687
- }
688
- if 2 + 4 * r < 2 * array. len ( ) + 2 {
689
- let array = std:: mem:: replace ( array, Vec :: new ( ) ) ;
690
- * self = Container :: Run ( Vec :: new ( ) ) ;
691
- for value in array {
692
- self . push ( value) ;
693
- }
694
- true
695
- } else {
696
- false
697
- }
698
- }
699
- Container :: Run ( _) => true ,
700
- }
701
- }
702
- }
703
- let mut key: u16 ;
704
- let mut domain_iter = domain. into_iter ( ) . copied ( ) . peekable ( ) ;
705
- let mut has_run = false ;
706
- while let Some ( entry) = domain_iter. next ( ) {
707
- key = ( entry >> 16 ) . try_into ( ) . expect ( "shifted off the top 16 bits, so it should fit" ) ;
708
- let value: u16 = ( entry & 0x00_00_FF_FF ) . try_into ( ) . expect ( "AND 16 bits, so it should fit" ) ;
709
- let mut container = Container :: Array ( vec ! [ value] ) ;
710
- while let Some ( entry) = domain_iter. peek ( ) . copied ( ) {
711
- let entry_key: u16 =
712
- ( entry >> 16 ) . try_into ( ) . expect ( "shifted off the top 16 bits, so it should fit" ) ;
713
- if entry_key != key {
714
- break ;
715
- }
716
- domain_iter. next ( ) . expect ( "peeking just succeeded" ) ;
717
- container
718
- . push ( ( entry & 0x00_00_FF_FF ) . try_into ( ) . expect ( "AND 16 bits, so it should fit" ) ) ;
719
- }
720
- keys. push ( key) ;
721
- has_run = container. try_make_run ( ) || has_run;
722
- containers. push ( container) ;
723
- }
724
- // https://github.com/RoaringBitmap/RoaringFormatSpec
725
- use byteorder:: { WriteBytesExt , LE } ;
726
- const SERIAL_COOKIE_NO_RUNCONTAINER : u32 = 12346 ;
727
- const SERIAL_COOKIE : u32 = 12347 ;
728
- const NO_OFFSET_THRESHOLD : u32 = 4 ;
729
- let size: u32 = containers. len ( ) . try_into ( ) . unwrap ( ) ;
730
- let start_offset = if has_run {
731
- out. write_u32 :: < LE > ( SERIAL_COOKIE | ( ( size - 1 ) << 16 ) ) ?;
732
- for set in containers. chunks ( 8 ) {
733
- let mut b = 0 ;
734
- for ( i, container) in set. iter ( ) . enumerate ( ) {
735
- if matches ! ( container, & Container :: Run ( ..) ) {
736
- b |= 1 << i;
737
- }
738
- }
739
- out. write_u8 ( b) ?;
740
- }
741
- if size < NO_OFFSET_THRESHOLD {
742
- 4 + 4 * size + ( ( size + 7 ) / 8 )
743
- } else {
744
- 4 + 8 * size + ( ( size + 7 ) / 8 )
745
- }
746
- } else {
747
- out. write_u32 :: < LE > ( SERIAL_COOKIE_NO_RUNCONTAINER ) ?;
748
- out. write_u32 :: < LE > ( containers. len ( ) . try_into ( ) . unwrap ( ) ) ?;
749
- 4 + 4 + 4 * size + 4 * size
750
- } ;
751
- for ( & key, container) in keys. iter ( ) . zip ( & containers) {
752
- // descriptive header
753
- let key: u32 = key. into ( ) ;
754
- let count: u32 = container. popcount ( ) - 1 ;
755
- out. write_u32 :: < LE > ( ( count << 16 ) | key) ?;
756
- }
757
- if !has_run || size >= NO_OFFSET_THRESHOLD {
758
- // offset header
759
- let mut starting_offset = start_offset;
760
- for container in & containers {
761
- out. write_u32 :: < LE > ( starting_offset) ?;
762
- starting_offset += match container {
763
- Container :: Bits ( _) => 8192u32 ,
764
- Container :: Array ( array) => u32:: try_from ( array. len ( ) ) . unwrap ( ) * 2 ,
765
- Container :: Run ( runs) => 2 + u32:: try_from ( runs. len ( ) ) . unwrap ( ) * 4 ,
766
- } ;
767
- }
768
- }
769
- for container in & containers {
770
- match container {
771
- Container :: Bits ( bits) => {
772
- for chunk in bits. iter ( ) {
773
- out. write_u64 :: < LE > ( * chunk) ?;
774
- }
775
- }
776
- Container :: Array ( array) => {
777
- for value in array. iter ( ) {
778
- out. write_u16 :: < LE > ( * value) ?;
779
- }
780
- }
781
- Container :: Run ( runs) => {
782
- out. write_u16 :: < LE > ( ( runs. len ( ) ) . try_into ( ) . unwrap ( ) ) ?;
783
- for ( start, lenm1) in runs. iter ( ) . copied ( ) {
784
- out. write_u16 :: < LE > ( start) ?;
785
- out. write_u16 :: < LE > ( lenm1) ?;
786
- }
787
- }
788
- }
789
- }
790
- Ok ( ( ) )
791
- }
792
-
793
576
pub ( crate ) fn get_function_type_for_search < ' tcx > (
794
577
item : & clean:: Item ,
795
578
tcx : TyCtxt < ' tcx > ,
0 commit comments