Skip to content

Commit 28db4cc

Browse files
committed
rustdoc-search: compressed bitmap to sort, then load desc
This adds a bit more data than "pure sharding" by including information about which items have no description at all. This way, it can sort the results, then truncate, then finally download the description. With the "e" bitmap: 2380KiB Without the "e" bitmap: 2364KiB
1 parent e860b9c commit 28db4cc

File tree

5 files changed

+395
-37
lines changed

5 files changed

+395
-37
lines changed

Diff for: Cargo.lock

+2
Original file line numberDiff line numberDiff line change
@@ -4741,6 +4741,8 @@ version = "0.0.0"
47414741
dependencies = [
47424742
"arrayvec",
47434743
"askama",
4744+
"base64",
4745+
"byteorder",
47444746
"expect-test",
47454747
"indexmap",
47464748
"itertools 0.12.1",

Diff for: src/librustdoc/Cargo.toml

+2
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ path = "lib.rs"
99
[dependencies]
1010
arrayvec = { version = "0.7", default-features = false }
1111
askama = { version = "0.12", default-features = false, features = ["config"] }
12+
base64 = "0.21.7"
13+
byteorder = "1.5"
1214
itertools = "0.12"
1315
indexmap = "2"
1416
minifier = "0.3.0"

Diff for: src/librustdoc/html/render/search_index.rs

+226-7
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
use std::collections::hash_map::Entry;
22
use std::collections::{BTreeMap, VecDeque};
33

4+
use base64::prelude::*;
45
use rustc_data_structures::fx::{FxHashMap, FxIndexMap};
56
use rustc_middle::ty::TyCtxt;
67
use rustc_span::def_id::DefId;
@@ -21,14 +22,14 @@ use crate::html::render::{self, IndexItem, IndexItemFunctionType, RenderType, Re
2122
///
2223
/// The `index` is a JSON-encoded list of names and other information.
2324
///
24-
/// The desc has newlined descriptions, split up by size into 1MiB shards.
25+
/// The desc has newlined descriptions, split up by size into 128KiB shards.
2526
/// For example, `(4, "foo\nbar\nbaz\nquux")`.
2627
pub(crate) struct SerializedSearchIndex {
2728
pub(crate) index: String,
2829
pub(crate) desc: Vec<(usize, String)>,
2930
}
3031

31-
const DESC_INDEX_SHARD_LEN: usize = 1024 * 1024;
32+
const DESC_INDEX_SHARD_LEN: usize = 128 * 1024;
3233

3334
/// Builds the search index from the collected metadata
3435
pub(crate) fn build_index<'tcx>(
@@ -342,6 +343,8 @@ pub(crate) fn build_index<'tcx>(
342343
// A list of shard lengths encoded as vlqhex. See the comment in write_vlqhex_to_string
343344
// for information on the format.
344345
descindex: String,
346+
// A list of items with no description. This is eventually turned into a bitmap.
347+
emptydesc: Vec<u32>,
345348
}
346349

347350
struct Paths {
@@ -456,7 +459,8 @@ pub(crate) fn build_index<'tcx>(
456459
}
457460

458461
if item.deprecation.is_some() {
459-
deprecated.push(index);
462+
// bitmasks always use 1-indexing for items, with 0 as the crate itself
463+
deprecated.push(u32::try_from(index + 1).unwrap());
460464
}
461465
}
462466

@@ -473,21 +477,37 @@ pub(crate) fn build_index<'tcx>(
473477
crate_data.serialize_field("i", &parents)?;
474478
crate_data.serialize_field("f", &functions)?;
475479
crate_data.serialize_field("D", &self.descindex)?;
476-
crate_data.serialize_field("c", &deprecated)?;
477480
crate_data.serialize_field("p", &paths)?;
478481
crate_data.serialize_field("b", &self.associated_item_disambiguators)?;
482+
let mut buf = Vec::new();
483+
let mut strbuf = String::new();
484+
write_bitmap_to_bytes(&deprecated, &mut buf).unwrap();
485+
BASE64_STANDARD.encode_string(&buf, &mut strbuf);
486+
crate_data.serialize_field("c", &strbuf)?;
487+
strbuf.clear();
488+
buf.clear();
489+
write_bitmap_to_bytes(&self.emptydesc, &mut buf).unwrap();
490+
BASE64_STANDARD.encode_string(&buf, &mut strbuf);
491+
crate_data.serialize_field("e", &strbuf)?;
479492
if has_aliases {
480493
crate_data.serialize_field("a", &self.aliases)?;
481494
}
482495
crate_data.end()
483496
}
484497
}
485498

486-
let desc = {
499+
let (emptydesc, desc) = {
500+
let mut emptydesc = Vec::new();
487501
let mut result = Vec::new();
488502
let mut set = String::new();
489503
let mut len: usize = 0;
504+
let mut itemindex: u32 = 0;
490505
for desc in std::iter::once(&crate_doc).chain(crate_items.iter().map(|item| &item.desc)) {
506+
if desc == "" {
507+
emptydesc.push(itemindex);
508+
itemindex += 1;
509+
continue;
510+
}
491511
if set.len() >= DESC_INDEX_SHARD_LEN {
492512
result.push((len, std::mem::replace(&mut set, String::new())));
493513
len = 0;
@@ -496,9 +516,10 @@ pub(crate) fn build_index<'tcx>(
496516
}
497517
set.push_str(&desc);
498518
len += 1;
519+
itemindex += 1;
499520
}
500521
result.push((len, std::mem::replace(&mut set, String::new())));
501-
result
522+
(emptydesc, result)
502523
};
503524

504525
let descindex = {
@@ -509,7 +530,10 @@ pub(crate) fn build_index<'tcx>(
509530
descindex
510531
};
511532

512-
assert_eq!(crate_items.len() + 1, desc.iter().map(|(len, _)| *len).sum::<usize>());
533+
assert_eq!(
534+
crate_items.len() + 1,
535+
desc.iter().map(|(len, _)| *len).sum::<usize>() + emptydesc.len()
536+
);
513537

514538
// The index, which is actually used to search, is JSON
515539
// It uses `JSON.parse(..)` to actually load, since JSON
@@ -523,6 +547,7 @@ pub(crate) fn build_index<'tcx>(
523547
aliases: &aliases,
524548
associated_item_disambiguators: &associated_item_disambiguators,
525549
descindex,
550+
emptydesc,
526551
})
527552
.expect("failed serde conversion")
528553
// All these `replace` calls are because we have to go through JS string for JSON content.
@@ -571,6 +596,200 @@ pub(crate) fn write_vlqhex_to_string(n: i32, string: &mut String) {
571596
}
572597
}
573598

599+
// checked against roaring-rs in
600+
// https://gitlab.com/notriddle/roaring-test
601+
pub fn write_bitmap_to_bytes(domain: &[u32], mut out: impl std::io::Write) -> std::io::Result<()> {
602+
// https://arxiv.org/pdf/1603.06549.pdf
603+
let mut keys = Vec::<u16>::new();
604+
let mut containers = Vec::<Container>::new();
605+
enum Container {
606+
/// number of ones, bits
607+
Bits(Box<[u64; 1024]>),
608+
/// list of entries
609+
Array(Vec<u16>),
610+
/// list of (start, len-1)
611+
Run(Vec<(u16, u16)>),
612+
}
613+
impl Container {
614+
fn popcount(&self) -> u32 {
615+
match self {
616+
Container::Bits(bits) => bits.iter().copied().map(|x| x.count_ones()).sum(),
617+
Container::Array(array) => {
618+
array.len().try_into().expect("array can't be bigger than 2**32")
619+
}
620+
Container::Run(runs) => {
621+
runs.iter().copied().map(|(_, lenm1)| u32::from(lenm1) + 1).sum()
622+
}
623+
}
624+
}
625+
fn push(&mut self, value: u16) {
626+
match self {
627+
Container::Bits(bits) => bits[value as usize >> 6] |= 1 << (value & 0x3F),
628+
Container::Array(array) => {
629+
array.push(value);
630+
if array.len() >= 4096 {
631+
let array = std::mem::replace(array, Vec::new());
632+
*self = Container::Bits(Box::new([0; 1024]));
633+
for value in array {
634+
self.push(value);
635+
}
636+
}
637+
}
638+
Container::Run(runs) => {
639+
if let Some(r) = runs.last_mut()
640+
&& r.0 + r.1 + 1 == value
641+
{
642+
r.1 += 1;
643+
} else {
644+
runs.push((value, 0));
645+
}
646+
}
647+
}
648+
}
649+
fn try_make_run(&mut self) -> bool {
650+
match self {
651+
Container::Bits(bits) => {
652+
let mut r: u64 = 0;
653+
for (i, chunk) in bits.iter().copied().enumerate() {
654+
let next_chunk =
655+
i.checked_add(1).and_then(|i| bits.get(i)).copied().unwrap_or(0);
656+
r += !chunk & u64::from((chunk << 1).count_ones());
657+
r += !next_chunk & u64::from((chunk >> 63).count_ones());
658+
}
659+
if (2 + 4 * r) < 8192 {
660+
let bits = std::mem::replace(bits, Box::new([0; 1024]));
661+
*self = Container::Run(Vec::new());
662+
for (i, bits) in bits.iter().copied().enumerate() {
663+
if bits == 0 {
664+
continue;
665+
}
666+
for j in 0..64 {
667+
let value = (u16::try_from(i).unwrap() << 6) | j;
668+
if bits & (1 << j) != 0 {
669+
self.push(value);
670+
}
671+
}
672+
}
673+
true
674+
} else {
675+
false
676+
}
677+
}
678+
Container::Array(array) if array.len() <= 5 => false,
679+
Container::Array(array) => {
680+
let mut r = 0;
681+
let mut prev = None;
682+
for value in array.iter().copied() {
683+
if value.checked_sub(1) != prev {
684+
r += 1;
685+
}
686+
prev = Some(value);
687+
}
688+
if 2 + 4 * r < 2 * array.len() + 2 {
689+
let array = std::mem::replace(array, Vec::new());
690+
*self = Container::Run(Vec::new());
691+
for value in array {
692+
self.push(value);
693+
}
694+
true
695+
} else {
696+
false
697+
}
698+
}
699+
Container::Run(_) => true,
700+
}
701+
}
702+
}
703+
let mut key: u16;
704+
let mut domain_iter = domain.into_iter().copied().peekable();
705+
let mut has_run = false;
706+
while let Some(entry) = domain_iter.next() {
707+
key = (entry >> 16).try_into().expect("shifted off the top 16 bits, so it should fit");
708+
let value: u16 = (entry & 0x00_00_FF_FF).try_into().expect("AND 16 bits, so it should fit");
709+
let mut container = Container::Array(vec![value]);
710+
while let Some(entry) = domain_iter.peek().copied() {
711+
let entry_key: u16 =
712+
(entry >> 16).try_into().expect("shifted off the top 16 bits, so it should fit");
713+
if entry_key != key {
714+
break;
715+
}
716+
domain_iter.next().expect("peeking just succeeded");
717+
container
718+
.push((entry & 0x00_00_FF_FF).try_into().expect("AND 16 bits, so it should fit"));
719+
}
720+
keys.push(key);
721+
has_run = container.try_make_run() || has_run;
722+
containers.push(container);
723+
}
724+
// https://github.com/RoaringBitmap/RoaringFormatSpec
725+
use byteorder::{WriteBytesExt, LE};
726+
const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346;
727+
const SERIAL_COOKIE: u32 = 12347;
728+
const NO_OFFSET_THRESHOLD: u32 = 4;
729+
let size: u32 = containers.len().try_into().unwrap();
730+
let start_offset = if has_run {
731+
out.write_u32::<LE>(SERIAL_COOKIE | ((size - 1) << 16))?;
732+
for set in containers.chunks(8) {
733+
let mut b = 0;
734+
for (i, container) in set.iter().enumerate() {
735+
if matches!(container, &Container::Run(..)) {
736+
b |= 1 << i;
737+
}
738+
}
739+
out.write_u8(b)?;
740+
}
741+
if size < NO_OFFSET_THRESHOLD {
742+
4 + 4 * size + ((size + 7) / 8)
743+
} else {
744+
4 + 8 * size + ((size + 7) / 8)
745+
}
746+
} else {
747+
out.write_u32::<LE>(SERIAL_COOKIE_NO_RUNCONTAINER)?;
748+
out.write_u32::<LE>(containers.len().try_into().unwrap())?;
749+
4 + 4 + 4 * size + 4 * size
750+
};
751+
for (&key, container) in keys.iter().zip(&containers) {
752+
// descriptive header
753+
let key: u32 = key.into();
754+
let count: u32 = container.popcount() - 1;
755+
out.write_u32::<LE>((count << 16) | key)?;
756+
}
757+
if !has_run || size >= NO_OFFSET_THRESHOLD {
758+
// offset header
759+
let mut starting_offset = start_offset;
760+
for container in &containers {
761+
out.write_u32::<LE>(starting_offset)?;
762+
starting_offset += match container {
763+
Container::Bits(_) => 8192u32,
764+
Container::Array(array) => u32::try_from(array.len()).unwrap() * 2,
765+
Container::Run(runs) => 2 + u32::try_from(runs.len()).unwrap() * 4,
766+
};
767+
}
768+
}
769+
for container in &containers {
770+
match container {
771+
Container::Bits(bits) => {
772+
for chunk in bits.iter() {
773+
out.write_u64::<LE>(*chunk)?;
774+
}
775+
}
776+
Container::Array(array) => {
777+
for value in array.iter() {
778+
out.write_u16::<LE>(*value)?;
779+
}
780+
}
781+
Container::Run(runs) => {
782+
out.write_u16::<LE>((runs.len()).try_into().unwrap())?;
783+
for (start, lenm1) in runs.iter().copied() {
784+
out.write_u16::<LE>(start)?;
785+
out.write_u16::<LE>(lenm1)?;
786+
}
787+
}
788+
}
789+
}
790+
Ok(())
791+
}
792+
574793
pub(crate) fn get_function_type_for_search<'tcx>(
575794
item: &clean::Item,
576795
tcx: TyCtxt<'tcx>,

0 commit comments

Comments
 (0)