Skip to content

Commit c65f7d8

Browse files
committed
rustdoc-search: address nits
1 parent 28db4cc commit c65f7d8

File tree

5 files changed

+297
-265
lines changed

5 files changed

+297
-265
lines changed

Diff for: src/librustdoc/html/render/mod.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ impl RenderTypeId {
192192
RenderTypeId::Index(idx) => (*idx).try_into().unwrap(),
193193
_ => panic!("must convert render types to indexes before serializing"),
194194
};
195-
search_index::write_vlqhex_to_string(id, string);
195+
search_index::encode::write_vlqhex_to_string(id, string);
196196
}
197197
}
198198

Diff for: src/librustdoc/html/render/search_index.rs

+42-259
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1+
pub(crate) mod encode;
2+
13
use std::collections::hash_map::Entry;
24
use std::collections::{BTreeMap, VecDeque};
35

4-
use base64::prelude::*;
56
use rustc_data_structures::fx::{FxHashMap, FxIndexMap};
67
use rustc_middle::ty::TyCtxt;
78
use rustc_span::def_id::DefId;
@@ -18,12 +19,33 @@ use crate::html::format::join_with_double_colon;
1819
use crate::html::markdown::short_markdown_summary;
1920
use crate::html::render::{self, IndexItem, IndexItemFunctionType, RenderType, RenderTypeId};
2021

22+
use encode::{bitmap_to_string, write_vlqhex_to_string};
23+
2124
/// The serialized search description sharded version
2225
///
2326
/// The `index` is a JSON-encoded list of names and other information.
2427
///
2528
/// The desc has newlined descriptions, split up by size into 128KiB shards.
2629
/// For example, `(4, "foo\nbar\nbaz\nquux")`.
30+
///
31+
/// There is no single, optimal size for these shards, because it depends on
32+
/// configuration values that we can't predict or control, such as the version
33+
/// of HTTP used (HTTP/1.1 would work better with larger files, while HTTP/2
34+
/// and 3 are more agnostic), transport compression (gzip, zstd, etc), whether
35+
/// the search query is going to produce a large number of results or a small
36+
/// number, the bandwidth delay product of the network...
37+
///
38+
/// Gzipping some standard library descriptions to guess what transport
39+
/// compression will do, the compressed file sizes can be as small as 4.9KiB
40+
/// or as large as 18KiB (ignoring the final 1.9KiB shard of leftovers).
41+
/// A "reasonable" range for files is for them to be bigger than 1KiB,
42+
/// since that's about the amount of data that can be transferred in a
43+
/// single TCP packet, and 64KiB, the maximum amount of data that
44+
/// TCP can transfer in a single round trip without extensions.
45+
///
46+
/// [1]: https://en.wikipedia.org/wiki/Maximum_transmission_unit#MTUs_for_common_media
47+
/// [2]: https://en.wikipedia.org/wiki/Sliding_window_protocol#Basic_concept
48+
/// [3]: https://learn.microsoft.com/en-us/troubleshoot/windows-server/networking/description-tcp-features
2749
pub(crate) struct SerializedSearchIndex {
2850
pub(crate) index: String,
2951
pub(crate) desc: Vec<(usize, String)>,
@@ -342,9 +364,9 @@ pub(crate) fn build_index<'tcx>(
342364
associated_item_disambiguators: &'a Vec<(usize, String)>,
343365
// A list of shard lengths encoded as vlqhex. See the comment in write_vlqhex_to_string
344366
// for information on the format.
345-
descindex: String,
367+
desc_index: String,
346368
// A list of items with no description. This is eventually turned into a bitmap.
347-
emptydesc: Vec<u32>,
369+
empty_desc: Vec<u32>,
348370
}
349371

350372
struct Paths {
@@ -476,36 +498,28 @@ pub(crate) fn build_index<'tcx>(
476498
crate_data.serialize_field("q", &full_paths)?;
477499
crate_data.serialize_field("i", &parents)?;
478500
crate_data.serialize_field("f", &functions)?;
479-
crate_data.serialize_field("D", &self.descindex)?;
501+
crate_data.serialize_field("D", &self.desc_index)?;
480502
crate_data.serialize_field("p", &paths)?;
481503
crate_data.serialize_field("b", &self.associated_item_disambiguators)?;
482-
let mut buf = Vec::new();
483-
let mut strbuf = String::new();
484-
write_bitmap_to_bytes(&deprecated, &mut buf).unwrap();
485-
BASE64_STANDARD.encode_string(&buf, &mut strbuf);
486-
crate_data.serialize_field("c", &strbuf)?;
487-
strbuf.clear();
488-
buf.clear();
489-
write_bitmap_to_bytes(&self.emptydesc, &mut buf).unwrap();
490-
BASE64_STANDARD.encode_string(&buf, &mut strbuf);
491-
crate_data.serialize_field("e", &strbuf)?;
504+
crate_data.serialize_field("c", &bitmap_to_string(&deprecated))?;
505+
crate_data.serialize_field("e", &bitmap_to_string(&self.empty_desc))?;
492506
if has_aliases {
493507
crate_data.serialize_field("a", &self.aliases)?;
494508
}
495509
crate_data.end()
496510
}
497511
}
498512

499-
let (emptydesc, desc) = {
500-
let mut emptydesc = Vec::new();
513+
let (empty_desc, desc) = {
514+
let mut empty_desc = Vec::new();
501515
let mut result = Vec::new();
502516
let mut set = String::new();
503517
let mut len: usize = 0;
504-
let mut itemindex: u32 = 0;
518+
let mut item_index: u32 = 0;
505519
for desc in std::iter::once(&crate_doc).chain(crate_items.iter().map(|item| &item.desc)) {
506520
if desc == "" {
507-
emptydesc.push(itemindex);
508-
itemindex += 1;
521+
empty_desc.push(item_index);
522+
item_index += 1;
509523
continue;
510524
}
511525
if set.len() >= DESC_INDEX_SHARD_LEN {
@@ -516,23 +530,23 @@ pub(crate) fn build_index<'tcx>(
516530
}
517531
set.push_str(&desc);
518532
len += 1;
519-
itemindex += 1;
533+
item_index += 1;
520534
}
521535
result.push((len, std::mem::replace(&mut set, String::new())));
522-
(emptydesc, result)
536+
(empty_desc, result)
523537
};
524538

525-
let descindex = {
526-
let mut descindex = String::with_capacity(desc.len() * 4);
539+
let desc_index = {
540+
let mut desc_index = String::with_capacity(desc.len() * 4);
527541
for &(len, _) in desc.iter() {
528-
write_vlqhex_to_string(len.try_into().unwrap(), &mut descindex);
542+
write_vlqhex_to_string(len.try_into().unwrap(), &mut desc_index);
529543
}
530-
descindex
544+
desc_index
531545
};
532546

533547
assert_eq!(
534548
crate_items.len() + 1,
535-
desc.iter().map(|(len, _)| *len).sum::<usize>() + emptydesc.len()
549+
desc.iter().map(|(len, _)| *len).sum::<usize>() + empty_desc.len()
536550
);
537551

538552
// The index, which is actually used to search, is JSON
@@ -546,8 +560,8 @@ pub(crate) fn build_index<'tcx>(
546560
paths: crate_paths,
547561
aliases: &aliases,
548562
associated_item_disambiguators: &associated_item_disambiguators,
549-
descindex,
550-
emptydesc,
563+
desc_index,
564+
empty_desc,
551565
})
552566
.expect("failed serde conversion")
553567
// All these `replace` calls are because we have to go through JS string for JSON content.
@@ -559,237 +573,6 @@ pub(crate) fn build_index<'tcx>(
559573
SerializedSearchIndex { index, desc }
560574
}
561575

562-
pub(crate) fn write_vlqhex_to_string(n: i32, string: &mut String) {
563-
let (sign, magnitude): (bool, u32) =
564-
if n >= 0 { (false, n.try_into().unwrap()) } else { (true, (-n).try_into().unwrap()) };
565-
// zig-zag encoding
566-
let value: u32 = (magnitude << 1) | (if sign { 1 } else { 0 });
567-
// Self-terminating hex use capital letters for everything but the
568-
// least significant digit, which is lowercase. For example, decimal 17
569-
// would be `` Aa `` if zig-zag encoding weren't used.
570-
//
571-
// Zig-zag encoding, however, stores the sign bit as the last bit.
572-
// This means, in the last hexit, 1 is actually `c`, -1 is `b`
573-
// (`a` is the imaginary -0), and, because all the bits are shifted
574-
// by one, `` A` `` is actually 8 and `` Aa `` is -8.
575-
//
576-
// https://rust-lang.github.io/rustc-dev-guide/rustdoc-internals/search.html
577-
// describes the encoding in more detail.
578-
let mut shift: u32 = 28;
579-
let mut mask: u32 = 0xF0_00_00_00;
580-
// first skip leading zeroes
581-
while shift < 32 {
582-
let hexit = (value & mask) >> shift;
583-
if hexit != 0 || shift == 0 {
584-
break;
585-
}
586-
shift = shift.wrapping_sub(4);
587-
mask = mask >> 4;
588-
}
589-
// now write the rest
590-
while shift < 32 {
591-
let hexit = (value & mask) >> shift;
592-
let hex = char::try_from(if shift == 0 { '`' } else { '@' } as u32 + hexit).unwrap();
593-
string.push(hex);
594-
shift = shift.wrapping_sub(4);
595-
mask = mask >> 4;
596-
}
597-
}
598-
599-
// checked against roaring-rs in
600-
// https://gitlab.com/notriddle/roaring-test
601-
pub fn write_bitmap_to_bytes(domain: &[u32], mut out: impl std::io::Write) -> std::io::Result<()> {
602-
// https://arxiv.org/pdf/1603.06549.pdf
603-
let mut keys = Vec::<u16>::new();
604-
let mut containers = Vec::<Container>::new();
605-
enum Container {
606-
/// number of ones, bits
607-
Bits(Box<[u64; 1024]>),
608-
/// list of entries
609-
Array(Vec<u16>),
610-
/// list of (start, len-1)
611-
Run(Vec<(u16, u16)>),
612-
}
613-
impl Container {
614-
fn popcount(&self) -> u32 {
615-
match self {
616-
Container::Bits(bits) => bits.iter().copied().map(|x| x.count_ones()).sum(),
617-
Container::Array(array) => {
618-
array.len().try_into().expect("array can't be bigger than 2**32")
619-
}
620-
Container::Run(runs) => {
621-
runs.iter().copied().map(|(_, lenm1)| u32::from(lenm1) + 1).sum()
622-
}
623-
}
624-
}
625-
fn push(&mut self, value: u16) {
626-
match self {
627-
Container::Bits(bits) => bits[value as usize >> 6] |= 1 << (value & 0x3F),
628-
Container::Array(array) => {
629-
array.push(value);
630-
if array.len() >= 4096 {
631-
let array = std::mem::replace(array, Vec::new());
632-
*self = Container::Bits(Box::new([0; 1024]));
633-
for value in array {
634-
self.push(value);
635-
}
636-
}
637-
}
638-
Container::Run(runs) => {
639-
if let Some(r) = runs.last_mut()
640-
&& r.0 + r.1 + 1 == value
641-
{
642-
r.1 += 1;
643-
} else {
644-
runs.push((value, 0));
645-
}
646-
}
647-
}
648-
}
649-
fn try_make_run(&mut self) -> bool {
650-
match self {
651-
Container::Bits(bits) => {
652-
let mut r: u64 = 0;
653-
for (i, chunk) in bits.iter().copied().enumerate() {
654-
let next_chunk =
655-
i.checked_add(1).and_then(|i| bits.get(i)).copied().unwrap_or(0);
656-
r += !chunk & u64::from((chunk << 1).count_ones());
657-
r += !next_chunk & u64::from((chunk >> 63).count_ones());
658-
}
659-
if (2 + 4 * r) < 8192 {
660-
let bits = std::mem::replace(bits, Box::new([0; 1024]));
661-
*self = Container::Run(Vec::new());
662-
for (i, bits) in bits.iter().copied().enumerate() {
663-
if bits == 0 {
664-
continue;
665-
}
666-
for j in 0..64 {
667-
let value = (u16::try_from(i).unwrap() << 6) | j;
668-
if bits & (1 << j) != 0 {
669-
self.push(value);
670-
}
671-
}
672-
}
673-
true
674-
} else {
675-
false
676-
}
677-
}
678-
Container::Array(array) if array.len() <= 5 => false,
679-
Container::Array(array) => {
680-
let mut r = 0;
681-
let mut prev = None;
682-
for value in array.iter().copied() {
683-
if value.checked_sub(1) != prev {
684-
r += 1;
685-
}
686-
prev = Some(value);
687-
}
688-
if 2 + 4 * r < 2 * array.len() + 2 {
689-
let array = std::mem::replace(array, Vec::new());
690-
*self = Container::Run(Vec::new());
691-
for value in array {
692-
self.push(value);
693-
}
694-
true
695-
} else {
696-
false
697-
}
698-
}
699-
Container::Run(_) => true,
700-
}
701-
}
702-
}
703-
let mut key: u16;
704-
let mut domain_iter = domain.into_iter().copied().peekable();
705-
let mut has_run = false;
706-
while let Some(entry) = domain_iter.next() {
707-
key = (entry >> 16).try_into().expect("shifted off the top 16 bits, so it should fit");
708-
let value: u16 = (entry & 0x00_00_FF_FF).try_into().expect("AND 16 bits, so it should fit");
709-
let mut container = Container::Array(vec![value]);
710-
while let Some(entry) = domain_iter.peek().copied() {
711-
let entry_key: u16 =
712-
(entry >> 16).try_into().expect("shifted off the top 16 bits, so it should fit");
713-
if entry_key != key {
714-
break;
715-
}
716-
domain_iter.next().expect("peeking just succeeded");
717-
container
718-
.push((entry & 0x00_00_FF_FF).try_into().expect("AND 16 bits, so it should fit"));
719-
}
720-
keys.push(key);
721-
has_run = container.try_make_run() || has_run;
722-
containers.push(container);
723-
}
724-
// https://github.com/RoaringBitmap/RoaringFormatSpec
725-
use byteorder::{WriteBytesExt, LE};
726-
const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346;
727-
const SERIAL_COOKIE: u32 = 12347;
728-
const NO_OFFSET_THRESHOLD: u32 = 4;
729-
let size: u32 = containers.len().try_into().unwrap();
730-
let start_offset = if has_run {
731-
out.write_u32::<LE>(SERIAL_COOKIE | ((size - 1) << 16))?;
732-
for set in containers.chunks(8) {
733-
let mut b = 0;
734-
for (i, container) in set.iter().enumerate() {
735-
if matches!(container, &Container::Run(..)) {
736-
b |= 1 << i;
737-
}
738-
}
739-
out.write_u8(b)?;
740-
}
741-
if size < NO_OFFSET_THRESHOLD {
742-
4 + 4 * size + ((size + 7) / 8)
743-
} else {
744-
4 + 8 * size + ((size + 7) / 8)
745-
}
746-
} else {
747-
out.write_u32::<LE>(SERIAL_COOKIE_NO_RUNCONTAINER)?;
748-
out.write_u32::<LE>(containers.len().try_into().unwrap())?;
749-
4 + 4 + 4 * size + 4 * size
750-
};
751-
for (&key, container) in keys.iter().zip(&containers) {
752-
// descriptive header
753-
let key: u32 = key.into();
754-
let count: u32 = container.popcount() - 1;
755-
out.write_u32::<LE>((count << 16) | key)?;
756-
}
757-
if !has_run || size >= NO_OFFSET_THRESHOLD {
758-
// offset header
759-
let mut starting_offset = start_offset;
760-
for container in &containers {
761-
out.write_u32::<LE>(starting_offset)?;
762-
starting_offset += match container {
763-
Container::Bits(_) => 8192u32,
764-
Container::Array(array) => u32::try_from(array.len()).unwrap() * 2,
765-
Container::Run(runs) => 2 + u32::try_from(runs.len()).unwrap() * 4,
766-
};
767-
}
768-
}
769-
for container in &containers {
770-
match container {
771-
Container::Bits(bits) => {
772-
for chunk in bits.iter() {
773-
out.write_u64::<LE>(*chunk)?;
774-
}
775-
}
776-
Container::Array(array) => {
777-
for value in array.iter() {
778-
out.write_u16::<LE>(*value)?;
779-
}
780-
}
781-
Container::Run(runs) => {
782-
out.write_u16::<LE>((runs.len()).try_into().unwrap())?;
783-
for (start, lenm1) in runs.iter().copied() {
784-
out.write_u16::<LE>(start)?;
785-
out.write_u16::<LE>(lenm1)?;
786-
}
787-
}
788-
}
789-
}
790-
Ok(())
791-
}
792-
793576
pub(crate) fn get_function_type_for_search<'tcx>(
794577
item: &clean::Item,
795578
tcx: TyCtxt<'tcx>,

0 commit comments

Comments
 (0)