Skip to content

Commit 5b44bfd

Browse files
committed
rustdoc-search: shard the search result descriptions
The descriptions are, on almost all crates[^1], the majority of the size of the search index, even though they aren't really used for searching. This makes it relatively easy to separate them into their own files. This commit also bumps us to ES8. Out of the browsers we support, all of them support async functions according to caniuse. https://caniuse.com/async-functions [^1]: <https://microsoft.github.io/windows-docs-rs/>, a crate with 44MiB of pure names and no descriptions for them, is an outlier and should not be counted.
1 parent 351890d commit 5b44bfd

File tree

11 files changed

+428
-229
lines changed

11 files changed

+428
-229
lines changed

Diff for: src/ci/docker/host-x86_64/mingw-check/Dockerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ ENV SCRIPT python3 ../x.py --stage 2 test src/tools/expand-yaml-anchors && \
5656
/scripts/validate-error-codes.sh && \
5757
reuse --include-submodules lint && \
5858
# Runs checks to ensure that there are no ES5 issues in our JS code.
59-
es-check es6 ../src/librustdoc/html/static/js/*.js && \
59+
es-check es8 ../src/librustdoc/html/static/js/*.js && \
6060
eslint -c ../src/librustdoc/html/static/.eslintrc.js ../src/librustdoc/html/static/js/*.js && \
6161
eslint -c ../src/tools/rustdoc-js/.eslintrc.js ../src/tools/rustdoc-js/tester.js && \
6262
eslint -c ../src/tools/rustdoc-gui/.eslintrc.js ../src/tools/rustdoc-gui/tester.js

Diff for: src/librustdoc/html/render/mod.rs

+4-29
Original file line numberDiff line numberDiff line change
@@ -184,40 +184,15 @@ pub(crate) enum RenderTypeId {
184184

185185
impl RenderTypeId {
186186
pub fn write_to_string(&self, string: &mut String) {
187-
// (sign, value)
188-
let (sign, id): (bool, u32) = match &self {
187+
let id: i32 = match &self {
189188
// 0 is a sentinel, everything else is one-indexed
190189
// concrete type
191-
RenderTypeId::Index(idx) if *idx >= 0 => (false, (idx + 1isize).try_into().unwrap()),
190+
RenderTypeId::Index(idx) if *idx >= 0 => (idx + 1isize).try_into().unwrap(),
192191
// generic type parameter
193-
RenderTypeId::Index(idx) => (true, (-*idx).try_into().unwrap()),
192+
RenderTypeId::Index(idx) => (*idx).try_into().unwrap(),
194193
_ => panic!("must convert render types to indexes before serializing"),
195194
};
196-
// zig-zag encoding
197-
let value: u32 = (id << 1) | (if sign { 1 } else { 0 });
198-
// Self-terminating hex use capital letters for everything but the
199-
// least significant digit, which is lowercase. For example, decimal 17
200-
// would be `` Aa `` if zig-zag encoding weren't used.
201-
//
202-
// Zig-zag encoding, however, stores the sign bit as the last bit.
203-
// This means, in the last hexit, 1 is actually `c`, -1 is `b`
204-
// (`a` is the imaginary -0), and, because all the bits are shifted
205-
// by one, `` A` `` is actually 8 and `` Aa `` is -8.
206-
//
207-
// https://rust-lang.github.io/rustc-dev-guide/rustdoc-internals/search.html
208-
// describes the encoding in more detail.
209-
let mut shift: u32 = 28;
210-
let mut mask: u32 = 0xF0_00_00_00;
211-
while shift < 32 {
212-
let hexit = (value & mask) >> shift;
213-
if hexit != 0 || shift == 0 {
214-
let hex =
215-
char::try_from(if shift == 0 { '`' } else { '@' } as u32 + hexit).unwrap();
216-
string.push(hex);
217-
}
218-
shift = shift.wrapping_sub(4);
219-
mask = mask >> 4;
220-
}
195+
search_index::write_vlqhex_to_string(id, string);
221196
}
222197
}
223198

Diff for: src/librustdoc/html/render/search_index.rs

+90-11
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,25 @@ use crate::html::format::join_with_double_colon;
1717
use crate::html::markdown::short_markdown_summary;
1818
use crate::html::render::{self, IndexItem, IndexItemFunctionType, RenderType, RenderTypeId};
1919

20+
/// The serialized search description sharded version
21+
///
22+
/// The `index` is a JSON-encoded list of names and other information.
23+
///
24+
/// The desc has newlined descriptions, split up by size into 1MiB shards.
25+
/// For example, `(4, "foo\nbar\nbaz\nquux")`.
26+
pub(crate) struct SerializedSearchIndex {
27+
pub(crate) index: String,
28+
pub(crate) desc: Vec<(usize, String)>,
29+
}
30+
31+
const DESC_INDEX_SHARD_LEN: usize = 1024 * 1024;
32+
2033
/// Builds the search index from the collected metadata
2134
pub(crate) fn build_index<'tcx>(
2235
krate: &clean::Crate,
2336
cache: &mut Cache,
2437
tcx: TyCtxt<'tcx>,
25-
) -> String {
38+
) -> SerializedSearchIndex {
2639
let mut itemid_to_pathid = FxHashMap::default();
2740
let mut primitives = FxHashMap::default();
2841
let mut associated_types = FxHashMap::default();
@@ -318,7 +331,6 @@ pub(crate) fn build_index<'tcx>(
318331
.collect::<Vec<_>>();
319332

320333
struct CrateData<'a> {
321-
doc: String,
322334
items: Vec<&'a IndexItem>,
323335
paths: Vec<(ItemType, Vec<Symbol>)>,
324336
// The String is alias name and the vec is the list of the elements with this alias.
@@ -327,6 +339,9 @@ pub(crate) fn build_index<'tcx>(
327339
aliases: &'a BTreeMap<String, Vec<usize>>,
328340
// Used when a type has more than one impl with an associated item with the same name.
329341
associated_item_disambiguators: &'a Vec<(usize, String)>,
342+
// A list of shard lengths encoded as vlqhex. See the comment in write_vlqhex_to_string
343+
// for information on the format.
344+
descindex: String,
330345
}
331346

332347
struct Paths {
@@ -408,7 +423,6 @@ pub(crate) fn build_index<'tcx>(
408423
let mut names = Vec::with_capacity(self.items.len());
409424
let mut types = String::with_capacity(self.items.len());
410425
let mut full_paths = Vec::with_capacity(self.items.len());
411-
let mut descriptions = Vec::with_capacity(self.items.len());
412426
let mut parents = Vec::with_capacity(self.items.len());
413427
let mut functions = String::with_capacity(self.items.len());
414428
let mut deprecated = Vec::with_capacity(self.items.len());
@@ -431,7 +445,6 @@ pub(crate) fn build_index<'tcx>(
431445
parents.push(item.parent_idx.map(|x| x + 1).unwrap_or(0));
432446

433447
names.push(item.name.as_str());
434-
descriptions.push(&item.desc);
435448

436449
if !item.path.is_empty() {
437450
full_paths.push((index, &item.path));
@@ -454,14 +467,12 @@ pub(crate) fn build_index<'tcx>(
454467
let has_aliases = !self.aliases.is_empty();
455468
let mut crate_data =
456469
serializer.serialize_struct("CrateData", if has_aliases { 9 } else { 8 })?;
457-
crate_data.serialize_field("doc", &self.doc)?;
458470
crate_data.serialize_field("t", &types)?;
459471
crate_data.serialize_field("n", &names)?;
460-
// Serialize as an array of item indices and full paths
461472
crate_data.serialize_field("q", &full_paths)?;
462-
crate_data.serialize_field("d", &descriptions)?;
463473
crate_data.serialize_field("i", &parents)?;
464474
crate_data.serialize_field("f", &functions)?;
475+
crate_data.serialize_field("D", &self.descindex)?;
465476
crate_data.serialize_field("c", &deprecated)?;
466477
crate_data.serialize_field("p", &paths)?;
467478
crate_data.serialize_field("b", &self.associated_item_disambiguators)?;
@@ -472,24 +483,92 @@ pub(crate) fn build_index<'tcx>(
472483
}
473484
}
474485

475-
// Collect the index into a string
476-
format!(
486+
let desc = {
487+
let mut result = Vec::new();
488+
let mut set = String::new();
489+
let mut len: usize = 0;
490+
for desc in std::iter::once(&crate_doc).chain(crate_items.iter().map(|item| &item.desc)) {
491+
if set.len() >= DESC_INDEX_SHARD_LEN {
492+
result.push((len, std::mem::replace(&mut set, String::new())));
493+
len = 0;
494+
} else if len != 0 {
495+
set.push('\n');
496+
}
497+
set.push_str(&desc);
498+
len += 1;
499+
}
500+
result.push((len, std::mem::replace(&mut set, String::new())));
501+
result
502+
};
503+
504+
let descindex = {
505+
let mut descindex = String::with_capacity(desc.len() * 4);
506+
for &(len, _) in desc.iter() {
507+
write_vlqhex_to_string(len.try_into().unwrap(), &mut descindex);
508+
}
509+
descindex
510+
};
511+
512+
assert_eq!(crate_items.len() + 1, desc.iter().map(|(len, _)| *len).sum::<usize>());
513+
514+
// The index, which is actually used to search, is JSON
515+
// It uses `JSON.parse(..)` to actually load, since JSON
516+
// parses faster than the full JavaScript syntax.
517+
let index = format!(
477518
r#"["{}",{}]"#,
478519
krate.name(tcx),
479520
serde_json::to_string(&CrateData {
480-
doc: crate_doc,
481521
items: crate_items,
482522
paths: crate_paths,
483523
aliases: &aliases,
484524
associated_item_disambiguators: &associated_item_disambiguators,
525+
descindex,
485526
})
486527
.expect("failed serde conversion")
487528
// All these `replace` calls are because we have to go through JS string for JSON content.
488529
.replace('\\', r"\\")
489530
.replace('\'', r"\'")
490531
// We need to escape double quotes for the JSON.
491532
.replace("\\\"", "\\\\\"")
492-
)
533+
);
534+
SerializedSearchIndex { index, desc }
535+
}
536+
537+
pub(crate) fn write_vlqhex_to_string(n: i32, string: &mut String) {
538+
let (sign, magnitude): (bool, u32) =
539+
if n >= 0 { (false, n.try_into().unwrap()) } else { (true, (-n).try_into().unwrap()) };
540+
// zig-zag encoding
541+
let value: u32 = (magnitude << 1) | (if sign { 1 } else { 0 });
542+
// Self-terminating hex use capital letters for everything but the
543+
// least significant digit, which is lowercase. For example, decimal 17
544+
// would be `` Aa `` if zig-zag encoding weren't used.
545+
//
546+
// Zig-zag encoding, however, stores the sign bit as the last bit.
547+
// This means, in the last hexit, 1 is actually `c`, -1 is `b`
548+
// (`a` is the imaginary -0), and, because all the bits are shifted
549+
// by one, `` A` `` is actually 8 and `` Aa `` is -8.
550+
//
551+
// https://rust-lang.github.io/rustc-dev-guide/rustdoc-internals/search.html
552+
// describes the encoding in more detail.
553+
let mut shift: u32 = 28;
554+
let mut mask: u32 = 0xF0_00_00_00;
555+
// first skip leading zeroes
556+
while shift < 32 {
557+
let hexit = (value & mask) >> shift;
558+
if hexit != 0 || shift == 0 {
559+
break;
560+
}
561+
shift = shift.wrapping_sub(4);
562+
mask = mask >> 4;
563+
}
564+
// now write the rest
565+
while shift < 32 {
566+
let hexit = (value & mask) >> shift;
567+
let hex = char::try_from(if shift == 0 { '`' } else { '@' } as u32 + hexit).unwrap();
568+
string.push(hex);
569+
shift = shift.wrapping_sub(4);
570+
mask = mask >> 4;
571+
}
493572
}
494573

495574
pub(crate) fn get_function_type_for_search<'tcx>(

Diff for: src/librustdoc/html/render/write_shared.rs

+29-2
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ use crate::formats::cache::Cache;
2424
use crate::formats::item_type::ItemType;
2525
use crate::formats::Impl;
2626
use crate::html::format::Buffer;
27+
use crate::html::render::search_index::SerializedSearchIndex;
2728
use crate::html::render::{AssocItemLink, ImplRenderingParameters};
2829
use crate::html::{layout, static_files};
2930
use crate::visit::DocVisitor;
@@ -46,7 +47,7 @@ use crate::{try_err, try_none};
4647
pub(super) fn write_shared(
4748
cx: &mut Context<'_>,
4849
krate: &Crate,
49-
search_index: String,
50+
search_index: SerializedSearchIndex,
5051
options: &RenderOptions,
5152
) -> Result<(), Error> {
5253
// Write out the shared files. Note that these are shared among all rustdoc
@@ -312,7 +313,7 @@ pub(super) fn write_shared(
312313
let dst = cx.dst.join(&format!("search-index{}.js", cx.shared.resource_suffix));
313314
let (mut all_indexes, mut krates) =
314315
try_err!(collect_json(&dst, krate.name(cx.tcx()).as_str()), &dst);
315-
all_indexes.push(search_index);
316+
all_indexes.push(search_index.index);
316317
krates.push(krate.name(cx.tcx()).to_string());
317318
krates.sort();
318319

@@ -335,6 +336,32 @@ else if (window.initSearch) window.initSearch(searchIndex);
335336
Ok(v.into_bytes())
336337
})?;
337338

339+
let search_desc_dir = cx.dst.join(format!("search.desc/{krate}", krate = krate.name(cx.tcx())));
340+
if Path::new(&search_desc_dir).exists() {
341+
try_err!(std::fs::remove_dir_all(&search_desc_dir), &search_desc_dir);
342+
}
343+
try_err!(std::fs::create_dir_all(&search_desc_dir), &search_desc_dir);
344+
let kratename = krate.name(cx.tcx()).to_string();
345+
for (i, (_, data)) in search_index.desc.into_iter().enumerate() {
346+
let output_filename = static_files::suffix_path(
347+
&format!("{kratename}-desc-{i}-.js"),
348+
&cx.shared.resource_suffix,
349+
);
350+
let path = search_desc_dir.join(output_filename);
351+
try_err!(
352+
std::fs::write(
353+
&path,
354+
&format!(
355+
r##"searchState.loadedDescShard({kratename}, {i}, {data})"##,
356+
kratename = serde_json::to_string(&kratename).unwrap(),
357+
data = serde_json::to_string(&data).unwrap(),
358+
)
359+
.into_bytes()
360+
),
361+
&path
362+
);
363+
}
364+
338365
write_invocation_specific("crates.js", &|| {
339366
let krates = krates.iter().map(|k| format!("\"{k}\"")).join(",");
340367
Ok(format!("window.ALL_CRATES = [{krates}];").into_bytes())

Diff for: src/librustdoc/html/static/.eslintrc.js

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ module.exports = {
55
},
66
"extends": "eslint:recommended",
77
"parserOptions": {
8-
"ecmaVersion": 2015,
8+
"ecmaVersion": 8,
99
"sourceType": "module"
1010
},
1111
"rules": {

Diff for: src/librustdoc/html/static/js/main.js

+24-4
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,26 @@ function preLoadCss(cssUrl) {
329329
search.innerHTML = "<h3 class=\"search-loading\">" + searchState.loadingText + "</h3>";
330330
searchState.showResults(search);
331331
},
332+
descShards: new Map(),
333+
loadDesc: async function({descShard, descIndex}) {
334+
if (descShard.promise === null) {
335+
descShard.promise = new Promise((resolve, reject) => {
336+
descShard.resolve = resolve;
337+
const ds = descShard;
338+
const fname = `${ds.crate}-desc-${ds.shard}-`;
339+
const url = resourcePath(
340+
`search.desc/${descShard.crate}/${fname}`,
341+
".js",
342+
);
343+
loadScript(url, reject);
344+
});
345+
}
346+
const list = await descShard.promise;
347+
return list[descIndex];
348+
},
349+
loadedDescShard: function (crate, shard, data) {
350+
this.descShards.get(crate)[shard].resolve(data.split("\n"));
351+
},
332352
};
333353

334354
const toggleAllDocsId = "toggle-all-docs";
@@ -381,7 +401,7 @@ function preLoadCss(cssUrl) {
381401
window.location.replace("#" + item.id);
382402
}, 0);
383403
}
384-
}
404+
},
385405
);
386406
}
387407
}
@@ -585,7 +605,7 @@ function preLoadCss(cssUrl) {
585605
const script = document
586606
.querySelector("script[data-ignore-extern-crates]");
587607
const ignoreExternCrates = new Set(
588-
(script ? script.getAttribute("data-ignore-extern-crates") : "").split(",")
608+
(script ? script.getAttribute("data-ignore-extern-crates") : "").split(","),
589609
);
590610
for (const lib of libs) {
591611
if (lib === window.currentCrate || ignoreExternCrates.has(lib)) {
@@ -1098,7 +1118,7 @@ function preLoadCss(cssUrl) {
10981118
} else {
10991119
wrapper.style.setProperty(
11001120
"--popover-arrow-offset",
1101-
(wrapperPos.right - pos.right + 4) + "px"
1121+
(wrapperPos.right - pos.right + 4) + "px",
11021122
);
11031123
}
11041124
wrapper.style.visibility = "";
@@ -1680,7 +1700,7 @@ href="https://doc.rust-lang.org/${channel}/rustdoc/read-documentation/search.htm
16801700
pendingSidebarResizingFrame = false;
16811701
document.documentElement.style.setProperty(
16821702
"--resizing-sidebar-width",
1683-
desiredSidebarSize + "px"
1703+
desiredSidebarSize + "px",
16841704
);
16851705
}, 100);
16861706
}

0 commit comments

Comments
 (0)