Skip to content

Commit e2df785

Browse files
committed
rustdoc-search: tighter encoding for f index
Two optimizations for the function signature search: * Instead of using JSON arrays, like `[1,20]`, it uses VLQ hex with no commas, like `[aAd]`. * This also adds backrefs: if you have more than one function with exactly the same signature, it'll not only store it once, it'll *decode* it once, and store in the typeIdMap only once. Size change ----------- standard library ```console $ du -bs search-index-old.js search-index-new.js 4976370 search-index-old.js 4404391 search-index-new.js ``` ((4976370-4404391)/4404391)*100% = 12.9% Benchmarks are similarly shrunk: ```console $ du -hs tmp/{arti,cortex-m,sqlx,stm32f4,ripgrep}/toolchain_{old,new}/doc/search-index.js 10555067 tmp/arti/toolchain_old/doc/search-index.js 8921236 tmp/arti/toolchain_new/doc/search-index.js 77018 tmp/cortex-m/toolchain_old/doc/search-index.js 66676 tmp/cortex-m/toolchain_new/doc/search-index.js 2876330 tmp/sqlx/toolchain_old/doc/search-index.js 2436812 tmp/sqlx/toolchain_new/doc/search-index.js 63632890 tmp/stm32f4/toolchain_old/doc/search-index.js 52337438 tmp/stm32f4/toolchain_new/doc/search-index.js 631150 tmp/ripgrep/toolchain_old/doc/search-index.js 541646 tmp/ripgrep/toolchain_new/doc/search-index.js ```
1 parent 1ab60f2 commit e2df785

File tree

3 files changed

+160
-86
lines changed

3 files changed

+160
-86
lines changed

src/librustdoc/html/render/mod.rs

+87-49
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ use rustc_span::{
5858
symbol::{sym, Symbol},
5959
BytePos, FileName, RealFileName,
6060
};
61-
use serde::ser::{SerializeMap, SerializeSeq};
61+
use serde::ser::SerializeMap;
6262
use serde::{Serialize, Serializer};
6363

6464
use crate::clean::{self, ItemId, RenderedLink, SelfTy};
@@ -123,115 +123,153 @@ pub(crate) struct IndexItem {
123123
}
124124

125125
/// A type used for the search index.
126-
#[derive(Debug)]
126+
#[derive(Debug, Eq, PartialEq)]
127127
pub(crate) struct RenderType {
128128
id: Option<RenderTypeId>,
129129
generics: Option<Vec<RenderType>>,
130130
bindings: Option<Vec<(RenderTypeId, Vec<RenderType>)>>,
131131
}
132132

133-
impl Serialize for RenderType {
134-
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
135-
where
136-
S: Serializer,
137-
{
138-
let id = match &self.id {
139-
// 0 is a sentinel, everything else is one-indexed
140-
None => 0,
141-
// concrete type
142-
Some(RenderTypeId::Index(idx)) if *idx >= 0 => idx + 1,
143-
// generic type parameter
144-
Some(RenderTypeId::Index(idx)) => *idx,
145-
_ => panic!("must convert render types to indexes before serializing"),
146-
};
133+
impl RenderType {
134+
pub fn write_to_string(&self, string: &mut String) {
135+
// 0 is a sentinel, everything else is one-indexed
136+
let id = self.id.unwrap_or(RenderTypeId::Index(0));
147137
if self.generics.is_some() || self.bindings.is_some() {
148-
let mut seq = serializer.serialize_seq(None)?;
149-
seq.serialize_element(&id)?;
150-
seq.serialize_element(self.generics.as_ref().map(Vec::as_slice).unwrap_or_default())?;
138+
string.push('{');
139+
id.write_to_string(string);
140+
string.push('{');
141+
for generic in &self.generics.as_ref().map(Vec::as_slice).unwrap_or_default()[..] {
142+
generic.write_to_string(string);
143+
}
144+
string.push('}');
151145
if self.bindings.is_some() {
152-
seq.serialize_element(
153-
self.bindings.as_ref().map(Vec::as_slice).unwrap_or_default(),
154-
)?;
146+
string.push('{');
147+
for binding in &self.bindings.as_ref().map(Vec::as_slice).unwrap_or_default()[..] {
148+
string.push('{');
149+
binding.0.write_to_string(string);
150+
string.push('{');
151+
for constraint in &binding.1[..] {
152+
constraint.write_to_string(string);
153+
}
154+
string.push('}');
155+
string.push('}');
156+
}
157+
string.push('}');
155158
}
156-
seq.end()
159+
string.push('}');
157160
} else {
158-
id.serialize(serializer)
161+
id.write_to_string(string)
159162
}
160163
}
161164
}
162165

163-
#[derive(Clone, Copy, Debug)]
166+
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
164167
pub(crate) enum RenderTypeId {
165168
DefId(DefId),
166169
Primitive(clean::PrimitiveType),
167170
AssociatedType(Symbol),
168171
Index(isize),
169172
}
170173

171-
impl Serialize for RenderTypeId {
172-
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
173-
where
174-
S: Serializer,
175-
{
176-
let id = match &self {
174+
impl RenderTypeId {
175+
pub fn write_to_string(&self, string: &mut String) {
176+
// (sign, value)
177+
let (sign, id): (bool, u32) = match &self {
177178
// 0 is a sentinel, everything else is one-indexed
178179
// concrete type
179-
RenderTypeId::Index(idx) if *idx >= 0 => idx + 1,
180+
RenderTypeId::Index(idx) if *idx >= 0 => (false, (idx + 1isize).try_into().unwrap()),
180181
// generic type parameter
181-
RenderTypeId::Index(idx) => *idx,
182+
RenderTypeId::Index(idx) => (true, (-*idx).try_into().unwrap()),
182183
_ => panic!("must convert render types to indexes before serializing"),
183184
};
184-
id.serialize(serializer)
185+
// zig-zag notation
186+
let value: u32 = (id << 1) | (if sign { 1 } else { 0 });
187+
// encode
188+
let mut shift: u32 = 28;
189+
let mut mask: u32 = 0xF0_00_00_00;
190+
while shift < 32 {
191+
let hexit = (value & mask) >> shift;
192+
if hexit != 0 || shift == 0 {
193+
let hex = char::try_from(if shift == 0 { '`' } else { '@' } as u32 + hexit).unwrap();
194+
string.push(hex);
195+
}
196+
shift = shift.wrapping_sub(4);
197+
mask = mask >> 4;
198+
}
185199
}
186200
}
187201

188202
/// Full type of functions/methods in the search index.
189-
#[derive(Debug)]
203+
#[derive(Debug, Eq, PartialEq)]
190204
pub(crate) struct IndexItemFunctionType {
191205
inputs: Vec<RenderType>,
192206
output: Vec<RenderType>,
193207
where_clause: Vec<Vec<RenderType>>,
194208
}
195209

196-
impl Serialize for IndexItemFunctionType {
197-
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
198-
where
199-
S: Serializer,
200-
{
210+
impl IndexItemFunctionType {
211+
pub fn write_to_string<'a>(
212+
&'a self,
213+
string: &mut String,
214+
backref_queue: &mut VecDeque<&'a IndexItemFunctionType>,
215+
) {
216+
assert!(backref_queue.len() < 16);
201217
// If we couldn't figure out a type, just write `0`.
202218
let has_missing = self
203219
.inputs
204220
.iter()
205221
.chain(self.output.iter())
206222
.any(|i| i.id.is_none() && i.generics.is_none());
207223
if has_missing {
208-
0.serialize(serializer)
224+
string.push('@');
225+
} else if let Some(idx) = backref_queue.iter().position(|other| *other == self) {
226+
string.push(char::try_from('0' as u32 + u32::try_from(idx).unwrap()).expect("last possible value is '?'"));
209227
} else {
210-
let mut seq = serializer.serialize_seq(None)?;
228+
backref_queue.push_front(self);
229+
if backref_queue.len() >= 16 {
230+
backref_queue.pop_back();
231+
}
232+
string.push('{');
211233
match &self.inputs[..] {
212234
[one] if one.generics.is_none() && one.bindings.is_none() => {
213-
seq.serialize_element(one)?
235+
one.write_to_string(string);
236+
}
237+
_ => {
238+
string.push('{');
239+
for item in &self.inputs[..] {
240+
item.write_to_string(string);
241+
}
242+
string.push('}');
214243
}
215-
_ => seq.serialize_element(&self.inputs)?,
216244
}
217245
match &self.output[..] {
218246
[] if self.where_clause.is_empty() => {}
219247
[one] if one.generics.is_none() && one.bindings.is_none() => {
220-
seq.serialize_element(one)?
248+
one.write_to_string(string);
249+
}
250+
_ => {
251+
string.push('{');
252+
for item in &self.output[..] {
253+
item.write_to_string(string);
254+
}
255+
string.push('}');
221256
}
222-
_ => seq.serialize_element(&self.output)?,
223257
}
224258
for constraint in &self.where_clause {
225259
if let [one] = &constraint[..]
226260
&& one.generics.is_none()
227261
&& one.bindings.is_none()
228262
{
229-
seq.serialize_element(one)?;
263+
one.write_to_string(string);
230264
} else {
231-
seq.serialize_element(constraint)?;
265+
string.push('{');
266+
for item in &constraint[..] {
267+
item.write_to_string(string);
268+
}
269+
string.push('}');
232270
}
233271
}
234-
seq.end()
272+
string.push('}');
235273
}
236274
}
237275
}

src/librustdoc/html/render/search_index.rs

+7-22
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
use std::collections::hash_map::Entry;
2-
use std::collections::BTreeMap;
2+
use std::collections::{BTreeMap, VecDeque};
33

44
use rustc_data_structures::fx::{FxHashMap, FxIndexMap};
55
use rustc_middle::ty::TyCtxt;
@@ -409,9 +409,11 @@ pub(crate) fn build_index<'tcx>(
409409
let mut full_paths = Vec::with_capacity(self.items.len());
410410
let mut descriptions = Vec::with_capacity(self.items.len());
411411
let mut parents = Vec::with_capacity(self.items.len());
412-
let mut functions = Vec::with_capacity(self.items.len());
412+
let mut functions = String::with_capacity(self.items.len());
413413
let mut deprecated = Vec::with_capacity(self.items.len());
414414

415+
let mut backref_queue = VecDeque::new();
416+
415417
for (index, item) in self.items.iter().enumerate() {
416418
let n = item.ty as u8;
417419
let c = char::try_from(n + b'A').expect("item types must fit in ASCII");
@@ -434,27 +436,10 @@ pub(crate) fn build_index<'tcx>(
434436
full_paths.push((index, &item.path));
435437
}
436438

437-
// Fake option to get `0` out as a sentinel instead of `null`.
438-
// We want to use `0` because it's three less bytes.
439-
enum FunctionOption<'a> {
440-
Function(&'a IndexItemFunctionType),
441-
None,
442-
}
443-
impl<'a> Serialize for FunctionOption<'a> {
444-
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
445-
where
446-
S: Serializer,
447-
{
448-
match self {
449-
FunctionOption::None => 0.serialize(serializer),
450-
FunctionOption::Function(ty) => ty.serialize(serializer),
451-
}
452-
}
439+
match &item.search_type {
440+
Some(ty) => ty.write_to_string(&mut functions, &mut backref_queue),
441+
None => functions.push('@'),
453442
}
454-
functions.push(match &item.search_type {
455-
Some(ty) => FunctionOption::Function(ty),
456-
None => FunctionOption::None,
457-
});
458443

459444
if item.deprecation.is_some() {
460445
deprecated.push(index);

src/librustdoc/html/static/js/search.js

+66-15
Original file line numberDiff line numberDiff line change
@@ -2767,19 +2767,65 @@ ${item.displayPath}<span class="${type}">${name}</span>\
27672767
* The raw function search type format is generated using serde in
27682768
* librustdoc/html/render/mod.rs: impl Serialize for IndexItemFunctionType
27692769
*
2770-
* @param {RawFunctionSearchType} functionSearchType
2770+
* @param {{
2771+
* string: string,
2772+
* offset: number,
2773+
* backrefQueue: FunctionSearchType[]
2774+
* }} itemFunctionDecoder
27712775
* @param {Array<{name: string, ty: number}>} lowercasePaths
27722776
* @param {Map<string, integer>}
27732777
*
27742778
* @return {null|FunctionSearchType}
27752779
*/
2776-
function buildFunctionSearchType(functionSearchType, lowercasePaths) {
2777-
const INPUTS_DATA = 0;
2778-
const OUTPUT_DATA = 1;
2779-
// `0` is used as a sentinel because it's fewer bytes than `null`
2780-
if (functionSearchType === 0) {
2780+
function buildFunctionSearchType(itemFunctionDecoder, lowercasePaths) {
2781+
const c = itemFunctionDecoder.string.charCodeAt(itemFunctionDecoder.offset);
2782+
itemFunctionDecoder.offset += 1;
2783+
const [zero, ua, la, ob, cb] = ["0", "@", "`", "{", "}"].map(c => c.charCodeAt(0));
2784+
// `@` is used as a sentinel because it's fewer bytes than `null`, and decodes to zero
2785+
// `0` is a backref
2786+
if (c === ua) {
27812787
return null;
27822788
}
2789+
// sixteen characters after "0" are backref
2790+
if (c >= zero && c < ua) {
2791+
return itemFunctionDecoder.backrefQueue[c - zero];
2792+
}
2793+
if (c !== ob) {
2794+
throw ["Unexpected ", c, " in function: expected ", "{", "; this is a bug"];
2795+
}
2796+
// call after consuming `{`
2797+
function decodeList() {
2798+
let c = itemFunctionDecoder.string.charCodeAt(itemFunctionDecoder.offset);
2799+
const ret = [];
2800+
while (c !== cb) {
2801+
ret.push(decode());
2802+
c = itemFunctionDecoder.string.charCodeAt(itemFunctionDecoder.offset);
2803+
}
2804+
itemFunctionDecoder.offset += 1; // eat cb
2805+
return ret;
2806+
}
2807+
// consumes and returns a list or integer
2808+
function decode() {
2809+
let n = 0;
2810+
let c = itemFunctionDecoder.string.charCodeAt(itemFunctionDecoder.offset);
2811+
if (c === ob) {
2812+
itemFunctionDecoder.offset += 1;
2813+
return decodeList();
2814+
}
2815+
while (c < la) {
2816+
n = (n << 4) | (c & 0xF);
2817+
itemFunctionDecoder.offset += 1;
2818+
c = itemFunctionDecoder.string.charCodeAt(itemFunctionDecoder.offset);
2819+
}
2820+
// last character >= la
2821+
n = (n << 4) | (c & 0xF);
2822+
const [sign, value] = [n & 1, n >> 1];
2823+
itemFunctionDecoder.offset += 1;
2824+
return sign ? -value : value;
2825+
}
2826+
const functionSearchType = decodeList();
2827+
const INPUTS_DATA = 0;
2828+
const OUTPUT_DATA = 1;
27832829
let inputs, output;
27842830
if (typeof functionSearchType[INPUTS_DATA] === "number") {
27852831
inputs = [buildItemSearchType(functionSearchType[INPUTS_DATA], lowercasePaths)];
@@ -2808,9 +2854,14 @@ ${item.displayPath}<span class="${type}">${name}</span>\
28082854
? [buildItemSearchType(functionSearchType[i], lowercasePaths)]
28092855
: buildItemSearchTypeAll(functionSearchType[i], lowercasePaths));
28102856
}
2811-
return {
2857+
const ret = {
28122858
inputs, output, where_clause,
28132859
};
2860+
itemFunctionDecoder.backrefQueue.unshift(ret);
2861+
if (itemFunctionDecoder.backrefQueue.length >= 16) {
2862+
itemFunctionDecoder.backrefQueue.pop();
2863+
}
2864+
return ret;
28142865
}
28152866

28162867
/**
@@ -2992,8 +3043,12 @@ ${item.displayPath}<span class="${type}">${name}</span>\
29923043
const itemDescs = crateCorpus.d;
29933044
// an array of (Number) the parent path index + 1 to `paths`, or 0 if none
29943045
const itemParentIdxs = crateCorpus.i;
2995-
// an array of (Array | 0) the type of the function, if any
2996-
const itemFunctionSearchTypes = crateCorpus.f;
3046+
// a string representing the list of function types
3047+
const itemFunctionDecoder = {
3048+
string: crateCorpus.f,
3049+
offset: 0,
3050+
backrefQueue: [],
3051+
};
29973052
// an array of (Number) indices for the deprecated items
29983053
const deprecatedItems = new Set(crateCorpus.c);
29993054
// an array of (Number) indices for the deprecated items
@@ -3041,12 +3096,8 @@ ${item.displayPath}<span class="${type}">${name}</span>\
30413096
word = itemNames[i].toLowerCase();
30423097
}
30433098
const path = itemPaths.has(i) ? itemPaths.get(i) : lastPath;
3044-
let type = null;
3045-
if (itemFunctionSearchTypes[i] !== 0) {
3046-
type = buildFunctionSearchType(
3047-
itemFunctionSearchTypes[i],
3048-
lowercasePaths
3049-
);
3099+
const type = buildFunctionSearchType(itemFunctionDecoder, lowercasePaths);
3100+
if (type !== null) {
30503101
if (type) {
30513102
const fp = functionTypeFingerprint.subarray(id * 4, (id + 1) * 4);
30523103
const fps = new Set();

0 commit comments

Comments
 (0)