diff --git a/compiler/rustc_macros/src/symbols.rs b/compiler/rustc_macros/src/symbols.rs index 791a19659117e..052960493afd0 100644 --- a/compiler/rustc_macros/src/symbols.rs +++ b/compiler/rustc_macros/src/symbols.rs @@ -102,15 +102,25 @@ impl Parse for Input { } } +/// WARNING: this function must behave equivalently to +/// `Symbol::try_new_inlined()`. It does, modulo the fact that it accepts fewer +/// inputs, panicking on any string containing non-ASCII or NUL bytes. This is +/// fine because static symbols never contain such bytes. Once those bytes are +/// excluded, it reduces to a mere length check. +fn is_inlinable(s: &str) -> bool { + assert!(s.as_bytes().iter().all(|&b| 0 < b && b < 0x80)); + s.len() <= 4 +} + pub fn symbols(input: TokenStream) -> TokenStream { let input = parse_macro_input!(input as Input); let mut keyword_stream = quote! {}; let mut symbols_stream = quote! {}; let mut digits_stream = quote! {}; - let mut prefill_stream = quote! {}; + let mut prefill_tabled_stream = quote! {}; + let mut tabled_counter = 0u32; let mut keyword_class_stream = quote! {}; - let mut counter = 0u32; let mut keys = HashSet::::new(); let mut prev_key: Option = None; let mut errors = Vec::::new(); @@ -136,18 +146,26 @@ pub fn symbols(input: TokenStream) -> TokenStream { for keyword in keywords { let name = &keyword.name; let value = &keyword.value; - check_dup(&value.value(), &mut errors); - prefill_stream.extend(quote! { - #value, - }); - keyword_stream.extend(quote! { - #[allow(non_upper_case_globals)] - pub const #name: Symbol = Symbol::new(#counter); - }); + let v = value.value(); + check_dup(&v, &mut errors); + if is_inlinable(&v) { + keyword_stream.extend(quote! { + #[allow(non_upper_case_globals)] + pub const #name: Symbol = Symbol::new_inlined(#value); + }); + } else { + prefill_tabled_stream.extend(quote! { + #value, + }); + keyword_stream.extend(quote! { + #[allow(non_upper_case_globals)] + pub const #name: Symbol = Symbol::new_tabled(#tabled_counter); + }); + tabled_counter += 1; + } class_stream.extend(quote! { | kw::#name }); - counter += 1; } if let Some(class) = class { keyword_class_stream.extend(quote! { @@ -170,28 +188,32 @@ pub fn symbols(input: TokenStream) -> TokenStream { }; check_dup(&value, &mut errors); check_order(&name.to_string(), &mut errors); - prefill_stream.extend(quote! { - #value, - }); - symbols_stream.extend(quote! { - #[allow(rustc::default_hash_types)] - #[allow(non_upper_case_globals)] - pub const #name: Symbol = Symbol::new(#counter); - }); - counter += 1; + if is_inlinable(&value) { + symbols_stream.extend(quote! { + #[allow(rustc::default_hash_types)] + #[allow(non_upper_case_globals)] + pub const #name: Symbol = Symbol::new_inlined(#value); + }); + } else { + prefill_tabled_stream.extend(quote! { + #value, + }); + symbols_stream.extend(quote! { + #[allow(rustc::default_hash_types)] + #[allow(non_upper_case_globals)] + pub const #name: Symbol = Symbol::new_tabled(#tabled_counter); + }); + tabled_counter += 1; + } } // Generate symbols for the strings "0", "1", ..., "9". for n in 0..10 { let n = n.to_string(); check_dup(&n, &mut errors); - prefill_stream.extend(quote! { - #n, - }); digits_stream.extend(quote! { - Symbol::new(#counter), + Symbol::new_inlined(#n), }); - counter += 1; } if !errors.is_empty() { @@ -221,8 +243,8 @@ pub fn symbols(input: TokenStream) -> TokenStream { impl Interner { pub fn fresh() -> Self { - Interner::prefill(&[ - #prefill_stream + Interner::prefill_tabled(&[ + #prefill_tabled_stream ]) } } diff --git a/compiler/rustc_span/src/symbol.rs b/compiler/rustc_span/src/symbol.rs index d522e730c1539..47f8de8c4622b 100644 --- a/compiler/rustc_span/src/symbol.rs +++ b/compiler/rustc_span/src/symbol.rs @@ -11,6 +11,7 @@ use rustc_serialize::{Decodable, Decoder, Encodable, Encoder}; use std::cmp::{Ord, PartialEq, PartialOrd}; use std::fmt; use std::hash::{Hash, Hasher}; +use std::ops::Deref; use std::str; use crate::{Span, DUMMY_SP, SESSION_GLOBALS}; @@ -1377,13 +1378,69 @@ impl fmt::Display for MacroRulesNormalizedIdent { /// An interned string. /// -/// Internally, a `Symbol` is implemented as an index, and all operations -/// (including hashing, equality, and ordering) operate on that index. The use -/// of `rustc_index::newtype_index!` means that `Option` only takes up 4 bytes, -/// because `rustc_index::newtype_index!` reserves the last 256 values for tagging purposes. +/// Internally, a `Symbol` is implemented as a u32, and all operations +/// (including hashing, equality, and ordering) operate on that u32. The use of +/// `rustc_index::newtype_index!` means that `Option` only takes up 4 +/// bytes, because `rustc_index::newtype_index!` reserves the last 256 values +/// for tagging purposes. /// -/// Note that `Symbol` cannot directly be a `rustc_index::newtype_index!` because it -/// implements `fmt::Debug`, `Encodable`, and `Decodable` in special ways. +/// Note that `Symbol` cannot directly be a `rustc_index::newtype_index!` +/// because it implements `fmt::Debug`, `Encodable`, and `Decodable` in special +/// ways. +/// +/// For the interner in `SESSION_GLOBALS`, we use a two-part encoding. Strings +/// of length 4 or less (with some exceptions due to two encoding constraints +/// described below) are "inlined", i.e. stored within the u32 itself. Other +/// strings are "tabled", i.e. stored in the interner's table and the u32 is an +/// index. This helps performance because strings of length 4 are common +/// (roughly 50% of cases in practice) and those ones avoid the need to lock +/// and search the hash table. +/// +/// The details of the encoding are as follows. +/// +/// - The highest bit of the u32 is a tag bit. +/// +/// - If the tag bit is 0, the symbol is inlined. The string bytes +/// (`s`) are encoded into the u32 (`u`) in a little-endian fashion: +/// - Byte 0 of `u` holds `s[0]`, or zero if `s.len()` is < 1. +/// - Byte 1 of `u` holds `s[1]`, or zero if `s.len()` is < 2. +/// - Byte 2 of `u` holds `s[2]`, or zero if `s.len()` is < 3. +/// - Byte 3 of `u` holds `s[3]`, or zero if `s.len()` is < 4. +/// +/// Some examples: +/// - "" -> 0x00_00_00_00 +/// - "a" -> 0x00_00_00_61 +/// - "ab" -> 0x00_00_62_61 +/// - "abc" -> 0x00_63_62_61 +/// - "abcd" -> 0x64_63_62_61 +/// +/// Because byte 3 contains the tag bit, for a string of length four to be +/// inlined its final byte must be < 0x80. For example, "cdé" can not be +/// inlined because in UTF-8 it is a four byte sequence `[0x64, 0x65, 0xc3, +/// 0xa9]` and byte 3 is > 0x7f. (This is the first of the abovementioned +/// encoding constraints.) +/// +/// The length of the string is not explicitly encoded. Rather, it is equal +/// to the index of the most-significant zero byte, or four if there are no +/// zero bytes. (For example, in 0x00_63_62_61 the most significant zero byte +/// is byte 3, and so the length is 3.) Because of this, a string whose final +/// char is a NUL char cannot be represented inline. For example, none of +/// "\0", "a\0", "ab\0", and "abc\0" can be inlined. (This is the second of +/// the abovementioned encoding constraints.) +/// +/// - If the tag bit is 1, the symbol is tabled. The high bit must be removed +/// before being used as an index. For example, the fifth tabled symbol will +/// have the value 0x80_00_00_05. +/// +/// The maximum value of a `newtype_index` is 0xff_ff_ff_00, due to highest +/// 256 values being reserved. Therefore, the maximum index representable is +/// 0x7f_ff_ff_00 (2,147,483,392). Given that the maximum number of bytes in +/// a crate is 0xff_ff_ff_ff, this should be more than enough for tabled +/// symbols. +/// +/// For interners other than the one in `SESSION_GLOBALS` (which are rare and +/// not that performance-critical) all symbols are interned, i.e. the u32 is an +/// index into the interner's table with its high bit set. #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct Symbol(SymbolIndex); @@ -1391,30 +1448,51 @@ rustc_index::newtype_index! { pub struct SymbolIndex { .. } } +/// These `Symbol` methods work with just the `Symbol` itself, and do not +/// involve `SESSION_GLOBALS`. impl Symbol { - const fn new(n: u32) -> Self { - Symbol(SymbolIndex::from_u32(n)) + /// Try to create an inlined symbol from a string. + /// WARNING: this function must behave equivalently to `is_inlinable()` in + /// `librustc_macros/src/symbol.rs`. + const fn try_new_inlined(s: &str) -> Option { + let len = s.len(); + let s = s.as_bytes(); + let n = if len == 4 && s[3] != 0 && s[3] < 0x80 { + s[0] as u32 | ((s[1] as u32) << 8) | ((s[2] as u32) << 16) | ((s[3] as u32) << 24) + } else if len == 3 && s[2] != 0 { + s[0] as u32 | ((s[1] as u32) << 8) | ((s[2] as u32) << 16) + } else if len == 2 && s[1] != 0 { + s[0] as u32 | ((s[1] as u32) << 8) + } else if len == 1 && s[0] != 0 { + s[0] as u32 + } else if len == 0 { + 0u32 + } else { + return None; + }; + Some(Symbol(SymbolIndex::from_u32(n))) } - /// Maps a string to its interned representation. - pub fn intern(string: &str) -> Self { - with_interner(|interner| interner.intern(string)) + /// Create an inlined symbol from a string. Panic if that's not possible. + const fn new_inlined(s: &str) -> Self { + match Symbol::try_new_inlined(s) { + Some(s) => s, + None => panic!("non-inlinable string"), + } } - /// Access the symbol's chars. This is a slowish operation because it - /// requires locking the symbol interner. - pub fn with R, R>(self, f: F) -> R { - with_interner(|interner| f(interner.get(self))) + /// Create a tabled symbol from an interner index. + const fn new_tabled(index: u32) -> Self { + Symbol(SymbolIndex::from_u32(0x80000000 | index)) } - /// Convert to a `SymbolStr`. This is a slowish operation because it - /// requires locking the symbol interner. - pub fn as_str(self) -> SymbolStr { - with_interner(|interner| unsafe { - SymbolStr { string: std::mem::transmute::<&str, &str>(interner.get(self)) } - }) + /// Convert a tabled symbol to an index for the interner. + fn as_tabled_index(self) -> usize { + debug_assert_ne!(self.0.as_u32() & 0x80000000, 0); + (self.0.as_u32() & !0x80000000) as usize } + /// Extract the inner u32 value. pub fn as_u32(self) -> u32 { self.0.as_u32() } @@ -1428,6 +1506,51 @@ impl Symbol { } } +// These `Symbol` methods are for accessing symbols via `SESSION_GLOBALS`. +impl Symbol { + /// Map a string to its symbol representation, using the interner's table + /// from `SESSION_GLOBALS` if necessary (a relatively slow operation). + pub fn intern(string: &str) -> Self { + if let Some(sym) = Symbol::try_new_inlined(string) { + sym + } else { + with_interner(|interner| interner.intern(string)) + } + } + + /// Access the symbol's chars, using the interner's table from + /// `SESSION_GLOBALS` if necessary (a relatively slow operation). + pub fn with R, R>(self, f: F) -> R { + f(self.as_str().deref()) + } + + /// Convert to a `SymbolStr`, using the interner's table from + /// `SESSION_GLOBALS` if necessary (a relatively slow operation). + pub fn as_str(self) -> SymbolStr { + if self.0.as_u32() & 0x80000000 == 0 { + // The high bit is clear, it's an inlined symbol. + let bytes = self.0.as_u32().to_le_bytes(); + let len = if bytes[3] != 0 { + 4 + } else if bytes[2] != 0 { + 3 + } else if bytes[1] != 0 { + 2 + } else if bytes[0] != 0 { + 1 + } else { + 0 + }; + SymbolStr::Inlined { bytes, len } + } else { + // The high bit is set, it's a tabled symbol. + with_interner(|interner| unsafe { + SymbolStr::Tabled { string: std::mem::transmute::<&str, &str>(interner.get(self)) } + }) + } + } +} + impl fmt::Debug for Symbol { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.with(|str| fmt::Debug::fmt(&str, f)) @@ -1469,6 +1592,9 @@ impl ToStableHashKey for Symbol { } } +/// A string interner. The interner in `SESSION_GLOBALS` can (and should) be +/// accessed via `Symbol` methods. Non-global interners can use `Interner` +/// methods directly. // The `&'static str`s in this type actually point into the arena. // // The `FxHashMap`+`Vec` pair could be replaced by `FxIndexSet`, but #75278 @@ -1482,24 +1608,33 @@ pub struct Interner { } impl Interner { - fn prefill(init: &[&'static str]) -> Self { + /// Prefill the interner table with static symbols. This should only be + /// done for the `SESSION_GLOBALS` interner. + fn prefill_tabled(init: &[&'static str]) -> Self { Interner { strings: init.into(), - names: init.iter().copied().zip((0..).map(Symbol::new)).collect(), + names: init + .iter() + .inspect(|s| assert_eq!(Symbol::try_new_inlined(s), None)) + .copied() + .zip((0..).map(Symbol::new_tabled)) + .collect(), ..Default::default() } } - #[inline] + /// Map a string to its symbol representation, using this interner's table + /// if necessary. For the interner in `SESSION_GLOBALS`, `Symbol::intern()` + /// should be used in preference to this function. pub fn intern(&mut self, string: &str) -> Symbol { if let Some(&name) = self.names.get(string) { return name; } - let name = Symbol::new(self.strings.len() as u32); + let name = Symbol::new_tabled(self.strings.len() as u32); - // `from_utf8_unchecked` is safe since we just allocated a `&str` which is known to be - // UTF-8. + // SAFETY: `from_utf8_unchecked` is safe because the input is a copy of + // `string` which is a `&str` and therefore must be UTF-8. let string: &str = unsafe { str::from_utf8_unchecked(self.arena.alloc_slice(string.as_bytes())) }; // It is safe to extend the arena allocation to `'static` because we only access @@ -1510,10 +1645,11 @@ impl Interner { name } - // Get the symbol as a string. `Symbol::as_str()` should be used in - // preference to this function. + // Get the symbol as a string. For the interner in `SESSION_GLOBALS`, + // `Symbol::as_str()` or `Symbol::with()` should be used in preference to + // this function. pub fn get(&self, symbol: Symbol) -> &str { - self.strings[symbol.0.as_usize()] + self.strings[symbol.as_tabled_index()] } } @@ -1630,15 +1766,20 @@ fn with_interner T>(f: F) -> T { // FIXME: ensure that the interner outlives any thread which uses `SymbolStr`, // by creating a new thread right after constructing the interner. #[derive(Clone, Eq, PartialOrd, Ord)] -pub struct SymbolStr { - string: &'static str, +pub enum SymbolStr { + /// For an inlined symbol this type needs its own bytes, because the + /// original bytes are embedded in the u32 within the `Symbol`. + Inlined { bytes: [u8; 4], len: usize }, + + /// For a tabled symbol a `&str` suffices. + Tabled { string: &'static str }, } // This impl allows a `SymbolStr` to be directly equated with a `String` or // `&str`. impl> std::cmp::PartialEq for SymbolStr { fn eq(&self, other: &T) -> bool { - self.string == other.deref() + self.deref() == other.deref() } } @@ -1654,26 +1795,32 @@ impl std::ops::Deref for SymbolStr { type Target = str; #[inline] fn deref(&self) -> &str { - self.string + match self { + // SAFETY: the bytes originally came from a `&str`. + SymbolStr::Inlined { bytes, len } => unsafe { + std::str::from_utf8_unchecked(&bytes[0..*len]) + }, + SymbolStr::Tabled { string } => string, + } } } impl fmt::Debug for SymbolStr { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - fmt::Debug::fmt(self.string, f) + fmt::Debug::fmt(self.deref(), f) } } impl fmt::Display for SymbolStr { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - fmt::Display::fmt(self.string, f) + fmt::Display::fmt(self.deref(), f) } } impl HashStable for SymbolStr { #[inline] fn hash_stable(&self, hcx: &mut CTX, hasher: &mut StableHasher) { - self.string.hash_stable(hcx, hasher) + self.deref().hash_stable(hcx, hasher) } } diff --git a/compiler/rustc_span/src/symbol/tests.rs b/compiler/rustc_span/src/symbol/tests.rs index 47da03424b770..a0cd9da6d3493 100644 --- a/compiler/rustc_span/src/symbol/tests.rs +++ b/compiler/rustc_span/src/symbol/tests.rs @@ -3,23 +3,99 @@ use super::*; use crate::{edition, SessionGlobals}; #[test] -fn interner_tests() { - let mut i: Interner = Interner::default(); - // first one is zero: - assert_eq!(i.intern("dog"), Symbol::new(0)); - // re-use gets the same entry: - assert_eq!(i.intern("dog"), Symbol::new(0)); - // different string gets a different #: - assert_eq!(i.intern("cat"), Symbol::new(1)); - assert_eq!(i.intern("cat"), Symbol::new(1)); - // dog is still at zero - assert_eq!(i.intern("dog"), Symbol::new(0)); +fn symbol_tests() { + let sym = |n| Some(Symbol(SymbolIndex::from_u32(n))); + + // Simple ASCII symbols. + assert_eq!(Symbol::try_new_inlined(""), sym(0x00_00_00_00)); + assert_eq!(Symbol::try_new_inlined("a"), sym(0x00_00_00_61)); + assert_eq!(Symbol::try_new_inlined("ab"), sym(0x00_00_62_61)); + assert_eq!(Symbol::try_new_inlined("abc"), sym(0x00_63_62_61)); + assert_eq!(Symbol::try_new_inlined("abcd"), sym(0x64_63_62_61)); + assert_eq!(Symbol::try_new_inlined("abcde"), None); // too long + assert_eq!(Symbol::try_new_inlined("abcdefghijklmnopqrstuvwxyz"), None); // too long + + // Symbols involving non-ASCII chars. + // Note that the UTF-8 sequence for 'é' is `[0xc3, 0xa9]`. + assert_eq!(Symbol::try_new_inlined("é"), sym(0x00_00_a9_c3)); + assert_eq!(Symbol::try_new_inlined("dé"), sym(0x00_a9_c3_64)); + assert_eq!(Symbol::try_new_inlined("édc"), sym(0x63_64_a9_c3)); + assert_eq!(Symbol::try_new_inlined("cdé"), None); // byte 3 (0xa9) is > 0x7f + + // Symbols involving NUL chars. + assert_eq!(Symbol::try_new_inlined("\0"), None); // last byte is NUL + assert_eq!(Symbol::try_new_inlined("a\0"), None); // last byte is NUL + assert_eq!(Symbol::try_new_inlined("\0a"), sym(0x00_00_61_00)); + assert_eq!(Symbol::try_new_inlined("aa\0"), None); // last byte is NUL + assert_eq!(Symbol::try_new_inlined("\0\0a"), sym(0x00_61_00_00)); + assert_eq!(Symbol::try_new_inlined("aaa\0"), None); // last byte is NUL + assert_eq!(Symbol::try_new_inlined("\0\0\0a"), sym(0x61_00_00_00)); + + // Tabled symbols. + assert_eq!(Symbol::new_tabled(0).as_u32(), 0x80000000); + assert_eq!(Symbol::new_tabled(5).as_u32(), 0x80000005); + assert_eq!(Symbol::new_tabled(0x123456).as_u32(), 0x80123456); + + // Tabled symbol indices. + assert_eq!(Symbol::new_tabled(0).as_tabled_index(), 0); + assert_eq!(Symbol::new_tabled(5).as_tabled_index(), 5); + assert_eq!(Symbol::new_tabled(0x123456).as_tabled_index(), 0x123456); } #[test] -fn without_first_quote_test() { +fn symbol_interner_tests() { SESSION_GLOBALS.set(&SessionGlobals::new(edition::DEFAULT_EDITION), || { + let inlined = |s, n, len| { + // Check the symbol and the deinterned string look right. + let sym = Symbol::intern(s); + assert_eq!(sym.as_u32(), n); + sym.with(|w| w == s); + assert_eq!(sym.as_str(), s); + assert_eq!(sym.as_str().len(), len); + }; + + let tabled = |s, len| { + // Check the symbol and the deinterned string look right. + let sym = Symbol::intern(s); + assert!(sym.as_u32() & 0x80000000 != 0); + sym.with(|w| w == s); + assert_eq!(sym.as_str(), s); + assert_eq!(sym.as_str().len(), len); + }; + + // Inlined symbols, lengths 1..=4. + // Note that the UTF-8 sequence for 'é' is `[0xc3, 0xa9]`. + inlined("", 0x00_00_00_00, 0); + inlined("a", 0x00_00_00_61, 1); + inlined("é", 0x00_00_a9_c3, 2); + inlined("dé", 0x00_a9_c3_64, 3); + inlined("édc", 0x63_64_a9_c3, 4); + + // Tabled symbols. + tabled("abcde", 5); // tabled due to length + tabled("cdé", 4); // tabled due to the fourth byte being > 0x7f + tabled("a\0", 2); // tabled due to the last byte being NUL + + // Test `without_first_quote()`. let i = Ident::from_str("'break"); assert_eq!(i.without_first_quote().name, kw::Break); }); } + +#[test] +fn interner_tests() { + let mut i: Interner = Interner::default(); + + // Note that going directly through `Interner` means that no inlined + // symbols are made. + + // First long one is zero. + assert_eq!(i.intern("dog"), Symbol::new_tabled(0)); + // Re-use gets the same entry. + assert_eq!(i.intern("dog"), Symbol::new_tabled(0)); + // Different string gets a different index. + assert_eq!(i.intern("salamander"), Symbol::new_tabled(1)); + assert_eq!(i.intern("salamander"), Symbol::new_tabled(1)); + // Dog is still at zero. + assert_eq!(i.intern("dog"), Symbol::new_tabled(0)); +}