Auto merge of rust-lang#123441 - saethlin:fixed-len-file-names, r=oli-obk

bors · bors · commit 0d7b2fb797f2 · 2024-05-03T17:41:48.000Z
Stabilize the size of incr comp object file names The current implementation does not produce stable-length paths, and we create the paths in a way that makes our allocation behavior is nondeterministic. I think `@eddyb` fixed a number of other cases like this in the past, and this PR fixes another one. Whether that actually matters I have no idea, but we still have bimodal behavior in rustc-perf and the non-uniformity in `find` and `ls` was bothering me. I've also removed the truncation of the mangled CGU names. Before this PR incr comp paths look like this: ``` target/debug/incremental/scratch-38izrrq90cex7/s-gux6gz0ow8-1ph76gg-ewe1xj434l26w9up5bedsojpd/261xgo1oqnd90ry5.o ``` And after, they look like this: ``` target/debug/incremental/scratch-035omutqbfkbw/s-gux6borni0-16r3v1j-6n64tmwqzchtgqzwwim5amuga/55v2re42sztc8je9bva6g8ft3.o ``` On the one hand, I'm sure this will break some people's builds because they're on Windows and only a few bytes from the path length limit. But if we're that seriously worried about the length of our file names, I have some other ideas on how to make them smaller. And last time I deleted some hash truncations from the compiler, there was a huge drop in the number if incremental compilation ICEs that were reported: rust-lang#110367 --- Upon further reading, this PR actually fixes a bug. This comment says the CGU names are supposed to be a fixed-length hash, and before this PR they aren't: https://github.com/rust-lang/rust/blob/ca7d34efa94afe271accf2bd3d44152a5bd6fff1/compiler/rustc_monomorphize/src/partitioning.rs#L445-L448
diff --git a/compiler/rustc_codegen_gcc/src/context.rs b/compiler/rustc_codegen_gcc/src/context.rs
@@ -6,7 +6,8 @@ use gccjit::{
 use rustc_codegen_ssa::base::wants_msvc_seh;
 use rustc_codegen_ssa::errors as ssa_errors;
 use rustc_codegen_ssa::traits::{BackendTypes, BaseTypeMethods, MiscMethods};
-use rustc_data_structures::base_n;
+use rustc_data_structures::base_n::ToBaseN;
+use rustc_data_structures::base_n::ALPHANUMERIC_ONLY;
 use rustc_data_structures::fx::{FxHashMap, FxHashSet};
 use rustc_middle::mir::mono::CodegenUnit;
 use rustc_middle::span_bug;
@@ -621,7 +622,7 @@ impl<'b, 'tcx> CodegenCx<'b, 'tcx> {
         let mut name = String::with_capacity(prefix.len() + 6);
         name.push_str(prefix);
         name.push_str(".");
-        base_n::push_str(idx as u128, base_n::ALPHANUMERIC_ONLY, &mut name);
+        name.push_str(&(idx as u64).to_base(ALPHANUMERIC_ONLY));
         name
     }
 }
diff --git a/compiler/rustc_codegen_llvm/src/context.rs b/compiler/rustc_codegen_llvm/src/context.rs
@@ -11,7 +11,8 @@ use crate::value::Value;
 use rustc_codegen_ssa::base::{wants_msvc_seh, wants_wasm_eh};
 use rustc_codegen_ssa::errors as ssa_errors;
 use rustc_codegen_ssa::traits::*;
-use rustc_data_structures::base_n;
+use rustc_data_structures::base_n::ToBaseN;
+use rustc_data_structures::base_n::ALPHANUMERIC_ONLY;
 use rustc_data_structures::fx::FxHashMap;
 use rustc_data_structures::small_c_str::SmallCStr;
 use rustc_hir::def_id::DefId;
@@ -1015,7 +1016,7 @@ impl CodegenCx<'_, '_> {
         let mut name = String::with_capacity(prefix.len() + 6);
         name.push_str(prefix);
         name.push('.');
-        base_n::push_str(idx as u128, base_n::ALPHANUMERIC_ONLY, &mut name);
+        name.push_str(&(idx as u64).to_base(ALPHANUMERIC_ONLY));
         name
     }
 }
diff --git a/compiler/rustc_data_structures/src/base_n.rs b/compiler/rustc_data_structures/src/base_n.rs
@@ -1,6 +1,7 @@
 /// Converts unsigned integers into a string representation with some base.
 /// Bases up to and including 36 can be used for case-insensitive things.
-use std::str;
+use std::ascii;
+use std::fmt;
 
 #[cfg(test)]
 mod tests;
@@ -9,36 +10,101 @@ pub const MAX_BASE: usize = 64;
 pub const ALPHANUMERIC_ONLY: usize = 62;
 pub const CASE_INSENSITIVE: usize = 36;
 
-const BASE_64: &[u8; MAX_BASE] =
-    b"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ@$";
+const BASE_64: [ascii::Char; MAX_BASE] = {
+    let bytes = b"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ@$";
+    let Some(ascii) = bytes.as_ascii() else { panic!() };
+    *ascii
+};
 
-#[inline]
-pub fn push_str(mut n: u128, base: usize, output: &mut String) {
-    debug_assert!(base >= 2 && base <= MAX_BASE);
-    let mut s = [0u8; 128];
-    let mut index = s.len();
+pub struct BaseNString {
+    start: usize,
+    buf: [ascii::Char; 128],
+}
+
+impl std::ops::Deref for BaseNString {
+    type Target = str;
 
-    let base = base as u128;
+    fn deref(&self) -> &str {
+        self.buf[self.start..].as_str()
+    }
+}
+
+impl AsRef<str> for BaseNString {
+    fn as_ref(&self) -> &str {
+        self.buf[self.start..].as_str()
+    }
+}
+
+impl fmt::Display for BaseNString {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.write_str(self)
+    }
+}
+
+// This trait just lets us reserve the exact right amount of space when doing fixed-length
+// case-insensitve encoding. Add any impls you need.
+pub trait ToBaseN: Into<u128> {
+    fn encoded_len(base: usize) -> usize;
+
+    fn to_base_fixed_len(self, base: usize) -> BaseNString {
+        let mut encoded = self.to_base(base);
+        encoded.start = encoded.buf.len() - Self::encoded_len(base);
+        encoded
+    }
 
-    loop {
-        index -= 1;
-        s[index] = BASE_64[(n % base) as usize];
-        n /= base;
+    fn to_base(self, base: usize) -> BaseNString {
+        let mut output = [ascii::Char::Digit0; 128];
 
-        if n == 0 {
-            break;
+        let mut n: u128 = self.into();
+
+        let mut index = output.len();
+        loop {
+            index -= 1;
+            output[index] = BASE_64[(n % base as u128) as usize];
+            n /= base as u128;
+
+            if n == 0 {
+                break;
+            }
+        }
+        assert_eq!(n, 0);
+
+        BaseNString { start: index, buf: output }
+    }
+}
+
+impl ToBaseN for u128 {
+    fn encoded_len(base: usize) -> usize {
+        let mut max = u128::MAX;
+        let mut len = 0;
+        while max > 0 {
+            len += 1;
+            max /= base as u128;
         }
+        len
     }
+}
 
-    output.push_str(unsafe {
-        // SAFETY: `s` is populated using only valid utf8 characters from `BASE_64`
-        str::from_utf8_unchecked(&s[index..])
-    });
+impl ToBaseN for u64 {
+    fn encoded_len(base: usize) -> usize {
+        let mut max = u64::MAX;
+        let mut len = 0;
+        while max > 0 {
+            len += 1;
+            max /= base as u64;
+        }
+        len
+    }
 }
 
-#[inline]
-pub fn encode(n: u128, base: usize) -> String {
-    let mut s = String::new();
-    push_str(n, base, &mut s);
-    s
+impl ToBaseN for u32 {
+    fn encoded_len(base: usize) -> usize {
+        let mut max = u32::MAX;
+        let mut len = 0;
+        while max > 0 {
+            len += 1;
+            max /= base as u32;
+        }
+        len
+    }
 }
diff --git a/compiler/rustc_data_structures/src/base_n/tests.rs b/compiler/rustc_data_structures/src/base_n/tests.rs
@@ -1,9 +1,17 @@
 use super::*;
 
 #[test]
-fn test_encode() {
+fn limits() {
+    assert_eq!(Ok(u128::MAX), u128::from_str_radix(&u128::MAX.to_base(36), 36));
+    assert_eq!(Ok(u64::MAX), u64::from_str_radix(&u64::MAX.to_base(36), 36));
+    assert_eq!(Ok(u32::MAX), u32::from_str_radix(&u32::MAX.to_base(36), 36));
+}
+
+#[test]
+fn test_to_base() {
     fn test(n: u128, base: usize) {
-        assert_eq!(Ok(n), u128::from_str_radix(&encode(n, base), base as u32));
+        assert_eq!(Ok(n), u128::from_str_radix(&n.to_base(base), base as u32));
+        assert_eq!(Ok(n), u128::from_str_radix(&n.to_base_fixed_len(base), base as u32));
     }
 
     for base in 2..37 {
diff --git a/compiler/rustc_data_structures/src/lib.rs b/compiler/rustc_data_structures/src/lib.rs
@@ -16,6 +16,8 @@
 #![doc(rust_logo)]
 #![feature(allocator_api)]
 #![feature(array_windows)]
+#![feature(ascii_char)]
+#![feature(ascii_char_variants)]
 #![feature(auto_traits)]
 #![feature(cfg_match)]
 #![feature(core_intrinsics)]
diff --git a/compiler/rustc_incremental/src/persist/fs.rs b/compiler/rustc_incremental/src/persist/fs.rs
@@ -104,10 +104,14 @@
 //! implemented.
 
 use crate::errors;
+use rustc_data_structures::base_n;
+use rustc_data_structures::base_n::BaseNString;
+use rustc_data_structures::base_n::ToBaseN;
+use rustc_data_structures::base_n::CASE_INSENSITIVE;
+use rustc_data_structures::flock;
 use rustc_data_structures::fx::{FxHashSet, FxIndexSet};
 use rustc_data_structures::svh::Svh;
 use rustc_data_structures::unord::{UnordMap, UnordSet};
-use rustc_data_structures::{base_n, flock};
 use rustc_errors::ErrorGuaranteed;
 use rustc_fs_util::{link_or_copy, try_canonicalize, LinkOrCopy};
 use rustc_middle::bug;
@@ -333,31 +337,24 @@ pub fn finalize_session_directory(sess: &Session, svh: Option<Svh>) {
 
     debug!("finalize_session_directory() - session directory: {}", incr_comp_session_dir.display());
 
-    let old_sub_dir_name = incr_comp_session_dir
+    let mut sub_dir_name = incr_comp_session_dir
         .file_name()
         .unwrap()
         .to_str()
-        .expect("malformed session dir name: contains non-Unicode characters");
+        .expect("malformed session dir name: contains non-Unicode characters")
+        .to_string();
 
-    // Keep the 's-{timestamp}-{random-number}' prefix, but replace the
-    // '-working' part with the SVH of the crate
-    let dash_indices: Vec<_> = old_sub_dir_name.match_indices('-').map(|(idx, _)| idx).collect();
-    if dash_indices.len() != 3 {
-        bug!(
-            "Encountered incremental compilation session directory with \
-              malformed name: {}",
-            incr_comp_session_dir.display()
-        )
-    }
-
-    // State: "s-{timestamp}-{random-number}-"
-    let mut new_sub_dir_name = String::from(&old_sub_dir_name[..=dash_indices[2]]);
+    // Keep the 's-{timestamp}-{random-number}' prefix, but replace "working" with the SVH of the crate
+    sub_dir_name.truncate(sub_dir_name.len() - "working".len());
+    // Double-check that we kept this: "s-{timestamp}-{random-number}-"
+    assert!(sub_dir_name.ends_with('-'), "{:?}", sub_dir_name);
+    assert!(sub_dir_name.as_bytes().iter().filter(|b| **b == b'-').count() == 3);
 
-    // Append the svh
-    base_n::push_str(svh.as_u128(), INT_ENCODE_BASE, &mut new_sub_dir_name);
+    // Append the SVH
+    sub_dir_name.push_str(&svh.as_u128().to_base_fixed_len(CASE_INSENSITIVE));
 
     // Create the full path
-    let new_path = incr_comp_session_dir.parent().unwrap().join(new_sub_dir_name);
+    let new_path = incr_comp_session_dir.parent().unwrap().join(&*sub_dir_name);
     debug!("finalize_session_directory() - new path: {}", new_path.display());
 
     match rename_path_with_retry(&*incr_comp_session_dir, &new_path, 3) {
@@ -453,11 +450,11 @@ fn generate_session_dir_path(crate_dir: &Path) -> PathBuf {
     let random_number = thread_rng().next_u32();
     debug!("generate_session_dir_path: random_number = {}", random_number);
 
-    let directory_name = format!(
-        "s-{}-{}-working",
-        timestamp,
-        base_n::encode(random_number as u128, INT_ENCODE_BASE)
-    );
+    // Chop the first 3 characters off the timestamp. Those 3 bytes will be zero for a while.
+    let (zeroes, timestamp) = timestamp.split_at(3);
+    assert_eq!(zeroes, "000");
+    let directory_name =
+        format!("s-{}-{}-working", timestamp, random_number.to_base_fixed_len(CASE_INSENSITIVE));
     debug!("generate_session_dir_path: directory_name = {}", directory_name);
     let directory_path = crate_dir.join(directory_name);
     debug!("generate_session_dir_path: directory_path = {}", directory_path.display());
@@ -588,10 +585,10 @@ fn extract_timestamp_from_session_dir(directory_name: &str) -> Result<SystemTime
     string_to_timestamp(&directory_name[dash_indices[0] + 1..dash_indices[1]])
 }
 
-fn timestamp_to_string(timestamp: SystemTime) -> String {
+fn timestamp_to_string(timestamp: SystemTime) -> BaseNString {
     let duration = timestamp.duration_since(UNIX_EPOCH).unwrap();
     let micros = duration.as_secs() * 1_000_000 + (duration.subsec_nanos() as u64) / 1000;
-    base_n::encode(micros as u128, INT_ENCODE_BASE)
+    micros.to_base_fixed_len(CASE_INSENSITIVE)
 }
 
 fn string_to_timestamp(s: &str) -> Result<SystemTime, &'static str> {
@@ -622,9 +619,8 @@ fn crate_path(sess: &Session) -> PathBuf {
         sess.cfg_version,
     );
 
-    let stable_crate_id = base_n::encode(stable_crate_id.as_u64() as u128, INT_ENCODE_BASE);
-
-    let crate_name = format!("{crate_name}-{stable_crate_id}");
+    let crate_name =
+        format!("{crate_name}-{}", stable_crate_id.as_u64().to_base_fixed_len(CASE_INSENSITIVE));
     incr_dir.join(crate_name)
 }
 
diff --git a/compiler/rustc_middle/src/mir/mono.rs b/compiler/rustc_middle/src/mir/mono.rs
@@ -1,7 +1,9 @@
 use crate::dep_graph::{DepNode, WorkProduct, WorkProductId};
 use crate::ty::{GenericArgs, Instance, InstanceDef, SymbolName, TyCtxt};
 use rustc_attr::InlineAttr;
-use rustc_data_structures::base_n;
+use rustc_data_structures::base_n::BaseNString;
+use rustc_data_structures::base_n::ToBaseN;
+use rustc_data_structures::base_n::CASE_INSENSITIVE;
 use rustc_data_structures::fingerprint::Fingerprint;
 use rustc_data_structures::fx::FxHashMap;
 use rustc_data_structures::fx::FxIndexMap;
@@ -337,14 +339,11 @@ impl<'tcx> CodegenUnit<'tcx> {
         self.is_code_coverage_dead_code_cgu = true;
     }
 
-    pub fn mangle_name(human_readable_name: &str) -> String {
-        // We generate a 80 bit hash from the name. This should be enough to
-        // avoid collisions and is still reasonably short for filenames.
+    pub fn mangle_name(human_readable_name: &str) -> BaseNString {
         let mut hasher = StableHasher::new();
         human_readable_name.hash(&mut hasher);
         let hash: Hash128 = hasher.finish();
-        let hash = hash.as_u128() & ((1u128 << 80) - 1);
-        base_n::encode(hash, base_n::CASE_INSENSITIVE)
+        hash.as_u128().to_base_fixed_len(CASE_INSENSITIVE)
     }
 
     pub fn compute_size_estimate(&mut self) {
diff --git a/compiler/rustc_sanitizers/src/cfi/typeid/itanium_cxx_abi/encode.rs b/compiler/rustc_sanitizers/src/cfi/typeid/itanium_cxx_abi/encode.rs
@@ -4,7 +4,9 @@
 //!
 //! For more information about LLVM CFI and cross-language LLVM CFI support for the Rust compiler,
 //! see design document in the tracking issue #89653.
-use rustc_data_structures::base_n;
+use rustc_data_structures::base_n::ToBaseN;
+use rustc_data_structures::base_n::ALPHANUMERIC_ONLY;
+use rustc_data_structures::base_n::CASE_INSENSITIVE;
 use rustc_data_structures::fx::FxHashMap;
 use rustc_hir as hir;
 use rustc_middle::bug;
@@ -736,7 +738,7 @@ fn encode_ty_name(tcx: TyCtxt<'_>, def_id: DefId) -> String {
 /// <https://rust-lang.github.io/rfcs/2603-rust-symbol-name-mangling-v0.html>).
 fn to_disambiguator(num: u64) -> String {
     if let Some(num) = num.checked_sub(1) {
-        format!("s{}_", base_n::encode(num as u128, base_n::ALPHANUMERIC_ONLY))
+        format!("s{}_", num.to_base(ALPHANUMERIC_ONLY))
     } else {
         "s_".to_string()
     }
@@ -746,7 +748,7 @@ fn to_disambiguator(num: u64) -> String {
 /// <https://itanium-cxx-abi.github.io/cxx-abi/abi.html#mangle.seq-id>).
 fn to_seq_id(num: usize) -> String {
     if let Some(num) = num.checked_sub(1) {
-        base_n::encode(num as u128, base_n::CASE_INSENSITIVE).to_uppercase()
+        (num as u64).to_base(CASE_INSENSITIVE).to_uppercase()
     } else {
         "".to_string()
     }
diff --git a/compiler/rustc_symbol_mangling/src/v0.rs b/compiler/rustc_symbol_mangling/src/v0.rs
@@ -1,4 +1,4 @@
-use rustc_data_structures::base_n;
+use rustc_data_structures::base_n::ToBaseN;
 use rustc_data_structures::fx::FxHashMap;
 use rustc_data_structures::intern::Interned;
 use rustc_hir as hir;
@@ -832,7 +832,7 @@ impl<'tcx> Printer<'tcx> for SymbolMangler<'tcx> {
 ///   e.g. `1` becomes `"0_"`, `62` becomes `"Z_"`, etc.
 pub(crate) fn push_integer_62(x: u64, output: &mut String) {
     if let Some(x) = x.checked_sub(1) {
-        base_n::push_str(x as u128, base_n::ALPHANUMERIC_ONLY, output);
+        output.push_str(&x.to_base(62));
     }
     output.push('_');
 }

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-use rustc_data_structures::base_n;`
	`1`	`+use rustc_data_structures::base_n::ToBaseN;`
`2`	`2`	`use rustc_data_structures::fx::FxHashMap;`
`3`	`3`	`use rustc_data_structures::intern::Interned;`
`4`	`4`	`use rustc_hir as hir;`
`@@ -832,7 +832,7 @@ impl<'tcx> Printer<'tcx> for SymbolMangler<'tcx> {`
`832`	`832`	/// e.g. `1` becomes `"0_"`, `62` becomes `"Z_"`, etc.
`833`	`833`	`pub(crate) fn push_integer_62(x: u64, output: &mut String) {`
`834`	`834`	`if let Some(x) = x.checked_sub(1) {`
`835`		`- base_n::push_str(x as u128, base_n::ALPHANUMERIC_ONLY, output);`
	`835`	`+ output.push_str(&x.to_base(62));`
`836`	`836`	`}`
`837`	`837`	`output.push('_');`
`838`	`838`	`}`