Skip to content

Commit

Permalink
Auto merge of #123441 - saethlin:fixed-len-file-names, r=oli-obk
Browse files Browse the repository at this point in the history
Stabilize the size of incr comp object file names

The current implementation does not produce stable-length paths, and we create the paths in a way that makes our allocation behavior is nondeterministic. I think `@eddyb` fixed a number of other cases like this in the past, and this PR fixes another one. Whether that actually matters I have no idea, but we still have bimodal behavior in rustc-perf and the non-uniformity in `find` and `ls` was bothering me.

I've also removed the truncation of the mangled CGU names. Before this PR incr comp paths look like this:
```
target/debug/incremental/scratch-38izrrq90cex7/s-gux6gz0ow8-1ph76gg-ewe1xj434l26w9up5bedsojpd/261xgo1oqnd90ry5.o
```
And after, they look like this:
```
target/debug/incremental/scratch-035omutqbfkbw/s-gux6borni0-16r3v1j-6n64tmwqzchtgqzwwim5amuga/55v2re42sztc8je9bva6g8ft3.o
```

On the one hand, I'm sure this will break some people's builds because they're on Windows and only a few bytes from the path length limit. But if we're that seriously worried about the length of our file names, I have some other ideas on how to make them smaller. And last time I deleted some hash truncations from the compiler, there was a huge drop in the number if incremental compilation ICEs that were reported: #110367

---

Upon further reading, this PR actually fixes a bug. This comment says the CGU names are supposed to be a fixed-length hash, and before this PR they aren't: https://github.com/rust-lang/rust/blob/ca7d34efa94afe271accf2bd3d44152a5bd6fff1/compiler/rustc_monomorphize/src/partitioning.rs#L445-L448
  • Loading branch information
bors committed May 3, 2024
2 parents d6d3b34 + 6ee3713 commit 0d7b2fb
Show file tree
Hide file tree
Showing 9 changed files with 145 additions and 70 deletions.
5 changes: 3 additions & 2 deletions compiler/rustc_codegen_gcc/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ use gccjit::{
use rustc_codegen_ssa::base::wants_msvc_seh;
use rustc_codegen_ssa::errors as ssa_errors;
use rustc_codegen_ssa::traits::{BackendTypes, BaseTypeMethods, MiscMethods};
use rustc_data_structures::base_n;
use rustc_data_structures::base_n::ToBaseN;
use rustc_data_structures::base_n::ALPHANUMERIC_ONLY;
use rustc_data_structures::fx::{FxHashMap, FxHashSet};
use rustc_middle::mir::mono::CodegenUnit;
use rustc_middle::span_bug;
Expand Down Expand Up @@ -621,7 +622,7 @@ impl<'b, 'tcx> CodegenCx<'b, 'tcx> {
let mut name = String::with_capacity(prefix.len() + 6);
name.push_str(prefix);
name.push_str(".");
base_n::push_str(idx as u128, base_n::ALPHANUMERIC_ONLY, &mut name);
name.push_str(&(idx as u64).to_base(ALPHANUMERIC_ONLY));
name
}
}
Expand Down
5 changes: 3 additions & 2 deletions compiler/rustc_codegen_llvm/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ use crate::value::Value;
use rustc_codegen_ssa::base::{wants_msvc_seh, wants_wasm_eh};
use rustc_codegen_ssa::errors as ssa_errors;
use rustc_codegen_ssa::traits::*;
use rustc_data_structures::base_n;
use rustc_data_structures::base_n::ToBaseN;
use rustc_data_structures::base_n::ALPHANUMERIC_ONLY;
use rustc_data_structures::fx::FxHashMap;
use rustc_data_structures::small_c_str::SmallCStr;
use rustc_hir::def_id::DefId;
Expand Down Expand Up @@ -1015,7 +1016,7 @@ impl CodegenCx<'_, '_> {
let mut name = String::with_capacity(prefix.len() + 6);
name.push_str(prefix);
name.push('.');
base_n::push_str(idx as u128, base_n::ALPHANUMERIC_ONLY, &mut name);
name.push_str(&(idx as u64).to_base(ALPHANUMERIC_ONLY));
name
}
}
Expand Down
114 changes: 90 additions & 24 deletions compiler/rustc_data_structures/src/base_n.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/// Converts unsigned integers into a string representation with some base.
/// Bases up to and including 36 can be used for case-insensitive things.
use std::str;
use std::ascii;
use std::fmt;

#[cfg(test)]
mod tests;
Expand All @@ -9,36 +10,101 @@ pub const MAX_BASE: usize = 64;
pub const ALPHANUMERIC_ONLY: usize = 62;
pub const CASE_INSENSITIVE: usize = 36;

const BASE_64: &[u8; MAX_BASE] =
b"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ@$";
const BASE_64: [ascii::Char; MAX_BASE] = {
let bytes = b"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ@$";
let Some(ascii) = bytes.as_ascii() else { panic!() };
*ascii
};

#[inline]
pub fn push_str(mut n: u128, base: usize, output: &mut String) {
debug_assert!(base >= 2 && base <= MAX_BASE);
let mut s = [0u8; 128];
let mut index = s.len();
pub struct BaseNString {
start: usize,
buf: [ascii::Char; 128],
}

impl std::ops::Deref for BaseNString {
type Target = str;

let base = base as u128;
fn deref(&self) -> &str {
self.buf[self.start..].as_str()
}
}

impl AsRef<str> for BaseNString {
fn as_ref(&self) -> &str {
self.buf[self.start..].as_str()
}
}

impl fmt::Display for BaseNString {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str(self)
}
}

// This trait just lets us reserve the exact right amount of space when doing fixed-length
// case-insensitve encoding. Add any impls you need.
pub trait ToBaseN: Into<u128> {
fn encoded_len(base: usize) -> usize;

fn to_base_fixed_len(self, base: usize) -> BaseNString {
let mut encoded = self.to_base(base);
encoded.start = encoded.buf.len() - Self::encoded_len(base);
encoded
}

loop {
index -= 1;
s[index] = BASE_64[(n % base) as usize];
n /= base;
fn to_base(self, base: usize) -> BaseNString {
let mut output = [ascii::Char::Digit0; 128];

if n == 0 {
break;
let mut n: u128 = self.into();

let mut index = output.len();
loop {
index -= 1;
output[index] = BASE_64[(n % base as u128) as usize];
n /= base as u128;

if n == 0 {
break;
}
}
assert_eq!(n, 0);

BaseNString { start: index, buf: output }
}
}

impl ToBaseN for u128 {
fn encoded_len(base: usize) -> usize {
let mut max = u128::MAX;
let mut len = 0;
while max > 0 {
len += 1;
max /= base as u128;
}
len
}
}

output.push_str(unsafe {
// SAFETY: `s` is populated using only valid utf8 characters from `BASE_64`
str::from_utf8_unchecked(&s[index..])
});
impl ToBaseN for u64 {
fn encoded_len(base: usize) -> usize {
let mut max = u64::MAX;
let mut len = 0;
while max > 0 {
len += 1;
max /= base as u64;
}
len
}
}

#[inline]
pub fn encode(n: u128, base: usize) -> String {
let mut s = String::new();
push_str(n, base, &mut s);
s
impl ToBaseN for u32 {
fn encoded_len(base: usize) -> usize {
let mut max = u32::MAX;
let mut len = 0;
while max > 0 {
len += 1;
max /= base as u32;
}
len
}
}
12 changes: 10 additions & 2 deletions compiler/rustc_data_structures/src/base_n/tests.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,17 @@
use super::*;

#[test]
fn test_encode() {
fn limits() {
assert_eq!(Ok(u128::MAX), u128::from_str_radix(&u128::MAX.to_base(36), 36));
assert_eq!(Ok(u64::MAX), u64::from_str_radix(&u64::MAX.to_base(36), 36));
assert_eq!(Ok(u32::MAX), u32::from_str_radix(&u32::MAX.to_base(36), 36));
}

#[test]
fn test_to_base() {
fn test(n: u128, base: usize) {
assert_eq!(Ok(n), u128::from_str_radix(&encode(n, base), base as u32));
assert_eq!(Ok(n), u128::from_str_radix(&n.to_base(base), base as u32));
assert_eq!(Ok(n), u128::from_str_radix(&n.to_base_fixed_len(base), base as u32));
}

for base in 2..37 {
Expand Down
2 changes: 2 additions & 0 deletions compiler/rustc_data_structures/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
#![doc(rust_logo)]
#![feature(allocator_api)]
#![feature(array_windows)]
#![feature(ascii_char)]
#![feature(ascii_char_variants)]
#![feature(auto_traits)]
#![feature(cfg_match)]
#![feature(core_intrinsics)]
Expand Down
54 changes: 25 additions & 29 deletions compiler/rustc_incremental/src/persist/fs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,10 +104,14 @@
//! implemented.
use crate::errors;
use rustc_data_structures::base_n;
use rustc_data_structures::base_n::BaseNString;
use rustc_data_structures::base_n::ToBaseN;
use rustc_data_structures::base_n::CASE_INSENSITIVE;
use rustc_data_structures::flock;
use rustc_data_structures::fx::{FxHashSet, FxIndexSet};
use rustc_data_structures::svh::Svh;
use rustc_data_structures::unord::{UnordMap, UnordSet};
use rustc_data_structures::{base_n, flock};
use rustc_errors::ErrorGuaranteed;
use rustc_fs_util::{link_or_copy, try_canonicalize, LinkOrCopy};
use rustc_middle::bug;
Expand Down Expand Up @@ -333,31 +337,24 @@ pub fn finalize_session_directory(sess: &Session, svh: Option<Svh>) {

debug!("finalize_session_directory() - session directory: {}", incr_comp_session_dir.display());

let old_sub_dir_name = incr_comp_session_dir
let mut sub_dir_name = incr_comp_session_dir
.file_name()
.unwrap()
.to_str()
.expect("malformed session dir name: contains non-Unicode characters");
.expect("malformed session dir name: contains non-Unicode characters")
.to_string();

// Keep the 's-{timestamp}-{random-number}' prefix, but replace the
// '-working' part with the SVH of the crate
let dash_indices: Vec<_> = old_sub_dir_name.match_indices('-').map(|(idx, _)| idx).collect();
if dash_indices.len() != 3 {
bug!(
"Encountered incremental compilation session directory with \
malformed name: {}",
incr_comp_session_dir.display()
)
}

// State: "s-{timestamp}-{random-number}-"
let mut new_sub_dir_name = String::from(&old_sub_dir_name[..=dash_indices[2]]);
// Keep the 's-{timestamp}-{random-number}' prefix, but replace "working" with the SVH of the crate
sub_dir_name.truncate(sub_dir_name.len() - "working".len());
// Double-check that we kept this: "s-{timestamp}-{random-number}-"
assert!(sub_dir_name.ends_with('-'), "{:?}", sub_dir_name);
assert!(sub_dir_name.as_bytes().iter().filter(|b| **b == b'-').count() == 3);

// Append the svh
base_n::push_str(svh.as_u128(), INT_ENCODE_BASE, &mut new_sub_dir_name);
// Append the SVH
sub_dir_name.push_str(&svh.as_u128().to_base_fixed_len(CASE_INSENSITIVE));

// Create the full path
let new_path = incr_comp_session_dir.parent().unwrap().join(new_sub_dir_name);
let new_path = incr_comp_session_dir.parent().unwrap().join(&*sub_dir_name);
debug!("finalize_session_directory() - new path: {}", new_path.display());

match rename_path_with_retry(&*incr_comp_session_dir, &new_path, 3) {
Expand Down Expand Up @@ -453,11 +450,11 @@ fn generate_session_dir_path(crate_dir: &Path) -> PathBuf {
let random_number = thread_rng().next_u32();
debug!("generate_session_dir_path: random_number = {}", random_number);

let directory_name = format!(
"s-{}-{}-working",
timestamp,
base_n::encode(random_number as u128, INT_ENCODE_BASE)
);
// Chop the first 3 characters off the timestamp. Those 3 bytes will be zero for a while.
let (zeroes, timestamp) = timestamp.split_at(3);
assert_eq!(zeroes, "000");
let directory_name =
format!("s-{}-{}-working", timestamp, random_number.to_base_fixed_len(CASE_INSENSITIVE));
debug!("generate_session_dir_path: directory_name = {}", directory_name);
let directory_path = crate_dir.join(directory_name);
debug!("generate_session_dir_path: directory_path = {}", directory_path.display());
Expand Down Expand Up @@ -588,10 +585,10 @@ fn extract_timestamp_from_session_dir(directory_name: &str) -> Result<SystemTime
string_to_timestamp(&directory_name[dash_indices[0] + 1..dash_indices[1]])
}

fn timestamp_to_string(timestamp: SystemTime) -> String {
fn timestamp_to_string(timestamp: SystemTime) -> BaseNString {
let duration = timestamp.duration_since(UNIX_EPOCH).unwrap();
let micros = duration.as_secs() * 1_000_000 + (duration.subsec_nanos() as u64) / 1000;
base_n::encode(micros as u128, INT_ENCODE_BASE)
micros.to_base_fixed_len(CASE_INSENSITIVE)
}

fn string_to_timestamp(s: &str) -> Result<SystemTime, &'static str> {
Expand Down Expand Up @@ -622,9 +619,8 @@ fn crate_path(sess: &Session) -> PathBuf {
sess.cfg_version,
);

let stable_crate_id = base_n::encode(stable_crate_id.as_u64() as u128, INT_ENCODE_BASE);

let crate_name = format!("{crate_name}-{stable_crate_id}");
let crate_name =
format!("{crate_name}-{}", stable_crate_id.as_u64().to_base_fixed_len(CASE_INSENSITIVE));
incr_dir.join(crate_name)
}

Expand Down
11 changes: 5 additions & 6 deletions compiler/rustc_middle/src/mir/mono.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
use crate::dep_graph::{DepNode, WorkProduct, WorkProductId};
use crate::ty::{GenericArgs, Instance, InstanceDef, SymbolName, TyCtxt};
use rustc_attr::InlineAttr;
use rustc_data_structures::base_n;
use rustc_data_structures::base_n::BaseNString;
use rustc_data_structures::base_n::ToBaseN;
use rustc_data_structures::base_n::CASE_INSENSITIVE;
use rustc_data_structures::fingerprint::Fingerprint;
use rustc_data_structures::fx::FxHashMap;
use rustc_data_structures::fx::FxIndexMap;
Expand Down Expand Up @@ -337,14 +339,11 @@ impl<'tcx> CodegenUnit<'tcx> {
self.is_code_coverage_dead_code_cgu = true;
}

pub fn mangle_name(human_readable_name: &str) -> String {
// We generate a 80 bit hash from the name. This should be enough to
// avoid collisions and is still reasonably short for filenames.
pub fn mangle_name(human_readable_name: &str) -> BaseNString {
let mut hasher = StableHasher::new();
human_readable_name.hash(&mut hasher);
let hash: Hash128 = hasher.finish();
let hash = hash.as_u128() & ((1u128 << 80) - 1);
base_n::encode(hash, base_n::CASE_INSENSITIVE)
hash.as_u128().to_base_fixed_len(CASE_INSENSITIVE)
}

pub fn compute_size_estimate(&mut self) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
//!
//! For more information about LLVM CFI and cross-language LLVM CFI support for the Rust compiler,
//! see design document in the tracking issue #89653.
use rustc_data_structures::base_n;
use rustc_data_structures::base_n::ToBaseN;
use rustc_data_structures::base_n::ALPHANUMERIC_ONLY;
use rustc_data_structures::base_n::CASE_INSENSITIVE;
use rustc_data_structures::fx::FxHashMap;
use rustc_hir as hir;
use rustc_middle::bug;
Expand Down Expand Up @@ -736,7 +738,7 @@ fn encode_ty_name(tcx: TyCtxt<'_>, def_id: DefId) -> String {
/// <https://rust-lang.github.io/rfcs/2603-rust-symbol-name-mangling-v0.html>).
fn to_disambiguator(num: u64) -> String {
if let Some(num) = num.checked_sub(1) {
format!("s{}_", base_n::encode(num as u128, base_n::ALPHANUMERIC_ONLY))
format!("s{}_", num.to_base(ALPHANUMERIC_ONLY))
} else {
"s_".to_string()
}
Expand All @@ -746,7 +748,7 @@ fn to_disambiguator(num: u64) -> String {
/// <https://itanium-cxx-abi.github.io/cxx-abi/abi.html#mangle.seq-id>).
fn to_seq_id(num: usize) -> String {
if let Some(num) = num.checked_sub(1) {
base_n::encode(num as u128, base_n::CASE_INSENSITIVE).to_uppercase()
(num as u64).to_base(CASE_INSENSITIVE).to_uppercase()
} else {
"".to_string()
}
Expand Down
4 changes: 2 additions & 2 deletions compiler/rustc_symbol_mangling/src/v0.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use rustc_data_structures::base_n;
use rustc_data_structures::base_n::ToBaseN;
use rustc_data_structures::fx::FxHashMap;
use rustc_data_structures::intern::Interned;
use rustc_hir as hir;
Expand Down Expand Up @@ -832,7 +832,7 @@ impl<'tcx> Printer<'tcx> for SymbolMangler<'tcx> {
/// e.g. `1` becomes `"0_"`, `62` becomes `"Z_"`, etc.
pub(crate) fn push_integer_62(x: u64, output: &mut String) {
if let Some(x) = x.checked_sub(1) {
base_n::push_str(x as u128, base_n::ALPHANUMERIC_ONLY, output);
output.push_str(&x.to_base(62));
}
output.push('_');
}
Expand Down

0 comments on commit 0d7b2fb

Please sign in to comment.