From 075f79fbec2944ae21dd29c1c609560aa4b12c4a Mon Sep 17 00:00:00 2001 From: Jacob Kiesel Date: Wed, 19 Jun 2024 17:13:00 -0600 Subject: [PATCH 01/27] initial version of checksum based freshness --- Cargo.lock | 11 + Cargo.toml | 1 + src/cargo/core/compiler/build_config.rs | 3 + src/cargo/core/compiler/build_runner/mod.rs | 5 +- .../core/compiler/fingerprint/dirty_reason.rs | 44 + src/cargo/core/compiler/fingerprint/mod.rs | 656 +++- src/cargo/core/compiler/mod.rs | 3 + src/cargo/core/features.rs | 2 + src/cargo/util/command_prelude.rs | 5 + src/cargo/util/context/mod.rs | 1 + tests/testsuite/cargo/z_help/stdout.term.svg | 62 +- tests/testsuite/freshness_checksum.rs | 2880 +++++++++++++++++ tests/testsuite/main.rs | 1 + 13 files changed, 3554 insertions(+), 120 deletions(-) create mode 100644 tests/testsuite/freshness_checksum.rs diff --git a/Cargo.lock b/Cargo.lock index aa88e03af98..78f4046d1dc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -307,6 +307,7 @@ dependencies = [ "lazycell", "libc", "libgit2-sys", + "md-5", "memchr", "opener", "openssl", @@ -2300,6 +2301,16 @@ dependencies = [ "syn 2.0.72", ] +[[package]] +name = "md-5" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +dependencies = [ + "cfg-if", + "digest", +] + [[package]] name = "mdman" version = "0.0.0" diff --git a/Cargo.toml b/Cargo.toml index c62dc28df49..1d682930dfb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -207,6 +207,7 @@ unicode-width.workspace = true url.workspace = true walkdir.workspace = true supports-unicode = "3.0.0" +md-5 = "0.10.6" [target.'cfg(target_has_atomic = "64")'.dependencies] tracing-chrome.workspace = true diff --git a/src/cargo/core/compiler/build_config.rs b/src/cargo/core/compiler/build_config.rs index 4c804f27b68..3ce8096fd35 100644 --- a/src/cargo/core/compiler/build_config.rs +++ b/src/cargo/core/compiler/build_config.rs @@ -48,6 +48,8 @@ pub struct BuildConfig { pub future_incompat_report: bool, /// Which kinds of build timings to output (empty if none). pub timing_outputs: Vec, + /// Use checksums rather than mtimes to determine if a crate is fresh. + pub checksum_freshness: bool, } fn default_parallelism() -> CargoResult { @@ -120,6 +122,7 @@ impl BuildConfig { export_dir: None, future_incompat_report: false, timing_outputs: Vec::new(), + checksum_freshness: false, }) } diff --git a/src/cargo/core/compiler/build_runner/mod.rs b/src/cargo/core/compiler/build_runner/mod.rs index ac16ae65479..32651e72a8c 100644 --- a/src/cargo/core/compiler/build_runner/mod.rs +++ b/src/cargo/core/compiler/build_runner/mod.rs @@ -16,7 +16,7 @@ use jobserver::Client; use super::build_plan::BuildPlan; use super::custom_build::{self, BuildDeps, BuildScriptOutputs, BuildScripts}; -use super::fingerprint::Fingerprint; +use super::fingerprint::{Checksum, Fingerprint}; use super::job_queue::JobQueue; use super::layout::Layout; use super::lto::Lto; @@ -50,6 +50,8 @@ pub struct BuildRunner<'a, 'gctx> { pub fingerprints: HashMap>, /// Cache of file mtimes to reduce filesystem hits. pub mtime_cache: HashMap, + /// Cache of file checksums to reduce filesystem reads. + pub checksum_cache: HashMap, /// A set used to track which units have been compiled. /// A unit may appear in the job graph multiple times as a dependency of /// multiple packages, but it only needs to run once. @@ -113,6 +115,7 @@ impl<'a, 'gctx> BuildRunner<'a, 'gctx> { build_script_outputs: Arc::new(Mutex::new(BuildScriptOutputs::default())), fingerprints: HashMap::new(), mtime_cache: HashMap::new(), + checksum_cache: HashMap::new(), compiled: HashSet::new(), build_scripts: HashMap::new(), build_explicit_deps: HashMap::new(), diff --git a/src/cargo/core/compiler/fingerprint/dirty_reason.rs b/src/cargo/core/compiler/fingerprint/dirty_reason.rs index cb6548a41a5..4804f7c2b13 100644 --- a/src/cargo/core/compiler/fingerprint/dirty_reason.rs +++ b/src/cargo/core/compiler/fingerprint/dirty_reason.rs @@ -222,6 +222,13 @@ impl DirtyReason { format_args!("the file `{}` is missing", file.display()), ) } + StaleItem::FailedToReadMetadata(file) => { + let file = file.strip_prefix(root).unwrap_or(&file); + s.dirty_because( + unit, + format_args!("couldn't read metadata for file `{}`", file.display()), + ) + } StaleItem::ChangedFile { stale, stale_mtime, @@ -235,6 +242,43 @@ impl DirtyReason { format_args!("the file `{}` has changed ({after})", file.display()), ) } + StaleItem::ChangedChecksum { + source, + stored_checksum, + new_checksum, + } => { + let file = source.strip_prefix(root).unwrap_or(&source); + s.dirty_because( + unit, + format_args!( + "the file `{}` has changed (checksum didn't match, {} != {})", + file.display(), + stored_checksum, + new_checksum, + ), + ) + } + StaleItem::FileSizeChanged { + path, + old_size, + new_size, + } => { + let file = path.strip_prefix(root).unwrap_or(&path); + s.dirty_because( + unit, + format_args!( + "file size changed ({old_size} != {new_size}) for `{}`", + file.display() + ), + ) + } + StaleItem::MissingChecksum(path) => { + let file = path.strip_prefix(root).unwrap_or(&path); + s.dirty_because( + unit, + format_args!("the checksum for file `{}` is missing", file.display()), + ) + } StaleItem::ChangedEnv { var, .. } => s.dirty_because( unit, format_args!("the environment variable {var} changed"), diff --git a/src/cargo/core/compiler/fingerprint/mod.rs b/src/cargo/core/compiler/fingerprint/mod.rs index e2f61bdfb1f..5a262931cff 100644 --- a/src/cargo/core/compiler/fingerprint/mod.rs +++ b/src/cargo/core/compiler/fingerprint/mod.rs @@ -33,6 +33,12 @@ //! details. If any input files are missing, or are newer than the //! dep-info, then the unit is dirty. //! +//! - Alternatively if you're using the unstable feature `checksum-freshness` +//! mtimes are ignored entirely in favor of comparing first the file size, and +//! then the checksum with a known prior value emitted by rustc. Only nightly +//! rustc will emit the needed metadata at the time of writing. This is dependent +//! on the unstable feature `-Z checksum-hash-algorithm`. +//! //! Note: Fingerprinting is not a perfect solution. Filesystem mtime tracking //! is notoriously imprecise and problematic. Only a small part of the //! environment is captured. This is a balance of performance, simplicity, and @@ -358,19 +364,24 @@ mod dirty_reason; use std::collections::hash_map::{Entry, HashMap}; use std::env; +use std::fmt::{self, Display}; +use std::fs::File; use std::hash::{self, Hash, Hasher}; -use std::io; +use std::io::{self, Read}; use std::path::{Path, PathBuf}; -use std::str; +use std::str::{self, from_utf8, FromStr}; use std::sync::{Arc, Mutex}; use std::time::SystemTime; use anyhow::{bail, format_err, Context as _}; -use cargo_util::{paths, ProcessBuilder}; +use cargo_util::{paths, ProcessBuilder, Sha256}; use filetime::FileTime; +use itertools::Either; +use md5::Md5; use serde::de; use serde::ser; use serde::{Deserialize, Serialize}; +use sha1::{Digest, Sha1}; use tracing::{debug, info}; use crate::core::compiler::unit_graph::UnitDep; @@ -727,6 +738,16 @@ enum LocalFingerprint { /// we need to recompile. CheckDepInfo { dep_info: PathBuf }, + /// This is used for crate compilations. The `dep_info` file is a relative + /// path anchored at `target_root(...)` to the dep-info file that Cargo + /// generates (which is a custom serialization after parsing rustc's own + /// `dep-info` output). + /// + /// The `dep_info` file, when present, also lists a number of other files + /// for us to look at. If any of those files have a different checksum then + /// we need to recompile. + CheckDepInfoChecksums { dep_info: PathBuf }, + /// This represents a nonempty set of `rerun-if-changed` annotations printed /// out by a build script. The `output` file is a relative file anchored at /// `target_root(...)` which is the actual output of the build script. That @@ -752,12 +773,24 @@ enum LocalFingerprint { #[derive(Clone, Debug)] pub enum StaleItem { MissingFile(PathBuf), + FailedToReadMetadata(PathBuf), + FileSizeChanged { + path: PathBuf, + old_size: u64, + new_size: u64, + }, ChangedFile { reference: PathBuf, reference_mtime: FileTime, stale: PathBuf, stale_mtime: FileTime, }, + ChangedChecksum { + source: PathBuf, + stored_checksum: Checksum, + new_checksum: Checksum, + }, + MissingChecksum(PathBuf), ChangedEnv { var: String, previous: Option, @@ -793,6 +826,7 @@ impl LocalFingerprint { fn find_stale_item( &self, mtime_cache: &mut HashMap, + checksum_cache: &mut HashMap, pkg_root: &Path, target_root: &Path, cargo_exe: &Path, @@ -807,43 +841,50 @@ impl LocalFingerprint { // rustc. LocalFingerprint::CheckDepInfo { dep_info } => { let dep_info = target_root.join(dep_info); - let Some(info) = parse_dep_info(pkg_root, target_root, &dep_info)? else { - return Ok(Some(StaleItem::MissingFile(dep_info))); + let info = match dep_info_shared(pkg_root, target_root, &dep_info, cargo_exe, gctx)? + { + Either::Left(stale) => { + return Ok(Some(stale)); + } + Either::Right(info) => info, }; - for (key, previous) in info.env.iter() { - let current = if key == CARGO_ENV { - Some( - cargo_exe - .to_str() - .ok_or_else(|| { - format_err!( - "cargo exe path {} must be valid UTF-8", - cargo_exe.display() - ) - })? - .to_string(), - ) - } else { - gctx.get_env(key).ok() - }; - if current == *previous { - continue; + Ok(find_stale_file( + mtime_cache, + checksum_cache, + &dep_info, + info.files.iter().map(|p| (p, None)), + false, + )) + } + + LocalFingerprint::CheckDepInfoChecksums { dep_info } => { + let dep_info = target_root.join(dep_info); + let info = match dep_info_shared(pkg_root, target_root, &dep_info, cargo_exe, gctx)? + { + Either::Left(stale) => { + return Ok(Some(stale)); } - return Ok(Some(StaleItem::ChangedEnv { - var: key.clone(), - previous: previous.clone(), - current, - })); - } - Ok(find_stale_file(mtime_cache, &dep_info, info.files.iter())) + Either::Right(info) => info, + }; + Ok(find_stale_file( + mtime_cache, + checksum_cache, + &dep_info, + info.files + .iter() + .map(|file| (file.clone(), info.checksum.get(file).cloned())), + true, + )) } // We need to verify that no paths listed in `paths` are newer than // the `output` path itself, or the last time the build script ran. LocalFingerprint::RerunIfChanged { output, paths } => Ok(find_stale_file( mtime_cache, + checksum_cache, &target_root.join(output), - paths.iter().map(|p| pkg_root.join(p)), + paths.iter().map(|p| (pkg_root.join(p), None)), + false, )), // These have no dependencies on the filesystem, and their values @@ -858,12 +899,48 @@ impl LocalFingerprint { match self { LocalFingerprint::Precalculated(..) => "precalculated", LocalFingerprint::CheckDepInfo { .. } => "dep-info", + LocalFingerprint::CheckDepInfoChecksums { .. } => "dep-info-checksums", LocalFingerprint::RerunIfChanged { .. } => "rerun-if-changed", LocalFingerprint::RerunIfEnvChanged { .. } => "rerun-if-env-changed", } } } +fn dep_info_shared( + pkg_root: &Path, + target_root: &Path, + dep_info: &PathBuf, + cargo_exe: &Path, + gctx: &GlobalContext, +) -> Result, anyhow::Error> { + let Some(info) = parse_dep_info(pkg_root, target_root, dep_info)? else { + return Ok(Either::Left(StaleItem::MissingFile(dep_info.clone()))); + }; + for (key, previous) in info.env.iter() { + let current = if key == CARGO_ENV { + Some( + cargo_exe + .to_str() + .ok_or_else(|| { + format_err!("cargo exe path {} must be valid UTF-8", cargo_exe.display()) + })? + .to_string(), + ) + } else { + gctx.get_env(key).ok() + }; + if current == *previous { + continue; + } + return Ok(Either::Left(StaleItem::ChangedEnv { + var: key.clone(), + previous: previous.clone(), + current, + })); + } + Ok(Either::Right(info)) +} + impl Fingerprint { fn new() -> Fingerprint { Fingerprint { @@ -975,6 +1052,17 @@ impl Fingerprint { }; } } + ( + LocalFingerprint::CheckDepInfoChecksums { dep_info: adep }, + LocalFingerprint::CheckDepInfoChecksums { dep_info: bdep }, + ) => { + if adep != bdep { + return DirtyReason::DepInfoOutputChanged { + old: bdep.clone(), + new: adep.clone(), + }; + } + } ( LocalFingerprint::RerunIfChanged { output: aout, @@ -1077,6 +1165,7 @@ impl Fingerprint { fn check_filesystem( &mut self, mtime_cache: &mut HashMap, + checksum_cache: &mut HashMap, pkg_root: &Path, target_root: &Path, cargo_exe: &Path, @@ -1181,9 +1270,14 @@ impl Fingerprint { // files for this package itself. If we do find something log a helpful // message and bail out so we stay stale. for local in self.local.get_mut().unwrap().iter() { - if let Some(item) = - local.find_stale_item(mtime_cache, pkg_root, target_root, cargo_exe, gctx)? - { + if let Some(item) = local.find_stale_item( + mtime_cache, + checksum_cache, + pkg_root, + target_root, + cargo_exe, + gctx, + )? { item.log(); self.fs_status = FsStatus::StaleItem(item); return Ok(()); @@ -1293,6 +1387,9 @@ impl StaleItem { StaleItem::MissingFile(path) => { info!("stale: missing {:?}", path); } + StaleItem::FailedToReadMetadata(path) => { + info!("stale: couldn't read metadata {:?}", path); + } StaleItem::ChangedFile { reference, reference_mtime, @@ -1303,6 +1400,27 @@ impl StaleItem { info!(" (vs) {:?}", reference); info!(" {:?} < {:?}", reference_mtime, stale_mtime); } + StaleItem::FileSizeChanged { + path, + new_size, + old_size, + } => { + info!("stale: changed {:?}", path); + info!("prior file size {old_size}"); + info!(" new file size {new_size}"); + } + StaleItem::ChangedChecksum { + source, + stored_checksum, + new_checksum, + } => { + info!("stale: changed {:?}", source); + info!("prior checksum {stored_checksum}"); + info!(" new checksum {new_checksum}"); + } + StaleItem::MissingChecksum(path) => { + info!("stale: no prior checksum {:?}", path); + } StaleItem::ChangedEnv { var, previous, @@ -1347,6 +1465,7 @@ fn calculate(build_runner: &mut BuildRunner<'_, '_>, unit: &Unit) -> CargoResult let cargo_exe = build_runner.bcx.gctx.cargo_exe()?; fingerprint.check_filesystem( &mut build_runner.mtime_cache, + &mut build_runner.checksum_cache, unit.pkg.root(), &target_root, cargo_exe, @@ -1399,7 +1518,11 @@ fn calculate_normal( } else { let dep_info = dep_info_loc(build_runner, unit); let dep_info = dep_info.strip_prefix(&target_root).unwrap().to_path_buf(); - vec![LocalFingerprint::CheckDepInfo { dep_info }] + if build_runner.bcx.gctx.cli_unstable().checksum_freshness { + vec![LocalFingerprint::CheckDepInfoChecksums { dep_info }] + } else { + vec![LocalFingerprint::CheckDepInfo { dep_info }] + } }; // Figure out what the outputs of our unit is, and we'll be storing them @@ -1843,16 +1966,32 @@ pub fn parse_dep_info( }; let mut ret = RustcDepInfo::default(); ret.env = info.env; - ret.files.extend(info.files.into_iter().map(|(ty, path)| { - match ty { - DepInfoPathType::PackageRootRelative => pkg_root.join(path), - // N.B. path might be absolute here in which case the join will have no effect - DepInfoPathType::TargetRootRelative => target_root.join(path), - } - })); + ret.files.extend( + info.files + .into_iter() + .map(|(ty, path)| make_absolute_path(ty, pkg_root, path, target_root)), + ); + for (ty, path, file_len, checksum) in info.checksum { + let path = make_absolute_path(ty, pkg_root, path, target_root); + ret.checksum + .insert(path, (file_len, Checksum::from_str(&checksum)?)); + } Ok(Some(ret)) } +fn make_absolute_path( + ty: DepInfoPathType, + pkg_root: &Path, + path: PathBuf, + target_root: &Path, +) -> PathBuf { + match ty { + DepInfoPathType::PackageRootRelative => pkg_root.join(path), + // N.B. path might be absolute here in which case the join will have no effect + DepInfoPathType::TargetRootRelative => target_root.join(path), + } +} + /// Calculates the fingerprint of a unit thats contains no dep-info files. fn pkg_fingerprint(bcx: &BuildContext<'_, '_>, pkg: &Package) -> CargoResult { let source_id = pkg.package_id().source_id(); @@ -1865,14 +2004,16 @@ fn pkg_fingerprint(bcx: &BuildContext<'_, '_>, pkg: &Package) -> CargoResult( +fn find_stale_file( mtime_cache: &mut HashMap, + checksum_cache: &mut HashMap, reference: &Path, paths: I, + use_checksums: bool, ) -> Option where - I: IntoIterator, - I::Item: AsRef, + I: IntoIterator)>, + P: AsRef, { let Ok(reference_mtime) = paths::mtime(reference) else { return Some(StaleItem::MissingFile(reference.to_path_buf())); @@ -1887,8 +2028,7 @@ where } else { None }; - - for path in paths { + for (path, prior_checksum) in paths { let path = path.as_ref(); // Assuming anything in cargo_home/{git, registry} is immutable @@ -1900,44 +2040,82 @@ where continue; } } - let path_mtime = match mtime_cache.entry(path.to_path_buf()) { - Entry::Occupied(o) => *o.get(), - Entry::Vacant(v) => { - let Ok(mtime) = paths::mtime_recursive(path) else { - return Some(StaleItem::MissingFile(path.to_path_buf())); - }; - *v.insert(mtime) + if use_checksums { + let Some((file_len, prior_checksum)) = prior_checksum else { + return Some(StaleItem::MissingChecksum(path.to_path_buf())); + }; + let path_buf = path.to_path_buf(); + + let path_checksum = match checksum_cache.entry(path_buf) { + Entry::Occupied(o) => *o.get(), + Entry::Vacant(v) => { + let Ok(file) = File::open(path) else { + return Some(StaleItem::MissingFile(path.to_path_buf())); + }; + let Ok(current_file_len) = file.metadata().map(|m| m.len()) else { + return Some(StaleItem::FailedToReadMetadata(path.to_path_buf())); + }; + if current_file_len != file_len { + return Some(StaleItem::FileSizeChanged { + path: path.to_path_buf(), + new_size: current_file_len, + old_size: file_len, + }); + } + let Ok(checksum) = Checksum::compute(prior_checksum.algo, file) else { + return Some(StaleItem::MissingFile(path.to_path_buf())); + }; + *v.insert(checksum) + } + }; + if path_checksum == prior_checksum { + continue; } - }; + return Some(StaleItem::ChangedChecksum { + source: path.to_path_buf(), + stored_checksum: prior_checksum, + new_checksum: path_checksum, + }); + } else { + let path_mtime = match mtime_cache.entry(path.to_path_buf()) { + Entry::Occupied(o) => *o.get(), + Entry::Vacant(v) => { + let Ok(mtime) = paths::mtime_recursive(path) else { + return Some(StaleItem::MissingFile(path.to_path_buf())); + }; + *v.insert(mtime) + } + }; - // TODO: fix #5918. - // Note that equal mtimes should be considered "stale". For filesystems with - // not much timestamp precision like 1s this is would be a conservative approximation - // to handle the case where a file is modified within the same second after - // a build starts. We want to make sure that incremental rebuilds pick that up! - // - // For filesystems with nanosecond precision it's been seen in the wild that - // its "nanosecond precision" isn't really nanosecond-accurate. It turns out that - // kernels may cache the current time so files created at different times actually - // list the same nanosecond precision. Some digging on #5919 picked up that the - // kernel caches the current time between timer ticks, which could mean that if - // a file is updated at most 10ms after a build starts then Cargo may not - // pick up the build changes. - // - // All in all, an equality check here would be a conservative assumption that, - // if equal, files were changed just after a previous build finished. - // Unfortunately this became problematic when (in #6484) cargo switch to more accurately - // measuring the start time of builds. - if path_mtime <= reference_mtime { - continue; - } + // TODO: fix #5918. + // Note that equal mtimes should be considered "stale". For filesystems with + // not much timestamp precision like 1s this is would be a conservative approximation + // to handle the case where a file is modified within the same second after + // a build starts. We want to make sure that incremental rebuilds pick that up! + // + // For filesystems with nanosecond precision it's been seen in the wild that + // its "nanosecond precision" isn't really nanosecond-accurate. It turns out that + // kernels may cache the current time so files created at different times actually + // list the same nanosecond precision. Some digging on #5919 picked up that the + // kernel caches the current time between timer ticks, which could mean that if + // a file is updated at most 10ms after a build starts then Cargo may not + // pick up the build changes. + // + // All in all, an equality check here would be a conservative assumption that, + // if equal, files were changed just after a previous build finished. + // Unfortunately this became problematic when (in #6484) cargo switch to more accurately + // measuring the start time of builds. + if path_mtime <= reference_mtime { + continue; + } - return Some(StaleItem::ChangedFile { - reference: reference.to_path_buf(), - reference_mtime, - stale: path.to_path_buf(), - stale_mtime: path_mtime, - }); + return Some(StaleItem::ChangedFile { + reference: reference.to_path_buf(), + reference_mtime, + stale: path.to_path_buf(), + stale_mtime: path_mtime, + }); + } } debug!( @@ -1949,7 +2127,8 @@ where /// Tells the associated path in [`EncodedDepInfo::files`] is relative to package root, /// target root, or absolute. -enum DepInfoPathType { +#[derive(Debug, Eq, PartialEq, Hash, Copy, Clone)] +pub enum DepInfoPathType { /// src/, e.g. src/lib.rs PackageRootRelative, /// target/debug/deps/lib... @@ -2028,7 +2207,7 @@ pub fn translate_dep_info( .env .retain(|(key, _)| !rustc_cmd.get_envs().contains_key(key) || key == CARGO_ENV); - for file in depinfo.files { + let serialize_path = |file| { // The path may be absolute or relative, canonical or not. Make sure // it is canonicalized so we are comparing the same kinds of paths. let abs_file = rustc_cwd.join(file); @@ -2041,7 +2220,7 @@ pub fn translate_dep_info( (DepInfoPathType::TargetRootRelative, stripped) } else if let Ok(stripped) = canon_file.strip_prefix(&pkg_root) { if !allow_package { - continue; + return None; } (DepInfoPathType::PackageRootRelative, stripped) } else { @@ -2050,7 +2229,25 @@ pub fn translate_dep_info( // effect. (DepInfoPathType::TargetRootRelative, &*abs_file) }; - on_disk_info.files.push((ty, path.to_owned())); + Some((ty, path.to_owned())) + }; + + for file in depinfo.files { + let Some(serializable_path) = serialize_path(file) else { + continue; + }; + on_disk_info.files.push(serializable_path); + } + for (file, (file_len, checksum)) in depinfo.checksum { + let Some(serializable_path) = serialize_path(file) else { + continue; + }; + on_disk_info.checksum.push(( + serializable_path.0, + serializable_path.1, + file_len, + checksum.to_string(), + )); } paths::write(cargo_dep_info, on_disk_info.serialize()?)?; Ok(()) @@ -2069,6 +2266,10 @@ pub struct RustcDepInfo { /// means that the env var wasn't actually set and the compilation depends /// on it not being set. pub env: Vec<(String, Option)>, + + /// If provided by rustc, a mapping that ties a file to the checksum and file size + /// at the time rustc ingested it. + pub checksum: HashMap, } /// Same as [`RustcDepInfo`] except avoids absolute paths as much as possible to @@ -2080,13 +2281,14 @@ pub struct RustcDepInfo { struct EncodedDepInfo { files: Vec<(DepInfoPathType, PathBuf)>, env: Vec<(String, Option)>, + checksum: Vec<(DepInfoPathType, PathBuf, u64, String)>, } impl EncodedDepInfo { fn parse(mut bytes: &[u8]) -> Option { let bytes = &mut bytes; let nfiles = read_usize(bytes)?; - let mut files = Vec::with_capacity(nfiles as usize); + let mut files = Vec::with_capacity(nfiles); for _ in 0..nfiles { let ty = match read_u8(bytes)? { 0 => DepInfoPathType::PackageRootRelative, @@ -2098,7 +2300,7 @@ impl EncodedDepInfo { } let nenv = read_usize(bytes)?; - let mut env = Vec::with_capacity(nenv as usize); + let mut env = Vec::with_capacity(nenv); for _ in 0..nenv { let key = str::from_utf8(read_bytes(bytes)?).ok()?.to_string(); let val = match read_u8(bytes)? { @@ -2108,7 +2310,29 @@ impl EncodedDepInfo { }; env.push((key, val)); } - return Some(EncodedDepInfo { files, env }); + let nchecksum = read_usize(bytes)?; + let mut checksum = Vec::with_capacity(nchecksum); + for _ in 0..nchecksum { + let ty = match read_u8(bytes)? { + 0 => DepInfoPathType::PackageRootRelative, + 1 => DepInfoPathType::TargetRootRelative, + _ => return None, + }; + let path_bytes = read_bytes(bytes)?; + let file_len = read_u64(bytes)?; + let checksum_bytes = read_bytes(bytes)?; + checksum.push(( + ty, + paths::bytes2path(path_bytes).ok()?, + file_len, + from_utf8(checksum_bytes).ok()?.to_string(), + )); + } + return Some(EncodedDepInfo { + files, + env, + checksum, + }); fn read_usize(bytes: &mut &[u8]) -> Option { let ret = bytes.get(..4)?; @@ -2116,6 +2340,12 @@ impl EncodedDepInfo { Some(u32::from_le_bytes(ret.try_into().unwrap()) as usize) } + fn read_u64(bytes: &mut &[u8]) -> Option { + let ret = bytes.get(..8)?; + *bytes = &bytes[8..]; + Some(u64::from_le_bytes(ret.try_into().unwrap())) + } + fn read_u8(bytes: &mut &[u8]) -> Option { let ret = *bytes.get(0)?; *bytes = &bytes[1..]; @@ -2153,6 +2383,17 @@ impl EncodedDepInfo { } } } + + write_usize(dst, self.checksum.len()); + for (ty, file, file_len, checksum) in self.checksum.iter() { + match ty { + DepInfoPathType::PackageRootRelative => dst.push(0), + DepInfoPathType::TargetRootRelative => dst.push(1), + } + write_bytes(dst, paths::path2bytes(file)?); + write_u64(dst, *file_len); + write_bytes(dst, checksum); + } return Ok(ret); fn write_bytes(dst: &mut Vec, val: impl AsRef<[u8]>) { @@ -2164,6 +2405,10 @@ impl EncodedDepInfo { fn write_usize(dst: &mut Vec, val: usize) { dst.extend(&u32::to_le_bytes(val as u32)); } + + fn write_u64(dst: &mut Vec, val: u64) { + dst.extend(&u64::to_le_bytes(val)); + } } } @@ -2184,6 +2429,22 @@ pub fn parse_rustc_dep_info(rustc_dep_info: &Path) -> CargoResult None => None, }; ret.env.push((unescape_env(env_var)?, env_val)); + } else if let Some(rest) = line.strip_prefix("# checksum:") { + let mut parts = rest.splitn(3, ' '); + let Some(checksum) = parts.next().map(Checksum::from_str).transpose()? else { + continue; + }; + let Some(Ok(file_len)) = parts + .next() + .and_then(|s| s.strip_prefix("file_len:").map(|s| s.parse::())) + else { + continue; + }; + let Some(path) = parts.next().map(PathBuf::from) else { + continue; + }; + + ret.checksum.insert(path, (file_len, checksum)); } else if let Some(pos) = line.find(": ") { if found_deps { continue; @@ -2228,3 +2489,220 @@ pub fn parse_rustc_dep_info(rustc_dep_info: &Path) -> CargoResult Ok(ret) } } + +/// Some algorithms are here to ensure compatibility with possible rustc outputs. +/// The presence of an algorithm here is not a suggestion that it's fit for use. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub enum ChecksumAlgo { + Sha256, + Sha1, + Md5, +} + +impl ChecksumAlgo { + fn hash_len(&self) -> usize { + match self { + ChecksumAlgo::Sha256 => 32, + ChecksumAlgo::Sha1 => 20, + ChecksumAlgo::Md5 => 16, + } + } +} + +impl FromStr for ChecksumAlgo { + type Err = InvalidChecksumAlgo; + + fn from_str(s: &str) -> Result { + match s { + "sha256" => Ok(Self::Sha256), + "sha1" => Ok(Self::Sha1), + "md5" => Ok(Self::Md5), + _ => Err(InvalidChecksumAlgo {}), + } + } +} + +impl Display for ChecksumAlgo { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(match self { + ChecksumAlgo::Sha256 => "sha256", + ChecksumAlgo::Sha1 => "sha1", + ChecksumAlgo::Md5 => "md5", + }) + } +} + +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub struct InvalidChecksumAlgo {} + +impl Display for InvalidChecksumAlgo { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "expected `sha256`, `sha1`, or `md5`") + } +} + +impl std::error::Error for InvalidChecksumAlgo {} + +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub struct Checksum { + algo: ChecksumAlgo, + /// If the algorithm uses fewer than 32 bytes, then the remaining bytes will be zero. + value: [u8; 32], +} + +impl Checksum { + pub fn new(algo: ChecksumAlgo, value: [u8; 32]) -> Self { + Self { algo, value } + } + + pub fn compute(algo: ChecksumAlgo, contents: impl Read) -> Result { + // Buffer size is the same as default for std::io::BufReader. + // This was completely arbitrary and can probably be improved upon. + // + // Mostly I just don't want to read the entire file into memory at once if it's massive. + let mut buf = vec![0; 8 * 1024]; + let mut ret = Self { + algo, + value: [0; 32], + }; + let len = algo.hash_len(); + let value = &mut ret.value[..len]; + + fn digest( + mut hasher: T, + mut update: impl FnMut(&mut T, &[u8]), + finish: impl FnOnce(T, &mut [u8]), + mut contents: impl Read, + buf: &mut [u8], + value: &mut [u8], + ) -> Result<(), io::Error> { + loop { + let bytes_read = contents.read(buf)?; + if bytes_read == 0 { + break; + } + update(&mut hasher, &buf[0..bytes_read]); + } + finish(hasher, value); + Ok(()) + } + + match algo { + ChecksumAlgo::Sha256 => { + digest( + Sha256::new(), + |h, b| { + h.update(b); + }, + |mut h, out| out.copy_from_slice(&h.finish()), + contents, + &mut buf, + value, + )?; + } + ChecksumAlgo::Sha1 => { + digest( + Sha1::new(), + |h, b| { + h.update(b); + }, + |h, out| out.copy_from_slice(&h.finalize()), + contents, + &mut buf, + value, + )?; + } + ChecksumAlgo::Md5 => { + digest( + Md5::new(), + |h, b| { + h.update(b); + }, + |h, out| out.copy_from_slice(&h.finalize()), + contents, + &mut buf, + value, + )?; + } + } + Ok(ret) + } + + pub fn algo(&self) -> ChecksumAlgo { + self.algo + } + + pub fn value(&self) -> &[u8; 32] { + &self.value + } +} + +impl FromStr for Checksum { + type Err = InvalidChecksum; + + fn from_str(s: &str) -> Result { + let mut parts = s.split('='); + let Some(algo) = parts.next().map(ChecksumAlgo::from_str).transpose()? else { + return Err(InvalidChecksum::InvalidFormat); + }; + let Some(checksum) = parts.next() else { + return Err(InvalidChecksum::InvalidFormat); + }; + let mut value = [0; 32]; + if hex::decode_to_slice(checksum, &mut value[0..algo.hash_len()]).is_err() { + return Err(InvalidChecksum::InvalidChecksum(algo)); + } + Ok(Self { algo, value }) + } +} + +impl Display for Checksum { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut checksum = [0; 64]; + let hash_len = self.algo.hash_len(); + hex::encode_to_slice(&self.value[0..hash_len], &mut checksum[0..(hash_len * 2)]) + .map_err(|_| fmt::Error)?; + write!( + f, + "{}={}", + self.algo, + from_utf8(&checksum[0..(hash_len * 2)]).unwrap_or_default() + ) + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum InvalidChecksum { + InvalidChecksumAlgo(InvalidChecksumAlgo), + InvalidChecksum(ChecksumAlgo), + InvalidFormat, +} + +impl Display for InvalidChecksum { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + InvalidChecksum::InvalidChecksumAlgo(e) => { + write!(f, "algorithm portion incorrect, {e}") + } + InvalidChecksum::InvalidChecksum(algo) => { + let expected_len = algo.hash_len() * 2; + write!( + f, + "expected {expected_len} hexadecimal digits in checksum portion" + ) + } + InvalidChecksum::InvalidFormat => write!( + f, + "expected a string with format \"algorithm=hex_checksum\"" + ), + } + } +} + +impl std::error::Error for InvalidChecksum {} + +impl From for InvalidChecksum { + fn from(value: InvalidChecksumAlgo) -> Self { + InvalidChecksum::InvalidChecksumAlgo(value) + } +} diff --git a/src/cargo/core/compiler/mod.rs b/src/cargo/core/compiler/mod.rs index 45df3004589..2aa157bc93f 100644 --- a/src/cargo/core/compiler/mod.rs +++ b/src/cargo/core/compiler/mod.rs @@ -704,6 +704,9 @@ fn prepare_rustc(build_runner: &BuildRunner<'_, '_>, unit: &Unit) -> CargoResult if build_runner.bcx.gctx.cli_unstable().binary_dep_depinfo { base.arg("-Z").arg("binary-dep-depinfo"); } + if build_runner.bcx.gctx.cli_unstable().checksum_freshness { + base.arg("-Z").arg("checksum-hash-algorithm=sha256"); + } if is_primary { base.env("CARGO_PRIMARY_PACKAGE", "1"); diff --git a/src/cargo/core/features.rs b/src/cargo/core/features.rs index 3dc2a1d8e9c..2638ff95234 100644 --- a/src/cargo/core/features.rs +++ b/src/cargo/core/features.rs @@ -760,6 +760,7 @@ unstable_cli_options!( build_std: Option> = ("Enable Cargo to compile the standard library itself as part of a crate graph compilation"), build_std_features: Option> = ("Configure features enabled for the standard library itself when building the standard library"), cargo_lints: bool = ("Enable the `[lints.cargo]` table"), + checksum_freshness: bool = ("Use a checksum to determine if output is fresh rather than filesystem mtime"), codegen_backend: bool = ("Enable the `codegen-backend` option in profiles in .cargo/config.toml file"), config_include: bool = ("Enable the `include` key in config files"), direct_minimal_versions: bool = ("Resolve minimal dependency versions instead of maximum (direct dependencies only)"), @@ -1289,6 +1290,7 @@ impl CliUnstable { "rustdoc-map" => self.rustdoc_map = parse_empty(k, v)?, "rustdoc-scrape-examples" => self.rustdoc_scrape_examples = parse_empty(k, v)?, "separate-nightlies" => self.separate_nightlies = parse_empty(k, v)?, + "checksum-freshness" => self.checksum_freshness = parse_empty(k, v)?, "skip-rustdoc-fingerprint" => self.skip_rustdoc_fingerprint = parse_empty(k, v)?, "script" => self.script = parse_empty(k, v)?, "target-applies-to-host" => self.target_applies_to_host = parse_empty(k, v)?, diff --git a/src/cargo/util/command_prelude.rs b/src/cargo/util/command_prelude.rs index 247143319f9..8c01585d91e 100644 --- a/src/cargo/util/command_prelude.rs +++ b/src/cargo/util/command_prelude.rs @@ -759,6 +759,7 @@ Run `{cmd}` to see possible targets." build_config.build_plan = self.flag("build-plan"); build_config.unit_graph = self.flag("unit-graph"); build_config.future_incompat_report = self.flag("future-incompat-report"); + build_config.checksum_freshness = self.flag("checksum-freshness"); if self._contains("timings") { for timing_output in self._values_of("timings") { @@ -793,6 +794,10 @@ Run `{cmd}` to see possible targets." gctx.cli_unstable() .fail_if_stable_opt("--unit-graph", 8002)?; } + if build_config.checksum_freshness { + gctx.cli_unstable() + .fail_if_stable_opt("--checksum-freshness", 14136)?; + } let opts = CompileOptions { build_config, diff --git a/src/cargo/util/context/mod.rs b/src/cargo/util/context/mod.rs index c38fd6fd55a..4e8687b3e79 100644 --- a/src/cargo/util/context/mod.rs +++ b/src/cargo/util/context/mod.rs @@ -91,6 +91,7 @@ use serde::Deserialize; use serde_untagged::UntaggedEnumVisitor; use time::OffsetDateTime; use toml_edit::Item; +use tracing::warn; use url::Url; mod de; diff --git a/tests/testsuite/cargo/z_help/stdout.term.svg b/tests/testsuite/cargo/z_help/stdout.term.svg index e5386620e46..a429e92d58b 100644 --- a/tests/testsuite/cargo/z_help/stdout.term.svg +++ b/tests/testsuite/cargo/z_help/stdout.term.svg @@ -1,4 +1,4 @@ - +