Skip to content

Commit

Permalink
feat(package): implement vcs status cache
Browse files Browse the repository at this point in the history
It may not be worthy for the majority.
Single file `git status` is generally fast.

On some slow file systems or lower-end machines,
skipping file system access might be helpful.

However, slow file system access usually happen on Window.
And we'll only have large amount of `git status` when
rust-lang#14981 merges and the repo contains lots of symlinks.
Given symlink handling story is crazy in real world Windows,
I doubt anybody will hit the performance issue without this.
  • Loading branch information
weihanglo committed Dec 26, 2024
1 parent 1b82184 commit 1d1ede9
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 21 deletions.
95 changes: 77 additions & 18 deletions src/cargo/ops/cargo_package/vcs.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
//! Helpers to gather the VCS information for `cargo package`.
use std::collections::HashMap;
use std::path::Path;
use std::path::PathBuf;

Expand Down Expand Up @@ -38,19 +39,71 @@ pub struct GitVcsInfo {
pub struct VcsInfoBuilder<'a, 'gctx> {
ws: &'a Workspace<'gctx>,
opts: &'a PackageOpts<'gctx>,
/// Map each git workdir path to the workdir's git status cache.
caches: HashMap<PathBuf, RepoStatusCache>,
}

impl<'a, 'gctx> VcsInfoBuilder<'a, 'gctx> {
pub fn new(
ws: &'a Workspace<'gctx>,
opts: &'a PackageOpts<'gctx>,
) -> VcsInfoBuilder<'a, 'gctx> {
VcsInfoBuilder { ws, opts }
VcsInfoBuilder {
ws,
opts,
caches: Default::default(),
}
}

/// Builds an [`VcsInfo`] for the given `pkg` and its associated `src_files`.
pub fn build(&mut self, pkg: &Package, src_files: &[PathBuf]) -> CargoResult<Option<VcsInfo>> {
check_repo_state(pkg, src_files, self.ws.gctx(), self.opts)
check_repo_state(pkg, src_files, self.ws.gctx(), self.opts, &mut self.caches)
}
}

/// Cache of git status inquries for a Git workdir.
struct RepoStatusCache {
repo: git2::Repository,
/// Status of each file path relative to the git workdir path.
cache: HashMap<PathBuf, git2::Status>,
}

impl RepoStatusCache {
fn new(repo: git2::Repository) -> RepoStatusCache {
RepoStatusCache {
repo,
cache: Default::default(),
}
}

/// Like [`git2::Repository::status_file`] but with cache.
fn status_file(&mut self, path: &Path) -> CargoResult<git2::Status> {
use std::collections::hash_map::Entry;
match self.cache.entry(path.into()) {
Entry::Occupied(entry) => {
tracing::trace!(
target: "cargo_package_vcs_cache",
"git status cache hit for `{}` at workdir `{}`",
path.display(),
self.repo.workdir().unwrap().display()
);
Ok(*entry.get())
}
Entry::Vacant(entry) => {
tracing::trace!(
target: "cargo_package_vcs_cache",
"git status cache miss for `{}` at workdir `{}`",
path.display(),
self.repo.workdir().unwrap().display()
);
let status = self.repo.status_file(path)?;
Ok(*entry.insert(status))
}
}
}

fn workdir(&self) -> &Path {
self.repo.workdir().unwrap()
}
}

Expand All @@ -67,6 +120,7 @@ pub fn check_repo_state(
src_files: &[PathBuf],
gctx: &GlobalContext,
opts: &PackageOpts<'_>,
caches: &mut HashMap<PathBuf, RepoStatusCache>,
) -> CargoResult<Option<VcsInfo>> {
let Ok(repo) = git2::Repository::discover(p.root()) else {
gctx.shell().verbose(|shell| {
Expand All @@ -86,14 +140,20 @@ pub fn check_repo_state(
};

debug!("found a git repo at `{}`", workdir.display());

let cache = caches
.entry(workdir.to_path_buf())
.or_insert_with(|| RepoStatusCache::new(repo));

let path = p.manifest_path();
let path = paths::strip_prefix_canonical(path, workdir).unwrap_or_else(|_| path.to_path_buf());
let Ok(status) = repo.status_file(&path) else {
let path =
paths::strip_prefix_canonical(path, cache.workdir()).unwrap_or_else(|_| path.to_path_buf());
let Ok(status) = cache.status_file(&path) else {
gctx.shell().verbose(|shell| {
shell.warn(format!(
"no (git) Cargo.toml found at `{}` in workdir `{}`",
path.display(),
workdir.display()
cache.workdir().display()
))
})?;
// No checked-in `Cargo.toml` found. This package may be irrelevant.
Expand All @@ -106,7 +166,7 @@ pub fn check_repo_state(
shell.warn(format!(
"found (git) Cargo.toml ignored at `{}` in workdir `{}`",
path.display(),
workdir.display()
cache.workdir().display()
))
})?;
// An ignored `Cargo.toml` found. This package may be irrelevant.
Expand All @@ -117,14 +177,14 @@ pub fn check_repo_state(
debug!(
"found (git) Cargo.toml at `{}` in workdir `{}`",
path.display(),
workdir.display(),
cache.workdir().display(),
);
let path_in_vcs = path
.parent()
.and_then(|p| p.to_str())
.unwrap_or("")
.replace("\\", "/");
let Some(git) = git(p, gctx, src_files, &repo, &opts)? else {
let Some(git) = git(p, gctx, src_files, cache, &opts)? else {
// If the git repo lacks essensial field like `sha1`, and since this field exists from the beginning,
// then don't generate the corresponding file in order to maintain consistency with past behavior.
return Ok(None);
Expand All @@ -138,7 +198,7 @@ fn git(
pkg: &Package,
gctx: &GlobalContext,
src_files: &[PathBuf],
repo: &git2::Repository,
cache: &mut RepoStatusCache,
opts: &PackageOpts<'_>,
) -> CargoResult<Option<GitVcsInfo>> {
// This is a collection of any dirty or untracked files. This covers:
Expand All @@ -147,10 +207,10 @@ fn git(
// - ignored (in case the user has an `include` directive that
// conflicts with .gitignore).
let mut dirty_files = Vec::new();
collect_statuses(repo, &mut dirty_files)?;
collect_statuses(&cache.repo, &mut dirty_files)?;
// Include each submodule so that the error message can provide
// specifically *which* files in a submodule are modified.
status_submodules(repo, &mut dirty_files)?;
status_submodules(&cache.repo, &mut dirty_files)?;

// Find the intersection of dirty in git, and the src_files that would
// be packaged. This is a lazy n^2 check, but seems fine with
Expand All @@ -159,7 +219,7 @@ fn git(
let mut dirty_src_files: Vec<_> = src_files
.iter()
.filter(|src_file| dirty_files.iter().any(|path| src_file.starts_with(path)))
.chain(dirty_metadata_paths(pkg, repo)?.iter())
.chain(dirty_metadata_paths(pkg, cache)?.iter())
.map(|path| {
pathdiff::diff_paths(path, cwd)
.as_ref()
Expand All @@ -172,10 +232,10 @@ fn git(
if !dirty || opts.allow_dirty {
// Must check whetherthe repo has no commit firstly, otherwise `revparse_single` would fail on bare commit repo.
// Due to lacking the `sha1` field, it's better not record the `GitVcsInfo` for consistency.
if repo.is_empty()? {
if cache.repo.is_empty()? {
return Ok(None);
}
let rev_obj = repo.revparse_single("HEAD")?;
let rev_obj = cache.repo.revparse_single("HEAD")?;
Ok(Some(GitVcsInfo {
sha1: rev_obj.id().to_string(),
dirty,
Expand All @@ -198,9 +258,8 @@ fn git(
/// This is required because those paths may link to a file outside the
/// current package root, but still under the git workdir, affecting the
/// final packaged `.crate` file.
fn dirty_metadata_paths(pkg: &Package, repo: &git2::Repository) -> CargoResult<Vec<PathBuf>> {
fn dirty_metadata_paths(pkg: &Package, repo: &mut RepoStatusCache) -> CargoResult<Vec<PathBuf>> {
let mut dirty_files = Vec::new();
let workdir = repo.workdir().unwrap();
let root = pkg.root();
let meta = pkg.manifest().metadata();
for path in [&meta.license_file, &meta.readme] {
Expand All @@ -212,12 +271,12 @@ fn dirty_metadata_paths(pkg: &Package, repo: &git2::Repository) -> CargoResult<V
// Inside package root. Don't bother checking git status.
continue;
}
if let Ok(rel_path) = paths::strip_prefix_canonical(abs_path.as_path(), workdir) {
if let Ok(rel_path) = paths::strip_prefix_canonical(abs_path.as_path(), repo.workdir()) {
// Outside package root but under git workdir,
if repo.status_file(&rel_path)? != git2::Status::CURRENT {
dirty_files.push(if abs_path.is_symlink() {
// For symlinks, shows paths to symlink sources
workdir.join(rel_path)
repo.workdir().join(rel_path)
} else {
abs_path
});
Expand Down
16 changes: 13 additions & 3 deletions tests/testsuite/package.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1185,6 +1185,7 @@ fn vcs_status_check_for_each_workspace_member() {
"#,
)
.file("hobbit", "...")
.file("README.md", "")
.file(
"isengard/Cargo.toml",
r#"
Expand All @@ -1194,6 +1195,7 @@ fn vcs_status_check_for_each_workspace_member() {
homepage = "saruman"
description = "saruman"
license = "MIT"
readme = "../README.md"
"#,
)
.file("isengard/src/lib.rs", "")
Expand All @@ -1206,6 +1208,7 @@ fn vcs_status_check_for_each_workspace_member() {
homepage = "sauron"
description = "sauron"
license = "MIT"
readme = "../README.md"
"#,
)
.file("mordor/src/lib.rs", "")
Expand All @@ -1228,10 +1231,15 @@ fn vcs_status_check_for_each_workspace_member() {

// Ensure dirty files be reported only for one affected package.
p.cargo("package --workspace --no-verify")
.env("CARGO_LOG", "cargo_package_vcs_cache=trace")
.with_status(101)
.with_stderr_data(str![[r#"
[..] TRACE cargo_package_vcs_cache: git status cache miss for `isengard/Cargo.toml` at workdir `[ROOT]/foo/`
[..] TRACE cargo_package_vcs_cache: git status cache miss for `README.md` at workdir `[ROOT]/foo/`
[PACKAGING] isengard v0.0.0 ([ROOT]/foo/isengard)
[PACKAGED] 5 files, [FILE_SIZE]B ([FILE_SIZE]B compressed)
[PACKAGED] 6 files, [FILE_SIZE]B ([FILE_SIZE]B compressed)
[..] TRACE cargo_package_vcs_cache: git status cache miss for `mordor/Cargo.toml` at workdir `[ROOT]/foo/`
[..] TRACE cargo_package_vcs_cache: git status cache hit for `README.md` at workdir `[ROOT]/foo/`
[ERROR] 2 files in the working directory contain changes that were not yet committed into git:
mordor/src/lib.rs
Expand All @@ -1246,9 +1254,9 @@ to proceed despite this and include the uncommitted changes, pass the `--allow-d
p.cargo("package --workspace --no-verify --allow-dirty")
.with_stderr_data(str![[r#"
[PACKAGING] isengard v0.0.0 ([ROOT]/foo/isengard)
[PACKAGED] 5 files, [FILE_SIZE]B ([FILE_SIZE]B compressed)
[PACKAGING] mordor v0.0.0 ([ROOT]/foo/mordor)
[PACKAGED] 6 files, [FILE_SIZE]B ([FILE_SIZE]B compressed)
[PACKAGING] mordor v0.0.0 ([ROOT]/foo/mordor)
[PACKAGED] 7 files, [FILE_SIZE]B ([FILE_SIZE]B compressed)
"#]])
.run();
Expand All @@ -1263,6 +1271,7 @@ to proceed despite this and include the uncommitted changes, pass the `--allow-d
"Cargo.toml.orig",
"src/lib.rs",
"Cargo.lock",
"README.md",
],
[(
".cargo_vcs_info.json",
Expand Down Expand Up @@ -1290,6 +1299,7 @@ to proceed despite this and include the uncommitted changes, pass the `--allow-d
"src/lib.rs",
"src/main.rs",
"Cargo.lock",
"README.md",
],
[(
".cargo_vcs_info.json",
Expand Down

0 comments on commit 1d1ede9

Please sign in to comment.