From 63fa80ec9cc2e0f58c4573553cf6e7f4f6d1af6b Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 5 Oct 2023 15:43:30 +0200 Subject: [PATCH 1/7] update crate-status with planned features related to `status` --- crate-status.md | 335 ++++++++++++++++++++---------------------------- 1 file changed, 142 insertions(+), 193 deletions(-) diff --git a/crate-status.md b/crate-status.md index e9983498d5d..a6aa1f119eb 100644 --- a/crate-status.md +++ b/crate-status.md @@ -19,72 +19,153 @@ and itself relies on all `gix-*` crates. It's not meant for consumption, for app * [x] **estimate-hours** - estimate the time invested into a repository by evaluating commit dates. * Based on the [git-hours] algorithm. * See the [discussion][git-hours-discussion] for some performance data. -* **the `gix` program** _(plumbing)_ - lower level commands for use in automation - * **progress** - provide an overview of what works and what doesn't from the perspective of the git configuration. - This is likely to change a lot over time depending on actual needs, but maybe useful for you to see - if particular git-configuration is picked up and where it deviates. - * **config** - list the complete git configuration in human-readable form and optionally filter sections by name. - * **exclude** - * [x] **query** - check if path specs are excluded via gits exclusion rules like `.gitignore`. - * **verify** - validate a whole repository, for now only the object database. - * **commit** - * [x] **describe** - identify a commit by its closest tag in its past - * **tree** - * [x] **entries** - list tree entries for a single tree or recursively - * [x] **info** - display tree statistics - * **odb** - * [x] **info** - display odb statistics - * [x] **entries** - display all object ids in the object database - * **mailmap** - * [x] **entries** - display all entries of the aggregated mailmap git would use for substitution - * **revision** - * [x] **list** - list plain revision hashes from a starting point, similar to a very simple version of `git rev-list`. - * [x] **explain** - show what would be done while parsing a revision specification like `HEAD~1` - * [x] **resolve** - show which objects a revspec resolves to, similar to `git rev-parse` but faster and with much better error handling - * [x] **previous-branches** - list all previously checked out branches, powered by the ref-log. - * **remote** - * [x] **refs** - list all references available on the remote based on the current remote configuration. - * [x] **ref-map** - show how remote references relate to their local tracking branches as mapped by refspecs. - * [x] **fetch** - fetch the current remote or the given one, optionally just as dry-run. - * **clone** - * [x] initialize a new **bare** repository and fetch all objects. - * [x] initialize a new repository, fetch all objects and checkout the main worktree. - * **credential** - * [x] **fill/approve/reject** - The same as `git credential`, but implemented in Rust, calling helpers only when from trusted configuration. - * **free** - no git repository necessary - * **pack** - * [x] [verify](https://asciinema.org/a/352942) - * [x] [index verify](https://asciinema.org/a/352945) including each object sha1 and statistics - * [x] [explode](https://asciinema.org/a/352951), useful for transforming packs into loose objects for inspection or restoration - * [x] verify written objects (by reading them back from disk) - * [x] [receive](https://asciinema.org/a/359321) - receive a whole pack produced by **pack-send** or _git-upload-pack_, useful for `clone` like operations. - * [x] **create** - create a pack from given objects or tips of the commit graph. - * [ ] **send** - create a pack and send it using the pack protocol to stdout, similar to 'git-upload-pack', - for consumption by **pack-receive** or _git-receive-pack_ - - **multi-index** - * [x] **info** - print information about the file - * [x] **create** - create a multi-index from pack indices - * [x] **verify** - check the file for consistency - * [x] **entries** - list all entries of the file - - **index** - * [x] [create](https://asciinema.org/a/352941) - create an index file by streaming a pack file as done during clone - * [x] support for thin packs (as needed for fetch/pull) - * **commit-graph** - * [x] **verify** - assure that a commit-graph is consistent - * **mailmap** - * [x] **verify** - check entries of a mailmap file for parse errors and display them - * **index** - * [x] **entries** - show detailed entry information for human or machine consumption (via JSON) - * [x] **verify** - check the index for consistency - * [x] **info** - display general information about the index itself, with detailed extension information by default - * [x] detailed information about the TREE extension - * [ ] …other extensions details aren't implemented yet - * [x] **checkout-exclusive** - a predecessor of `git worktree`, providing flexible options to evaluate checkout performance from an index and/or an object database. +* **the `gix` program** _(plumbing)_ - lower level commands for use during development + - As its main purpose is to help running the latest improvements in the real world, it's self-documenting without + duplicating its features here. Use `gix --help` to start discovery. [skim]: https://github.com/lotabout/skim [git-hours]: https://github.com/kimmobrunfeldt/git-hours/blob/8aaeee237cb9d9028e7a2592a25ad8468b1f45e4/index.js#L114-L143 [git-hours-discussion]: https://github.com/Byron/gitoxide/discussions/78 +### gix + +The top-level crate that acts as hub to all functionality provided by the `gix-*` plumbing crates. + +* [x] utilities for applications to make long running operations interruptible gracefully and to support timeouts in servers. +* [x] handle `core.repositoryFormatVersion` and extensions +* [x] support for unicode-precomposition of command-line arguments (needs explicit use in parent application) +* [ ] strict object creation (validate objects referenced by newly created objects exist) +* [ ] strict hash verification (validate that objects actually have the hashes they claim to have) +* **Repository** + * [x] discovery + * [x] option to not cross file systems (default) + * [x] handle git-common-dir + * [x] support for `GIT_CEILING_DIRECTORIES` environment variable + * [ ] handle other non-discovery modes and provide control over environment variable usage required in applications + * [x] rev-parse + * [x] rev-walk + * [x] include tips + * [ ] exclude commits + * [x] instantiation + * [x] access to refs and objects + * **credentials** + * [x] run `git credential` directly + * [x] use credential helper configuration and to obtain credentials with `gix_credentials::helper::Cascade` + * **config** + * [ ] facilities to apply the [url-match](https://git-scm.com/docs/git-config#Documentation/git-config.txt-httplturlgt) algorithm and to + [normalize urls](https://github.com/git/git/blob/be1a02a17ede4082a86dfbfee0f54f345e8b43ac/urlmatch.c#L109:L109) before comparison. + * **traverse** + * [x] commit graphs + * [ ] make [git-notes](https://git-scm.com/docs/git-notes) accessible + * [x] tree entries + * **diffs/changes** + * [x] tree with other tree + * [ ] respect case-sensitivity of host filesystem. + * [x] a way to access various diff related settings or use them + * [ ] respect `diff.*.textconv`, `diff.*.cachetextconv` and external diff viewers with `diff.*.command`, + [along with support for reading `diff` gitattributes](https://github.com/git/git/blob/73876f4861cd3d187a4682290ab75c9dccadbc56/Documentation/gitattributes.txt#L699:L699). + * **rewrite tracking** + * **deviation** - git keeps up to four candidates whereas we use the first-found candidate that matches the similarity percentage. + This can lead to different sources being found. As such, we also don't consider the filename at all. + * [ ] handle binary files correctly, and apply filters for that matter + * [x] computation limit with observable reduction of precision when it is hit, for copies and renames separately + * **by identity** + * [x] renames (sym-links are only ever compared by identity) + * [x] copies + * **by similarity** - similarity factor controllable separately from renames + * [x] renames + * [x] copies + * [x] 'find-copies-harder' - find copies with the source being the entire tree. + * [ ] tree or index with working tree + - [ ] rename tracking + - [ ] submodule status (recursive) + * [x] diffs between modified blobs with various algorithms + * [ ] tree with index (via index-from-tree and index) + - [ ] rename tracking + - [ ] submodule status (recursive) + * [x] initialize + * [x] Proper configuration depending on platform (e.g. ignorecase, filemode, …) + * **Id** + * [x] short hashes with detection of ambiguity. + * **Commit** + * [x] `git describe` like functionality, with optional commit-graph acceleration + * [x] create new commit from tree + * **Objects** + * [x] lookup + * [x] peel to object kind + * [ ] create [signed commits and tags](https://github.com/Byron/gitoxide/issues/12) + * **trees** + * [x] lookup path + * **references** + * [x] peel to end + * [x] ref-log access + * [x] remote name + * [x] find remote itself + - [ ] respect `branch..merge` in the returned remote. + * **remotes** + * [x] clone + * [x] shallow + * [ ] include-tags when shallow is used (needs separate fetch) + * [ ] prune non-existing shallow commits + * [ ] [bundles](https://git-scm.com/docs/git-bundle) + * [x] fetch + * [x] shallow (remains shallow, options to adjust shallow boundary) + * [ ] a way to auto-explode small packs to avoid them to pile up + * [x] 'ref-in-want' + * [ ] 'wanted-ref' + * [x] standard negotiation algorithms `consecutive`, `skipping` and `noop`. + * [ ] push + * [x] ls-refs + * [x] ls-refs with ref-spec filter + * [x] list, find by name + * [x] create in memory + * [ ] groups + * [ ] [remote and branch files](https://github.com/git/git/blob/master/remote.c#L300) + * [ ] execute hooks + * **refs** + * [ ] run transaction hooks and handle special repository states like quarantine + * [ ] support for different backends like `files` and `reftable` + * **main or linked worktree** + * [ ] add files with `.gitignore` handling + * [ ] checkout with conversions like clean + smudge as in `.gitattributes` + * [ ] _diff_ index with working tree + * [ ] sparse checkout support + * [x] read per-worktree config if `extensions.worktreeConfig` is enabled. + * **index** + * [ ] tree from index + * [x] index from tree + * **worktrees** + * [x] open a repository with worktrees + * [x] read locked state + * [ ] obtain 'prunable' information + * [x] proper handling of worktree related refs + * [x] create a byte stream and create archives for such a stream, including worktree filters and conversions + * [ ] create, move, remove, and repair + * [x] access exclude information + * [x] access attribute information + * [x] respect `core.worktree` configuration + - **deviation** + * The delicate interplay between `GIT_COMMON_DIR` and `GIT_WORK_TREE` isn't implemented. + * **config** + * [x] read the primitive types `boolean`, `integer`, `string` + * [x] read and interpolate trusted paths + * [x] low-level API for more elaborate access to all details of `git-config` files + * [ ] a way to make changes to individual configuration files + * [x] mailmap + * [x] object replacements (`git replace`) + * [x] read git configuration + * [ ] merging + * [ ] stashing + * [ ] Use _Commit Graph_ to speed up certain queries + * [ ] subtree + * [ ] interactive rebase status/manipulation + * **submodules** + * [x] handle 'old' form for reading and detect old form + * [x] list + * [ ] edit +* [ ] API documentation + * [ ] Some examples + ### gix-actor * [x] read and write a signature that uniquely identifies an actor within a git repository * [x] a way to parse `name ` tuples (instead of full signatures) to facilitate parsing @@ -611,138 +692,6 @@ See its [README.md](https://github.com/Byron/gitoxide/blob/main/gix-lock/README. * [x] API documentation * [x] Some examples -### gix -* [x] utilities for applications to make long running operations interruptible gracefully and to support timeouts in servers. -* [x] handle `core.repositoryFormatVersion` and extensions -* [x] support for unicode-precomposition of command-line arguments (needs explicit use in parent application) -* [ ] strict object creation (validate objects referenced by newly created objects exist) -* [ ] strict hash verification (validate that objects actually have the hashes they claim to have) -* **Repository** - * [x] discovery - * [x] option to not cross file systems (default) - * [x] handle git-common-dir - * [x] support for `GIT_CEILING_DIRECTORIES` environment variable - * [ ] handle other non-discovery modes and provide control over environment variable usage required in applications - * [x] rev-parse - * [x] rev-walk - * [x] include tips - * [ ] exclude commits - * [x] instantiation - * [x] access to refs and objects - * **credentials** - * [x] run `git credential` directly - * [x] use credential helper configuration and to obtain credentials with `gix_credentials::helper::Cascade` - * **config** - * [ ] facilities to apply the [url-match](https://git-scm.com/docs/git-config#Documentation/git-config.txt-httplturlgt) algorithm and to - [normalize urls](https://github.com/git/git/blob/be1a02a17ede4082a86dfbfee0f54f345e8b43ac/urlmatch.c#L109:L109) before comparison. - * **traverse** - * [x] commit graphs - * [ ] make [git-notes](https://git-scm.com/docs/git-notes) accessible - * [x] tree entries - * **diffs/changes** - * [x] tree with other tree - * [ ] respect case-sensitivity of host filesystem. - * [x] a way to access various diff related settings or use them - * [ ] respect `diff.*.textconv`, `diff.*.cachetextconv` and external diff viewers with `diff.*.command`, - [along with support for reading `diff` gitattributes](https://github.com/git/git/blob/73876f4861cd3d187a4682290ab75c9dccadbc56/Documentation/gitattributes.txt#L699:L699). - * **rewrite tracking** - * **deviation** - git keeps up to four candidates whereas we use the first-found candidate that matches the similarity percentage. - This can lead to different sources being found. As such, we also don't consider the filename at all. - * [ ] handle binary files correctly, and apply filters for that matter - * [x] computation limit with observable reduction of precision when it is hit, for copies and renames separately - * **by identity** - * [x] renames (sym-links are only ever compared by identity) - * [x] copies - * **by similarity** - similarity factor controllable separately from renames - * [x] renames - * [x] copies - * [x] 'find-copies-harder' - find copies with the source being the entire tree. - * [ ] tree or index with working tree - * [x] diffs between modified blobs with various algorithms - * [ ] tree with index - * [x] initialize - * [x] Proper configuration depending on platform (e.g. ignorecase, filemode, …) - * **Id** - * [x] short hashes with detection of ambiguity. - * **Commit** - * [x] `git describe` like functionality, with optional commit-graph acceleration - * [x] create new commit from tree - * **Objects** - * [x] lookup - * [x] peel to object kind - * [ ] create [signed commits and tags](https://github.com/Byron/gitoxide/issues/12) - * **trees** - * [x] lookup path - * **references** - * [x] peel to end - * [x] ref-log access - * [x] remote name - * [x] find remote itself - - [ ] respect `branch..merge` in the returned remote. - * **remotes** - * [x] clone - * [x] shallow - * [ ] include-tags when shallow is used (needs separate fetch) - * [ ] prune non-existing shallow commits - * [ ] [bundles](https://git-scm.com/docs/git-bundle) - * [x] fetch - * [x] shallow (remains shallow, options to adjust shallow boundary) - * [ ] a way to auto-explode small packs to avoid them to pile up - * [x] 'ref-in-want' - * [ ] 'wanted-ref' - * [x] standard negotiation algorithms `consecutive`, `skipping` and `noop`. - * [ ] push - * [x] ls-refs - * [x] ls-refs with ref-spec filter - * [x] list, find by name - * [x] create in memory - * [ ] groups - * [ ] [remote and branch files](https://github.com/git/git/blob/master/remote.c#L300) - * [ ] execute hooks - * **refs** - * [ ] run transaction hooks and handle special repository states like quarantine - * [ ] support for different backends like `files` and `reftable` - * **main or linked worktree** - * [ ] add files with `.gitignore` handling - * [ ] checkout with conversions like clean + smudge as in `.gitattributes` - * [ ] _diff_ index with working tree - * [ ] sparse checkout support - * [x] read per-worktree config if `extensions.worktreeConfig` is enabled. - * **index** - * [ ] tree from index - * [x] index from tree - * **worktrees** - * [x] open a repository with worktrees - * [x] read locked state - * [ ] obtain 'prunable' information - * [x] proper handling of worktree related refs - * [x] create a byte stream and create archives for such a stream, including worktree filters and conversions - * [ ] create, move, remove, and repair - * [x] access exclude information - * [x] access attribute information - * [x] respect `core.worktree` configuration - - **deviation** - * The delicate interplay between `GIT_COMMON_DIR` and `GIT_WORK_TREE` isn't implemented. - * **config** - * [x] read the primitive types `boolean`, `integer`, `string` - * [x] read and interpolate trusted paths - * [x] low-level API for more elaborate access to all details of `git-config` files - * [ ] a way to make changes to individual configuration files - * [x] mailmap - * [x] object replacements (`git replace`) - * [x] read git configuration - * [ ] merging - * [ ] stashing - * [ ] Use _Commit Graph_ to speed up certain queries - * [ ] subtree - * [ ] interactive rebase status/manipulation - * **submodules** - * [x] handle 'old' form for reading and detect old form - * [x] list - * [ ] edit -* [ ] API documentation - * [ ] Some examples - ### gix-worktree-stream * [x] encode git-tree as stream of bytes (with large file support and actual streaming) From 3c8421f003bc3a5f2f51cee1b5cb6a526d5e0f38 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 24 Oct 2023 14:07:48 +0200 Subject: [PATCH 2/7] feat!: Add git-style metadata support. As opposed to the Rust standard library, this one will get the ctime from the file itself, instead of from the inode. That way, the index file written by `gix` will not continuously be expensively rewritten by `git`, and vice versa. --- Cargo.lock | 12 +-- gix-index/Cargo.toml | 4 + gix-index/src/entry/mode.rs | 10 +-- gix-index/src/entry/stat.rs | 23 +++-- gix-index/src/fs.rs | 166 ++++++++++++++++++++++++++++++++++++ gix-index/src/lib.rs | 2 + 6 files changed, 194 insertions(+), 23 deletions(-) create mode 100644 gix-index/src/fs.rs diff --git a/Cargo.lock b/Cargo.lock index 909ae597fd5..7d574a99c88 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1795,7 +1795,9 @@ dependencies = [ "gix-object 0.38.0", "gix-traverse 0.34.0", "itoa", + "libc", "memmap2 0.7.1", + "rustix 0.38.20", "serde", "smallvec", "thiserror", @@ -2082,7 +2084,7 @@ dependencies = [ "gix-config-value", "gix-testtools", "parking_lot", - "rustix 0.38.19", + "rustix 0.38.20", "serial_test", "thiserror", ] @@ -2933,7 +2935,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" dependencies = [ "hermit-abi", - "rustix 0.38.19", + "rustix 0.38.20", "windows-sys", ] @@ -3859,9 +3861,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.19" +version = "0.38.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "745ecfa778e66b2b63c88a61cb36e0eea109e803b0b86bf9879fbc77c70e86ed" +checksum = "67ce50cb2e16c2903e30d1cbccfd8387a74b9d4c938b6a4c5ec6cc7556f7a8a0" dependencies = [ "bitflags 2.4.0", "errno", @@ -4289,7 +4291,7 @@ dependencies = [ "cfg-if", "fastrand 2.0.1", "redox_syscall", - "rustix 0.38.19", + "rustix 0.38.20", "windows-sys", ] diff --git a/gix-index/Cargo.toml b/gix-index/Cargo.toml index 9320daa51b0..cd845b326b7 100644 --- a/gix-index/Cargo.toml +++ b/gix-index/Cargo.toml @@ -41,6 +41,10 @@ bitflags = "2" document-features = { version = "0.2.0", optional = true } +[target.'cfg(not(windows))'.dependencies] +rustix = { version = "0.38.20", default-features = false, features = ["std", "fs"] } +libc = { version = "0.2.149" } + [package.metadata.docs.rs] features = ["document-features", "serde"] rustdoc-args = ["--cfg", "docsrs"] diff --git a/gix-index/src/entry/mode.rs b/gix-index/src/entry/mode.rs index 0301df43800..583c295bc7b 100644 --- a/gix-index/src/entry/mode.rs +++ b/gix-index/src/entry/mode.rs @@ -37,7 +37,7 @@ impl Mode { /// can not be committed to git). pub fn change_to_match_fs( self, - stat: &std::fs::Metadata, + stat: &crate::fs::Metadata, has_symlinks: bool, executable_bit: bool, ) -> Option { @@ -46,15 +46,13 @@ impl Mode { Mode::SYMLINK if has_symlinks && !stat.is_symlink() => (), Mode::SYMLINK if !has_symlinks && !stat.is_file() => (), Mode::COMMIT | Mode::DIR if !stat.is_dir() => (), - Mode::FILE if executable_bit && gix_fs::is_executable(stat) => return Some(Change::ExecutableBit), - Mode::FILE_EXECUTABLE if executable_bit && !gix_fs::is_executable(stat) => { - return Some(Change::ExecutableBit) - } + Mode::FILE if executable_bit && stat.is_executable() => return Some(Change::ExecutableBit), + Mode::FILE_EXECUTABLE if executable_bit && !stat.is_executable() => return Some(Change::ExecutableBit), _ => return None, }; let new_mode = if stat.is_dir() { Mode::COMMIT - } else if executable_bit && gix_fs::is_executable(stat) { + } else if executable_bit && stat.is_executable() { Mode::FILE_EXECUTABLE } else { Mode::FILE diff --git a/gix-index/src/entry/stat.rs b/gix-index/src/entry/stat.rs index 5e60f8540be..9e279e784ea 100644 --- a/gix-index/src/entry/stat.rs +++ b/gix-index/src/entry/stat.rs @@ -76,11 +76,11 @@ impl Stat { } /// Creates stat information from the result of `symlink_metadata`. - pub fn from_fs(fstat: &std::fs::Metadata) -> Result { - let mtime = fstat.modified().unwrap_or(std::time::UNIX_EPOCH); - let ctime = fstat.created().unwrap_or(std::time::UNIX_EPOCH); + pub fn from_fs(stat: &crate::fs::Metadata) -> Result { + let mtime = stat.modified().unwrap_or(std::time::UNIX_EPOCH); + let ctime = stat.created().unwrap_or(std::time::UNIX_EPOCH); - #[cfg(not(unix))] + #[cfg(windows)] let res = Stat { mtime: mtime.try_into()?, ctime: ctime.try_into()?, @@ -89,11 +89,10 @@ impl Stat { uid: 0, gid: 0, // truncation to 32 bits is on purpose (git does the same). - size: fstat.len() as u32, + size: stat.len() as u32, }; - #[cfg(unix)] + #[cfg(not(windows))] let res = { - use std::os::unix::fs::MetadataExt; Stat { mtime: mtime.try_into().unwrap_or_default(), ctime: ctime.try_into().unwrap_or_default(), @@ -101,12 +100,12 @@ impl Stat { // that's what the linux syscalls returns // just rust upcasts to 64 bits for some reason? // numbers this large are impractical anyway (that's a lot of hard-drives). - dev: fstat.dev() as u32, - ino: fstat.ino() as u32, - uid: fstat.uid(), - gid: fstat.gid(), + dev: stat.dev() as u32, + ino: stat.ino() as u32, + uid: stat.uid(), + gid: stat.gid(), // truncation to 32 bits is on purpose (git does the same). - size: fstat.len() as u32, + size: stat.len() as u32, } }; diff --git a/gix-index/src/fs.rs b/gix-index/src/fs.rs new file mode 100644 index 00000000000..21422f9b804 --- /dev/null +++ b/gix-index/src/fs.rs @@ -0,0 +1,166 @@ +//! This module contains a `Metadata` implementation that must be used instead of `std::fs::Metadata` to assure +//! that the `ctime` information is populated exactly like the one in `git`, which wouldn't be the case on unix. +#![allow(clippy::useless_conversion)] // on some MacOOS conversions are required, but on linux usually not. +#![allow(clippy::unnecessary_cast)] + +// it's allowed for good measure, in case there are systems that use different types for that. +use std::path::Path; +use std::time::{Duration, SystemTime}; + +/// A structure to partially mirror [`std::fs::Metadata`]. +#[cfg(not(windows))] +pub struct Metadata(rustix::fs::Stat); + +#[cfg(windows)] +/// A structure to partially mirror [`std::fs::Metadata`]. +pub struct Metadata(std::fs::Metadata); + +/// Lifecycle +impl Metadata { + /// Obtain the metadata at `path` without following symlinks. + pub fn from_path_no_follow(path: &Path) -> Result { + #[cfg(not(windows))] + { + rustix::fs::lstat(path).map(Metadata).map_err(Into::into) + } + #[cfg(windows)] + path.symlink_metadata().map(Metadata) + } + + /// Obtain the metadata at `path` without following symlinks. + pub fn from_file(file: &std::fs::File) -> Result { + #[cfg(not(windows))] + { + rustix::fs::fstat(file).map(Metadata).map_err(Into::into) + } + #[cfg(windows)] + file.metadata().map(Metadata) + } +} + +/// Access +#[allow(clippy::len_without_is_empty)] +impl Metadata { + /// Return true if the metadata belongs to a directory + pub fn is_dir(&self) -> bool { + #[cfg(not(windows))] + { + (self.0.st_mode & libc::S_IFMT) == libc::S_IFDIR + } + #[cfg(windows)] + self.0.is_dir() + } + + /// Return the time at which the underlying file was modified. + pub fn modified(&self) -> Option { + #[cfg(not(windows))] + { + Some(system_time_from_secs_nanos( + self.0.st_mtime.try_into().ok()?, + self.0.st_mtime_nsec.try_into().ok()?, + )) + } + #[cfg(windows)] + self.0.modified().ok() + } + + /// Return the time at which the underlying file was created. + /// + /// Note that this differes from [`std::fs::Metadata::created()`] which would return + /// the inode birth time, which is notably different to what `git` does. + pub fn created(&self) -> Option { + #[cfg(not(windows))] + { + Some(system_time_from_secs_nanos( + self.0.st_ctime.try_into().ok()?, + self.0.st_ctime_nsec.try_into().ok()?, + )) + } + #[cfg(windows)] + self.0.created().ok() + } + + /// Return the size of the file in bytes. + pub fn len(&self) -> u64 { + #[cfg(not(windows))] + { + self.0.st_size as u64 + } + #[cfg(windows)] + self.0.len() + } + + /// Return the device id on which the file is located, or 0 on windows. + pub fn dev(&self) -> u64 { + #[cfg(not(windows))] + { + self.0.st_dev as u64 + } + #[cfg(windows)] + 0 + } + + /// Return the inode id tracking the file, or 0 on windows. + pub fn ino(&self) -> u64 { + #[cfg(not(windows))] + { + self.0.st_ino as u64 + } + #[cfg(windows)] + 0 + } + + /// Return the user-id of the file or 0 on windows. + pub fn uid(&self) -> u32 { + #[cfg(not(windows))] + { + self.0.st_uid as u32 + } + #[cfg(windows)] + 0 + } + + /// Return the group-id of the file or 0 on windows. + pub fn gid(&self) -> u32 { + #[cfg(not(windows))] + { + self.0.st_gid as u32 + } + #[cfg(windows)] + 0 + } + + /// Return `true` if the file's executable bit is set, or `false` on windows. + pub fn is_executable(&self) -> bool { + #[cfg(not(windows))] + { + (self.0.st_mode & libc::S_IFMT) == libc::S_IFREG && self.0.st_mode & libc::S_IXUSR == libc::S_IXUSR + } + #[cfg(windows)] + gix_fs::is_executable(&self.0) + } + + /// Return `true` if the file's is a symbolic link. + pub fn is_symlink(&self) -> bool { + #[cfg(not(windows))] + { + (self.0.st_mode & libc::S_IFMT) == libc::S_IFLNK + } + #[cfg(windows)] + self.0.is_symlink() + } + + /// Return `true` if this is a regular file, executable or not. + pub fn is_file(&self) -> bool { + #[cfg(not(windows))] + { + (self.0.st_mode & libc::S_IFMT) == libc::S_IFREG + } + #[cfg(windows)] + self.0.is_file() + } +} + +fn system_time_from_secs_nanos(secs: u64, nanos: u32) -> SystemTime { + std::time::UNIX_EPOCH + Duration::new(secs, nanos) +} diff --git a/gix-index/src/lib.rs b/gix-index/src/lib.rs index 55b332a8280..e54c4aaf1fa 100644 --- a/gix-index/src/lib.rs +++ b/gix-index/src/lib.rs @@ -33,6 +33,8 @@ pub mod verify; /// pub mod write; +pub mod fs; + /// All known versions of a git index file. #[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] From 81347676707d30d414b3126c5714c005cca576c8 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 24 Oct 2023 18:59:36 +0200 Subject: [PATCH 3/7] adapt to changes in `gix-index` --- gix-status/src/index_as_worktree/function.rs | 2 +- gix-worktree-state/src/checkout/entry.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/gix-status/src/index_as_worktree/function.rs b/gix-status/src/index_as_worktree/function.rs index 7e1b9c86465..82b50593091 100644 --- a/gix-status/src/index_as_worktree/function.rs +++ b/gix-status/src/index_as_worktree/function.rs @@ -348,7 +348,7 @@ impl<'index> State<'_, 'index> { Err(err) => return Err(Error::Io(err)), }; self.symlink_metadata_calls.fetch_add(1, Ordering::Relaxed); - let metadata = match worktree_path.symlink_metadata() { + let metadata = match gix_index::fs::Metadata::from_path_no_follow(worktree_path) { Ok(metadata) if metadata.is_dir() => { // index entries are normally only for files/symlinks // if a file turned into a directory it was removed diff --git a/gix-worktree-state/src/checkout/entry.rs b/gix-worktree-state/src/checkout/entry.rs index b913c3bbda3..77db18daa1e 100644 --- a/gix-worktree-state/src/checkout/entry.rs +++ b/gix-worktree-state/src/checkout/entry.rs @@ -161,7 +161,7 @@ where file.close()?; } - entry.stat = Stat::from_fs(&std::fs::symlink_metadata(dest)?)?; + entry.stat = Stat::from_fs(&gix_index::fs::Metadata::from_path_no_follow(dest)?)?; obj.data.len() } gix_index::entry::Mode::DIR => { @@ -285,7 +285,7 @@ pub(crate) fn finalize_entry( } // NOTE: we don't call `file.sync_all()` here knowing that some filesystems don't handle this well. // revisit this once there is a bug to fix. - entry.stat = Stat::from_fs(&file.metadata()?)?; + entry.stat = Stat::from_fs(&gix_index::fs::Metadata::from_file(&file)?)?; file.close()?; Ok(()) } From 13ab6291eca79b2a5b538d923f7138c3d755d18d Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 23 Oct 2023 09:39:32 +0200 Subject: [PATCH 4/7] fix: remove unused dependency and improve documentation slightly --- Cargo.lock | 1 - gix-status/Cargo.toml | 2 -- gix-status/src/index_as_worktree/function.rs | 8 +++++--- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7d574a99c88..b593fa99e29 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2284,7 +2284,6 @@ dependencies = [ "gix-index 0.26.0", "gix-object 0.38.0", "gix-path 0.10.0", - "gix-pathspec", "gix-worktree 0.27.0", "thiserror", ] diff --git a/gix-status/Cargo.toml b/gix-status/Cargo.toml index ea3263d0f35..0f7d58ddd59 100644 --- a/gix-status/Cargo.toml +++ b/gix-status/Cargo.toml @@ -20,11 +20,9 @@ gix-hash = { version = "^0.13.1", path = "../gix-hash" } gix-object = { version = "^0.38.0", path = "../gix-object" } gix-path = { version = "^0.10.0", path = "../gix-path" } gix-features = { version = "^0.36.0", path = "../gix-features" } -gix-pathspec = { version = "^0.4.0", path = "../gix-pathspec" } gix-filter = { version = "^0.6.0", path = "../gix-filter" } gix-worktree = { version = "^0.27.0", path = "../gix-worktree", default-features = false, features = ["attributes"] } thiserror = "1.0.26" filetime = "0.2.15" bstr = { version = "1.3.0", default-features = false } - diff --git a/gix-status/src/index_as_worktree/function.rs b/gix-status/src/index_as_worktree/function.rs index 82b50593091..d14b2b7234d 100644 --- a/gix-status/src/index_as_worktree/function.rs +++ b/gix-status/src/index_as_worktree/function.rs @@ -25,9 +25,11 @@ use crate::{ /// `submodule` which can take a look at submodules in detail to produce status information (BASE version if its conflicting). /// `options` are used to configure the operation. /// -/// Note that `index` may require changes to be up-to-date with the working tree and avoid expensive computations by updating respective entries -/// with stat information from the worktree, and its timestamp is adjusted to the current time for which it will be considered fresh -/// as long as it is included which depends on `pathspec`. All this is delegated to the caller. +/// Note that `index` may require changes to be up-to-date with the working tree and avoid expensive computations by updating +/// respective entries with stat information from the worktree, and its timestamp is adjusted to the current time for which it +/// will be considered fresh. All changes that would be applied to the index are delegated to the caller, which receives these +/// as [`EntryStatus`]. +/// The `pathspec` is used to determine which index entries to check for status in the first place. /// /// `should_interrupt` can be used to stop all processing. /// `filter` is used to convert worktree files back to their internal git representation. For this to be correct, From e2745fd20203bf26a30b563f2817e342df7c4742 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sat, 11 Nov 2023 17:21:53 +0100 Subject: [PATCH 5/7] feat: provider new rename-tracking faciliites. They generalize reneame tracking to the point where it can work for different kinds of changes. There is still some way to go until it is truly correct though, as it still lacks worktree conversions and diff filters. --- Cargo.lock | 1 + gix-diff/Cargo.toml | 4 +- gix-diff/src/blob.rs | 15 + gix-diff/src/lib.rs | 28 ++ gix-diff/src/rewrites/mod.rs | 77 +++++ gix-diff/src/rewrites/tracker.rs | 488 +++++++++++++++++++++++++++++++ gix-diff/src/tree/visit.rs | 40 +++ 7 files changed, 652 insertions(+), 1 deletion(-) create mode 100644 gix-diff/src/rewrites/mod.rs create mode 100644 gix-diff/src/rewrites/tracker.rs diff --git a/Cargo.lock b/Cargo.lock index b593fa99e29..bd526b0380e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1519,6 +1519,7 @@ dependencies = [ name = "gix-diff" version = "0.37.0" dependencies = [ + "bstr", "document-features", "getrandom", "gix-hash 0.13.1", diff --git a/gix-diff/Cargo.toml b/gix-diff/Cargo.toml index b51eaaaa7da..589b3c34b2d 100644 --- a/gix-diff/Cargo.toml +++ b/gix-diff/Cargo.toml @@ -12,7 +12,7 @@ autotests = false [features] default = ["blob"] -## Enable diffing of blobs using imara-diff. +## Enable diffing of blobs using imara-diff, which also allows for a generic rewrite tracking implementation. blob = ["dep:imara-diff"] ## Data structures implement `serde::Serialize` and `serde::Deserialize`. serde = ["dep:serde", "gix-hash/serde", "gix-object/serde"] @@ -25,10 +25,12 @@ doctest = false [dependencies] gix-hash = { version = "^0.13.1", path = "../gix-hash" } gix-object = { version = "^0.38.0", path = "../gix-object" } + thiserror = "1.0.32" imara-diff = { version = "0.1.3", optional = true } serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"]} getrandom = { version = "0.2.8", optional = true, default-features = false, features = ["js"] } +bstr = { version = "1.5.0", default-features = false } document-features = { version = "0.2.0", optional = true } diff --git a/gix-diff/src/blob.rs b/gix-diff/src/blob.rs index 27c1a131724..7b2a082bd1e 100644 --- a/gix-diff/src/blob.rs +++ b/gix-diff/src/blob.rs @@ -1,3 +1,18 @@ //! For using text diffs, please have a look at the [`imara-diff` documentation](https://docs.rs/imara-diff), //! maintained by [Pascal Kuthe](https://github.com/pascalkuthe). +//! +//! +/// Information about the diff performed to detect similarity. +#[derive(Debug, Default, Clone, Copy, Eq, PartialEq)] +pub struct DiffLineStats { + /// The amount of lines to remove from the source to get to the destination. + pub removals: u32, + /// The amount of lines to add to the source to get to the destination. + pub insertions: u32, + /// The amount of lines of the previous state, in the source. + pub before: u32, + /// The amount of lines of the new state, in the destination. + pub after: u32, +} + pub use imara_diff::*; diff --git a/gix-diff/src/lib.rs b/gix-diff/src/lib.rs index 6d94a75919f..b3a61b2b97b 100644 --- a/gix-diff/src/lib.rs +++ b/gix-diff/src/lib.rs @@ -8,6 +8,34 @@ cfg_attr(doc, doc = ::document_features::document_features!()) #![deny(missing_docs, rust_2018_idioms)] #![forbid(unsafe_code)] +/// A structure to capture how to perform rename and copy tracking, used by the [rewrites::Tracker]. +#[derive(Debug, Copy, Clone, PartialEq)] +#[cfg(feature = "blob")] +pub struct Rewrites { + /// If `Some(…)`, also find copies. `None` is the default which does not try to detect copies at all. + /// + /// Note that this is an even more expensive operation than detecting renames stemming from additions and deletions + /// as the resulting set to search through is usually larger. + pub copies: Option, + /// The percentage of similarity needed for files to be considered renamed, defaulting to `Some(0.5)`. + /// This field is similar to `git diff -M50%`. + /// + /// If `None`, files are only considered equal if their content matches 100%. + /// Note that values greater than 1.0 have no different effect than 1.0. + pub percentage: Option, + /// The amount of files to consider for fuzzy rename or copy tracking. Defaults to 1000, meaning that only 1000*1000 + /// combinations can be tested for fuzzy matches, i.e. the ones that try to find matches by comparing similarity. + /// If 0, there is no limit. + /// + /// If the limit would not be enough to test the entire set of combinations, the algorithm will trade in precision and not + /// run the fuzzy version of identity tests at all. That way results are never partial. + pub limit: usize, +} + +/// Contains a [Tracker](rewrites::Tracker) to detect rewrites. +#[cfg(feature = "blob")] +pub mod rewrites; + /// pub mod tree; diff --git a/gix-diff/src/rewrites/mod.rs b/gix-diff/src/rewrites/mod.rs new file mode 100644 index 00000000000..8af13165f6f --- /dev/null +++ b/gix-diff/src/rewrites/mod.rs @@ -0,0 +1,77 @@ +use crate::Rewrites; + +/// Types related to the rename tracker for renames, rewrites and copies. +pub mod tracker; + +/// A type to retain state related to an ongoing tracking operation to retain sets of interesting changes +/// of which some are retained to at a later stage compute the ones that seem to be renames or copies. +pub struct Tracker { + /// The tracked items thus far, which will be used to determine renames/copies and rewrites later. + items: Vec>, + /// A place to store all paths in to reduce amount of allocations. + path_backing: Vec, + /// A buffer for use when fetching objects for similarity tests. + buf1: Vec, + /// Another buffer for use when fetching objects for similarity tests. + buf2: Vec, + /// How to track copies and/or rewrites. + rewrites: Rewrites, + /// The diff algorithm to use when checking for similarity. + diff_algo: crate::blob::Algorithm, +} + +/// Determine in which set of files to search for copies. +#[derive(Default, Debug, Copy, Clone, Eq, PartialEq)] +pub enum CopySource { + /// Find copies from the set of modified files only. + #[default] + FromSetOfModifiedFiles, + /// Find copies from the set of modified files, as well as all files known to the source (i.e. previous state of the tree). + /// + /// This can be an expensive operation as it scales exponentially with the total amount of files in the set. + FromSetOfModifiedFilesAndAllSources, +} + +/// Under which circumstances we consider a file to be a copy. +#[derive(Debug, Copy, Clone, PartialEq)] +pub struct Copies { + /// The set of files to search when finding the source of copies. + pub source: CopySource, + /// Equivalent to [`Rewrites::percentage`], but used for copy tracking. + /// + /// Useful to have similarity-based rename tracking and cheaper copy tracking. + pub percentage: Option, +} + +impl Default for Copies { + fn default() -> Self { + Copies { + source: CopySource::default(), + percentage: Some(0.5), + } + } +} + +/// Information collected while handling rewrites of files which may be tracked. +#[derive(Default, Clone, Copy, Debug, PartialEq)] +pub struct Outcome { + /// The options used to guide the rewrite tracking. Either fully provided by the caller or retrieved from git configuration. + pub options: Rewrites, + /// The amount of similarity checks that have been conducted to find renamed files and potentially copies. + pub num_similarity_checks: usize, + /// Set to the amount of worst-case rename permutations we didn't search as our limit didn't allow it. + pub num_similarity_checks_skipped_for_rename_tracking_due_to_limit: usize, + /// Set to the amount of worst-case copy permutations we didn't search as our limit didn't allow it. + pub num_similarity_checks_skipped_for_copy_tracking_due_to_limit: usize, +} + +/// The default settings for rewrites according to the git configuration defaults. +impl Default for Rewrites { + fn default() -> Self { + Rewrites { + copies: None, + percentage: Some(0.5), + limit: 1000, + } + } +} diff --git a/gix-diff/src/rewrites/tracker.rs b/gix-diff/src/rewrites/tracker.rs new file mode 100644 index 00000000000..09d3c724608 --- /dev/null +++ b/gix-diff/src/rewrites/tracker.rs @@ -0,0 +1,488 @@ +use std::ops::Range; + +use gix_object::tree::{EntryKind, EntryMode}; + +use crate::blob::DiffLineStats; +use crate::rewrites::{CopySource, Outcome}; +use crate::{rewrites::Tracker, Rewrites}; +use bstr::BStr; +use gix_object::FindExt; + +/// The kind of a change. +#[derive(Debug, Copy, Clone, Ord, PartialOrd, PartialEq, Eq)] +pub enum ChangeKind { + /// The change represents the *deletion* of an item. + Deletion, + /// The change represents the *modification* of an item. + Modification, + /// The change represents the *addition* of an item. + Addition, +} + +/// A trait providing all functionality to abstract over the concept of a change, as seen by the [`Tracker`]. +pub trait Change: Clone { + /// Return the hash of this change for identification. + fn id(&self) -> &gix_hash::oid; + /// Return the kind of this change. + fn kind(&self) -> ChangeKind; + /// Return more information about the kind of entry affected by this change. + fn entry_mode(&self) -> EntryMode; + /// Return the id of the change along with its mode. + fn id_and_entry_mode(&self) -> (&gix_hash::oid, EntryMode); +} + +/// A set of tracked items allows to figure out their relations by figuring out their similarity. +pub(crate) struct Item { + /// The underlying raw change + change: T, + /// That slice into the backing for paths. + path: Range, + /// If true, this item was already emitted, i.e. seen by the caller. + emitted: bool, +} + +impl Item { + fn location<'a>(&self, backing: &'a [u8]) -> &'a BStr { + backing[self.path.clone()].as_ref() + } + fn entry_mode_compatible(&self, mode: EntryMode) -> bool { + use EntryKind::*; + matches!( + (mode.kind(), self.change.entry_mode().kind()), + (Blob | BlobExecutable, Blob | BlobExecutable) | (Link, Link) + ) + } + + fn is_source_for_destination_of(&self, kind: visit::SourceKind, dest_item_mode: EntryMode) -> bool { + self.entry_mode_compatible(dest_item_mode) + && match kind { + visit::SourceKind::Rename => !self.emitted && matches!(self.change.kind(), ChangeKind::Deletion), + visit::SourceKind::Copy => { + matches!(self.change.kind(), ChangeKind::Modification) + } + } + } +} + +/// A module with types used in the user-callback in [Tracker::emit()](crate::rewrites::Tracker::emit()). +pub mod visit { + use crate::blob::DiffLineStats; + use bstr::BStr; + use gix_object::tree::EntryMode; + + /// The source of a rewrite, rename or copy. + pub struct Source<'a> { + /// The kind of entry. + pub entry_mode: EntryMode, + /// The hash of the state of the source as seen in the object database. + pub id: gix_hash::ObjectId, + /// Further specify what kind of source this is. + pub kind: SourceKind, + /// The repository-relative location of this entry. + pub location: &'a BStr, + /// If this is a rewrite, indicate how many lines would need to change to turn this source into the destination. + pub diff: Option, + } + + /// Further identify the kind of [Source]. + #[derive(Debug, Copy, Clone, Eq, PartialEq)] + pub enum SourceKind { + /// This is the source of an entry that was renamed, as `source` was renamed to `destination`. + Rename, + /// This is the source of a copy, as `source` was copied into `destination`. + Copy, + } + + /// A change along with a location. + pub struct Destination<'a, T> { + /// The change at the given `location`. + pub change: T, + /// The repository-relative location of this destination. + pub location: &'a BStr, + } +} + +/// +pub mod emit { + /// The error returned by [Tracker::emit()](super::Tracker::emit()). + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error { + #[error("Could not find blob for similarity checking")] + FindExistingBlob(#[from] gix_object::find::existing_object::Error), + #[error("Could not obtain exhaustive item set to use as possible sources for copy detection")] + GetItemsForExhaustiveCopyDetection(#[source] Box), + } +} + +/// Lifecycle +impl Tracker { + /// Create a new instance with `rewrites` configuration, and the `diff_algo` to use when performing + /// similarity checking. + pub fn new(rewrites: Rewrites, diff_algo: crate::blob::Algorithm) -> Self { + Tracker { + items: vec![], + path_backing: vec![], + buf1: Vec::new(), + buf2: Vec::new(), + rewrites, + diff_algo, + } + } +} + +/// build state and find matches. +impl Tracker { + /// We may refuse the push if that information isn't needed for what we have to track. + pub fn try_push_change(&mut self, change: T, location: &BStr) -> Option { + if !change.entry_mode().is_blob_or_symlink() { + return Some(change); + } + let keep = match (self.rewrites.copies, change.kind()) { + (Some(_find_copies), _) => true, + (None, ChangeKind::Modification { .. }) => false, + (None, _) => true, + }; + + if !keep { + return Some(change); + } + + let start = self.path_backing.len(); + self.path_backing.extend_from_slice(location); + self.items.push(Item { + path: start..self.path_backing.len(), + change, + emitted: false, + }); + None + } + + /// Can only be called once effectively as it alters its own state. + /// + /// `cb(destination, source)` is called for each item, either with `Some(source)` if it's + /// the destination of a copy or rename, or with `None` for source if no relation to other + /// items in the tracked set exist. + /// + /// `objects` is used to access blob data for similarity checks if required and is taken directly from the object database. + /// Worktree filters and diff conversions will be applied afterwards automatically. + /// + /// `push_source_tree(push_fn: push(change, location))` is a function that is called when the entire tree of the source + /// should be added as modifications by calling `push` repeatedly to use for perfect copy tracking. Note that `push` + /// will panic if `change` is not a modification, and it's valid to not call `push` at all. + pub fn emit( + &mut self, + mut cb: impl FnMut(visit::Destination<'_, T>, Option>) -> crate::tree::visit::Action, + objects: &dyn gix_object::Find, + mut push_source_tree: PushSourceTreeFn, + ) -> Result + where + PushSourceTreeFn: FnMut(&mut dyn FnMut(T, &BStr)) -> Result<(), E>, + E: std::error::Error + Send + Sync + 'static, + { + fn by_id_and_location(a: &Item, b: &Item) -> std::cmp::Ordering { + a.change + .id() + .cmp(b.change.id()) + .then_with(|| a.path.start.cmp(&b.path.start).then(a.path.end.cmp(&b.path.end))) + } + self.items.sort_by(by_id_and_location); + + let mut out = Outcome { + options: self.rewrites, + ..Default::default() + }; + out = self.match_pairs_of_kind( + visit::SourceKind::Rename, + &mut cb, + self.rewrites.percentage, + out, + objects, + )?; + + if let Some(copies) = self.rewrites.copies { + out = self.match_pairs_of_kind(visit::SourceKind::Copy, &mut cb, copies.percentage, out, objects)?; + + match copies.source { + CopySource::FromSetOfModifiedFiles => {} + CopySource::FromSetOfModifiedFilesAndAllSources => { + push_source_tree(&mut |change, location| { + assert!( + self.try_push_change(change, location).is_none(), + "we must accept every change" + ); + // make sure these aren't viable to be emitted anymore. + self.items.last_mut().expect("just pushed").emitted = true; + }) + .map_err(|err| emit::Error::GetItemsForExhaustiveCopyDetection(Box::new(err)))?; + self.items.sort_by(by_id_and_location); + + out = + self.match_pairs_of_kind(visit::SourceKind::Copy, &mut cb, copies.percentage, out, objects)?; + } + } + } + + self.items + .sort_by(|a, b| a.location(&self.path_backing).cmp(b.location(&self.path_backing))); + for item in self.items.drain(..).filter(|item| !item.emitted) { + if cb( + visit::Destination { + location: item.location(&self.path_backing), + change: item.change, + }, + None, + ) == crate::tree::visit::Action::Cancel + { + break; + } + } + Ok(out) + } +} + +impl Tracker { + fn match_pairs_of_kind( + &mut self, + kind: visit::SourceKind, + cb: &mut impl FnMut(visit::Destination<'_, T>, Option>) -> crate::tree::visit::Action, + percentage: Option, + mut out: Outcome, + objects: &dyn gix_object::Find, + ) -> Result { + // we try to cheaply reduce the set of possibilities first, before possibly looking more exhaustively. + let needs_second_pass = !needs_exact_match(percentage); + if self.match_pairs(cb, None /* by identity */, kind, &mut out, objects)? == crate::tree::visit::Action::Cancel + { + return Ok(out); + } + if needs_second_pass { + let is_limited = if self.rewrites.limit == 0 { + false + } else if let Some(permutations) = permutations_over_limit(&self.items, self.rewrites.limit, kind) { + match kind { + visit::SourceKind::Rename => { + out.num_similarity_checks_skipped_for_rename_tracking_due_to_limit = permutations; + } + visit::SourceKind::Copy => { + out.num_similarity_checks_skipped_for_copy_tracking_due_to_limit = permutations; + } + } + true + } else { + false + }; + if !is_limited { + self.match_pairs(cb, percentage, kind, &mut out, objects)?; + } + } + Ok(out) + } + + fn match_pairs( + &mut self, + cb: &mut impl FnMut(visit::Destination<'_, T>, Option>) -> crate::tree::visit::Action, + percentage: Option, + kind: visit::SourceKind, + stats: &mut Outcome, + objects: &dyn gix_object::Find, + ) -> Result { + // TODO(perf): reuse object data and interner state and interned tokens, make these available to `find_match()` + let mut dest_ofs = 0; + while let Some((mut dest_idx, dest)) = self.items[dest_ofs..].iter().enumerate().find_map(|(idx, item)| { + (!item.emitted && matches!(item.change.kind(), ChangeKind::Addition)).then_some((idx, item)) + }) { + dest_idx += dest_ofs; + dest_ofs = dest_idx + 1; + let src = find_match( + &self.items, + dest, + dest_idx, + percentage.map(|p| (p, self.diff_algo)), + kind, + stats, + objects, + &mut self.buf1, + &mut self.buf2, + )? + .map(|(src_idx, src, diff)| { + let (id, entry_mode) = src.change.id_and_entry_mode(); + let id = id.to_owned(); + let location = src.location(&self.path_backing); + ( + visit::Source { + entry_mode, + id, + kind, + location, + diff, + }, + src_idx, + ) + }); + if src.is_none() { + continue; + } + let location = dest.location(&self.path_backing); + let change = dest.change.clone(); + let dest = visit::Destination { change, location }; + self.items[dest_idx].emitted = true; + if let Some(src_idx) = src.as_ref().map(|t| t.1) { + self.items[src_idx].emitted = true; + } + if cb(dest, src.map(|t| t.0)) == crate::tree::visit::Action::Cancel { + return Ok(crate::tree::visit::Action::Cancel); + } + } + Ok(crate::tree::visit::Action::Continue) + } +} + +fn permutations_over_limit(items: &[Item], limit: usize, kind: visit::SourceKind) -> Option { + let (sources, destinations) = items + .iter() + .filter(|item| match kind { + visit::SourceKind::Rename => !item.emitted, + visit::SourceKind::Copy => true, + }) + .fold((0, 0), |(mut src, mut dest), item| { + match item.change.kind() { + ChangeKind::Addition => { + dest += 1; + } + ChangeKind::Deletion => { + if kind == visit::SourceKind::Rename { + src += 1 + } + } + ChangeKind::Modification => { + if kind == visit::SourceKind::Copy { + src += 1 + } + } + } + (src, dest) + }); + let permutations = sources * destinations; + (permutations > limit * limit).then_some(permutations) +} + +fn needs_exact_match(percentage: Option) -> bool { + percentage.map_or(true, |p| p >= 1.0) +} + +/// <`src_idx`, src, possibly diff stat> +type SourceTuple<'a, T> = (usize, &'a Item, Option); + +/// Find `item` in our set of items ignoring `item_idx` to avoid finding ourselves, by similarity indicated by `percentage`. +/// The latter can be `None` or `Some(x)` where `x>=1` for identity, and anything else for similarity. +/// We also ignore emitted items entirely. +/// Use `kind` to indicate what kind of match we are looking for, which might be deletions matching an `item` addition, or +/// any non-deletion otherwise. +/// Note that we always try to find by identity first even if a percentage is given as it's much faster and may reduce the set +/// of items to be searched. +#[allow(clippy::too_many_arguments)] +fn find_match<'a, T: Change>( + items: &'a [Item], + item: &Item, + item_idx: usize, + percentage: Option<(f32, crate::blob::Algorithm)>, + kind: visit::SourceKind, + stats: &mut Outcome, + objects: &dyn gix_object::Find, + buf1: &mut Vec, + buf2: &mut Vec, +) -> Result>, emit::Error> { + let (item_id, item_mode) = item.change.id_and_entry_mode(); + if needs_exact_match(percentage.map(|t| t.0)) || item_mode.is_link() { + let first_idx = items.partition_point(|a| a.change.id() < item_id); + let range = match items.get(first_idx..).map(|items| { + let end = items + .iter() + .position(|a| a.change.id() != item_id) + .map_or(items.len(), |idx| first_idx + idx); + first_idx..end + }) { + Some(range) => range, + None => return Ok(None), + }; + if range.is_empty() { + return Ok(None); + } + let res = items[range.clone()].iter().enumerate().find_map(|(mut src_idx, src)| { + src_idx += range.start; + (src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode)).then_some((src_idx, src, None)) + }); + if let Some(src) = res { + return Ok(Some(src)); + } + } else { + let new = objects.find_blob(item_id, buf1)?; + let (percentage, algo) = percentage.expect("it's set to something below 1.0 and we assured this"); + debug_assert_eq!( + item.change.entry_mode().kind(), + EntryKind::Blob, + "symlinks are matched exactly, and trees aren't used here" + ); + for (can_idx, src) in items + .iter() + .enumerate() + .filter(|(src_idx, src)| *src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode)) + { + let old = objects.find_blob(src.change.id(), buf2)?; + // TODO: make sure we get attribute handling/worktree conversion and binary skips and filters right here. + let tokens = crate::blob::intern::InternedInput::new( + crate::blob::sources::byte_lines_with_terminator(old.data), + crate::blob::sources::byte_lines_with_terminator(new.data), + ); + let counts = crate::blob::diff( + algo, + &tokens, + crate::blob::sink::Counter::new(diff::Statistics { + removed_bytes: 0, + input: &tokens, + }), + ); + let similarity = (old.data.len() - counts.wrapped) as f32 / old.data.len().max(new.data.len()) as f32; + stats.num_similarity_checks += 1; + if similarity >= percentage { + return Ok(Some(( + can_idx, + src, + DiffLineStats { + removals: counts.removals, + insertions: counts.insertions, + before: tokens.before.len().try_into().expect("interner handles only u32"), + after: tokens.after.len().try_into().expect("interner handles only u32"), + } + .into(), + ))); + } + } + } + Ok(None) +} + +mod diff { + use std::ops::Range; + + pub struct Statistics<'a, 'data> { + pub removed_bytes: usize, + pub input: &'a crate::blob::intern::InternedInput<&'data [u8]>, + } + + impl<'a, 'data> crate::blob::Sink for Statistics<'a, 'data> { + type Out = usize; + + fn process_change(&mut self, before: Range, _after: Range) { + self.removed_bytes = self.input.before[before.start as usize..before.end as usize] + .iter() + .map(|token| self.input.interner[*token].len()) + .sum(); + } + + fn finish(self) -> Self::Out { + self.removed_bytes + } + } +} diff --git a/gix-diff/src/tree/visit.rs b/gix-diff/src/tree/visit.rs index 82e38931dc2..c279ed90888 100644 --- a/gix-diff/src/tree/visit.rs +++ b/gix-diff/src/tree/visit.rs @@ -92,6 +92,46 @@ pub trait Visit { fn visit(&mut self, change: Change) -> Action; } +#[cfg(feature = "blob")] +mod change_impls { + use crate::rewrites::tracker::ChangeKind; + use crate::tree::visit::Change; + use gix_hash::oid; + use gix_object::tree::EntryMode; + + impl crate::rewrites::tracker::Change for crate::tree::visit::Change { + fn id(&self) -> &oid { + match self { + Change::Addition { oid, .. } | Change::Deletion { oid, .. } | Change::Modification { oid, .. } => oid, + } + } + + fn kind(&self) -> ChangeKind { + match self { + Change::Addition { .. } => ChangeKind::Addition, + Change::Deletion { .. } => ChangeKind::Deletion, + Change::Modification { .. } => ChangeKind::Modification, + } + } + + fn entry_mode(&self) -> EntryMode { + match self { + Change::Addition { entry_mode, .. } + | Change::Deletion { entry_mode, .. } + | Change::Modification { entry_mode, .. } => *entry_mode, + } + } + + fn id_and_entry_mode(&self) -> (&oid, EntryMode) { + match self { + Change::Addition { entry_mode, oid, .. } + | Change::Deletion { entry_mode, oid, .. } + | Change::Modification { entry_mode, oid, .. } => (oid, *entry_mode), + } + } + } +} + #[cfg(test)] mod tests { use super::*; From 089c4dc8b7d323637e5f9a9f7446f2a8e9f51ce1 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 1 Nov 2023 10:19:44 +0100 Subject: [PATCH 6/7] feat!: generalize rename-tracking engine for later use with status. Previously the rename tracking engine was integrated with tree-diffs, but already operates in a stand-alone fashion. Now it's officially generalized which allows it to be tested separately and used when tracking renames for diffs between index and tree, index and index, and index and worktree. --- gix/src/config/cache/access.rs | 8 +- gix/src/config/mod.rs | 2 +- gix/src/diff.rs | 63 +++ gix/src/object/tree/diff/change.rs | 14 +- gix/src/object/tree/diff/for_each.rs | 102 ++++- gix/src/object/tree/diff/mod.rs | 31 +- gix/src/object/tree/diff/rewrites.rs | 108 ------ gix/src/object/tree/diff/tracked.rs | 554 --------------------------- gix/tests/object/tree/diff.rs | 9 +- 9 files changed, 157 insertions(+), 734 deletions(-) delete mode 100644 gix/src/object/tree/diff/rewrites.rs delete mode 100644 gix/src/object/tree/diff/tracked.rs diff --git a/gix/src/config/cache/access.rs b/gix/src/config/cache/access.rs index e8363e1f6b7..ec3e7e1b424 100644 --- a/gix/src/config/cache/access.rs +++ b/gix/src/config/cache/access.rs @@ -93,13 +93,9 @@ impl Cache { } #[cfg(feature = "blob-diff")] - pub(crate) fn diff_renames( - &self, - ) -> Result, crate::object::tree::diff::rewrites::Error> { + pub(crate) fn diff_renames(&self) -> Result, crate::diff::new_rewrites::Error> { self.diff_renames - .get_or_try_init(|| { - crate::object::tree::diff::Rewrites::try_from_config(&self.resolved, self.lenient_config) - }) + .get_or_try_init(|| crate::diff::new_rewrites(&self.resolved, self.lenient_config)) .copied() } diff --git a/gix/src/config/mod.rs b/gix/src/config/mod.rs index 438c54378a9..f48575c174f 100644 --- a/gix/src/config/mod.rs +++ b/gix/src/config/mod.rs @@ -515,7 +515,7 @@ pub(crate) struct Cache { pub(crate) url_rewrite: OnceCell, /// The lazy-loaded rename information for diffs. #[cfg(feature = "blob-diff")] - pub(crate) diff_renames: OnceCell>, + pub(crate) diff_renames: OnceCell>, /// A lazily loaded mapping to know which url schemes to allow #[cfg(any(feature = "blocking-network-client", feature = "async-network-client"))] pub(crate) url_scheme: OnceCell, diff --git a/gix/src/diff.rs b/gix/src/diff.rs index b1081929394..445698cea39 100644 --- a/gix/src/diff.rs +++ b/gix/src/diff.rs @@ -15,3 +15,66 @@ pub mod rename { RenamesAndCopies, } } + +/// +#[cfg(feature = "blob-diff")] +mod utils { + use crate::config::cache::util::ApplyLeniency; + use crate::config::tree::Diff; + use crate::diff::rename::Tracking; + use gix_diff::rewrites::Copies; + use gix_diff::Rewrites; + + /// + pub mod new_rewrites { + /// The error returned by [`new_rewrites()`](super::new_rewrites()). + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error { + #[error(transparent)] + ConfigDiffRenames(#[from] crate::config::key::GenericError), + #[error(transparent)] + ConfigDiffRenameLimit(#[from] crate::config::unsigned_integer::Error), + } + } + + /// Create an instance by reading all relevant information from the `config`uration, while being `lenient` or not. + /// Returns `Ok(None)` if nothing is configured. + /// + /// Note that missing values will be defaulted similar to what git does. + #[allow(clippy::result_large_err)] + pub fn new_rewrites( + config: &gix_config::File<'static>, + lenient: bool, + ) -> Result, new_rewrites::Error> { + let key = "diff.renames"; + let copies = match config + .boolean_by_key(key) + .map(|value| Diff::RENAMES.try_into_renames(value)) + .transpose() + .with_leniency(lenient)? + { + Some(renames) => match renames { + Tracking::Disabled => return Ok(None), + Tracking::Renames => None, + Tracking::RenamesAndCopies => Some(Copies::default()), + }, + None => return Ok(None), + }; + + let default = Rewrites::default(); + Ok(Rewrites { + copies, + limit: config + .integer_by_key("diff.renameLimit") + .map(|value| Diff::RENAME_LIMIT.try_into_usize(value)) + .transpose() + .with_leniency(lenient)? + .unwrap_or(default.limit), + ..default + } + .into()) + } +} +#[cfg(feature = "blob-diff")] +pub use utils::new_rewrites; diff --git a/gix/src/object/tree/diff/change.rs b/gix/src/object/tree/diff/change.rs index e6826d6ed32..a95770d6656 100644 --- a/gix/src/object/tree/diff/change.rs +++ b/gix/src/object/tree/diff/change.rs @@ -1,18 +1,6 @@ +use crate::diff::blob::DiffLineStats; use crate::{bstr::BStr, Id}; -/// Information about the diff performed to detect similarity of a [Rewrite][Event::Rewrite]. -#[derive(Debug, Default, Clone, Copy, Eq, PartialEq)] -pub struct DiffLineStats { - /// The amount of lines to remove from the source to get to the destination. - pub removals: u32, - /// The amount of lines to add to the source to get to the destination. - pub insertions: u32, - /// The amount of lines of the previous state, in the source. - pub before: u32, - /// The amount of lines of the new state, in the destination. - pub after: u32, -} - /// An event emitted when finding differences between two trees. #[derive(Debug, Clone, Copy)] pub enum Event<'a, 'old, 'new> { diff --git a/gix/src/object/tree/diff/for_each.rs b/gix/src/object/tree/diff/for_each.rs index cd9c60f547d..404e804327d 100644 --- a/gix/src/object/tree/diff/for_each.rs +++ b/gix/src/object/tree/diff/for_each.rs @@ -1,15 +1,8 @@ use gix_object::TreeRefIter; use super::{change, Action, Change, Platform}; -use crate::{ - bstr::BStr, - ext::ObjectIdExt, - object::tree::{ - diff, - diff::{rewrites, tracked}, - }, - Repository, Tree, -}; +use crate::diff::rewrites::tracker; +use crate::{bstr::BStr, diff::rewrites, ext::ObjectIdExt, object::tree::diff, Repository, Tree}; /// The error return by methods on the [diff platform][Platform]. #[derive(Debug, thiserror::Error)] @@ -19,12 +12,10 @@ pub enum Error { Diff(#[from] gix_diff::tree::changes::Error), #[error("The user-provided callback failed")] ForEach(#[source] Box), - #[error("Could not find blob for similarity checking")] - FindExistingBlob(#[from] crate::object::find::existing::Error), #[error("Could not configure diff algorithm prior to checking similarity")] ConfigureDiffAlgorithm(#[from] crate::config::diff::algorithm::Error), - #[error("Could not traverse tree to obtain possible sources for copies")] - TraverseTreeForExhaustiveCopyDetection(#[from] gix_traverse::tree::breadthfirst::Error), + #[error("Failure during rename tracking")] + RenameTracking(#[from] tracker::emit::Error), } /// @@ -49,12 +40,14 @@ impl<'a, 'old> Platform<'a, 'old> { E: std::error::Error + Sync + Send + 'static, { let repo = self.lhs.repo; + let diff_algo = repo.config.diff_algorithm()?; let mut delegate = Delegate { src_tree: self.lhs, other_repo: other.repo, recorder: gix_diff::tree::Recorder::default().track_location(self.tracking), visit: for_each, - tracked: self.rewrites.map(|r| tracked::State::new(r, self.tracking)), + location: self.tracking, + tracked: self.rewrites.map(|r| rewrites::Tracker::new(r, diff_algo)), err: None, }; match gix_diff::tree::Changes::from(TreeRefIter::from_bytes(&self.lhs.data)).needed_to_obtain( @@ -87,7 +80,8 @@ struct Delegate<'a, 'old, 'new, VisitFn, E> { other_repo: &'new Repository, recorder: gix_diff::tree::Recorder, visit: VisitFn, - tracked: Option, + tracked: Option>, + location: Option, err: Option, } @@ -151,14 +145,14 @@ where location: dest.location, event: diff::change::Event::Rewrite { source_location: source.location, - source_entry_mode: source.mode, + source_entry_mode: source.entry_mode, source_id: source.id.attach(self.src_tree.repo), entry_mode: mode, id: oid.to_owned().attach(self.other_repo), diff: source.diff, copy: match source.kind { - tracked::visit::Kind::RenameTarget => false, - tracked::visit::Kind::CopyDestination => true, + tracker::visit::SourceKind::Rename => false, + tracker::visit::SourceKind::Copy => true, }, }, }; @@ -180,7 +174,12 @@ where &mut self.err, ), }, - self.src_tree, + &self.src_tree.repo.objects, + |push| { + self.src_tree + .traverse() + .breadthfirst(&mut tree_to_changes::Delegate::new(push, self.location)) + }, )?; Ok(Some(outcome)) } @@ -233,3 +232,68 @@ where } } } + +mod tree_to_changes { + use gix_diff::tree::visit::Change; + use gix_object::tree::EntryRef; + + use crate::bstr::BStr; + + pub struct Delegate<'a> { + push: &'a mut dyn FnMut(Change, &BStr), + recorder: gix_traverse::tree::Recorder, + } + + impl<'a> Delegate<'a> { + pub fn new( + push: &'a mut dyn FnMut(Change, &BStr), + location: Option, + ) -> Self { + let location = location.map(|t| match t { + gix_diff::tree::recorder::Location::FileName => gix_traverse::tree::recorder::Location::FileName, + gix_diff::tree::recorder::Location::Path => gix_traverse::tree::recorder::Location::Path, + }); + Self { + push, + recorder: gix_traverse::tree::Recorder::default().track_location(location), + } + } + } + + impl gix_traverse::tree::Visit for Delegate<'_> { + fn pop_front_tracked_path_and_set_current(&mut self) { + self.recorder.pop_front_tracked_path_and_set_current() + } + + fn push_back_tracked_path_component(&mut self, component: &BStr) { + self.recorder.push_back_tracked_path_component(component) + } + + fn push_path_component(&mut self, component: &BStr) { + self.recorder.push_path_component(component) + } + + fn pop_path_component(&mut self) { + self.recorder.pop_path_component(); + } + + fn visit_tree(&mut self, _entry: &EntryRef<'_>) -> gix_traverse::tree::visit::Action { + gix_traverse::tree::visit::Action::Continue + } + + fn visit_nontree(&mut self, entry: &EntryRef<'_>) -> gix_traverse::tree::visit::Action { + if entry.mode.is_blob() { + (self.push)( + Change::Modification { + previous_entry_mode: entry.mode, + previous_oid: gix_hash::ObjectId::null(entry.oid.kind()), + entry_mode: entry.mode, + oid: entry.oid.to_owned(), + }, + self.recorder.path(), + ); + } + gix_traverse::tree::visit::Action::Continue + } + } +} diff --git a/gix/src/object/tree/diff/mod.rs b/gix/src/object/tree/diff/mod.rs index 5f7a041e4df..b5e6c5bae4d 100644 --- a/gix/src/object/tree/diff/mod.rs +++ b/gix/src/object/tree/diff/mod.rs @@ -1,5 +1,6 @@ use gix_diff::tree::recorder::Location; +use crate::diff::Rewrites; use crate::{bstr::BStr, Tree}; /// Returned by the `for_each` function to control flow. @@ -39,7 +40,7 @@ impl<'repo> Tree<'repo> { /// try to access blobs to compute a similarity metric. Thus, it's more compatible to turn rewrite tracking off /// using [`Platform::track_rewrites()`]. #[allow(clippy::result_large_err)] - pub fn changes<'a>(&'a self) -> Result, rewrites::Error> { + pub fn changes<'a>(&'a self) -> Result, crate::diff::new_rewrites::Error> { Ok(Platform { state: Default::default(), lhs: self, @@ -58,34 +59,6 @@ pub struct Platform<'a, 'repo> { rewrites: Option, } -/// A structure to capture how to perform rename and copy tracking -#[derive(Debug, Copy, Clone, PartialEq)] -pub struct Rewrites { - /// If `Some(…)`, do also find copies. `None` is the default which does not try to detect copies at all. - /// - /// Note that this is an even more expensive operation than detecting renames as files. - pub copies: Option, - /// The percentage of similarity needed for files to be considered renamed, defaulting to `Some(0.5)`. - /// This field is similar to `git diff -M50%`. - /// - /// If `None`, files are only considered equal if their content matches 100%. - /// Note that values greater than 1.0 have no different effect than 1.0. - pub percentage: Option, - /// The amount of files to consider for fuzzy rename or copy tracking. Defaults to 1000, meaning that only 1000*1000 - /// combinations can be tested for fuzzy matches, i.e. the ones that try to find matches by comparing similarity. - /// If 0, there is no limit. - /// - /// If the limit would not be enough to test the entire set of combinations, the algorithm will trade in precision and not - /// run the fuzzy version of identity tests at all. That way results are never partial. - pub limit: usize, -} - -/// -pub mod rewrites; - -/// types to actually perform rename tracking. -pub(crate) mod tracked; - /// Configuration impl<'a, 'repo> Platform<'a, 'repo> { /// Keep track of file-names, which makes the [`location`][Change::location] field usable with the filename of the changed item. diff --git a/gix/src/object/tree/diff/rewrites.rs b/gix/src/object/tree/diff/rewrites.rs deleted file mode 100644 index e434726d9e6..00000000000 --- a/gix/src/object/tree/diff/rewrites.rs +++ /dev/null @@ -1,108 +0,0 @@ -use crate::{ - config::{cache::util::ApplyLeniency, tree::Diff}, - diff::rename::Tracking, - object::tree::diff::Rewrites, -}; - -/// From where to source copies -#[derive(Debug, Copy, Clone, Eq, PartialEq)] -pub enum CopySource { - /// Find copies from the set of modified files only. - FromSetOfModifiedFiles, - /// Find copies from the set of changed files, as well as all files known to the source (i.e. previous state) of the tree. - /// - /// This can be an expensive operation as it scales exponentially with the total amount of files in the tree. - FromSetOfModifiedFilesAndSourceTree, -} - -/// How to determine copied files. -#[derive(Debug, Copy, Clone, PartialEq)] -pub struct Copies { - /// The set of files to search when finding the source of copies. - pub source: CopySource, - /// Equivalent to [`Rewrites::percentage`], but used for copy tracking. - /// - /// Useful to have similarity-based rename tracking and cheaper copy tracking, which also is the default - /// as only identity plays a role. - pub percentage: Option, -} - -impl Default for Copies { - fn default() -> Self { - Copies { - source: CopySource::FromSetOfModifiedFiles, - percentage: Some(0.5), - } - } -} - -/// Information collected while handling rewrites of files which may be tracked. -#[derive(Default, Clone, Copy, Debug, PartialEq)] -pub struct Outcome { - /// The options used to guide the rewrite tracking. Either fully provided by the caller or retrieved from git configuration. - pub options: Rewrites, - /// The amount of similarity checks that have been conducted to find renamed files and potentially copies. - pub num_similarity_checks: usize, - /// Set to the amount of worst-case rename permutations we didn't search as our limit didn't allow it. - pub num_similarity_checks_skipped_for_rename_tracking_due_to_limit: usize, - /// Set to the amount of worst-case copy permutations we didn't search as our limit didn't allow it. - pub num_similarity_checks_skipped_for_copy_tracking_due_to_limit: usize, -} - -/// The error returned by [`Rewrites::try_from_config()`]. -#[derive(Debug, thiserror::Error)] -#[allow(missing_docs)] -pub enum Error { - #[error(transparent)] - ConfigDiffRenames(#[from] crate::config::key::GenericError), - #[error(transparent)] - ConfigDiffRenameLimit(#[from] crate::config::unsigned_integer::Error), -} - -/// The default settings for rewrites according to the git configuration defaults. -impl Default for Rewrites { - fn default() -> Self { - Rewrites { - copies: None, - percentage: Some(0.5), - limit: 1000, - } - } -} - -impl Rewrites { - /// Create an instance by reading all relevant information from the `config`uration, while being `lenient` or not. - /// Returns `Ok(None)` if nothing is configured. - /// - /// Note that missing values will be defaulted similar to what git does. - #[allow(clippy::result_large_err)] - pub fn try_from_config(config: &gix_config::File<'static>, lenient: bool) -> Result, Error> { - let key = "diff.renames"; - let copies = match config - .boolean_by_key(key) - .map(|value| Diff::RENAMES.try_into_renames(value)) - .transpose() - .with_leniency(lenient)? - { - Some(renames) => match renames { - Tracking::Disabled => return Ok(None), - Tracking::Renames => None, - Tracking::RenamesAndCopies => Some(Copies::default()), - }, - None => return Ok(None), - }; - - let default = Self::default(); - Ok(Rewrites { - copies, - limit: config - .integer_by_key("diff.renameLimit") - .map(|value| Diff::RENAME_LIMIT.try_into_usize(value)) - .transpose() - .with_leniency(lenient)? - .unwrap_or(default.limit), - ..default - } - .into()) - } -} diff --git a/gix/src/object/tree/diff/tracked.rs b/gix/src/object/tree/diff/tracked.rs deleted file mode 100644 index 318ce295063..00000000000 --- a/gix/src/object/tree/diff/tracked.rs +++ /dev/null @@ -1,554 +0,0 @@ -use std::ops::Range; - -use gix_diff::tree::visit::Change; -use gix_object::tree::{EntryKind, EntryMode}; - -use crate::{ - bstr::BStr, - ext::ObjectIdExt, - object::tree::diff::{ - change::DiffLineStats, - rewrites::{CopySource, Outcome}, - Rewrites, - }, - Repository, Tree, -}; - -/// A set of tracked items allows to figure out their relations by figuring out their similarity. -pub struct Item { - /// The underlying raw change - change: Change, - /// That slice into the backing for paths. - location: Range, - /// If true, this item was already emitted, i.e. seen by the caller. - emitted: bool, -} - -impl Item { - fn location<'a>(&self, backing: &'a [u8]) -> &'a BStr { - backing[self.location.clone()].as_ref() - } - fn entry_mode_compatible(&self, mode: EntryMode) -> bool { - use EntryKind::*; - matches!( - (mode.kind(), self.change.entry_mode().kind()), - (Blob | BlobExecutable, Blob | BlobExecutable) | (Link, Link) - ) - } - - fn is_source_for_destination_of(&self, kind: visit::Kind, dest_item_mode: EntryMode) -> bool { - self.entry_mode_compatible(dest_item_mode) - && match kind { - visit::Kind::RenameTarget => !self.emitted && matches!(self.change, Change::Deletion { .. }), - visit::Kind::CopyDestination => { - matches!(self.change, Change::Modification { .. }) - } - } - } -} - -pub struct State { - items: Vec, - path_backing: Vec, - rewrites: Rewrites, - tracking: Option, -} - -pub mod visit { - use crate::{bstr::BStr, object::tree::diff::change::DiffLineStats}; - - pub struct Source<'a> { - pub mode: gix_object::tree::EntryMode, - pub id: gix_hash::ObjectId, - pub kind: Kind, - pub location: &'a BStr, - pub diff: Option, - } - - #[derive(Debug, Copy, Clone, Eq, PartialEq)] - pub enum Kind { - RenameTarget, - CopyDestination, - } - - pub struct Destination<'a> { - pub change: gix_diff::tree::visit::Change, - pub location: &'a BStr, - } -} - -impl State { - pub(crate) fn new(renames: Rewrites, tracking: Option) -> Self { - State { - items: vec![], - path_backing: vec![], - rewrites: renames, - tracking, - } - } -} - -/// build state and find matches. -impl State { - /// We may refuse the push if that information isn't needed for what we have to track. - pub fn try_push_change(&mut self, change: Change, location: &BStr) -> Option { - if !change.entry_mode().is_blob_or_symlink() { - return Some(change); - } - let keep = match (self.rewrites.copies, &change) { - (Some(_find_copies), _) => true, - (None, Change::Modification { .. }) => false, - (None, _) => true, - }; - - if !keep { - return Some(change); - } - - let start = self.path_backing.len(); - self.path_backing.extend_from_slice(location); - self.items.push(Item { - location: start..self.path_backing.len(), - change, - emitted: false, - }); - None - } - - /// Can only be called once effectively as it alters its own state. - /// - /// `cb(destination, source)` is called for each item, either with `Some(source)` if it's - /// the destination of a copy or rename, or with `None` for source if no relation to other - /// items in the tracked set exist. - pub fn emit( - &mut self, - mut cb: impl FnMut(visit::Destination<'_>, Option>) -> gix_diff::tree::visit::Action, - src_tree: &Tree<'_>, - ) -> Result { - fn by_id_and_location(a: &Item, b: &Item) -> std::cmp::Ordering { - a.change.oid().cmp(b.change.oid()).then_with(|| { - a.location - .start - .cmp(&b.location.start) - .then(a.location.end.cmp(&b.location.end)) - }) - } - self.items.sort_by(by_id_and_location); - - let mut out = Outcome { - options: self.rewrites, - ..Default::default() - }; - out = self.match_pairs_of_kind( - visit::Kind::RenameTarget, - &mut cb, - self.rewrites.percentage, - out, - src_tree.repo, - )?; - - if let Some(copies) = self.rewrites.copies { - out = self.match_pairs_of_kind( - visit::Kind::CopyDestination, - &mut cb, - copies.percentage, - out, - src_tree.repo, - )?; - - match copies.source { - CopySource::FromSetOfModifiedFiles => {} - CopySource::FromSetOfModifiedFilesAndSourceTree => { - src_tree - .traverse() - .breadthfirst(&mut tree_to_events::Delegate::new(self))?; - self.items.sort_by(by_id_and_location); - - out = self.match_pairs_of_kind( - visit::Kind::CopyDestination, - &mut cb, - copies.percentage, - out, - src_tree.repo, - )?; - } - } - } - - self.items - .sort_by(|a, b| a.location(&self.path_backing).cmp(b.location(&self.path_backing))); - for item in self.items.drain(..).filter(|item| !item.emitted) { - if cb( - visit::Destination { - location: item.location(&self.path_backing), - change: item.change, - }, - None, - ) == gix_diff::tree::visit::Action::Cancel - { - break; - } - } - Ok(out) - } - - fn match_pairs_of_kind( - &mut self, - kind: visit::Kind, - cb: &mut impl FnMut(visit::Destination<'_>, Option>) -> gix_diff::tree::visit::Action, - percentage: Option, - mut out: Outcome, - repo: &Repository, - ) -> Result { - // we try to cheaply reduce the set of possibilities first, before possibly looking more exhaustively. - let needs_second_pass = !needs_exact_match(percentage); - if self.match_pairs(cb, None /* by identity */, kind, repo, &mut out)? == gix_diff::tree::visit::Action::Cancel - { - return Ok(out); - } - if needs_second_pass { - let is_limited = if self.rewrites.limit == 0 { - false - } else if let Some(permutations) = permutations_over_limit(&self.items, self.rewrites.limit, kind) { - match kind { - visit::Kind::RenameTarget => { - out.num_similarity_checks_skipped_for_rename_tracking_due_to_limit = permutations; - } - visit::Kind::CopyDestination => { - out.num_similarity_checks_skipped_for_copy_tracking_due_to_limit = permutations; - } - } - true - } else { - false - }; - if !is_limited { - self.match_pairs(cb, self.rewrites.percentage, kind, repo, &mut out)?; - } - } - Ok(out) - } - - fn match_pairs( - &mut self, - cb: &mut impl FnMut(visit::Destination<'_>, Option>) -> gix_diff::tree::visit::Action, - percentage: Option, - kind: visit::Kind, - repo: &Repository, - stats: &mut Outcome, - ) -> Result { - // TODO(perf): reuse object data and interner state and interned tokens, make these available to `find_match()` - let mut dest_ofs = 0; - while let Some((mut dest_idx, dest)) = self.items[dest_ofs..].iter().enumerate().find_map(|(idx, item)| { - (!item.emitted && matches!(item.change, Change::Addition { .. })).then_some((idx, item)) - }) { - dest_idx += dest_ofs; - dest_ofs = dest_idx + 1; - let src = - find_match(&self.items, dest, dest_idx, percentage, kind, repo, stats)?.map(|(src_idx, src, diff)| { - let (id, mode) = src.change.oid_and_entry_mode(); - let id = id.to_owned(); - let location = src.location(&self.path_backing); - ( - visit::Source { - mode, - id, - kind, - location, - diff, - }, - src_idx, - ) - }); - if src.is_none() { - continue; - } - let location = dest.location(&self.path_backing); - let change = dest.change.clone(); - let dest = visit::Destination { change, location }; - self.items[dest_idx].emitted = true; - if let Some(src_idx) = src.as_ref().map(|t| t.1) { - self.items[src_idx].emitted = true; - } - if cb(dest, src.map(|t| t.0)) == gix_diff::tree::visit::Action::Cancel { - return Ok(gix_diff::tree::visit::Action::Cancel); - } - } - Ok(gix_diff::tree::visit::Action::Continue) - } -} - -fn permutations_over_limit(items: &[Item], limit: usize, kind: visit::Kind) -> Option { - let (sources, destinations) = items - .iter() - .filter(|item| match kind { - visit::Kind::RenameTarget => !item.emitted, - visit::Kind::CopyDestination => true, - }) - .fold((0, 0), |(mut src, mut dest), item| { - match item.change { - Change::Addition { .. } => { - dest += 1; - } - Change::Deletion { .. } => { - if kind == visit::Kind::RenameTarget { - src += 1 - } - } - Change::Modification { .. } => { - if kind == visit::Kind::CopyDestination { - src += 1 - } - } - } - (src, dest) - }); - let permutations = sources * destinations; - (permutations > limit * limit).then_some(permutations) -} - -fn needs_exact_match(percentage: Option) -> bool { - percentage.map_or(true, |p| p >= 1.0) -} - -/// <`src_idx`, src, possibly diff stat> -type SourceTuple<'a> = (usize, &'a Item, Option); - -/// Find `item` in our set of items ignoring `item_idx` to avoid finding ourselves, by similarity indicated by `percentage`. -/// The latter can be `None` or `Some(x)` where `x>=1` for identity, and anything else for similarity. -/// We also ignore emitted items entirely. -/// Use `kind` to indicate what kind of match we are looking for, which might be deletions matching an `item` addition, or -/// any non-deletion otherwise. -/// Note that we always try to find by identity first even if a percentage is given as it's much faster and may reduce the set -/// of items to be searched. -fn find_match<'a>( - items: &'a [Item], - item: &Item, - item_idx: usize, - percentage: Option, - kind: visit::Kind, - repo: &Repository, - stats: &mut Outcome, -) -> Result>, crate::object::tree::diff::for_each::Error> { - let (item_id, item_mode) = item.change.oid_and_entry_mode(); - if needs_exact_match(percentage) || item_mode.is_link() { - let first_idx = items.partition_point(|a| a.change.oid() < item_id); - let range = match items.get(first_idx..).map(|items| { - let end = items - .iter() - .position(|a| a.change.oid() != item_id) - .map_or(items.len(), |idx| first_idx + idx); - first_idx..end - }) { - Some(range) => range, - None => return Ok(None), - }; - if range.is_empty() { - return Ok(None); - } - let res = items[range.clone()].iter().enumerate().find_map(|(mut src_idx, src)| { - src_idx += range.start; - (src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode)).then_some((src_idx, src, None)) - }); - if let Some(src) = res { - return Ok(Some(src)); - } - } else { - let new = item_id.to_owned().attach(repo).object()?; - let percentage = percentage.expect("it's set to something below 1.0 and we assured this"); - debug_assert!( - item.change.entry_mode().is_blob(), - "symlinks are matched exactly, and trees aren't used here" - ); - let algo = repo.config.diff_algorithm()?; - for (can_idx, src) in items - .iter() - .enumerate() - .filter(|(src_idx, src)| *src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode)) - { - let old = src.change.oid().to_owned().attach(repo).object()?; - // TODO: make sure we get attribute handling and binary skips and filters right here. There is crate::object::blob::diff::Platform - // which should have facilities for that one day, but we don't use it because we need newlines in our tokens. - let tokens = gix_diff::blob::intern::InternedInput::new( - gix_diff::blob::sources::byte_lines_with_terminator(&old.data), - gix_diff::blob::sources::byte_lines_with_terminator(&new.data), - ); - let counts = gix_diff::blob::diff( - algo, - &tokens, - gix_diff::blob::sink::Counter::new(diff::Statistics { - removed_bytes: 0, - input: &tokens, - }), - ); - let similarity = (old.data.len() - counts.wrapped) as f32 / old.data.len().max(new.data.len()) as f32; - stats.num_similarity_checks += 1; - if similarity >= percentage { - return Ok(Some(( - can_idx, - src, - DiffLineStats { - removals: counts.removals, - insertions: counts.insertions, - before: tokens.before.len().try_into().expect("interner handles only u32"), - after: tokens.after.len().try_into().expect("interner handles only u32"), - } - .into(), - ))); - } - } - } - Ok(None) -} - -mod diff { - use std::ops::Range; - - pub struct Statistics<'a, 'data> { - pub removed_bytes: usize, - pub input: &'a gix_diff::blob::intern::InternedInput<&'data [u8]>, - } - - impl<'a, 'data> gix_diff::blob::Sink for Statistics<'a, 'data> { - type Out = usize; - - fn process_change(&mut self, before: Range, _after: Range) { - self.removed_bytes += self.input.before[before.start as usize..before.end as usize] - .iter() - .map(|token| self.input.interner[*token].len()) - .sum::(); - } - - fn finish(self) -> Self::Out { - self.removed_bytes - } - } - - #[cfg(test)] - mod tests { - fn removed_bytes(before: &[u8], after: &[u8]) -> usize { - let input = gix_diff::blob::intern::InternedInput::new(before, after); - gix_diff::blob::diff( - gix_diff::blob::Algorithm::Myers, - &input, - super::Statistics { - removed_bytes: 0, - input: &input, - }, - ) - } - - #[test] - fn counts_removed_bytes_correctly() { - assert_eq!(1, removed_bytes(b"a", b"")); - assert_eq!(0, removed_bytes(b"", b"a")); - - // need the inputs to have more than one "hunk" - // of differences to stress the fact that - // process_change is called multiple times - let before = b" -a -a -a -b -b -b -b -a -a -a -a -a -b -b -a -a -a -a -a -"; - // it's `before`, with the "b" lines removed - let after = b" -a -a -a -a -a -a -a -a -a -a -a -a -a - "; - - assert_eq!(6, removed_bytes(before, after)); - } - } -} - -mod tree_to_events { - use gix_diff::tree::visit::Change; - use gix_object::tree::EntryRef; - - use crate::bstr::BStr; - - pub struct Delegate<'a> { - parent: &'a mut super::State, - recorder: gix_traverse::tree::Recorder, - } - - impl<'a> Delegate<'a> { - pub fn new(parent: &'a mut super::State) -> Self { - let tracking = parent.tracking.map(|t| match t { - gix_diff::tree::recorder::Location::FileName => gix_traverse::tree::recorder::Location::FileName, - gix_diff::tree::recorder::Location::Path => gix_traverse::tree::recorder::Location::Path, - }); - Self { - parent, - recorder: gix_traverse::tree::Recorder::default().track_location(tracking), - } - } - } - - impl gix_traverse::tree::Visit for Delegate<'_> { - fn pop_front_tracked_path_and_set_current(&mut self) { - self.recorder.pop_front_tracked_path_and_set_current() - } - - fn push_back_tracked_path_component(&mut self, component: &BStr) { - self.recorder.push_back_tracked_path_component(component) - } - - fn push_path_component(&mut self, component: &BStr) { - self.recorder.push_path_component(component) - } - - fn pop_path_component(&mut self) { - self.recorder.pop_path_component(); - } - - fn visit_tree(&mut self, _entry: &EntryRef<'_>) -> gix_traverse::tree::visit::Action { - gix_traverse::tree::visit::Action::Continue - } - - fn visit_nontree(&mut self, entry: &EntryRef<'_>) -> gix_traverse::tree::visit::Action { - if entry.mode.is_blob() { - self.parent.try_push_change( - Change::Modification { - previous_entry_mode: entry.mode, - previous_oid: gix_hash::ObjectId::null(entry.oid.kind()), - entry_mode: entry.mode, - oid: entry.oid.to_owned(), - }, - self.recorder.path(), - ); - // make sure these aren't viable to be emitted anymore. - self.parent.items.last_mut().expect("just pushed").emitted = true; - } - gix_traverse::tree::visit::Action::Continue - } - } -} diff --git a/gix/tests/object/tree/diff.rs b/gix/tests/object/tree/diff.rs index 75a6549d8ea..c3304965b3f 100644 --- a/gix/tests/object/tree/diff.rs +++ b/gix/tests/object/tree/diff.rs @@ -104,11 +104,12 @@ fn tree_named(repo: &gix::Repository, rev_spec: impl AsRef) -> gix::Tree { mod track_rewrites { use std::convert::Infallible; - use gix::object::tree::diff::{ - change::{DiffLineStats, Event}, + use gix::diff::blob::DiffLineStats; + use gix::diff::{ rewrites::{Copies, CopySource}, Rewrites, }; + use gix::object::tree::diff::change::Event; use gix_ref::bstr::BStr; use crate::{ @@ -484,7 +485,7 @@ mod track_rewrites { .track_rewrites( Rewrites { copies: Some(Copies { - source: CopySource::FromSetOfModifiedFilesAndSourceTree, + source: CopySource::FromSetOfModifiedFilesAndAllSources, ..Default::default() }), ..Default::default() @@ -555,7 +556,7 @@ mod track_rewrites { .track_rewrites( Rewrites { copies: Some(Copies { - source: CopySource::FromSetOfModifiedFilesAndSourceTree, + source: CopySource::FromSetOfModifiedFilesAndAllSources, ..Default::default() }), limit: 2, // similarity checks can't be made that way From a28bf90aef49d1c51308884df84bb50567aeb501 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 1 Nov 2023 12:39:07 +0100 Subject: [PATCH 7/7] adapt to changes in `gix` related rename tracking --- gitoxide-core/src/query/engine/update.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/gitoxide-core/src/query/engine/update.rs b/gitoxide-core/src/query/engine/update.rs index 0e8281cf950..1dcf57ace7b 100644 --- a/gitoxide-core/src/query/engine/update.rs +++ b/gitoxide-core/src/query/engine/update.rs @@ -9,8 +9,8 @@ use anyhow::{anyhow, bail}; use gix::objs::find::Error; use gix::{ bstr::{BStr, BString, ByteSlice}, + diff::rewrites::CopySource, features::progress, - object::tree::diff::rewrites::CopySource, parallel::{InOrderIter, SequenceId}, prelude::ObjectIdExt, Count, Progress, @@ -139,11 +139,10 @@ pub fn update( }); let rewrites = { - let mut r = - gix::object::tree::diff::Rewrites::try_from_config(&repo.config_snapshot(), true)?.unwrap_or_default(); - r.copies = Some(gix::object::tree::diff::rewrites::Copies { + let mut r = gix::diff::new_rewrites(&repo.config_snapshot(), true)?.unwrap_or_default(); + r.copies = Some(gix::diff::rewrites::Copies { source: if find_copies_harder { - CopySource::FromSetOfModifiedFilesAndSourceTree + CopySource::FromSetOfModifiedFilesAndAllSources } else { CopySource::FromSetOfModifiedFiles },