From 1838cbd36f4c19e07080529ec05ecf6c81218686 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sat, 18 Nov 2023 13:45:55 +0100 Subject: [PATCH 01/20] feat: support for querying only the header of an object with the `FindHeader` trait. That way one can know its decompressed size and its kind. We also add a `FindObjectOrHeader` trait for use as `dyn` trait object that can find objects and access their headers. --- gix-object/src/find.rs | 6 ++++ gix-object/src/lib.rs | 11 ++++++- gix-object/src/traits.rs | 63 ++++++++++++++++++++++++++++++++++++++-- 3 files changed, 77 insertions(+), 3 deletions(-) diff --git a/gix-object/src/find.rs b/gix-object/src/find.rs index 271079eb412..8471592cbc9 100644 --- a/gix-object/src/find.rs +++ b/gix-object/src/find.rs @@ -66,6 +66,12 @@ pub mod existing_iter { #[derive(Debug, Copy, Clone)] pub struct Never; +impl super::FindHeader for Never { + fn try_header(&self, _id: &gix_hash::oid) -> Result, Error> { + Ok(None) + } +} + impl super::Find for Never { fn try_find<'a>(&self, _id: &gix_hash::oid, _buffer: &'a mut Vec) -> Result>, Error> { Ok(None) diff --git a/gix-object/src/lib.rs b/gix-object/src/lib.rs index 661397ca36d..85ff8708e11 100644 --- a/gix-object/src/lib.rs +++ b/gix-object/src/lib.rs @@ -34,7 +34,7 @@ pub mod data; pub mod find; mod traits; -pub use traits::{Exists, Find, FindExt, WriteTo}; +pub use traits::{Exists, Find, FindExt, FindObjectOrHeader, Header as FindHeader, HeaderExt, WriteTo}; pub mod encode; pub(crate) mod parse; @@ -257,6 +257,15 @@ pub struct Data<'a> { pub data: &'a [u8], } +/// Information about an object, which includes its kind and the amount of bytes it would have when obtained. +#[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone, Copy)] +pub struct Header { + /// The kind of object. + pub kind: Kind, + /// The object's size in bytes, or the size of the buffer when it's retrieved in full. + pub size: u64, +} + /// pub mod decode { #[cfg(feature = "verbose-object-parsing-errors")] diff --git a/gix-object/src/traits.rs b/gix-object/src/traits.rs index d48bc195d1a..e12215bef4c 100644 --- a/gix-object/src/traits.rs +++ b/gix-object/src/traits.rs @@ -71,6 +71,17 @@ mod find { ) -> Result>, find::Error>; } + /// Find the header of an object in the object store. + pub trait Header { + /// Find the header of the object matching `id` in the database. + /// + /// Returns `Some` header if it was present, or the error that occurred during lookup. + fn try_header(&self, id: &gix_hash::oid) -> Result, find::Error>; + } + + /// A combination of [`Find`] and [`Header`] traits to help with `dyn` trait objects. + pub trait FindObjectOrHeader: Find + Header {} + mod _impls { use std::{ops::Deref, rc::Rc, sync::Arc}; @@ -86,6 +97,8 @@ mod find { } } + impl crate::FindObjectOrHeader for T where T: crate::Find + crate::FindHeader {} + impl crate::Find for &T where T: crate::Find, @@ -95,6 +108,15 @@ mod find { } } + impl crate::FindHeader for &T + where + T: crate::FindHeader, + { + fn try_header(&self, id: &gix_hash::oid) -> Result, crate::find::Error> { + (*self).try_header(id) + } + } + impl crate::Exists for Box where T: crate::Exists, @@ -122,6 +144,15 @@ mod find { } } + impl crate::FindHeader for Rc + where + T: crate::FindHeader, + { + fn try_header(&self, id: &gix_hash::oid) -> Result, crate::find::Error> { + self.deref().try_header(id) + } + } + impl crate::Find for Box where T: crate::Find, @@ -131,6 +162,15 @@ mod find { } } + impl crate::FindHeader for Box + where + T: crate::FindHeader, + { + fn try_header(&self, id: &gix_hash::oid) -> Result, crate::find::Error> { + self.deref().try_header(id) + } + } + impl crate::Exists for Arc where T: crate::Exists, @@ -148,6 +188,15 @@ mod find { self.deref().try_find(id, buffer) } } + + impl crate::FindHeader for Arc + where + T: crate::FindHeader, + { + fn try_header(&self, id: &gix_hash::oid) -> Result, crate::find::Error> { + self.deref().try_header(id) + } + } } mod ext { @@ -214,9 +263,19 @@ mod find { }; } + /// An extension trait with convenience functions. + pub trait HeaderExt: super::Header { + /// Like [`try_header(…)`](super::Header::try_header()), but flattens the `Result>` into a single `Result` making a non-existing header an error. + fn header(&self, id: &gix_hash::oid) -> Result { + self.try_header(id) + .map_err(find::existing::Error::Find)? + .ok_or_else(|| find::existing::Error::NotFound { oid: id.to_owned() }) + } + } + /// An extension trait with convenience functions. pub trait FindExt: super::Find { - /// Like [`try_find(…)`][super::Find::try_find()], but flattens the `Result>` into a single `Result` making a non-existing object an error. + /// Like [`try_find(…)`](super::Find::try_find()), but flattens the `Result>` into a single `Result` making a non-existing object an error. fn find<'a>( &self, id: &gix_hash::oid, @@ -238,6 +297,6 @@ mod find { impl FindExt for T {} } - pub use ext::FindExt; + pub use ext::{FindExt, HeaderExt}; } pub use find::*; From 336d161a4a291b5cb03c08e637c6f47a05731996 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sat, 18 Nov 2023 13:57:02 +0100 Subject: [PATCH 02/20] adapt to changes in `gix-object` (Find::try_header()) --- gix-odb/src/cache.rs | 9 +++++++++ gix-odb/src/store_impls/dynamic/find.rs | 23 ++++++++++++++++++++--- gix-odb/src/store_impls/dynamic/header.rs | 2 +- 3 files changed, 30 insertions(+), 4 deletions(-) diff --git a/gix-odb/src/cache.rs b/gix-odb/src/cache.rs index aaa99756301..cc737041e63 100644 --- a/gix-odb/src/cache.rs +++ b/gix-odb/src/cache.rs @@ -177,6 +177,15 @@ mod impls { } } + impl gix_object::FindHeader for Cache + where + S: gix_object::FindHeader, + { + fn try_header(&self, id: &oid) -> Result, gix_object::find::Error> { + self.inner.try_header(id) + } + } + impl gix_pack::Find for Cache where S: gix_pack::Find, diff --git a/gix-odb/src/store_impls/dynamic/find.rs b/gix-odb/src/store_impls/dynamic/find.rs index b57b5fe830e..99b58d7477f 100644 --- a/gix-odb/src/store_impls/dynamic/find.rs +++ b/gix-odb/src/store_impls/dynamic/find.rs @@ -78,7 +78,6 @@ pub use error::Error; use gix_features::zlib; use crate::store::types::PackId; -use gix_object::{Exists, Find}; impl super::Handle where @@ -499,7 +498,7 @@ where } } -impl Find for super::Handle +impl gix_object::Find for super::Handle where S: Deref + Clone, Self: gix_pack::Find, @@ -513,7 +512,25 @@ where } } -impl Exists for super::Handle +impl gix_object::FindHeader for super::Handle +where + S: Deref + Clone, +{ + fn try_header(&self, id: &gix_hash::oid) -> Result, gix_object::find::Error> { + let mut snapshot = self.snapshot.borrow_mut(); + let mut inflate = self.inflate.borrow_mut(); + self.try_header_inner(id, &mut inflate, &mut snapshot, None) + .map(|maybe_header| { + maybe_header.map(|hdr| gix_object::Header { + kind: hdr.kind(), + size: hdr.size(), + }) + }) + .map_err(|err| Box::new(err) as _) + } +} + +impl gix_object::Exists for super::Handle where S: Deref + Clone, Self: gix_pack::Find, diff --git a/gix-odb/src/store_impls/dynamic/header.rs b/gix-odb/src/store_impls/dynamic/header.rs index 42fd51c85ba..2a331a094c5 100644 --- a/gix-odb/src/store_impls/dynamic/header.rs +++ b/gix-odb/src/store_impls/dynamic/header.rs @@ -13,7 +13,7 @@ impl super::Handle where S: Deref + Clone, { - fn try_header_inner<'b>( + pub(crate) fn try_header_inner<'b>( &'b self, mut id: &'b gix_hash::oid, inflate: &mut zlib::Inflate, From ee47fbab9ff995bc76a4c75185fe8e6e7c1d4ba3 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Fri, 17 Nov 2023 09:23:04 +0100 Subject: [PATCH 03/20] fix: specify minimum required `url` version of v2.2.0 (#1119). Note that this is also the minimal required version that is resolved with `cargo +nightly update -Z minimal-versions`, but it's nothing I could validate or reproduce myself just yet. --- gix-url/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gix-url/Cargo.toml b/gix-url/Cargo.toml index 0dd0861b023..a14947405ec 100644 --- a/gix-url/Cargo.toml +++ b/gix-url/Cargo.toml @@ -22,7 +22,7 @@ gix-path = { version = "^0.10.0", path = "../gix-path" } serde = { version = "1.0.114", optional = true, default-features = false, features = ["std", "derive"]} thiserror = "1.0.32" -url = "2.1.1" +url = "2.2.0" bstr = { version = "1.3.0", default-features = false, features = ["std"] } home = "0.5.3" From 86cdb42df7c564c6fd267f744a67f88ceb4c674f Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Fri, 17 Nov 2023 11:13:32 +0100 Subject: [PATCH 04/20] feat: Add `Buffers` type. It allows to more easily manage a form of 'double buffering' to better manage conditional alteration of a source buffer, and to implement conversion pipelines which conditionally transform an input over multiple steps. --- gix-utils/src/buffers.rs | 54 ++++++++++++++++++++++++++++++++++ gix-utils/src/lib.rs | 17 +++++++++++ gix-utils/tests/buffers/mod.rs | 33 +++++++++++++++++++++ gix-utils/tests/utils.rs | 1 + 4 files changed, 105 insertions(+) create mode 100644 gix-utils/src/buffers.rs create mode 100644 gix-utils/tests/buffers/mod.rs diff --git a/gix-utils/src/buffers.rs b/gix-utils/src/buffers.rs new file mode 100644 index 00000000000..2c0544ea3da --- /dev/null +++ b/gix-utils/src/buffers.rs @@ -0,0 +1,54 @@ +use crate::Buffers; + +/// Lifecycle +impl Buffers { + /// Use this if there is an input buffer `src` which isn't owned by you, but which should be used as source when + /// asking for [`src_and_dest()`](WithForeignSource::src_and_dest()). + pub fn use_foreign_src<'a, 'src>(&'a mut self, src: &'src [u8]) -> WithForeignSource<'src, 'a> { + self.clear(); + WithForeignSource { + ro_src: Some(src), + src: &mut self.src, + dest: &mut self.dest, + } + } +} + +impl Buffers { + /// Clear all buffers, which should be called once processing is done. + pub fn clear(&mut self) { + self.src.clear(); + self.dest.clear(); + } + + /// Must be called after every change (i.e. when it's known that `dest` was written. + pub fn swap(&mut self) { + std::mem::swap(&mut self.src, &mut self.dest); + } +} + +/// A utility to do buffer-swapping with, similar to [`Buffers`], but with support for a +/// read-only one-time buffer as source. +pub struct WithForeignSource<'src, 'bufs> { + /// The original source buffer, or `None` if already altered. + pub ro_src: Option<&'src [u8]>, + /// The source buffer that will be used after the first call to `swap`. + pub src: &'bufs mut Vec, + dest: &'bufs mut Vec, +} + +impl<'bufs> WithForeignSource<'_, 'bufs> { + /// Must be called after every change (i.e. when it's known that `dest` was written. + pub fn swap(&mut self) { + self.ro_src.take(); + std::mem::swap(&mut self.src, &mut self.dest); + self.dest.clear(); + } + /// Obtain `(source, destination)`, which reads from the read-only source exactly once. + pub fn src_and_dest(&mut self) -> (&[u8], &mut Vec) { + match self.ro_src { + Some(src) => (src, &mut self.dest), + None => (self.src, &mut self.dest), + } + } +} diff --git a/gix-utils/src/lib.rs b/gix-utils/src/lib.rs index 4c9d99fa423..f5951b10451 100644 --- a/gix-utils/src/lib.rs +++ b/gix-utils/src/lib.rs @@ -6,3 +6,20 @@ /// pub mod backoff; + +/// +pub mod buffers; + +/// A utility to do buffer-swapping with. +/// +/// Use `src` to read from and `dest` to write to, and after actually changing data, call [Buffers::swap()]. +/// To be able to repeat the process, this time using what was `dest` as `src`, freeing up `dest` for writing once more. +/// +/// Note that after each [`Buffers::swap()`], `src` is the most recent version of the data, just like before each swap. +#[derive(Default, Clone)] +pub struct Buffers { + /// The source data, as basis for processing. + pub src: Vec, + /// The data produced after processing `src`. + pub dest: Vec, +} diff --git a/gix-utils/tests/buffers/mod.rs b/gix-utils/tests/buffers/mod.rs new file mode 100644 index 00000000000..7d7eb9f8dad --- /dev/null +++ b/gix-utils/tests/buffers/mod.rs @@ -0,0 +1,33 @@ +use gix_utils::Buffers; + +#[test] +fn lifecycle() { + let mut bufs = Buffers::default(); + let mut bufs = bufs.use_foreign_src(b"a"); + + assert_eq!(bufs.ro_src.unwrap(), b"a"); + + let (src, dest) = bufs.src_and_dest(); + + assert_eq!(src, b"a"); + assert_eq!(dest.len(), 0); + dest.push(b'b'); + + bufs.swap(); + + assert_eq!(bufs.src, b"b"); + assert!(bufs.ro_src.is_none()); + + let (src, dest) = bufs.src_and_dest(); + assert_eq!(src, b"b"); + assert_eq!( + dest.len(), + 0, + "the original previously empty source buffer was swapped in " + ); + dest.push(b'c'); + bufs.swap(); + let (src, dest) = bufs.src_and_dest(); + assert_eq!(src, b"c"); + assert_eq!(dest.len(), 0, "dest always starting out empty"); +} diff --git a/gix-utils/tests/utils.rs b/gix-utils/tests/utils.rs index ed38d3e98c6..8face990aab 100644 --- a/gix-utils/tests/utils.rs +++ b/gix-utils/tests/utils.rs @@ -1 +1,2 @@ mod backoff; +mod buffers; From 1ed74f7bf56a01e20cd2cb9dcd8fa830c3d41e3b Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Fri, 17 Nov 2023 14:03:02 +0100 Subject: [PATCH 05/20] use `Buffers` implementation from `gix-utils` --- Cargo.lock | 1 + gix-filter/Cargo.toml | 1 + gix-filter/src/lib.rs | 2 +- gix-filter/src/pipeline/convert.rs | 2 +- gix-filter/src/pipeline/mod.rs | 3 -- gix-filter/src/pipeline/tests.rs | 39 ------------------------ gix-filter/src/pipeline/util.rs | 48 ------------------------------ 7 files changed, 4 insertions(+), 92 deletions(-) delete mode 100644 gix-filter/src/pipeline/tests.rs diff --git a/Cargo.lock b/Cargo.lock index 62c8e3102b2..6646336f11d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1633,6 +1633,7 @@ dependencies = [ "gix-quote 0.4.7", "gix-testtools", "gix-trace 0.1.3", + "gix-utils 0.1.5", "gix-worktree 0.27.0", "once_cell", "smallvec", diff --git a/gix-filter/Cargo.toml b/gix-filter/Cargo.toml index ebf2682e18c..8a047b67e75 100644 --- a/gix-filter/Cargo.toml +++ b/gix-filter/Cargo.toml @@ -18,6 +18,7 @@ gix-trace = { version = "^0.1.3", path = "../gix-trace" } gix-object = { version = "^0.38.0", path = "../gix-object" } gix-command = { version = "^0.2.10", path = "../gix-command" } gix-quote = { version = "^0.4.7", path = "../gix-quote" } +gix-utils = { version = "^0.1.5", path = "../gix-utils" } gix-path = { version = "^0.10.0", path = "../gix-path" } gix-packetline = { package = "gix-packetline-blocking", version = "^0.16.6", path = "../gix-packetline-blocking" } gix-attributes = { version = "^0.20.0", path = "../gix-attributes" } diff --git a/gix-filter/src/lib.rs b/gix-filter/src/lib.rs index 5fba895fdb9..9c7d44873d7 100644 --- a/gix-filter/src/lib.rs +++ b/gix-filter/src/lib.rs @@ -43,7 +43,7 @@ pub struct Pipeline { /// State needed to keep running filter processes. processes: driver::State, /// A utility to handle multiple buffers to keep results of various filters. - bufs: pipeline::util::Buffers, + bufs: gix_utils::Buffers, } /// A declaration of a driver program. diff --git a/gix-filter/src/pipeline/convert.rs b/gix-filter/src/pipeline/convert.rs index 85f6a9b1861..0d93acdc38a 100644 --- a/gix-filter/src/pipeline/convert.rs +++ b/gix-filter/src/pipeline/convert.rs @@ -190,7 +190,7 @@ impl Pipeline { self.options.eol_config, )?; - let mut bufs = self.bufs.with_src(src); + let mut bufs = self.bufs.use_foreign_src(src); let (src, dest) = bufs.src_and_dest(); if apply_ident_filter && ident::apply(src, self.options.object_hash, dest) { bufs.swap(); diff --git a/gix-filter/src/pipeline/mod.rs b/gix-filter/src/pipeline/mod.rs index c5efbb5a019..e1d7c37f3b7 100644 --- a/gix-filter/src/pipeline/mod.rs +++ b/gix-filter/src/pipeline/mod.rs @@ -114,6 +114,3 @@ impl Pipeline { pub mod convert; pub(crate) mod util; - -#[cfg(test)] -mod tests; diff --git a/gix-filter/src/pipeline/tests.rs b/gix-filter/src/pipeline/tests.rs deleted file mode 100644 index e41a040c604..00000000000 --- a/gix-filter/src/pipeline/tests.rs +++ /dev/null @@ -1,39 +0,0 @@ -mod buffers { - use bstr::ByteSlice; - - use crate::pipeline::util::Buffers; - - #[test] - fn usage() { - let mut backing = Buffers::default(); - let mut bufs = backing.with_src(b"a"); - - { - let (src, dest) = bufs.src_and_dest(); - assert_eq!(src.as_bstr(), "a"); - assert!(dest.is_empty()); - dest.push(b'b'); - } - assert!(bufs.ro_src.is_some(), "read-only source remains until swap"); - bufs.swap(); - assert!( - bufs.ro_src.is_none(), - "after swap it's not used anymore as recent data is in owned buffers" - ); - - let (src, dest) = bufs.src_and_dest(); - assert_eq!(src.as_bstr(), "b", "buffers were swapped"); - assert_eq!(dest.as_bstr(), "", "destination is new and cleared"); - dest.push(b'c'); - bufs.swap(); - - let (src, dest) = bufs.src_and_dest(); - assert_eq!(src.as_bstr(), "c"); - assert_eq!(dest.as_bstr(), "", "destination is cleared"); - - let mut bufs = backing.with_src(b"z"); - let (src, dest) = bufs.src_and_dest(); - assert_eq!(src.as_bstr(), "z"); - assert_eq!(dest.as_bstr(), "", "output buffer was cleared by `with_src()`") - } -} diff --git a/gix-filter/src/pipeline/util.rs b/gix-filter/src/pipeline/util.rs index ab10db08ec9..f83c37357bf 100644 --- a/gix-filter/src/pipeline/util.rs +++ b/gix-filter/src/pipeline/util.rs @@ -11,54 +11,6 @@ use crate::{ Driver, }; -/// A utility to do buffer-swapping with. -#[derive(Default, Clone)] -pub(crate) struct Buffers { - pub src: Vec, - pub dest: Vec, -} - -/// A utility to do buffer-swapping with. -pub(crate) struct BuffersWithSource<'src, 'bufs> { - pub ro_src: Option<&'src [u8]>, - pub src: &'bufs mut Vec, - pub dest: &'bufs mut Vec, -} - -impl Buffers { - pub fn with_src<'a, 'src>(&'a mut self, src: &'src [u8]) -> BuffersWithSource<'src, 'a> { - self.clear(); - BuffersWithSource { - ro_src: Some(src), - src: &mut self.src, - dest: &mut self.dest, - } - } - pub fn clear(&mut self) { - self.src.clear(); - self.dest.clear(); - } - - pub fn swap(&mut self) { - std::mem::swap(&mut self.src, &mut self.dest); - } -} - -impl BuffersWithSource<'_, '_> { - /// Must be called after every change (i.e. when it's known that `dest` was written. - pub fn swap(&mut self) { - self.ro_src.take(); - std::mem::swap(&mut self.src, &mut self.dest); - self.dest.clear(); - } - pub fn src_and_dest(&mut self) -> (&[u8], &mut Vec) { - match self.ro_src { - Some(src) => (src, &mut self.dest), - None => (self.src, &mut self.dest), - } - } -} - pub(crate) struct Configuration<'a> { pub(crate) driver: Option<&'a Driver>, /// What attributes say about CRLF handling. From c752f67090265dac78e43c132de1d6cced61be9c Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 13 Nov 2023 14:08:42 +0100 Subject: [PATCH 06/20] fix: Use `gix-objet::Find` error type. --- gix-filter/src/pipeline/convert.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gix-filter/src/pipeline/convert.rs b/gix-filter/src/pipeline/convert.rs index 0d93acdc38a..dac54940958 100644 --- a/gix-filter/src/pipeline/convert.rs +++ b/gix-filter/src/pipeline/convert.rs @@ -22,8 +22,7 @@ pub mod configuration { /// pub mod to_git { /// A function that fills `buf` `fn(&mut buf)` with the data stored in the index of the file that should be converted. - pub type IndexObjectFn<'a> = - dyn FnMut(&mut Vec) -> Result, Box> + 'a; + pub type IndexObjectFn<'a> = dyn FnMut(&mut Vec) -> Result, gix_object::find::Error> + 'a; /// The error returned by [Pipeline::convert_to_git()][super::Pipeline::convert_to_git()]. #[derive(Debug, thiserror::Error)] From c74c7feea9b4404c3e0f403e22cf64561c8d12ee Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 15 Nov 2023 17:13:25 +0100 Subject: [PATCH 07/20] feat!: simplify `Pipeline::new()` by reomving the metadata collection. It's required, but in practice has no effect as it's initialized at just the right time anyway, which is when it does matter. Also, re-export `gix_attributes as attributes` to allow using the types it mentions in the public API. --- gix-filter/src/lib.rs | 3 +++ gix-filter/src/pipeline/mod.rs | 14 ++++---------- gix-filter/tests/driver/mod.rs | 21 +++++++++++---------- gix-filter/tests/pipeline/mod.rs | 1 - 4 files changed, 18 insertions(+), 21 deletions(-) diff --git a/gix-filter/src/lib.rs b/gix-filter/src/lib.rs index 9c7d44873d7..7c0a0c37ba2 100644 --- a/gix-filter/src/lib.rs +++ b/gix-filter/src/lib.rs @@ -10,6 +10,9 @@ //! or not to apply such a filter. #![deny(rust_2018_idioms, missing_docs, unsafe_code)] +/// The `gix-attributes` crate whose types are mentioned in the public API of [Pipeline::convert_to_worktree()]. +pub use gix_attributes as attributes; + use bstr::BString; /// A forwarding of the `encoding_rs` crate for its types and convenience. pub use encoding_rs as encoding; diff --git a/gix-filter/src/pipeline/mod.rs b/gix-filter/src/pipeline/mod.rs index e1d7c37f3b7..76e7b0ef742 100644 --- a/gix-filter/src/pipeline/mod.rs +++ b/gix-filter/src/pipeline/mod.rs @@ -52,18 +52,13 @@ const ATTRS: [&str; 6] = ["crlf", "ident", "filter", "eol", "text", "working-tre /// Lifecycle impl Pipeline { - /// Create a new pipeline with configured `drivers` (which should be considered safe to invoke) with `context` as well as - /// a way to initialize our attributes with `collection`. + /// Create a new pipeline with configured `drivers` (which should be considered safe to invoke), which are passed `context`. /// `eol_config` serves as fallback to understand how to convert line endings if no line-ending attributes are present. /// `crlf_roundtrip_check` corresponds to the git-configuration of `core.safecrlf`. /// `object_hash` is relevant for the `ident` filter. - pub fn new( - collection: &gix_attributes::search::MetadataCollection, - context: gix_command::Context, - options: Options, - ) -> Self { + pub fn new(context: gix_command::Context, options: Options) -> Self { let mut attrs = gix_attributes::search::Outcome::default(); - attrs.initialize_with_selection(collection, ATTRS); + attrs.initialize_with_selection(&Default::default(), ATTRS); Pipeline { attrs, context: Context::default(), @@ -83,8 +78,7 @@ impl Pipeline { impl Default for Pipeline { fn default() -> Self { - let collection = Default::default(); - Pipeline::new(&collection, Default::default(), Default::default()) + Pipeline::new(Default::default(), Default::default()) } } diff --git a/gix-filter/tests/driver/mod.rs b/gix-filter/tests/driver/mod.rs index c5a38847e3b..adca2d06095 100644 --- a/gix-filter/tests/driver/mod.rs +++ b/gix-filter/tests/driver/mod.rs @@ -138,17 +138,18 @@ pub(crate) mod apply { fn a_crashing_process_can_restart_it() -> crate::Result { let mut state = gix_filter::driver::State::default(); let driver = driver_with_process(); + let err = match state.apply( + &driver, + &mut std::io::empty(), + Operation::Smudge, + context_from_path("fail"), + ) { + Ok(_) => panic!("expecting an error as invalid context was passed"), + Err(err) => err, + }; assert!( - matches!( - state.apply( - &driver, - &mut std::io::empty(), - Operation::Smudge, - context_from_path("fail") - ), - Err(gix_filter::driver::apply::Error::ProcessInvoke { .. }) - ), - "cannot invoke if failure is requested" + matches!(err, gix_filter::driver::apply::Error::ProcessInvoke { .. }), + "{err:?}: cannot invoke if failure is requested" ); let mut filtered = state diff --git a/gix-filter/tests/pipeline/mod.rs b/gix-filter/tests/pipeline/mod.rs index ad84b7fb775..8c9d8fffcb1 100644 --- a/gix-filter/tests/pipeline/mod.rs +++ b/gix-filter/tests/pipeline/mod.rs @@ -58,7 +58,6 @@ fn pipeline( let cache = attribute_cache(name)?; let (drivers, encodings_with_roundtrip_check, crlf_roundtrip_check, eol_config) = init(); let pipe = gix_filter::Pipeline::new( - cache.attributes_collection(), Default::default(), gix_filter::pipeline::Options { drivers, From 7d754cc7600a15acc5421206eb8d90dea432b283 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 15 Nov 2023 17:10:30 +0100 Subject: [PATCH 08/20] more correct initialization order for new attribute search outcomes. An attribute selection affects the initialization, hence it should be added first. --- gix-attributes/src/search/outcome.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gix-attributes/src/search/outcome.rs b/gix-attributes/src/search/outcome.rs index 7cea3bd1970..41d70e5806c 100644 --- a/gix-attributes/src/search/outcome.rs +++ b/gix-attributes/src/search/outcome.rs @@ -54,8 +54,6 @@ impl Outcome { collection: &MetadataCollection, attribute_names: &mut dyn Iterator>, ) { - self.initialize(collection); - self.selected.clear(); self.selected.extend(attribute_names.map(|name| { ( @@ -63,6 +61,8 @@ impl Outcome { collection.name_to_meta.get(name.as_str()).map(|meta| meta.id), ) })); + + self.initialize(collection); self.reset_remaining(); } From 17638628586900d43d730e6ed2a0862d8e408f29 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 15 Nov 2023 17:21:11 +0100 Subject: [PATCH 09/20] adapt to changes in `gix-filter` --- gix-archive/tests/archive.rs | 2 +- gix-worktree-stream/tests/stream.rs | 1 - gix/src/config/cache/access.rs | 8 ++------ gix/src/filter.rs | 6 +----- gix/src/repository/worktree.rs | 6 +----- 5 files changed, 5 insertions(+), 18 deletions(-) diff --git a/gix-archive/tests/archive.rs b/gix-archive/tests/archive.rs index 95c8e09359e..c9a4e8bb281 100644 --- a/gix-archive/tests/archive.rs +++ b/gix-archive/tests/archive.rs @@ -303,6 +303,6 @@ mod from_tree { } fn noop_pipeline() -> gix_filter::Pipeline { - gix_filter::Pipeline::new(&Default::default(), Default::default(), Default::default()) + gix_filter::Pipeline::new(Default::default(), Default::default()) } } diff --git a/gix-worktree-stream/tests/stream.rs b/gix-worktree-stream/tests/stream.rs index 9d341ca15ad..b84859b7cd8 100644 --- a/gix-worktree-stream/tests/stream.rs +++ b/gix-worktree-stream/tests/stream.rs @@ -261,7 +261,6 @@ mod from_tree { fn mutating_pipeline(driver: bool) -> gix_filter::Pipeline { gix_filter::Pipeline::new( - &Default::default(), Default::default(), gix_filter::pipeline::Options { drivers: if driver { vec![driver_with_process()] } else { vec![] }, diff --git a/gix/src/config/cache/access.rs b/gix/src/config/cache/access.rs index 5031501d3f1..5f9f71887fa 100644 --- a/gix/src/config/cache/access.rs +++ b/gix/src/config/cache/access.rs @@ -196,12 +196,8 @@ impl Cache { )?; let capabilities = self.fs_capabilities()?; let filters = { - let collection = Default::default(); - let mut filters = gix_filter::Pipeline::new( - &collection, - repo.command_context()?, - crate::filter::Pipeline::options(repo)?, - ); + let mut filters = + gix_filter::Pipeline::new(repo.command_context()?, crate::filter::Pipeline::options(repo)?); if let Ok(mut head) = repo.head() { let ctx = filters.driver_context_mut(); ctx.ref_name = head.referent_name().map(|name| name.as_bstr().to_owned()); diff --git a/gix/src/filter.rs b/gix/src/filter.rs index d18fc06fa57..f05e332a255 100644 --- a/gix/src/filter.rs +++ b/gix/src/filter.rs @@ -113,11 +113,7 @@ impl<'repo> Pipeline<'repo> { /// Create a new instance by extracting all necessary information and configuration from a `repo` along with `cache` for accessing /// attributes. The `index` is used for some filters which may access it under very specific circumstances. pub fn new(repo: &'repo Repository, cache: gix_worktree::Stack) -> Result { - let pipeline = gix_filter::Pipeline::new( - cache.attributes_collection(), - repo.command_context()?, - Self::options(repo)?, - ); + let pipeline = gix_filter::Pipeline::new(repo.command_context()?, Self::options(repo)?); Ok(Pipeline { inner: pipeline, cache, diff --git a/gix/src/repository/worktree.rs b/gix/src/repository/worktree.rs index 198c1eb5e6e..1b48c7c3735 100644 --- a/gix/src/repository/worktree.rs +++ b/gix/src/repository/worktree.rs @@ -79,11 +79,7 @@ impl crate::Repository { let mut cache = self .attributes_only(&index, gix_worktree::stack::state::attributes::Source::IdMapping)? .detach(); - let pipeline = gix_filter::Pipeline::new( - cache.attributes_collection(), - self.command_context()?, - crate::filter::Pipeline::options(self)?, - ); + let pipeline = gix_filter::Pipeline::new(repo.command_context()?, crate::filter::Pipeline::options(self)?); let objects = self.objects.clone().into_arc().expect("TBD error handling"); let stream = gix_worktree_stream::from_tree( id, From 0f03a08e9e07979c476a7def0c045fcdfddcbb7c Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Fri, 17 Nov 2023 17:23:56 +0100 Subject: [PATCH 10/20] feat: Allow obtaining mutable pipeline buffers. --- gix-filter/src/pipeline/mod.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/gix-filter/src/pipeline/mod.rs b/gix-filter/src/pipeline/mod.rs index 76e7b0ef742..662e14ca15d 100644 --- a/gix-filter/src/pipeline/mod.rs +++ b/gix-filter/src/pipeline/mod.rs @@ -102,6 +102,11 @@ impl Pipeline { pub fn options_mut(&mut self) -> &mut Options { &mut self.options } + + /// Return our double-buffers for re-use by the caller. + pub fn buffers_mut(&mut self) -> &mut gix_utils::Buffers { + &mut self.bufs + } } /// From 552bed2ad56a459b9e0956ece96100b73dce572d Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sat, 18 Nov 2023 13:02:13 +0100 Subject: [PATCH 11/20] feat!: Use `&dyn gix_object::Find` where possible. As otherwise, one cannot use `&dyn ` at all in this case as it's unsized.` Additionally, rename top-level `pub use gix_glob` to `glob` to be in-line with other public exports of this kind. --- gix-worktree/src/lib.rs | 3 ++- gix-worktree/src/stack/mod.rs | 20 +++++++------------ .../tests/worktree/stack/attributes.rs | 4 ++-- .../tests/worktree/stack/create_directory.rs | 10 +++++----- gix-worktree/tests/worktree/stack/ignore.rs | 2 +- 5 files changed, 17 insertions(+), 22 deletions(-) diff --git a/gix-worktree/src/lib.rs b/gix-worktree/src/lib.rs index 3e6fc19d503..26464956615 100644 --- a/gix-worktree/src/lib.rs +++ b/gix-worktree/src/lib.rs @@ -10,7 +10,8 @@ #![cfg_attr(all(doc, feature = "document-features"), feature(doc_cfg, doc_auto_cfg))] #![deny(missing_docs, rust_2018_idioms, unsafe_code)] use bstr::BString; -pub use gix_glob; +/// A way to access the [`Case`](glob::pattern::Case) enum which used throughout this API. +pub use gix_glob as glob; /// A cache for efficiently executing operations on directories and files which are encountered in sorted order. /// That way, these operations can be re-used for subsequent invocations in the same directory. diff --git a/gix-worktree/src/stack/mod.rs b/gix-worktree/src/stack/mod.rs index 387acf2acff..b6c03d175bf 100644 --- a/gix-worktree/src/stack/mod.rs +++ b/gix-worktree/src/stack/mod.rs @@ -108,22 +108,19 @@ impl Stack { /// `objects` maybe used to lookup objects from an [id mapping][crate::stack::State::id_mappings_from_index()], with mappnigs /// /// Provide access to cached information for that `relative` path via the returned platform. - pub fn at_path( + pub fn at_path( &mut self, relative: impl AsRef, is_dir: Option, - objects: Find, - ) -> std::io::Result> - where - Find: gix_object::Find, - { + objects: &dyn gix_object::Find, + ) -> std::io::Result> { self.statistics.platforms += 1; let mut delegate = StackDelegate { state: &mut self.state, buf: &mut self.buf, is_dir: is_dir.unwrap_or(false), id_mappings: &self.id_mappings, - objects: &objects, + objects, case: self.case, statistics: &mut self.statistics, }; @@ -142,15 +139,12 @@ impl Stack { /// ### Panics /// /// on illformed UTF8 in `relative` - pub fn at_entry<'r, Find>( + pub fn at_entry<'r>( &mut self, relative: impl Into<&'r BStr>, is_dir: Option, - objects: Find, - ) -> std::io::Result> - where - Find: gix_object::Find, - { + objects: &dyn gix_object::Find, + ) -> std::io::Result> { let relative = relative.into(); let relative_path = gix_path::from_bstr(relative); diff --git a/gix-worktree/tests/worktree/stack/attributes.rs b/gix-worktree/tests/worktree/stack/attributes.rs index d595a9e2860..e4fda10e905 100644 --- a/gix-worktree/tests/worktree/stack/attributes.rs +++ b/gix-worktree/tests/worktree/stack/attributes.rs @@ -29,7 +29,7 @@ fn baseline() -> crate::Result { let mut actual = cache.attribute_matches(); let input = std::fs::read(base.join("baseline"))?; for (rela_path, expected) in (baseline::Expectations { lines: input.lines() }) { - let entry = cache.at_entry(rela_path, None, gix_object::find::Never)?; + let entry = cache.at_entry(rela_path, None, &gix_object::find::Never)?; let has_match = entry.matching_attributes(&mut actual); assert_eq!( @@ -50,7 +50,7 @@ fn baseline() -> crate::Result { let mut actual = cache.selected_attribute_matches(["info", "test"]); let input = std::fs::read(base.join("baseline.selected"))?; for (rela_path, expected) in (baseline::Expectations { lines: input.lines() }) { - let entry = cache.at_entry(rela_path, None, gix_object::find::Never)?; + let entry = cache.at_entry(rela_path, None, &gix_object::find::Never)?; let has_match = entry.matching_attributes(&mut actual); assert_eq!( diff --git a/gix-worktree/tests/worktree/stack/create_directory.rs b/gix-worktree/tests/worktree/stack/create_directory.rs index 4ae15cf70f6..19130d6559f 100644 --- a/gix-worktree/tests/worktree/stack/create_directory.rs +++ b/gix-worktree/tests/worktree/stack/create_directory.rs @@ -15,7 +15,7 @@ fn root_is_assumed_to_exist_and_files_in_root_do_not_create_directory() -> crate ); assert_eq!(cache.statistics().delegate.num_mkdir_calls, 0); - let path = cache.at_path("hello", Some(false), gix_object::find::Never)?.path(); + let path = cache.at_path("hello", Some(false), &gix_object::find::Never)?.path(); assert!(!path.parent().unwrap().exists(), "prefix itself is never created"); assert_eq!(cache.statistics().delegate.num_mkdir_calls, 0); Ok(()) @@ -33,7 +33,7 @@ fn directory_paths_are_created_in_full() { ("link", None), ] { let path = cache - .at_path(Path::new("dir").join(name), *is_dir, gix_object::find::Never) + .at_path(Path::new("dir").join(name), *is_dir, &gix_object::find::Never) .unwrap() .path(); assert!(path.parent().unwrap().is_dir(), "dir exists"); @@ -47,7 +47,7 @@ fn existing_directories_are_fine() -> crate::Result { let (mut cache, tmp) = new_cache(); std::fs::create_dir(tmp.path().join("dir"))?; - let path = cache.at_path("dir/file", Some(false), gix_object::find::Never)?.path(); + let path = cache.at_path("dir/file", Some(false), &gix_object::find::Never)?.path(); assert!(path.parent().unwrap().is_dir(), "directory is still present"); assert!(!path.exists(), "it won't create the file"); assert_eq!(cache.statistics().delegate.num_mkdir_calls, 1); @@ -72,7 +72,7 @@ fn symlinks_or_files_in_path_are_forbidden_or_unlinked_when_forced() -> crate::R let relative_path = format!("{dirname}/file"); assert_eq!( cache - .at_path(&relative_path, Some(false), gix_object::find::Never) + .at_path(&relative_path, Some(false), &gix_object::find::Never) .unwrap_err() .kind(), std::io::ErrorKind::AlreadyExists @@ -93,7 +93,7 @@ fn symlinks_or_files_in_path_are_forbidden_or_unlinked_when_forced() -> crate::R } let relative_path = format!("{dirname}/file"); let path = cache - .at_path(&relative_path, Some(false), gix_object::find::Never)? + .at_path(&relative_path, Some(false), &gix_object::find::Never)? .path(); assert!(path.parent().unwrap().is_dir(), "directory was forcefully created"); assert!(!path.exists()); diff --git a/gix-worktree/tests/worktree/stack/ignore.rs b/gix-worktree/tests/worktree/stack/ignore.rs index 6dbcecb5cb8..7de236335a6 100644 --- a/gix-worktree/tests/worktree/stack/ignore.rs +++ b/gix-worktree/tests/worktree/stack/ignore.rs @@ -65,7 +65,7 @@ fn exclude_by_dir_is_handled_just_like_git() { let relative_path = gix_path::from_byte_slice(relative_entry); let is_dir = dir.join(relative_path).metadata().ok().map(|m| m.is_dir()); - let platform = cache.at_entry(relative_entry, is_dir, FindError).unwrap(); + let platform = cache.at_entry(relative_entry, is_dir, &FindError).unwrap(); let match_ = platform.matching_exclude_pattern().expect("match all values"); let _is_excluded = platform.is_excluded(); assert_eq!( From a0e4dec7162ea242d03d11e73c4ce1f761ed2853 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sat, 18 Nov 2023 13:04:31 +0100 Subject: [PATCH 12/20] adapt to changes in `gix-worktree` --- gix-filter/tests/pipeline/convert_to_git.rs | 6 +++--- gix-filter/tests/pipeline/convert_to_worktree.rs | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/gix-filter/tests/pipeline/convert_to_git.rs b/gix-filter/tests/pipeline/convert_to_git.rs index 491c6043d6a..79d79993238 100644 --- a/gix-filter/tests/pipeline/convert_to_git.rs +++ b/gix-filter/tests/pipeline/convert_to_git.rs @@ -53,7 +53,7 @@ fn all_stages_mean_streaming_is_impossible() -> gix_testtools::Result { Path::new("any.txt"), &mut |path, attrs| { cache - .at_entry(path, Some(false), gix_object::find::Never) + .at_entry(path, Some(false), &gix_object::find::Never) .expect("cannot fail") .matching_attributes(attrs); }, @@ -82,7 +82,7 @@ fn only_driver_means_streaming_is_possible() -> gix_testtools::Result { Path::new("subdir/doesnot/matter/any.txt"), &mut |path, attrs| { cache - .at_entry(path, Some(false), gix_object::find::Never) + .at_entry(path, Some(false), &gix_object::find::Never) .expect("cannot fail") .matching_attributes(attrs); }, @@ -112,7 +112,7 @@ fn no_filter_means_reader_is_returned_unchanged() -> gix_testtools::Result { Path::new("other.txt"), &mut |path, attrs| { cache - .at_entry(path, Some(false), gix_object::find::Never) + .at_entry(path, Some(false), &gix_object::find::Never) .expect("cannot fail") .matching_attributes(attrs); }, diff --git a/gix-filter/tests/pipeline/convert_to_worktree.rs b/gix-filter/tests/pipeline/convert_to_worktree.rs index 8798f5a22c7..2449c50304a 100644 --- a/gix-filter/tests/pipeline/convert_to_worktree.rs +++ b/gix-filter/tests/pipeline/convert_to_worktree.rs @@ -21,7 +21,7 @@ fn all_stages() -> gix_testtools::Result { "any.txt".into(), &mut |path, attrs| { cache - .at_entry(path, Some(false), gix_object::find::Never) + .at_entry(path, Some(false), &gix_object::find::Never) .expect("cannot fail") .matching_attributes(attrs); }, @@ -54,7 +54,7 @@ fn all_stages_no_filter() -> gix_testtools::Result { "other.txt".into(), &mut |path, attrs| { cache - .at_entry(path, Some(false), gix_object::find::Never) + .at_entry(path, Some(false), &gix_object::find::Never) .expect("cannot fail") .matching_attributes(attrs); }, @@ -86,7 +86,7 @@ fn no_filter() -> gix_testtools::Result { "other.txt".into(), &mut |path, attrs| { cache - .at_entry(path, Some(false), gix_object::find::Never) + .at_entry(path, Some(false), &gix_object::find::Never) .expect("cannot fail") .matching_attributes(attrs); }, From 20e56b3d635a46eb2f42b9ccfb171fbbf01b96ef Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 27 Nov 2023 16:04:36 +0100 Subject: [PATCH 13/20] feat: add `Oid::is_null()` - previously it was only available on `ObjectId`. --- gix-hash/src/oid.rs | 11 ++++++++++- gix-hash/tests/oid/mod.rs | 6 ++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/gix-hash/src/oid.rs b/gix-hash/src/oid.rs index b105621cb69..f9c74adfec3 100644 --- a/gix-hash/src/oid.rs +++ b/gix-hash/src/oid.rs @@ -1,6 +1,6 @@ use std::{convert::TryInto, fmt, hash}; -use crate::{ObjectId, SIZE_OF_SHA1_DIGEST}; +use crate::{Kind, ObjectId, SIZE_OF_SHA1_DIGEST}; /// A borrowed reference to a hash identifying objects. /// @@ -137,6 +137,15 @@ impl oid { hex_len: self.bytes.len() * 2, } } + + /// Returns `true` if this hash consists of all null bytes. + #[inline] + #[doc(alias = "is_zero", alias = "git2")] + pub fn is_null(&self) -> bool { + match self.kind() { + Kind::Sha1 => &self.bytes == oid::null_sha1().as_bytes(), + } + } } /// Sha1 specific methods diff --git a/gix-hash/tests/oid/mod.rs b/gix-hash/tests/oid/mod.rs index d8449e7a048..7b46e559492 100644 --- a/gix-hash/tests/oid/mod.rs +++ b/gix-hash/tests/oid/mod.rs @@ -13,3 +13,9 @@ mod to_hex_with_len { ); } } + +#[test] +fn is_null() { + assert!(gix_hash::Kind::Sha1.null().is_null()); + assert!(gix_hash::Kind::Sha1.null().as_ref().is_null()); +} From ebe66dbc5720bb47a6c3810333060e17d6b58085 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 13 Nov 2023 13:55:08 +0100 Subject: [PATCH 14/20] feat: add `blob::Platform` and `blob::Pipeline` for diff-content conversions and caching. The `Pipeline` provides ways to obtain content for use with a diffing algorithm, and the `Platform` is a way to cache such content to efficiently perform MxN matrix diffing, and possibly to prepare running external diff programs as well. --- Cargo.lock | 86 +- crate-status.md | 27 +- gix-diff/Cargo.toml | 9 +- gix-diff/src/blob.rs | 18 - gix-diff/src/blob/mod.rs | 134 +++ gix-diff/src/blob/pipeline.rs | 518 ++++++++++ gix-diff/src/blob/platform.rs | 613 ++++++++++++ gix-diff/src/lib.rs | 8 + gix-diff/src/rewrites/tracker.rs | 164 +++- gix-diff/tests/Cargo.toml | 4 + gix-diff/tests/blob/mod.rs | 4 +- gix-diff/tests/blob/pipeline.rs | 925 ++++++++++++++++++ gix-diff/tests/blob/platform.rs | 373 +++++++ gix-diff/tests/diff.rs | 50 + .../generated-archives/make_blob_repo.tar.xz | 3 + .../generated-archives/make_diff_repo.tar.xz | 4 +- gix-diff/tests/fixtures/make_blob_repo.sh | 22 + gix-diff/tests/fixtures/make_diff_repo.sh | 1 - gix-diff/tests/rewrites/mod.rs | 56 ++ gix-diff/tests/rewrites/tracker.rs | 601 ++++++++++++ 20 files changed, 3539 insertions(+), 81 deletions(-) delete mode 100644 gix-diff/src/blob.rs create mode 100644 gix-diff/src/blob/mod.rs create mode 100644 gix-diff/src/blob/pipeline.rs create mode 100644 gix-diff/src/blob/platform.rs create mode 100644 gix-diff/tests/blob/pipeline.rs create mode 100644 gix-diff/tests/blob/platform.rs create mode 100644 gix-diff/tests/fixtures/generated-archives/make_blob_repo.tar.xz create mode 100644 gix-diff/tests/fixtures/make_blob_repo.sh create mode 100644 gix-diff/tests/rewrites/mod.rs create mode 100644 gix-diff/tests/rewrites/tracker.rs diff --git a/Cargo.lock b/Cargo.lock index 6646336f11d..046451adda6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -956,7 +956,7 @@ checksum = "d4029edd3e734da6fe05b6cd7bd2960760a616bd2ddd0d59a0124746d6272af0" dependencies = [ "cfg-if", "libc", - "redox_syscall", + "redox_syscall 0.3.5", "windows-sys", ] @@ -1525,8 +1525,15 @@ dependencies = [ "bstr", "document-features", "getrandom", + "gix-command", + "gix-filter", + "gix-fs 0.8.0", "gix-hash 0.13.1", "gix-object 0.38.0", + "gix-path 0.10.0", + "gix-tempfile 11.0.0 (registry+https://github.com/rust-lang/crates.io-index)", + "gix-trace 0.1.3", + "gix-worktree 0.27.0", "imara-diff", "serde", "thiserror", @@ -1537,11 +1544,15 @@ name = "gix-diff-tests" version = "0.0.0" dependencies = [ "gix-diff", + "gix-filter", + "gix-fs 0.8.0", "gix-hash 0.13.1", "gix-object 0.38.0", "gix-odb", "gix-testtools", "gix-traverse 0.34.0", + "gix-worktree 0.27.0", + "shell-words", ] [[package]] @@ -1614,6 +1625,17 @@ dependencies = [ "walkdir", ] +[[package]] +name = "gix-features" +version = "0.36.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51f4365ba17c4f218d7fd9ec102b8d2d3cb0ca200a835e81151ace7778aec827" +dependencies = [ + "gix-hash 0.13.1 (registry+https://github.com/rust-lang/crates.io-index)", + "gix-trace 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", + "libc", +] + [[package]] name = "gix-fetchhead" version = "0.0.0" @@ -1658,6 +1680,15 @@ dependencies = [ "tempfile", ] +[[package]] +name = "gix-fs" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8cd171c0cae97cd0dc57e7b4601cb1ebf596450e263ef3c02be9107272c877bd" +dependencies = [ + "gix-features 0.36.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "gix-fsck" version = "0.1.0" @@ -1716,6 +1747,16 @@ dependencies = [ "thiserror", ] +[[package]] +name = "gix-hash" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1884c7b41ea0875217c1be9ce91322f90bde433e91d374d0e1276073a51ccc60" +dependencies = [ + "faster-hex", + "thiserror", +] + [[package]] name = "gix-hashtable" version = "0.2.4" @@ -1803,7 +1844,7 @@ dependencies = [ "itoa", "libc", "memmap2 0.9.0", - "rustix 0.38.20", + "rustix 0.38.21", "serde", "smallvec", "thiserror", @@ -2090,7 +2131,7 @@ dependencies = [ "gix-config-value", "gix-testtools", "parking_lot", - "rustix 0.38.20", + "rustix 0.38.21", "serial_test", "thiserror", ] @@ -2353,6 +2394,20 @@ dependencies = [ "tempfile", ] +[[package]] +name = "gix-tempfile" +version = "11.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05cc2205cf10d99f70b96e04e16c55d4c7cf33efc151df1f793e29fd12a931f8" +dependencies = [ + "dashmap", + "gix-fs 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)", + "libc", + "once_cell", + "parking_lot", + "tempfile", +] + [[package]] name = "gix-testtools" version = "0.13.0" @@ -2940,7 +2995,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" dependencies = [ "hermit-abi", - "rustix 0.38.20", + "rustix 0.38.21", "windows-sys", ] @@ -3467,7 +3522,7 @@ checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447" dependencies = [ "cfg-if", "libc", - "redox_syscall", + "redox_syscall 0.3.5", "smallvec", "windows-targets 0.48.5", ] @@ -3718,6 +3773,15 @@ dependencies = [ "bitflags 1.3.2", ] +[[package]] +name = "redox_syscall" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" +dependencies = [ + "bitflags 1.3.2", +] + [[package]] name = "regex" version = "1.10.0" @@ -3866,9 +3930,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.20" +version = "0.38.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67ce50cb2e16c2903e30d1cbccfd8387a74b9d4c938b6a4c5ec6cc7556f7a8a0" +checksum = "2b426b0506e5d50a7d8dafcf2e81471400deb602392c7dd110815afb4eaf02a3" dependencies = [ "bitflags 2.4.0", "errno", @@ -4295,14 +4359,14 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.8.0" +version = "3.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb94d2f3cc536af71caac6b6fcebf65860b347e7ce0cc9ebe8f70d3e521054ef" +checksum = "7ef1adac450ad7f4b3c28589471ade84f25f731a7a0fe30d71dfa9f60fd808e5" dependencies = [ "cfg-if", "fastrand 2.0.1", - "redox_syscall", - "rustix 0.38.20", + "redox_syscall 0.4.1", + "rustix 0.38.21", "windows-sys", ] diff --git a/crate-status.md b/crate-status.md index a6aa1f119eb..406d594f110 100644 --- a/crate-status.md +++ b/crate-status.md @@ -293,16 +293,29 @@ The top-level crate that acts as hub to all functionality provided by the `gix-* Check out the [performance discussion][gix-diff-performance] as well. * **tree** - * [x] changes needed to obtain _other tree_ + * [x] changes needed to obtain _other tree_ * **patches** - * There are various ways to generate a patch from two blobs. - * [ ] any + * There are various ways to generate a patch from two blobs. + * [ ] text + * [ ] binary * **lines** - * [x] Simple line-by-line diffs powered by the `imara-diff` crate. -* diffing, merging, working with hunks of data -* find differences between various states, i.e. index, working tree, commit-tree + * [x] Simple line-by-line diffs powered by the `imara-diff` crate. +* **generic rename tracker to find renames and copies** + * [x] find by exact match + * [x] find by similarity check + * [ ] heuristics to find best candidate + * [ ] find by basename to help detecting simple moves +* **blob** + * [x] a choice of to-worktree, to-git and to-worktree-if-needed conversions + * [x] `textconv` filters + * [x] special handling of files beyond the big-file threshold. + * [x] detection of binary files by looking at header (first 8k bytes) + * [x] caching of diff-able data + * [x] prepare invocation of external diff program + - [ ] pass meta-info +* [ ] working with hunks of data * [x] API documentation - * [ ] Examples + * [ ] Examples [gix-diff-performance]: https://github.com/Byron/gitoxide/discussions/74 diff --git a/gix-diff/Cargo.toml b/gix-diff/Cargo.toml index 8889f53690b..715f5f96cca 100644 --- a/gix-diff/Cargo.toml +++ b/gix-diff/Cargo.toml @@ -13,7 +13,7 @@ autotests = false [features] default = ["blob"] ## Enable diffing of blobs using imara-diff, which also allows for a generic rewrite tracking implementation. -blob = ["dep:imara-diff"] +blob = ["dep:imara-diff", "dep:gix-filter", "dep:gix-worktree", "dep:gix-path", "dep:gix-fs", "dep:gix-command", "dep:gix-tempfile", "dep:gix-trace"] ## Data structures implement `serde::Serialize` and `serde::Deserialize`. serde = ["dep:serde", "gix-hash/serde", "gix-object/serde"] ## Make it possible to compile to the `wasm32-unknown-unknown` target. @@ -25,6 +25,13 @@ doctest = false [dependencies] gix-hash = { version = "^0.13.1", path = "../gix-hash" } gix-object = { version = "^0.38.0", path = "../gix-object" } +gix-filter = { version = "^0.6.0", path = "../gix-filter", optional = true } +gix-worktree = { version = "^0.27.0", path = "../gix-worktree", default-features = false, features = ["attributes"], optional = true } +gix-command = { version = "^0.2.10", path = "../gix-command", optional = true } +gix-path = { version = "^0.10.0", path = "../gix-path", optional = true } +gix-fs = { version = "^0.8.0", path = "../gix-fs", optional = true } +gix-tempfile = { version = "11.0.0", optional = true } +gix-trace = { version = "^0.1.3", path = "../gix-trace", optional = true } thiserror = "1.0.32" imara-diff = { version = "0.1.3", optional = true } diff --git a/gix-diff/src/blob.rs b/gix-diff/src/blob.rs deleted file mode 100644 index 7b2a082bd1e..00000000000 --- a/gix-diff/src/blob.rs +++ /dev/null @@ -1,18 +0,0 @@ -//! For using text diffs, please have a look at the [`imara-diff` documentation](https://docs.rs/imara-diff), -//! maintained by [Pascal Kuthe](https://github.com/pascalkuthe). -//! -//! -/// Information about the diff performed to detect similarity. -#[derive(Debug, Default, Clone, Copy, Eq, PartialEq)] -pub struct DiffLineStats { - /// The amount of lines to remove from the source to get to the destination. - pub removals: u32, - /// The amount of lines to add to the source to get to the destination. - pub insertions: u32, - /// The amount of lines of the previous state, in the source. - pub before: u32, - /// The amount of lines of the new state, in the destination. - pub after: u32, -} - -pub use imara_diff::*; diff --git a/gix-diff/src/blob/mod.rs b/gix-diff/src/blob/mod.rs new file mode 100644 index 00000000000..2c2453318d0 --- /dev/null +++ b/gix-diff/src/blob/mod.rs @@ -0,0 +1,134 @@ +//! For using text diffs, please have a look at the [`imara-diff` documentation](https://docs.rs/imara-diff), +//! maintained by [Pascal Kuthe](https://github.com/pascalkuthe). +pub use imara_diff::*; +use std::collections::HashMap; +use std::path::PathBuf; + +use bstr::BString; + +/// +pub mod pipeline; + +/// +pub mod platform; + +/// Information about the diff performed to detect similarity. +#[derive(Debug, Default, Clone, Copy, PartialEq, PartialOrd)] +pub struct DiffLineStats { + /// The amount of lines to remove from the source to get to the destination. + pub removals: u32, + /// The amount of lines to add to the source to get to the destination. + pub insertions: u32, + /// The amount of lines of the previous state, in the source. + pub before: u32, + /// The amount of lines of the new state, in the destination. + pub after: u32, + /// A range from 0 to 1.0, where 1.0 is a perfect match and 0.5 is a similarity of 50%. + /// Similarity is the ratio between all lines in the previous blob and the current blob, + /// calculated as `(old_lines_count - new_lines_count) as f32 / old_lines_count.max(new_lines_count) as f32`. + pub similarity: f32, +} + +/// A way to classify a resource suitable for diffing. +#[derive(Copy, Clone, Debug, Ord, PartialOrd, Eq, PartialEq, Hash)] +pub enum ResourceKind { + /// The source of a rewrite, rename or copy operation, or generally the old version of a resource. + OldOrSource, + /// The destination of a rewrite, rename or copy operation, or generally the new version of a resource. + NewOrDestination, +} + +/// A set of values to define how to diff something that is associated with it using `git-attributes`, relevant for regular files. +/// +/// Some values are related to diffing, some are related to conversions. +#[derive(Default, Debug, Clone, PartialEq, Eq)] +pub struct Driver { + /// The name of the driver, as referred to by `[diff "name"]` in the git configuration. + pub name: BString, + /// The command to execute to perform the diff entirely like ` old-file old-hex old-mode new-file new-hex new-mode`. + /// + /// Please note that we don't make this call ourselves, but use it to determine that we should not run the our standard + /// built-in algorithm but bail instead as the output of such a program isn't standardized. + pub command: Option, + /// The per-driver algorithm to use. + pub algorithm: Option, + /// The external filter program to call like ` /path/to/blob` which outputs a textual version of the provided + /// binary file. + /// Note that it's invoked with a shell if arguments are given. + /// Further, if present, it will always be executed, whether `is_binary` is set or not. + pub binary_to_text_command: Option, + /// `Some(true)` if this driver deals with binary files, which means that a `binary_to_text_command` should be used to convert binary + /// into a textual representation. + /// Without such a command, anything that is considered binary is not diffed, but only the size of its data is made available. + /// If `Some(false)`, it won't be considered binary, and the its data will not be sampled for the null-byte either. + /// Leaving it to `None` means binary detection is automatic, and is based on the presence of the `0` byte in the first 8kB of the buffer. + pub is_binary: Option, +} + +/// A conversion pipeline to take an object or path from what's stored in `git` to what can be diffed, while +/// following the guidance of git-attributes at the respective path to learn if diffing should happen or if +/// the content is considered binary. +/// +/// There are two different conversion flows, where the target of the flow is a buffer with diffable content: +// TODO: update this with information about possible directions. +/// +/// * `worktree on disk` -> `text conversion` +/// * `object` -> `worktree-filters` -> `text conversion` +#[derive(Clone)] +pub struct Pipeline { + /// A way to read data directly from the worktree. + pub roots: pipeline::WorktreeRoots, + /// A pipeline to convert objects from what's stored in `git` to its worktree version. + pub worktree_filter: gix_filter::Pipeline, + /// Options affecting the way we read files. + pub options: pipeline::Options, + /// Drivers to help customize the conversion behaviour depending on the location of items. + drivers: Vec, + /// Pre-configured attributes to obtain additional diff-related information. + attrs: gix_filter::attributes::search::Outcome, + /// A buffer to manipulate paths + path: PathBuf, +} + +/// A utility for performing a diff of two blobs, including flexible conversions, conversion-caching +/// acquisition of diff information. +/// Note that this instance will not call external filters as their output can't be known programmatically, +/// but it allows to prepare their input if the caller wishes to perform this task. +/// +/// Optimized for NxM lookups with built-in caching. +#[derive(Clone)] +pub struct Platform { + /// The old version of a diff-able blob, if set. + old: Option, + /// The new version of a diff-able blob, if set. + new: Option, + + /// Options to alter how diffs should be performed. + pub options: platform::Options, + /// A way to convert objects into a diff-able format. + pub filter: Pipeline, + /// A way to access .gitattributes + pub attr_stack: gix_worktree::Stack, + + /// The way we convert resources into diffable states. + filter_mode: pipeline::Mode, + /// A continuously growing cache keeping ready-for-diff blobs by their path in the worktree, + /// as that is what affects their final diff-able state. + /// + /// That way, expensive rewrite-checks with NxM matrix checks would be as fast as possible, + /// avoiding duplicate work. + diff_cache: HashMap, +} + +mod impls { + use crate::blob::ResourceKind; + + impl std::fmt::Display for ResourceKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(match self { + ResourceKind::OldOrSource => "old", + ResourceKind::NewOrDestination => "new", + }) + } + } +} diff --git a/gix-diff/src/blob/pipeline.rs b/gix-diff/src/blob/pipeline.rs new file mode 100644 index 00000000000..d0899846d14 --- /dev/null +++ b/gix-diff/src/blob/pipeline.rs @@ -0,0 +1,518 @@ +use crate::blob::{Driver, Pipeline, ResourceKind}; +use bstr::{BStr, ByteSlice}; +use gix_filter::driver::apply::{Delay, MaybeDelayed}; +use gix_filter::pipeline::convert::{ToGitOutcome, ToWorktreeOutcome}; +use gix_object::tree::EntryKind; +use std::io::{Read, Write}; +use std::path::{Path, PathBuf}; +use std::process::{Command, Stdio}; + +/// A way to access roots for different kinds of resources that are possibly located and accessible in a worktree. +#[derive(Clone, Debug, Default)] +pub struct WorktreeRoots { + /// A place where the source of a rewrite, rename or copy, or generally the previous version of resources, are located. + pub old_root: Option, + /// A place where the destination of a rewrite, rename or copy, or generally the new version of resources, are located. + pub new_root: Option, +} + +impl WorktreeRoots { + /// Return the root path for the given `kind` + pub fn by_kind(&self, kind: ResourceKind) -> Option<&Path> { + match kind { + ResourceKind::OldOrSource => self.old_root.as_deref(), + ResourceKind::NewOrDestination => self.new_root.as_deref(), + } + } +} + +/// Data as part of an [Outcome]. +#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Debug)] +pub enum Data { + /// The data to use for diffing was written into the buffer that was passed during the call to [`Pipeline::convert_to_diffable()`]. + Buffer, + /// The size that the binary blob had at the given revision, without having applied filters, as it's either + /// considered binary or above the big-file threshold. + /// + /// In this state, the binary file cannot be diffed. + Binary { + /// The size of the object prior to performing any filtering or as it was found on disk. + /// + /// Note that technically, the size isn't always representative of the same 'state' of the + /// content, as once it can be the size of the blob in git, and once it's the size of file + /// in the worktree. + size: u64, + }, +} + +/// The outcome returned by [Pipeline::convert_to_diffable()](super::Pipeline::convert_to_diffable()). +#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Debug)] +pub struct Outcome { + /// If available, an index into the `drivers` field to access more diff-related information of the driver for items + /// at the given path, as previously determined by git-attributes. + /// + /// Note that drivers are queried even if there is no object available. + pub driver_index: Option, + /// The data itself, suitable for diffing, and if the object or worktree item is present at all. + pub data: Option, +} + +/// Options for use in a [`Pipeline`]. +#[derive(Default, Clone, Copy, PartialEq, Eq, Debug, Hash, Ord, PartialOrd)] +pub struct Options { + /// The amount of bytes that an object has to reach before being treated as binary. + /// These objects will not be queried, nor will their data be processed in any way. + /// If `0`, no file is ever considered binary due to their size. + /// + /// Note that for files stored in `git`, what counts is their stored, decompressed size, + /// thus `git-lfs` files would typically not be considered binary unless one explicitly sets + /// them + pub large_file_threshold_bytes: u64, + /// Capabilities of the file system which affect how we read worktree files. + pub fs: gix_fs::Capabilities, +} + +/// The specific way to convert a resource. +#[derive(Default, Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)] +pub enum Mode { + /// Always prepare the version of the resource as it would be in the work-tree, and + /// apply binary-to-text filters if present. + /// + /// This is typically free for resources in the worktree, and will apply filters to resources in the + /// object database. + #[default] + ToWorktreeAndBinaryToText, + /// Prepare the version of the resource as it would be in the work-tree if + /// binary-to-text filters are present (and apply them), or use the version in `git` otherwise. + ToGitUnlessBinaryToTextIsPresent, + /// Always prepare resources as they are stored in `git`. + /// + /// This is usually fastest, even though resources in the worktree needed to be converted files. + ToGit, +} + +impl Mode { + fn to_worktree(self) -> bool { + matches!( + self, + Mode::ToGitUnlessBinaryToTextIsPresent | Mode::ToWorktreeAndBinaryToText + ) + } + + fn to_git(self) -> bool { + matches!(self, Mode::ToGitUnlessBinaryToTextIsPresent | Mode::ToGit) + } +} + +/// +pub mod convert_to_diffable { + use bstr::BString; + use gix_object::tree::EntryKind; + + /// The error returned by [Pipeline::convert_to_diffable()](super::Pipeline::convert_to_diffable()). + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error { + #[error("Entry at '{rela_path}' must be regular file or symlink, but was {actual:?}")] + InvalidEntryKind { rela_path: BString, actual: EntryKind }, + #[error("Entry at '{rela_path}' could not be read as symbolic link")] + ReadLink { rela_path: BString, source: std::io::Error }, + #[error("Entry at '{rela_path}' could not be opened for reading or read from")] + OpenOrRead { rela_path: BString, source: std::io::Error }, + #[error("Entry at '{rela_path}' could not be copied from a filter process to a memory buffer")] + StreamCopy { rela_path: BString, source: std::io::Error }, + #[error("Failed to run '{cmd}' for binary-to-text conversion of entry at {rela_path}")] + RunTextConvFilter { + rela_path: BString, + cmd: String, + source: std::io::Error, + }, + #[error("Tempfile for binary-to-text conversion for entry at {rela_path} could not be created")] + CreateTempfile { rela_path: BString, source: std::io::Error }, + #[error("Binary-to-text conversion '{cmd}' for entry at {rela_path} failed with: {stderr}")] + TextConvFilterFailed { + rela_path: BString, + cmd: String, + stderr: BString, + }, + #[error(transparent)] + FindObject(#[from] gix_object::find::existing_object::Error), + #[error(transparent)] + ConvertToWorktree(#[from] gix_filter::pipeline::convert::to_worktree::Error), + #[error(transparent)] + ConvertToGit(#[from] gix_filter::pipeline::convert::to_git::Error), + } +} + +/// Lifecycle +impl Pipeline { + /// Create a new instance of a pipeline which produces blobs suitable for diffing. `roots` allow to read worktree files directly, otherwise + /// `worktree_filter` is used to transform object database data directly. `drivers` further configure individual paths. + /// `options` are used to further configure the way we act.. + pub fn new( + roots: WorktreeRoots, + worktree_filter: gix_filter::Pipeline, + mut drivers: Vec, + options: Options, + ) -> Self { + drivers.sort_by(|a, b| a.name.cmp(&b.name)); + Pipeline { + roots, + worktree_filter, + drivers, + options, + attrs: { + let mut out = gix_filter::attributes::search::Outcome::default(); + out.initialize_with_selection(&Default::default(), Some("diff")); + out + }, + path: Default::default(), + } + } +} + +/// Access +impl Pipeline { + /// Return all drivers that this instance was initialized with. + pub fn drivers(&self) -> &[super::Driver] { + &self.drivers + } +} + +/// Conversion +impl Pipeline { + /// Convert the object at `id`, `mode`, `rela_path` and `kind`, providing access to `attributes` and `objects`. + /// The resulting diff-able data is written into `out`, assuming it's not too large. The returned [`Outcome`] + /// contains information on how to use `out`, or if it's filled at all. + /// + /// `attributes` must be returning the attributes at `rela_path`, and `objects` must be usable if `kind` is + /// a resource in the object database, i.e. has no worktree root available. + /// + /// If `id` [is null](gix_hash::ObjectId::is_null()) or the file in question doesn't exist in the worktree in case + /// [a root](WorktreeRoots) is present, then `out` will be left cleared and [Outcome::data] will be `None`. + /// + /// Note that `mode` is trusted, and we will not re-validate that the entry in the worktree actually is of that mode. + /// + /// Use `convert` to control what kind of the resource will be produced. + /// + /// ### About Tempfiles + /// + /// When querying from the object database and a binary and a [binary-to-text](Driver::binary_to_text_command) is set, + /// a temporary file will be created to serve as input for the converter program, containing the worktree-data that + /// exactly as it would be present in the worktree if checked out. + /// + /// As these files are ultimately named tempfiles, they will be leaked unless the [gix_tempfile] is configured with + /// a signal handler. If they leak, they would remain in the system's `$TMP` directory. + #[allow(clippy::too_many_arguments)] + pub fn convert_to_diffable( + &mut self, + id: &gix_hash::oid, + mode: EntryKind, + rela_path: &BStr, + kind: ResourceKind, + attributes: &mut dyn FnMut(&BStr, &mut gix_filter::attributes::search::Outcome), + objects: &dyn gix_object::FindObjectOrHeader, + convert: Mode, + out: &mut Vec, + ) -> Result { + let is_symlink = match mode { + EntryKind::Link if self.options.fs.symlink => true, + EntryKind::Blob | EntryKind::BlobExecutable => false, + _ => { + return Err(convert_to_diffable::Error::InvalidEntryKind { + rela_path: rela_path.to_owned(), + actual: mode, + }) + } + }; + + out.clear(); + attributes(rela_path, &mut self.attrs); + let attr = self.attrs.iter_selected().next().expect("pre-initialized with 'diff'"); + let driver_index = attr + .assignment + .state + .as_bstr() + .and_then(|name| self.drivers.binary_search_by(|d| d.name.as_bstr().cmp(name)).ok()); + let driver = driver_index.map(|idx| &self.drivers[idx]); + let mut is_binary = if let Some(driver) = driver { + driver + .is_binary + .map(|is_binary| is_binary && driver.binary_to_text_command.is_none()) + } else { + attr.assignment.state.is_unset().then_some(true) + }; + match self.roots.by_kind(kind) { + Some(root) => { + self.path.clear(); + self.path.push(root); + self.path.push(gix_path::from_bstr(rela_path)); + let data = if is_symlink { + let target = none_if_missing(std::fs::read_link(&self.path)).map_err(|err| { + convert_to_diffable::Error::ReadLink { + rela_path: rela_path.to_owned(), + source: err, + } + })?; + target.map(|target| { + out.extend_from_slice(gix_path::into_bstr(target).as_ref()); + Data::Buffer + }) + } else { + let need_size_only = is_binary == Some(true); + let size_in_bytes = (need_size_only + || (is_binary != Some(false) && self.options.large_file_threshold_bytes > 0)) + .then(|| { + none_if_missing(self.path.metadata().map(|md| md.len())).map_err(|err| { + convert_to_diffable::Error::OpenOrRead { + rela_path: rela_path.to_owned(), + source: err, + } + }) + }) + .transpose()?; + match size_in_bytes { + Some(None) => None, // missing as identified by the size check + Some(Some(size)) if size > self.options.large_file_threshold_bytes || need_size_only => { + Some(Data::Binary { size }) + } + _ => { + match driver + .filter(|_| convert.to_worktree()) + .and_then(|d| d.prepare_binary_to_text_cmd(&self.path)) + { + Some(cmd) => { + // Avoid letting the driver program fail if it doesn't exist. + if self.options.large_file_threshold_bytes == 0 + && none_if_missing(std::fs::symlink_metadata(&self.path)) + .map_err(|err| convert_to_diffable::Error::OpenOrRead { + rela_path: rela_path.to_owned(), + source: err, + })? + .is_none() + { + None + } else { + run_cmd(rela_path, cmd, out)?; + Some(Data::Buffer) + } + } + None => { + let file = none_if_missing(std::fs::File::open(&self.path)).map_err(|err| { + convert_to_diffable::Error::OpenOrRead { + rela_path: rela_path.to_owned(), + source: err, + } + })?; + + match file { + Some(mut file) => { + if convert.to_git() { + let res = self.worktree_filter.convert_to_git( + file, + gix_path::from_bstr(rela_path).as_ref(), + attributes, + &mut |buf| objects.try_find(id, buf).map(|obj| obj.map(|_| ())), + )?; + + match res { + ToGitOutcome::Unchanged(mut file) => { + file.read_to_end(out).map_err(|err| { + convert_to_diffable::Error::OpenOrRead { + rela_path: rela_path.to_owned(), + source: err, + } + })?; + } + ToGitOutcome::Process(mut stream) => { + stream.read_to_end(out).map_err(|err| { + convert_to_diffable::Error::OpenOrRead { + rela_path: rela_path.to_owned(), + source: err, + } + })?; + } + ToGitOutcome::Buffer(buf) => { + out.resize(buf.len(), 0); + out.copy_from_slice(buf); + } + } + } else { + file.read_to_end(out).map_err(|err| { + convert_to_diffable::Error::OpenOrRead { + rela_path: rela_path.to_owned(), + source: err, + } + })?; + } + + Some(if is_binary.unwrap_or_else(|| is_binary_buf(out)) { + let size = out.len() as u64; + out.clear(); + Data::Binary { size } + } else { + Data::Buffer + }) + } + None => None, + } + } + } + } + } + }; + Ok(Outcome { driver_index, data }) + } + None => { + let data = if id.is_null() { + None + } else { + let header = objects + .try_header(id) + .map_err(gix_object::find::existing_object::Error::Find)? + .ok_or_else(|| gix_object::find::existing_object::Error::NotFound { oid: id.to_owned() })?; + if is_binary.is_none() && self.options.large_file_threshold_bytes > 0 { + is_binary = Some(header.size > self.options.large_file_threshold_bytes); + }; + let data = if is_binary == Some(true) { + Data::Binary { size: header.size } + } else { + objects + .try_find(id, out) + .map_err(gix_object::find::existing_object::Error::Find)? + .ok_or_else(|| gix_object::find::existing_object::Error::NotFound { oid: id.to_owned() })?; + if matches!(mode, EntryKind::Blob | EntryKind::BlobExecutable) + && convert == Mode::ToWorktreeAndBinaryToText + || (convert == Mode::ToGitUnlessBinaryToTextIsPresent + && driver.map_or(false, |d| d.binary_to_text_command.is_some())) + { + let res = + self.worktree_filter + .convert_to_worktree(out, rela_path, attributes, Delay::Forbid)?; + + let cmd_and_file = driver + .and_then(|d| { + d.binary_to_text_command.is_some().then(|| { + gix_tempfile::new( + std::env::temp_dir(), + gix_tempfile::ContainingDirectory::Exists, + gix_tempfile::AutoRemove::Tempfile, + ) + .and_then(|mut tmp_file| { + self.path.clear(); + tmp_file.with_mut(|tmp| self.path.push(tmp.path()))?; + Ok(tmp_file) + }) + .map(|tmp_file| { + ( + d.prepare_binary_to_text_cmd(&self.path) + .expect("always get cmd if command is set"), + tmp_file, + ) + }) + }) + }) + .transpose() + .map_err(|err| convert_to_diffable::Error::CreateTempfile { + source: err, + rela_path: rela_path.to_owned(), + })?; + match cmd_and_file { + Some((cmd, mut tmp_file)) => { + tmp_file.write_all(out).map_err(|err| { + convert_to_diffable::Error::CreateTempfile { + source: err, + rela_path: rela_path.to_owned(), + } + })?; + out.clear(); + run_cmd(rela_path, cmd, out)?; + } + None => { + match res { + ToWorktreeOutcome::Unchanged(_) => {} + ToWorktreeOutcome::Buffer(src) => { + out.resize(src.len(), 0); + out.copy_from_slice(src); + } + ToWorktreeOutcome::Process(MaybeDelayed::Immediate(mut stream)) => { + std::io::copy(&mut stream, out).map_err(|err| { + convert_to_diffable::Error::StreamCopy { + rela_path: rela_path.to_owned(), + source: err, + } + })?; + } + ToWorktreeOutcome::Process(MaybeDelayed::Delayed(_)) => { + unreachable!("we prohibit this") + } + }; + } + } + } + + if driver.map_or(true, |d| d.binary_to_text_command.is_none()) + && is_binary.unwrap_or_else(|| is_binary_buf(out)) + { + let size = out.len() as u64; + out.clear(); + Data::Binary { size } + } else { + Data::Buffer + } + }; + Some(data) + }; + Ok(Outcome { driver_index, data }) + } + } + } +} + +fn is_binary_buf(buf: &[u8]) -> bool { + let buf = &buf[..buf.len().min(8000)]; + buf.contains(&0) +} + +fn none_if_missing(res: std::io::Result) -> std::io::Result> { + match res { + Ok(data) => Ok(Some(data)), + Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(None), + Err(err) => Err(err), + } +} + +fn run_cmd(rela_path: &BStr, mut cmd: Command, out: &mut Vec) -> Result<(), convert_to_diffable::Error> { + gix_trace::debug!(cmd = ?cmd, "Running binary-to-text command"); + let mut res = cmd + .output() + .map_err(|err| convert_to_diffable::Error::RunTextConvFilter { + rela_path: rela_path.to_owned(), + cmd: format!("{cmd:?}"), + source: err, + })?; + if !res.status.success() { + return Err(convert_to_diffable::Error::TextConvFilterFailed { + rela_path: rela_path.to_owned(), + cmd: format!("{cmd:?}"), + stderr: res.stderr.into(), + }); + } + out.append(&mut res.stdout); + Ok(()) +} + +impl Driver { + /// Produce an invocable command pre-configured to produce the filtered output on stdout after reading `path`. + pub fn prepare_binary_to_text_cmd(&self, path: &Path) -> Option { + let command: &BStr = self.binary_to_text_command.as_ref()?.as_ref(); + let cmd = gix_command::prepare(gix_path::from_bstr(command).into_owned()) + .with_shell() + .stdin(Stdio::null()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .arg(path) + .into(); + Some(cmd) + } +} diff --git a/gix-diff/src/blob/platform.rs b/gix-diff/src/blob/platform.rs new file mode 100644 index 00000000000..29615cd55fe --- /dev/null +++ b/gix-diff/src/blob/platform.rs @@ -0,0 +1,613 @@ +use super::Algorithm; +use crate::blob::{pipeline, Pipeline, Platform, ResourceKind}; +use bstr::{BStr, BString, ByteSlice}; +use std::io::Write; +use std::process::Stdio; + +/// A key to uniquely identify either a location in the worktree, or in the object database. +#[derive(Clone)] +pub(crate) struct CacheKey { + id: gix_hash::ObjectId, + location: BString, + /// If `true`, this is an `id` based key, otherwise it's location based. + use_id: bool, + /// Only relevant when `id` is not null, to further differentiate content and allow us to + /// keep track of both links and blobs with the same content (rare, but possible). + is_link: bool, +} + +/// A stored value representing a diffable resource. +#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Debug)] +pub(crate) struct CacheValue { + /// The outcome of converting a resource into a diffable format using [Pipeline::convert_to_diffable()]. + conversion: pipeline::Outcome, + /// The kind of the resource we are looking at. Only possible values are `Blob`, `BlobExecutable` and `Link`. + mode: gix_object::tree::EntryKind, + /// A possibly empty buffer, depending on `conversion.data` which may indicate the data is considered binary. + buffer: Vec, +} + +impl std::hash::Hash for CacheKey { + fn hash(&self, state: &mut H) { + if self.use_id { + self.id.hash(state); + self.is_link.hash(state) + } else { + self.location.hash(state) + } + } +} + +impl PartialEq for CacheKey { + fn eq(&self, other: &Self) -> bool { + match (self.use_id, other.use_id) { + (false, false) => self.location.eq(&other.location), + (true, true) => self.id.eq(&other.id) && self.is_link.eq(&other.is_link), + _ => false, + } + } +} + +impl Eq for CacheKey {} + +impl Default for CacheKey { + fn default() -> Self { + CacheKey { + id: gix_hash::Kind::Sha1.null(), + use_id: false, + is_link: false, + location: BString::default(), + } + } +} + +impl CacheKey { + fn set_location(&mut self, rela_path: &BStr) { + self.location.clear(); + self.location.extend_from_slice(rela_path); + } +} + +/// A resource ready to be diffed in one way or another. +#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] +pub struct Resource<'a> { + /// If available, an index into the `drivers` field to access more diff-related information of the driver for items + /// at the given path, as previously determined by git-attributes. + /// + /// Note that drivers are queried even if there is no object available. + pub driver_index: Option, + /// The data itself, suitable for diffing, and if the object or worktree item is present at all. + pub data: resource::Data<'a>, + /// The kind of the resource we are looking at. Only possible values are `Blob`, `BlobExecutable` and `Link`. + pub mode: gix_object::tree::EntryKind, + /// The location of the resource, relative to the working tree. + pub rela_path: &'a BStr, + /// The id of the content as it would be stored in `git`, or `null` if the content doesn't exist anymore at + /// `rela_path` or if it was never computed. This can happen with content read from the worktree, which has to + /// go through a filter to be converted back to what `git` would store. + pub id: &'a gix_hash::oid, +} + +/// +pub mod resource { + use crate::blob::pipeline; + use crate::blob::platform::{CacheKey, CacheValue, Resource}; + + impl<'a> Resource<'a> { + pub(crate) fn new(key: &'a CacheKey, value: &'a CacheValue) -> Self { + Resource { + driver_index: value.conversion.driver_index, + data: value.conversion.data.map_or(Data::Missing, |data| match data { + pipeline::Data::Buffer => Data::Buffer(&value.buffer), + pipeline::Data::Binary { size } => Data::Binary { size }, + }), + mode: value.mode, + rela_path: key.location.as_ref(), + id: &key.id, + } + } + + /// Produce an iterator over lines, separated by LF or CRLF, suitable to create tokens using + /// [`imara_diff::intern::InternedInput`]. + pub fn intern_source(&self) -> imara_diff::sources::ByteLines<'a, true> { + crate::blob::sources::byte_lines_with_terminator(self.data.as_slice().unwrap_or_default()) + } + } + + /// The data of a diffable resource, as it could be determined and computed previously. + #[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] + pub enum Data<'a> { + /// The object is missing, either because it didn't exist in the working tree or because its `id` was null. + Missing, + /// The textual data as processed to be in a diffable state. + Buffer(&'a [u8]), + /// The size that the binary blob had at the given revision, without having applied filters, as it's either + /// considered binary or above the big-file threshold. + /// + /// In this state, the binary file cannot be diffed. + Binary { + /// The size of the object prior to performing any filtering or as it was found on disk. + /// + /// Note that technically, the size isn't always representative of the same 'state' of the + /// content, as once it can be the size of the blob in git, and once it's the size of file + /// in the worktree. + size: u64, + }, + } + + impl<'a> Data<'a> { + /// Return ourselves as slice of bytes if this instance stores data. + pub fn as_slice(&self) -> Option<&'a [u8]> { + match self { + Data::Buffer(d) => Some(d), + Data::Binary { .. } | Data::Missing => None, + } + } + } +} + +/// +pub mod set_resource { + use crate::blob::{pipeline, ResourceKind}; + use bstr::BString; + + /// The error returned by [Platform::set_resource](super::Platform::set_resource). + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error { + #[error("Can only diff blobs and links, not {mode:?}")] + InvalidMode { mode: gix_object::tree::EntryKind }, + #[error("Failed to read {kind} worktree data from '{rela_path}'")] + Io { + rela_path: BString, + kind: ResourceKind, + source: std::io::Error, + }, + #[error("Failed to obtain attributes for {kind} resource at '{rela_path}'")] + Attributes { + rela_path: BString, + kind: ResourceKind, + source: std::io::Error, + }, + #[error(transparent)] + ConvertToDiffable(#[from] pipeline::convert_to_diffable::Error), + } +} + +/// +pub mod prepare_diff { + use crate::blob::platform::Resource; + use bstr::BStr; + + /// The kind of operation that was performed during the [`diff`](super::Platform::prepare_diff()) operation. + #[derive(Debug, Copy, Clone, Eq, PartialEq)] + pub enum Operation<'a> { + /// The [internal diff algorithm](imara_diff::diff) should be called with the provided arguments. + /// This only happens if none of the resources are binary, and if there is no external diff program configured via git-attributes + /// *or* [Options::skip_internal_diff_if_external_is_configured](super::Options::skip_internal_diff_if_external_is_configured) + /// is `false`. + /// + /// Use [`Outcome::interned_input()`] to easily obtain an interner for use with [`imara_diff::diff()`], or maintain one yourself + /// for greater re-use. + InternalDiff { + /// The algorithm we determined should be used, which is one of (in order, first set one wins): + /// + /// * the driver's override + /// * the platforms own configuration (typically from git-config) + /// * the default algorithm + algorithm: imara_diff::Algorithm, + }, + /// Run the external diff program according as configured in the `source`-resources driver. + /// This only happens if [Options::skip_internal_diff_if_external_is_configured](super::Options::skip_internal_diff_if_external_is_configured) + /// was `true`, preventing the usage of the internal diff implementation. + ExternalCommand { + /// The command as extracted from [Driver::command](super::super::Driver::command). + /// Use it in [`Platform::prepare_diff_command`](super::Platform::prepare_diff_command()) to easily prepare a compatible invocation. + command: &'a BStr, + }, + /// One of the involved resources, [`old`](Outcome::old) or [`new`](Outcome::new), were binary and thus no diff + /// cannot be performed. + SourceOrDestinationIsBinary, + } + + /// The outcome of a [`prepare_diff`](super::Platform::prepare_diff()) operation. + #[derive(Debug, Copy, Clone, Eq, PartialEq)] + pub struct Outcome<'a> { + /// The kind of diff that was actually performed. This may include skipping the internal diff as well. + pub operation: Operation<'a>, + /// The old or source of the diff operation. + pub old: Resource<'a>, + /// The new or destination of the diff operation. + pub new: Resource<'a>, + } + + impl<'a> Outcome<'a> { + /// Produce an instance of an interner which `git` would use to perform diffs. + pub fn interned_input(&self) -> imara_diff::intern::InternedInput<&'a [u8]> { + crate::blob::intern::InternedInput::new(self.old.intern_source(), self.new.intern_source()) + } + } + + /// The error returned by [Platform::prepare_diff()](super::Platform::prepare_diff()). + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error { + #[error("Either the source or the destination of the diff operation were not set")] + SourceOrDestinationUnset, + #[error("Tried to diff resources that are both considered removed")] + SourceAndDestinationRemoved, + } +} + +/// +pub mod prepare_diff_command { + use bstr::BString; + use std::ops::{Deref, DerefMut}; + + /// The error returned by [Platform::prepare_diff_command()](super::Platform::prepare_diff_command()). + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error { + #[error("Either the source or the destination of the diff operation were not set")] + SourceOrDestinationUnset, + #[error("Binary resources can't be diffed with an external command (as we don't have the data anymore)")] + SourceOrDestinationBinary, + #[error( + "Tempfile to store content of '{rela_path}' for passing to external diff command could not be created" + )] + CreateTempfile { rela_path: BString, source: std::io::Error }, + #[error("Could not write content of '{rela_path}' to tempfile for passing to external diff command")] + WriteTempfile { rela_path: BString, source: std::io::Error }, + } + + /// The outcome of a [`prepare_diff_command`](super::Platform::prepare_diff_command()) operation. + /// + /// This type acts like [`std::process::Command`], ready to run, with `stdin`, `stdout` and `stderr` set to *inherit* + /// all handles as this is expected to be for visual inspection. + pub struct Command { + pub(crate) cmd: std::process::Command, + /// Possibly a tempfile to be removed after the run, or `None` if there is no old version. + pub(crate) old: Option>, + /// Possibly a tempfile to be removed after the run, or `None` if there is no new version. + pub(crate) new: Option>, + } + + impl Deref for Command { + type Target = std::process::Command; + + fn deref(&self) -> &Self::Target { + &self.cmd + } + } + + impl DerefMut for Command { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.cmd + } + } +} + +/// Options for use in [Platform::new()]. +#[derive(Default, Copy, Clone)] +pub struct Options { + /// The algorithm to use when diffing. + /// If unset, it uses the [default algorithm](Algorithm::default()). + pub algorithm: Option, + /// If `true`, default `false`, then an external `diff` configured using gitattributes and drivers, + /// will cause the built-in diff [to be skipped](prepare_diff::Operation::ExternalCommand). + /// Otherwise, the internal diff is called despite the configured external diff, which is + /// typically what callers expect by default. + pub skip_internal_diff_if_external_is_configured: bool, +} + +/// Lifecycle +impl Platform { + /// Create a new instance with `options`, and a way to `filter` data from the object database to data that is diff-able. + /// `filter_mode` decides how to do that specifically. + /// Use `attr_stack` to access attributes pertaining worktree filters and diff settings. + pub fn new( + options: Options, + filter: Pipeline, + filter_mode: pipeline::Mode, + attr_stack: gix_worktree::Stack, + ) -> Self { + Platform { + old: None, + new: None, + diff_cache: Default::default(), + options, + filter, + filter_mode, + attr_stack, + } + } +} + +/// Conversions +impl Platform { + /// Store enough information about a resource to eventually diff it, where… + /// + /// * `id` is the hash of the resource. If it [is null](gix_hash::ObjectId::is_null()), it should either + /// be a resource in the worktree, or it's considered a non-existing, deleted object. + /// If an `id` is known, as the hash of the object as (would) be stored in `git`, then it should be provided + /// for completeness. + /// * `mode` is the kind of object (only blobs and links are allowed) + /// * `rela_path` is the relative path as seen from the (work)tree root. + /// * `kind` identifies the side of the diff this resource will be used for. + /// A diff needs both `OldOrSource` *and* `NewOrDestination`. + /// * `objects` provides access to the object database in case the resource can't be read from a worktree. + /// + /// Note that it's assumed that either `id + mode (` or `rela_path` can serve as unique identifier for the resource, + /// depending on whether or not a [worktree root](pipeline::WorktreeRoots) is set for the resource of `kind`, + /// with resources with worktree roots using the `rela_path` as unique identifier. + /// + /// ### Important + /// + /// If an error occours, the previous resource of `kind` will be cleared, preventing further diffs + /// unless another attempt succeeds. + pub fn set_resource( + &mut self, + id: gix_hash::ObjectId, + mode: gix_object::tree::EntryKind, + rela_path: &BStr, + kind: ResourceKind, + objects: &impl gix_object::FindObjectOrHeader, // TODO: make this `dyn` once https://github.com/rust-lang/rust/issues/65991 is stable + ) -> Result<(), set_resource::Error> { + let res = self.set_resource_inner(id, mode, rela_path, kind, objects); + if res.is_err() { + *match kind { + ResourceKind::OldOrSource => &mut self.old, + ResourceKind::NewOrDestination => &mut self.new, + } = None; + } + res + } + + /// Given `diff_command` and `context`, typically obtained from git-configuration, and the currently set diff-resources, + /// prepare the invocation and temporary files needed to launch it according to protocol. + /// `count` / `total` are used for progress indication passed as environment variables `GIT_DIFF_PATH_(COUNTER|TOTAL)` + /// respectively (0-based), so the first path has `count=0` and `total=1` (assuming there is only one path). + /// Returns `None` if at least one resource is unset, see [`set_resource()`](Self::set_resource()). + /// + /// Please note that this is an expensive operation this will always create up to two temporary files to hold the data + /// for the old and new resources. + /// + /// ### Deviation + /// + /// If one of the resources is binary, the operation reports an error as such resources don't make their data available + /// which is required for the external diff to run. + pub fn prepare_diff_command( + &self, + diff_command: BString, + context: gix_command::Context, + count: usize, + total: usize, + ) -> Result { + fn add_resource( + cmd: &mut std::process::Command, + res: Resource<'_>, + ) -> Result>, prepare_diff_command::Error> { + let tmpfile = match res.data { + resource::Data::Missing => { + cmd.args(["/dev/null", ".", "."]); + None + } + resource::Data::Buffer(buf) => { + let mut tmp = gix_tempfile::new( + std::env::temp_dir(), + gix_tempfile::ContainingDirectory::Exists, + gix_tempfile::AutoRemove::Tempfile, + ) + .map_err(|err| prepare_diff_command::Error::CreateTempfile { + rela_path: res.rela_path.to_owned(), + source: err, + })?; + tmp.write_all(buf) + .map_err(|err| prepare_diff_command::Error::WriteTempfile { + rela_path: res.rela_path.to_owned(), + source: err, + })?; + tmp.with_mut(|f| { + cmd.arg(f.path()); + }) + .map_err(|err| prepare_diff_command::Error::WriteTempfile { + rela_path: res.rela_path.to_owned(), + source: err, + })?; + cmd.arg(res.id.to_string()).arg(res.mode.as_octal_str().to_string()); + let tmp = tmp.close().map_err(|err| prepare_diff_command::Error::WriteTempfile { + rela_path: res.rela_path.to_owned(), + source: err, + })?; + Some(tmp) + } + resource::Data::Binary { .. } => return Err(prepare_diff_command::Error::SourceOrDestinationBinary), + }; + Ok(tmpfile) + } + + let (old, new) = self + .resources() + .ok_or(prepare_diff_command::Error::SourceOrDestinationUnset)?; + let mut cmd: std::process::Command = gix_command::prepare(gix_path::from_bstring(diff_command)) + .with_context(context) + .env("GIT_DIFF_PATH_COUNTER", (count + 1).to_string()) + .env("GIT_DIFF_PATH_TOTAL", total.to_string()) + .stdin(Stdio::inherit()) + .stdout(Stdio::inherit()) + .stderr(Stdio::inherit()) + .into(); + + cmd.arg(gix_path::from_bstr(old.rela_path).into_owned()); + let mut out = prepare_diff_command::Command { + cmd, + old: None, + new: None, + }; + + out.old = add_resource(&mut out.cmd, old)?; + out.new = add_resource(&mut out.cmd, new)?; + + if old.rela_path != new.rela_path { + out.cmd.arg(gix_path::from_bstr(new.rela_path).into_owned()); + } + + Ok(out) + } + + /// Returns the resource of the given kind if it was set. + pub fn resource(&self, kind: ResourceKind) -> Option> { + let key = match kind { + ResourceKind::OldOrSource => self.old.as_ref(), + ResourceKind::NewOrDestination => self.new.as_ref(), + }?; + Resource::new(key, self.diff_cache.get(key)?).into() + } + + /// Obtain the two resources that were previously set as `(OldOrSource, NewOrDestination)`, if both are set and available. + /// + /// This is useful if one wishes to manually prepare the diff, maybe for invoking external programs, instead of relying on + /// [`Self::prepare_diff()`]. + pub fn resources(&self) -> Option<(Resource<'_>, Resource<'_>)> { + let key = &self.old.as_ref()?; + let value = self.diff_cache.get(key)?; + let old = Resource::new(key, value); + + let key = &self.new.as_ref()?; + let value = self.diff_cache.get(key)?; + let new = Resource::new(key, value); + Some((old, new)) + } + + /// Prepare a diff operation on the [previously set](Self::set_resource()) [old](ResourceKind::OldOrSource) and + /// [new](ResourceKind::NewOrDestination) resources. + /// + /// The returned outcome allows to easily perform diff operations, based on the [`prepare_diff::Outcome::operation`] field, + /// which hints at what should be done. + pub fn prepare_diff(&mut self) -> Result, prepare_diff::Error> { + let old_key = &self.old.as_ref().ok_or(prepare_diff::Error::SourceOrDestinationUnset)?; + let old = self + .diff_cache + .get(old_key) + .ok_or(prepare_diff::Error::SourceOrDestinationUnset)?; + let new_key = &self.new.as_ref().ok_or(prepare_diff::Error::SourceOrDestinationUnset)?; + let new = self + .diff_cache + .get(new_key) + .ok_or(prepare_diff::Error::SourceOrDestinationUnset)?; + let mut out = prepare_diff::Outcome { + operation: prepare_diff::Operation::SourceOrDestinationIsBinary, + old: Resource::new(old_key, old), + new: Resource::new(new_key, new), + }; + + match (old.conversion.data, new.conversion.data) { + (None, None) => return Err(prepare_diff::Error::SourceAndDestinationRemoved), + (Some(pipeline::Data::Binary { .. }), _) | (_, Some(pipeline::Data::Binary { .. })) => return Ok(out), + _either_missing_or_non_binary => { + if let Some(command) = old + .conversion + .driver_index + .and_then(|idx| self.filter.drivers[idx].command.as_deref()) + .filter(|_| self.options.skip_internal_diff_if_external_is_configured) + { + out.operation = prepare_diff::Operation::ExternalCommand { + command: command.as_bstr(), + }; + return Ok(out); + } + } + } + + out.operation = prepare_diff::Operation::InternalDiff { + algorithm: old + .conversion + .driver_index + .and_then(|idx| self.filter.drivers[idx].algorithm) + .or(self.options.algorithm) + .unwrap_or_default(), + }; + Ok(out) + } + + /// Every call to [set_resource()](Self::set_resource()) will keep the diffable data in memory, and that will never be cleared. + /// + /// Use this method to clear the cache, releasing memory. Note that this will also loose all information about resources + /// which means diffs would fail unless the resources are set again. + /// + /// Note that this also has to be called if the same resource is going to be diffed in different states, i.e. using different + /// `id`s, but the same `rela_path`. + pub fn clear_resource_cache(&mut self) { + self.old = None; + self.new = None; + self.diff_cache.clear(); + } +} + +impl Platform { + fn set_resource_inner( + &mut self, + id: gix_hash::ObjectId, + mode: gix_object::tree::EntryKind, + rela_path: &BStr, + kind: ResourceKind, + objects: &impl gix_object::FindObjectOrHeader, + ) -> Result<(), set_resource::Error> { + if matches!( + mode, + gix_object::tree::EntryKind::Commit | gix_object::tree::EntryKind::Tree + ) { + return Err(set_resource::Error::InvalidMode { mode }); + } + let storage = match kind { + ResourceKind::OldOrSource => &mut self.old, + ResourceKind::NewOrDestination => &mut self.new, + } + .get_or_insert_with(Default::default); + + storage.id = id; + storage.set_location(rela_path); + storage.is_link = matches!(mode, gix_object::tree::EntryKind::Link); + storage.use_id = self.filter.roots.by_kind(kind).is_none(); + + if self.diff_cache.contains_key(storage) { + return Ok(()); + } + let entry = self + .attr_stack + .at_entry(rela_path, Some(false), objects) + .map_err(|err| set_resource::Error::Attributes { + source: err, + kind, + rela_path: rela_path.to_owned(), + })?; + let mut buf = Vec::new(); + let out = self.filter.convert_to_diffable( + &id, + mode, + rela_path, + kind, + &mut |_, out| { + let _ = entry.matching_attributes(out); + }, + objects, + self.filter_mode, + &mut buf, + )?; + let key = storage.clone(); + assert!( + self.diff_cache + .insert( + key, + CacheValue { + conversion: out, + mode, + buffer: buf, + }, + ) + .is_none(), + "The key impl makes clashes impossible with our usage" + ); + Ok(()) + } +} diff --git a/gix-diff/src/lib.rs b/gix-diff/src/lib.rs index 1ad5b1478db..c055e45ad34 100644 --- a/gix-diff/src/lib.rs +++ b/gix-diff/src/lib.rs @@ -8,6 +8,14 @@ #![deny(missing_docs, rust_2018_idioms)] #![forbid(unsafe_code)] +/// Re-export for use in public API. +#[cfg(feature = "blob")] +pub use gix_command as command; + +/// Re-export for use in public API. +#[cfg(feature = "blob")] +pub use gix_object as object; + /// A structure to capture how to perform rename and copy tracking, used by the [rewrites::Tracker]. #[derive(Debug, Copy, Clone, PartialEq)] #[cfg(feature = "blob")] diff --git a/gix-diff/src/rewrites/tracker.rs b/gix-diff/src/rewrites/tracker.rs index 09d3c724608..da8a7bb1149 100644 --- a/gix-diff/src/rewrites/tracker.rs +++ b/gix-diff/src/rewrites/tracker.rs @@ -1,3 +1,9 @@ +//! ### Deviation +//! +//! Note that the algorithm implemented here is in many ways different from what `git` does. +//! +//! - it's less sophisticated and doesn't use any ranking of candidates. Instead, it picks the first possible match. +//! - the set used for copy-detection is probably smaller by default. use std::ops::Range; use gix_object::tree::{EntryKind, EntryMode}; @@ -22,6 +28,9 @@ pub enum ChangeKind { /// A trait providing all functionality to abstract over the concept of a change, as seen by the [`Tracker`]. pub trait Change: Clone { /// Return the hash of this change for identification. + /// + /// Note that this is the id of the object as stored in `git`, i.e. it must have gone through workspace + /// conversions. fn id(&self) -> &gix_hash::oid; /// Return the kind of this change. fn kind(&self) -> ChangeKind; @@ -71,6 +80,7 @@ pub mod visit { use gix_object::tree::EntryMode; /// The source of a rewrite, rename or copy. + #[derive(Debug, Clone, PartialEq, PartialOrd)] pub struct Source<'a> { /// The kind of entry. pub entry_mode: EntryMode, @@ -85,7 +95,7 @@ pub mod visit { } /// Further identify the kind of [Source]. - #[derive(Debug, Copy, Clone, Eq, PartialEq)] + #[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)] pub enum SourceKind { /// This is the source of an entry that was renamed, as `source` was renamed to `destination`. Rename, @@ -94,7 +104,8 @@ pub mod visit { } /// A change along with a location. - pub struct Destination<'a, T> { + #[derive(Clone)] + pub struct Destination<'a, T: Clone> { /// The change at the given `location`. pub change: T, /// The repository-relative location of this destination. @@ -158,14 +169,18 @@ impl Tracker { None } - /// Can only be called once effectively as it alters its own state. + /// Can only be called once effectively as it alters its own state to assure each item is only emitted once. /// /// `cb(destination, source)` is called for each item, either with `Some(source)` if it's /// the destination of a copy or rename, or with `None` for source if no relation to other - /// items in the tracked set exist. + /// items in the tracked set exist, which is like saying 'no rename or rewrite or copy' happened. /// /// `objects` is used to access blob data for similarity checks if required and is taken directly from the object database. - /// Worktree filters and diff conversions will be applied afterwards automatically. + /// Worktree filters and text conversions will be applied afterwards automatically. Note that object-caching *should not* + /// be enabled as caching is implemented internally, after all, the blob that's actually diffed is going through conversion steps. + /// + /// Use `worktree_filter` to obtain working-tree versions of files present on disk before diffing to see if rewrites happened, + /// with text-conversions being applied afterwards. /// /// `push_source_tree(push_fn: push(change, location))` is a function that is called when the entire tree of the source /// should be added as modifications by calling `push` repeatedly to use for perfect copy tracking. Note that `push` @@ -174,6 +189,7 @@ impl Tracker { &mut self, mut cb: impl FnMut(visit::Destination<'_, T>, Option>) -> crate::tree::visit::Action, objects: &dyn gix_object::Find, + _worktree_filter: &mut gix_filter::Pipeline, mut push_source_tree: PushSourceTreeFn, ) -> Result where @@ -192,16 +208,16 @@ impl Tracker { options: self.rewrites, ..Default::default() }; - out = self.match_pairs_of_kind( + self.match_pairs_of_kind( visit::SourceKind::Rename, &mut cb, self.rewrites.percentage, - out, + &mut out, objects, )?; if let Some(copies) = self.rewrites.copies { - out = self.match_pairs_of_kind(visit::SourceKind::Copy, &mut cb, copies.percentage, out, objects)?; + self.match_pairs_of_kind(visit::SourceKind::Copy, &mut cb, copies.percentage, &mut out, objects)?; match copies.source { CopySource::FromSetOfModifiedFiles => {} @@ -217,8 +233,7 @@ impl Tracker { .map_err(|err| emit::Error::GetItemsForExhaustiveCopyDetection(Box::new(err)))?; self.items.sort_by(by_id_and_location); - out = - self.match_pairs_of_kind(visit::SourceKind::Copy, &mut cb, copies.percentage, out, objects)?; + self.match_pairs_of_kind(visit::SourceKind::Copy, &mut cb, copies.percentage, &mut out, objects)?; } } } @@ -247,36 +262,40 @@ impl Tracker { kind: visit::SourceKind, cb: &mut impl FnMut(visit::Destination<'_, T>, Option>) -> crate::tree::visit::Action, percentage: Option, - mut out: Outcome, + out: &mut Outcome, objects: &dyn gix_object::Find, - ) -> Result { + ) -> Result<(), emit::Error> { // we try to cheaply reduce the set of possibilities first, before possibly looking more exhaustively. let needs_second_pass = !needs_exact_match(percentage); - if self.match_pairs(cb, None /* by identity */, kind, &mut out, objects)? == crate::tree::visit::Action::Cancel - { - return Ok(out); + if self.match_pairs(cb, None /* by identity */, kind, out, objects)? == crate::tree::visit::Action::Cancel { + return Ok(()); } if needs_second_pass { let is_limited = if self.rewrites.limit == 0 { false - } else if let Some(permutations) = permutations_over_limit(&self.items, self.rewrites.limit, kind) { - match kind { - visit::SourceKind::Rename => { - out.num_similarity_checks_skipped_for_rename_tracking_due_to_limit = permutations; - } - visit::SourceKind::Copy => { - out.num_similarity_checks_skipped_for_copy_tracking_due_to_limit = permutations; + } else { + let (num_src, num_dst) = + estimate_involved_items(self.items.iter().map(|item| (item.emitted, item.change.kind())), kind); + let permutations = num_src * num_dst; + if permutations > self.rewrites.limit { + match kind { + visit::SourceKind::Rename => { + out.num_similarity_checks_skipped_for_rename_tracking_due_to_limit = permutations; + } + visit::SourceKind::Copy => { + out.num_similarity_checks_skipped_for_copy_tracking_due_to_limit = permutations; + } } + true + } else { + false } - true - } else { - false }; if !is_limited { - self.match_pairs(cb, percentage, kind, &mut out, objects)?; + self.match_pairs(cb, percentage, kind, out, objects)?; } } - Ok(out) + Ok(()) } fn match_pairs( @@ -338,17 +357,23 @@ impl Tracker { } } -fn permutations_over_limit(items: &[Item], limit: usize, kind: visit::SourceKind) -> Option { - let (sources, destinations) = items - .iter() - .filter(|item| match kind { - visit::SourceKind::Rename => !item.emitted, +/// Returns the amount of viable sources and destinations for `items` as eligible for the given `kind` of operation. +fn estimate_involved_items( + items: impl IntoIterator, + kind: visit::SourceKind, +) -> (usize, usize) { + items + .into_iter() + .filter(|(emitted, _)| match kind { + visit::SourceKind::Rename => !*emitted, visit::SourceKind::Copy => true, }) - .fold((0, 0), |(mut src, mut dest), item| { - match item.change.kind() { + .fold((0, 0), |(mut src, mut dest), (emitted, change_kind)| { + match change_kind { ChangeKind::Addition => { - dest += 1; + if kind == visit::SourceKind::Rename || !emitted { + dest += 1; + } } ChangeKind::Deletion => { if kind == visit::SourceKind::Rename { @@ -362,9 +387,7 @@ fn permutations_over_limit(items: &[Item], limit: usize, kind: vis } } (src, dest) - }); - let permutations = sources * destinations; - (permutations > limit * limit).then_some(permutations) + }) } fn needs_exact_match(percentage: Option) -> bool { @@ -417,7 +440,7 @@ fn find_match<'a, T: Change>( return Ok(Some(src)); } } else { - let new = objects.find_blob(item_id, buf1)?; + let mut new = None; let (percentage, algo) = percentage.expect("it's set to something below 1.0 and we assured this"); debug_assert_eq!( item.change.entry_mode().kind(), @@ -429,6 +452,13 @@ fn find_match<'a, T: Change>( .enumerate() .filter(|(src_idx, src)| *src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode)) { + let new = match &new { + Some(new) => new, + None => { + new = objects.find_blob(item_id, buf1)?.into(); + new.as_ref().expect("just set") + } + }; let old = objects.find_blob(src.change.id(), buf2)?; // TODO: make sure we get attribute handling/worktree conversion and binary skips and filters right here. let tokens = crate::blob::intern::InternedInput::new( @@ -454,6 +484,7 @@ fn find_match<'a, T: Change>( insertions: counts.insertions, before: tokens.before.len().try_into().expect("interner handles only u32"), after: tokens.after.len().try_into().expect("interner handles only u32"), + similarity, } .into(), ))); @@ -486,3 +517,58 @@ mod diff { } } } + +#[cfg(test)] +mod estimate_involved_items { + use super::estimate_involved_items; + use crate::rewrites::tracker::visit::SourceKind; + use crate::rewrites::tracker::ChangeKind; + + #[test] + fn renames_count_unemitted_as_sources_and_destinations() { + let items = [ + (false, ChangeKind::Addition), + (true, ChangeKind::Deletion), + (true, ChangeKind::Deletion), + ]; + assert_eq!( + estimate_involved_items(items, SourceKind::Rename), + (0, 1), + "here we only have one eligible source, hence nothing to do" + ); + assert_eq!( + estimate_involved_items(items.into_iter().map(|t| (false, t.1)), SourceKind::Rename), + (2, 1), + "now we have more possibilities as renames count un-emitted deletions as source" + ); + } + + #[test] + fn copies_do_not_count_additions_as_sources() { + let items = [ + (false, ChangeKind::Addition), + (true, ChangeKind::Addition), + (true, ChangeKind::Deletion), + ]; + assert_eq!( + estimate_involved_items(items, SourceKind::Copy), + (0, 1), + "one addition as source, the other isn't counted as it's emitted, nor is it considered a copy-source.\ + deletions don't count" + ); + } + + #[test] + fn copies_count_modifications_as_sources() { + let items = [ + (false, ChangeKind::Addition), + (true, ChangeKind::Modification), + (false, ChangeKind::Modification), + ]; + assert_eq!( + estimate_involved_items(items, SourceKind::Copy), + (2, 1), + "any modifications is a valid source, emitted or not" + ); + } +} diff --git a/gix-diff/tests/Cargo.toml b/gix-diff/tests/Cargo.toml index 6d07360ae38..e63c3d5a320 100644 --- a/gix-diff/tests/Cargo.toml +++ b/gix-diff/tests/Cargo.toml @@ -17,7 +17,11 @@ path = "diff.rs" [dev-dependencies] gix-diff = { path = ".." } gix-hash = { path = "../../gix-hash" } +gix-fs = { path = "../../gix-fs" } +gix-worktree = { path = "../../gix-worktree" } gix-object = { path = "../../gix-object" } gix-odb = { path = "../../gix-odb" } +gix-filter = { path = "../../gix-filter" } gix-traverse = { path = "../../gix-traverse" } gix-testtools = { path = "../../tests/tools" } +shell-words = "1" diff --git a/gix-diff/tests/blob/mod.rs b/gix-diff/tests/blob/mod.rs index 33b38e69ef5..bb412d412cf 100644 --- a/gix-diff/tests/blob/mod.rs +++ b/gix-diff/tests/blob/mod.rs @@ -1,2 +1,2 @@ -#[test] -fn currently_there_is_no_api_surface_to_test_as_it_is_reexporting_imara_diff() {} +pub(crate) mod pipeline; +mod platform; diff --git a/gix-diff/tests/blob/pipeline.rs b/gix-diff/tests/blob/pipeline.rs new file mode 100644 index 00000000000..e7a8385c7a7 --- /dev/null +++ b/gix-diff/tests/blob/pipeline.rs @@ -0,0 +1,925 @@ +pub(crate) mod convert_to_diffable { + + use crate::util::ObjectDb; + use gix_diff::blob::pipeline::Options; + use gix_diff::blob::{pipeline, pipeline::WorktreeRoots, ResourceKind}; + use gix_filter::eol; + use gix_filter::eol::AutoCrlf; + use gix_object::bstr::ByteSlice; + use gix_object::tree::EntryKind; + use gix_worktree::stack::state::attributes; + + #[test] + fn simple() -> crate::Result { + for mode in [ + pipeline::Mode::ToWorktreeAndBinaryToText, + pipeline::Mode::ToGit, + pipeline::Mode::ToGitUnlessBinaryToTextIsPresent, + ] { + let tmp = gix_testtools::tempfile::TempDir::new()?; + let mut filter = gix_diff::blob::Pipeline::new( + WorktreeRoots { + old_root: Some(tmp.path().to_owned()), + new_root: None, + }, + gix_filter::Pipeline::default(), + vec![], + default_options(), + ); + + let does_not_matter = gix_hash::Kind::Sha1.null(); + let mut buf = Vec::new(); + let a_name = "a"; + let a_content = "a-content"; + std::fs::write(tmp.path().join(a_name), a_content.as_bytes())?; + let out = filter.convert_to_diffable( + &does_not_matter, + EntryKind::Blob, + a_name.into(), + ResourceKind::OldOrSource, + &mut |_, _| {}, + &gix_object::find::Never, + mode, + &mut buf, + )?; + assert!(out.driver_index.is_none(), "there was no driver"); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!(buf.as_bstr(), a_content, "there is no transformations configured"); + + let link_name = "link"; + gix_fs::symlink::create(a_name.as_ref(), &tmp.path().join(link_name))?; + let out = filter.convert_to_diffable( + &does_not_matter, + EntryKind::Link, + link_name.into(), + ResourceKind::OldOrSource, + &mut |_, _| {}, + &gix_object::find::Never, + mode, + &mut buf, + )?; + + assert!(out.driver_index.is_none()); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!( + buf.as_bstr(), + a_name, + "links are just files with a different mode, with its content pointing to the target" + ); + drop(tmp); + + let mut db = ObjectDb::default(); + let b_content = "b-content"; + let id = db.insert(b_content); + + let out = filter.convert_to_diffable( + &id, + EntryKind::Blob, + a_name.into(), + ResourceKind::NewOrDestination, + &mut |_, _| {}, + &db, + mode, + &mut buf, + )?; + + assert!(out.driver_index.is_none(), "there was no driver"); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!(buf.as_bstr(), b_content, "there is no transformations configured"); + } + + Ok(()) + } + + #[test] + fn above_large_file_threshold() -> crate::Result { + let tmp = gix_testtools::tempfile::TempDir::new()?; + let mut filter = gix_diff::blob::Pipeline::new( + WorktreeRoots { + old_root: None, + new_root: Some(tmp.path().to_owned()), + }, + gix_filter::Pipeline::default(), + vec![], + gix_diff::blob::pipeline::Options { + large_file_threshold_bytes: 4, + ..default_options() + }, + ); + + let does_not_matter = gix_hash::Kind::Sha1.null(); + let mut buf = Vec::new(); + let a_name = "a"; + let large_content = "hello"; + std::fs::write(tmp.path().join(a_name), large_content.as_bytes())?; + let out = filter.convert_to_diffable( + &does_not_matter, + EntryKind::BlobExecutable, + a_name.into(), + ResourceKind::NewOrDestination, + &mut |_, _| {}, + &gix_object::find::Never, + pipeline::Mode::default(), + &mut buf, + )?; + assert!(out.driver_index.is_none(), "there was no driver"); + assert_eq!(out.data, Some(pipeline::Data::Binary { size: 5 })); + assert_eq!(buf.len(), 0, "it should avoid querying that data in the first place"); + + // On windows, this test fails as it needs the link target to exist, and it was + // hard to make it exist with a relative path, strangely enough. + // For worktree checkouts, this works, that's all that matters for now. + if !cfg!(windows) { + let link_name = "link"; + gix_fs::symlink::create(large_content.as_ref(), &tmp.path().join(link_name))?; + let out = filter.convert_to_diffable( + &does_not_matter, + EntryKind::Link, + link_name.into(), + ResourceKind::NewOrDestination, + &mut |_, _| {}, + &gix_object::find::Never, + pipeline::Mode::default(), + &mut buf, + )?; + + assert!(out.driver_index.is_none()); + assert_eq!( + out.data, + Some(pipeline::Data::Buffer), + "links are always read and never considered large" + ); + assert_eq!(buf.as_bstr(), large_content); + } + drop(tmp); + + let mut db = ObjectDb::default(); + let id = db.insert(large_content); + + let out = filter.convert_to_diffable( + &id, + EntryKind::Blob, + a_name.into(), + ResourceKind::OldOrSource, + &mut |_, _| {}, + &db, + pipeline::Mode::default(), + &mut buf, + )?; + + assert!(out.driver_index.is_none(), "there was no driver"); + assert_eq!(out.data, Some(pipeline::Data::Binary { size: 5 })); + assert_eq!(buf.len(), 0, "it should avoid querying that data in the first place"); + + Ok(()) + } + + #[test] + fn non_existing() -> crate::Result { + let tmp = gix_testtools::tempfile::TempDir::new()?; + let mut filter = gix_diff::blob::Pipeline::new( + WorktreeRoots { + old_root: Some(tmp.path().to_owned()), + new_root: None, + }, + gix_filter::Pipeline::default(), + vec![], + default_options(), + ); + + let null = gix_hash::Kind::Sha1.null(); + let mut buf = vec![1]; + let a_name = "a"; + assert!( + !tmp.path().join(a_name).exists(), + "precondition: worktree file doesn't exist" + ); + let out = filter.convert_to_diffable( + &null, + EntryKind::Blob, + a_name.into(), + ResourceKind::OldOrSource, + &mut |_, _| {}, + &gix_object::find::Never, + pipeline::Mode::default(), + &mut buf, + )?; + assert!(out.driver_index.is_none(), "there was no driver"); + assert_eq!(out.data, None); + assert_eq!(buf.len(), 0, "always cleared"); + + buf.push(1); + let out = filter.convert_to_diffable( + &null, + EntryKind::Link, + "link".into(), + ResourceKind::OldOrSource, + &mut |_, _| {}, + &gix_object::find::Never, + pipeline::Mode::default(), + &mut buf, + )?; + assert!(out.driver_index.is_none()); + assert_eq!(out.data, None); + assert_eq!(buf.len(), 0, "always cleared"); + + drop(tmp); + + buf.push(1); + let out = filter.convert_to_diffable( + &null, + EntryKind::Blob, + a_name.into(), + ResourceKind::NewOrDestination, + &mut |_, _| {}, + &gix_object::find::Never, + pipeline::Mode::default(), + &mut buf, + )?; + + assert!(out.driver_index.is_none(), "there was no driver"); + assert_eq!(out.data, None); + assert_eq!(buf.len(), 0, "it's always cleared before any potential use"); + + Ok(()) + } + + #[test] + fn worktree_filter() -> crate::Result { + let tmp = gix_testtools::tempfile::TempDir::new()?; + let filter = gix_filter::Pipeline::new( + Default::default(), + gix_filter::pipeline::Options { + eol_config: eol::Configuration { + auto_crlf: AutoCrlf::Enabled, + ..Default::default() + }, + ..Default::default() + }, + ); + let mut filter = gix_diff::blob::Pipeline::new( + WorktreeRoots { + old_root: Some(tmp.path().to_owned()), + new_root: None, + }, + filter, + vec![], + default_options(), + ); + + let does_not_matter = gix_hash::Kind::Sha1.null(); + let mut buf = Vec::new(); + let a_name = "a"; + let a_content = "a-content\n"; + std::fs::write(tmp.path().join(a_name), a_content.as_bytes())?; + let out = filter.convert_to_diffable( + &does_not_matter, + EntryKind::Blob, + a_name.into(), + ResourceKind::OldOrSource, + &mut |_, _| {}, + &gix_object::find::Never, + pipeline::Mode::default(), + &mut buf, + )?; + assert!(out.driver_index.is_none(), "there was no driver"); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!( + buf.as_bstr(), + a_content, + "worktree files are assumed to be filtered already, and are verbatim" + ); + + let b_name = "b"; + let b_content = "a\r\nb"; + std::fs::write(tmp.path().join(b_name), b_content.as_bytes())?; + let out = filter.convert_to_diffable( + &does_not_matter, + EntryKind::Blob, + b_name.into(), + ResourceKind::OldOrSource, + &mut |_, _| {}, + &gix_object::find::Never, + pipeline::Mode::ToGit, + &mut buf, + )?; + assert!(out.driver_index.is_none(), "there was no driver"); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!( + buf.as_bstr(), + "a\nb", + "worktree files are converted back to git if the mode needs it" + ); + + // On windows, this test fails as it needs the link target to exist, and it's kind of impossible + // to test what we want to test apparently. + if !cfg!(windows) { + let link_name = "link"; + let link_content = "hello\n"; + gix_fs::symlink::create(link_content.as_ref(), &tmp.path().join(link_name))?; + let out = filter.convert_to_diffable( + &does_not_matter, + EntryKind::Link, + link_name.into(), + ResourceKind::OldOrSource, + &mut |_, _| {}, + &gix_object::find::Never, + pipeline::Mode::default(), + &mut buf, + )?; + + assert!(out.driver_index.is_none()); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!( + buf.as_bstr(), + link_content, + "links aren't put through worktree filters, otherwise it would have its newlines replaced" + ); + } + drop(tmp); + + let mut db = ObjectDb::default(); + let b_content = "b-content\n"; + let id = db.insert(b_content); + + let out = filter.convert_to_diffable( + &id, + EntryKind::Blob, + a_name.into(), + ResourceKind::NewOrDestination, + &mut |_, _| {}, + &db, + pipeline::Mode::default(), + &mut buf, + )?; + + assert!(out.driver_index.is_none(), "there was no driver"); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!(buf.as_bstr(), "b-content\r\n", "LF to CRLF by worktree filtering"); + + let mut db = ObjectDb::default(); + let b_content = "b\n"; + let id = db.insert(b_content); + let out = filter.convert_to_diffable( + &id, + EntryKind::Blob, + a_name.into(), + ResourceKind::NewOrDestination, + &mut |_, _| {}, + &db, + pipeline::Mode::ToGit, + &mut buf, + )?; + + assert!(out.driver_index.is_none(), "there was no driver"); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!(buf.as_bstr(), "b\n", "no filtering was performed at all"); + + Ok(()) + } + + #[test] + fn binary_by_buffer_inspection() -> crate::Result { + let tmp = gix_testtools::tempfile::TempDir::new()?; + let root = gix_testtools::scripted_fixture_read_only_standalone("make_blob_repo.sh")?; + let mut attributes = gix_worktree::Stack::new( + root, + gix_worktree::stack::State::AttributesStack(gix_worktree::stack::state::Attributes::new( + Default::default(), + None, + attributes::Source::WorktreeThenIdMapping, + Default::default(), + )), + gix_worktree::glob::pattern::Case::Sensitive, + Vec::new(), + Vec::new(), + ); + let mut filter = gix_diff::blob::Pipeline::new( + WorktreeRoots { + old_root: Some(tmp.path().to_owned()), + new_root: None, + }, + gix_filter::Pipeline::default(), + vec![gix_diff::blob::Driver { + name: "c".into(), + binary_to_text_command: Some("printf '\\0'; cat <".into()), + ..Default::default() + }], + default_options(), + ); + + let does_not_matter = gix_hash::Kind::Sha1.null(); + let mut buf = Vec::new(); + let a_name = "a"; + let a_content = "a\0b"; + std::fs::write(tmp.path().join(a_name), a_content.as_bytes())?; + let out = filter.convert_to_diffable( + &does_not_matter, + EntryKind::Blob, + a_name.into(), + ResourceKind::OldOrSource, + &mut |_, _| {}, + &gix_object::find::Never, + pipeline::Mode::default(), + &mut buf, + )?; + assert!(out.driver_index.is_none(), "there was no driver"); + assert_eq!(out.data, Some(pipeline::Data::Binary { size: 3 })); + assert_eq!(buf.len(), 0, "binary files aren't stored, even if we read them"); + + // LINK with null-bytes can't be created, and generally we ignore a lot of checks on links + // for good reason. Hard to test. + drop(tmp); + + let mut db = ObjectDb::default(); + let b_content = "b-co\0ntent\n"; + let id = db.insert(b_content); + + let out = filter.convert_to_diffable( + &id, + EntryKind::Blob, + a_name.into(), + ResourceKind::NewOrDestination, + &mut |_, _| {}, + &db, + pipeline::Mode::default(), + &mut buf, + )?; + + assert!(out.driver_index.is_none(), "there was no driver"); + assert_eq!(out.data, Some(pipeline::Data::Binary { size: 11 })); + assert_eq!(buf.len(), 0, "buffers are cleared even if we read them"); + + let platform = attributes.at_entry("c", Some(false), &gix_object::find::Never)?; + + let id = db.insert("b"); + let out = filter.convert_to_diffable( + &id, + EntryKind::Blob, + "c".into(), + ResourceKind::NewOrDestination, + &mut |_, out| { + let _ = platform.matching_attributes(out); + }, + &db, + pipeline::Mode::default(), + &mut buf, + )?; + + assert_eq!(out.driver_index, Some(0)); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!( + buf.as_bstr(), + "\0b", + "if binary-text-conversion is set, we don't care if it outputs zero bytes, let everything pass" + ); + + Ok(()) + } + + #[test] + fn with_driver() -> crate::Result { + let root = gix_testtools::scripted_fixture_read_only_standalone("make_blob_repo.sh")?; + let command = "echo to-text; cat <"; + let mut attributes = gix_worktree::Stack::new( + &root, + gix_worktree::stack::State::AttributesStack(gix_worktree::stack::state::Attributes::new( + Default::default(), + None, + attributes::Source::WorktreeThenIdMapping, + Default::default(), + )), + gix_worktree::glob::pattern::Case::Sensitive, + Vec::new(), + Vec::new(), + ); + let mut filter = gix_diff::blob::Pipeline::new( + WorktreeRoots { + old_root: Some(root.clone()), + new_root: None, + }, + gix_filter::Pipeline::default(), + vec![ + gix_diff::blob::Driver { + name: "a".into(), + binary_to_text_command: Some(command.into()), + ..Default::default() + }, + gix_diff::blob::Driver { + name: "b".into(), + is_binary: Some(true), + ..Default::default() + }, + gix_diff::blob::Driver { + name: "c".into(), + binary_to_text_command: Some(command.into()), + is_binary: Some(true), + ..Default::default() + }, + gix_diff::blob::Driver { + name: "d".into(), + binary_to_text_command: Some(command.into()), + ..Default::default() + }, + gix_diff::blob::Driver { + name: "missing".into(), + ..Default::default() + }, + ], + default_options(), + ); + + let mut db = ObjectDb::default(); + let null = gix_hash::Kind::Sha1.null(); + let mut buf = Vec::new(); + let platform = attributes.at_entry("a", Some(false), &gix_object::find::Never)?; + let worktree_modes = [ + pipeline::Mode::ToWorktreeAndBinaryToText, + pipeline::Mode::ToGitUnlessBinaryToTextIsPresent, + ]; + let all_modes = [ + pipeline::Mode::ToGit, + pipeline::Mode::ToWorktreeAndBinaryToText, + pipeline::Mode::ToGitUnlessBinaryToTextIsPresent, + ]; + for mode in worktree_modes { + let out = filter.convert_to_diffable( + &null, + EntryKind::Blob, + "a".into(), + ResourceKind::OldOrSource, + &mut |_, out| { + let _ = platform.matching_attributes(out); + }, + &gix_object::find::Never, + mode, + &mut buf, + )?; + assert_eq!(out.driver_index, Some(0)); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!(buf.as_bstr(), "to-text\na\n", "filter was applied"); + } + + let out = filter.convert_to_diffable( + &null, + EntryKind::Blob, + "a".into(), + ResourceKind::OldOrSource, + &mut |_, out| { + let _ = platform.matching_attributes(out); + }, + &gix_object::find::Never, + pipeline::Mode::ToGit, + &mut buf, + )?; + assert_eq!(out.driver_index, Some(0)); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!(buf.as_bstr(), "a\n", "unconditionally use git according to mode"); + + let id = db.insert("a\n"); + for mode in worktree_modes { + let out = filter.convert_to_diffable( + &id, + EntryKind::Blob, + "a".into(), + ResourceKind::NewOrDestination, + &mut |_, out| { + let _ = platform.matching_attributes(out); + }, + &db, + mode, + &mut buf, + )?; + assert_eq!(out.driver_index, Some(0)); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!(buf.as_bstr(), "to-text\na\n", "filter was applied"); + } + + let out = filter.convert_to_diffable( + &id, + EntryKind::Blob, + "a".into(), + ResourceKind::NewOrDestination, + &mut |_, out| { + let _ = platform.matching_attributes(out); + }, + &db, + pipeline::Mode::ToGit, + &mut buf, + )?; + assert_eq!(out.driver_index, Some(0)); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!( + buf.as_bstr(), + "a\n", + "no filter was applied in this mode, also when using the ODB" + ); + + let platform = attributes.at_entry("missing", Some(false), &gix_object::find::Never)?; + for mode in all_modes { + buf.push(1); + let out = filter.convert_to_diffable( + &null, + EntryKind::Link, + "missing".into(), /* does not actually exist */ + ResourceKind::OldOrSource, + &mut |_, out| { + let _ = platform.matching_attributes(out); + }, + &gix_object::find::Never, + mode, + &mut buf, + )?; + assert_eq!(out.driver_index, Some(4), "despite missing, we get driver information"); + assert_eq!(out.data, None); + assert_eq!(buf.len(), 0, "always cleared"); + + buf.push(1); + let out = filter.convert_to_diffable( + &null, + EntryKind::Link, + "missing".into(), /* does not actually exist */ + ResourceKind::NewOrDestination, + &mut |_, out| { + let _ = platform.matching_attributes(out); + }, + &gix_object::find::Never, + mode, + &mut buf, + )?; + assert_eq!(out.driver_index, Some(4), "despite missing, we get driver information"); + assert_eq!(out.data, None); + assert_eq!(buf.len(), 0, "always cleared"); + + buf.push(1); + let id = db.insert("link-target"); + let out = filter.convert_to_diffable( + &id, + EntryKind::Link, + "missing".into(), + ResourceKind::NewOrDestination, + &mut |_, out| { + let _ = platform.matching_attributes(out); + }, + &db, + mode, + &mut buf, + )?; + assert_eq!(out.driver_index, Some(4), "despite missing, we get driver information"); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!( + buf.as_bstr(), + "link-target", + "no matter what, links always look the same." + ); + } + + let platform = attributes.at_entry("b", Some(false), &gix_object::find::Never)?; + for mode in all_modes { + buf.push(1); + let out = filter.convert_to_diffable( + &null, + EntryKind::Blob, + "b".into(), + ResourceKind::OldOrSource, + &mut |_, out| { + let _ = platform.matching_attributes(out); + }, + &gix_object::find::Never, + mode, + &mut buf, + )?; + + assert_eq!(out.driver_index, Some(1)); + assert_eq!( + out.data, + Some(pipeline::Data::Binary { size: 2 }), + "binary value comes from driver, and it's always respected with worktree source" + ); + assert_eq!(buf.len(), 0, "it's always cleared before any potential use"); + } + + let id = db.insert("b\n"); + for mode in all_modes { + buf.push(1); + let out = filter.convert_to_diffable( + &id, + EntryKind::Blob, + "b".into(), + ResourceKind::NewOrDestination, + &mut |_, out| { + let _ = platform.matching_attributes(out); + }, + &db, + mode, + &mut buf, + )?; + + assert_eq!(out.driver_index, Some(1)); + assert_eq!( + out.data, + Some(pipeline::Data::Binary { size: 2 }), + "binary value comes from driver, and it's always respected with DB source" + ); + assert_eq!(buf.len(), 0, "it's always cleared before any potential use"); + } + + let platform = attributes.at_entry("c", Some(false), &gix_object::find::Never)?; + for mode in worktree_modes { + let out = filter.convert_to_diffable( + &null, + EntryKind::Blob, + "c".into(), + ResourceKind::OldOrSource, + &mut |_, out| { + let _ = platform.matching_attributes(out); + }, + &gix_object::find::Never, + mode, + &mut buf, + )?; + assert_eq!(out.driver_index, Some(2)); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!( + buf.as_bstr(), + "to-text\nc\n", + "filter was applied, it overrides binary=true" + ); + } + + let id = db.insert("c\n"); + for mode in worktree_modes { + let out = filter.convert_to_diffable( + &id, + EntryKind::Blob, + "c".into(), + ResourceKind::NewOrDestination, + &mut |_, out| { + let _ = platform.matching_attributes(out); + }, + &db, + mode, + &mut buf, + )?; + assert_eq!(out.driver_index, Some(2)); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!( + buf.as_bstr(), + "to-text\nc\n", + "filter was applied, it overrides binary=true" + ); + } + + let platform = attributes.at_entry("unset", Some(false), &gix_object::find::Never)?; + for mode in all_modes { + let out = filter.convert_to_diffable( + &null, + EntryKind::Blob, + "unset".into(), + ResourceKind::OldOrSource, + &mut |_, out| { + let _ = platform.matching_attributes(out); + }, + &gix_object::find::Never, + mode, + &mut buf, + )?; + assert_eq!( + out.driver_index, None, + "no driver is associated, as `diff` is explicitly unset" + ); + assert_eq!( + out.data, + Some(pipeline::Data::Binary { size: 6 }), + "unset counts as binary" + ); + assert_eq!(buf.len(), 0); + } + + let id = db.insert("unset\n"); + for mode in all_modes { + let out = filter.convert_to_diffable( + &id, + EntryKind::Blob, + "unset".into(), + ResourceKind::NewOrDestination, + &mut |_, out| { + let _ = platform.matching_attributes(out); + }, + &db, + mode, + &mut buf, + )?; + assert_eq!( + out.driver_index, None, + "no driver is associated, as `diff` is explicitly unset" + ); + assert_eq!( + out.data, + Some(pipeline::Data::Binary { size: 6 }), + "unset counts as binary" + ); + assert_eq!(buf.len(), 0); + } + + let platform = attributes.at_entry("d", Some(false), &gix_object::find::Never)?; + let id = db.insert("d-in-db"); + for mode in worktree_modes { + let out = filter.convert_to_diffable( + &null, + EntryKind::Blob, + "d".into(), + ResourceKind::OldOrSource, + &mut |_, out| { + let _ = platform.matching_attributes(out); + }, + &gix_object::find::Never, + mode, + &mut buf, + )?; + assert_eq!(out.driver_index, Some(3)); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!( + buf.as_bstr(), + "to-text\nd\n", + "the worktree + text conversion was triggered for worktree source" + ); + + let out = filter.convert_to_diffable( + &id, + EntryKind::Blob, + "d".into(), + ResourceKind::NewOrDestination, + &mut |_, out| { + let _ = platform.matching_attributes(out); + }, + &db, + mode, + &mut buf, + )?; + assert_eq!(out.driver_index, Some(3)); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!( + buf.as_bstr(), + "to-text\nd-in-db", + "the worktree + text conversion was triggered for db source" + ); + } + + let platform = attributes.at_entry("e-no-attr", Some(false), &gix_object::find::Never)?; + let out = filter.convert_to_diffable( + &null, + EntryKind::Blob, + "e-no-attr".into(), + ResourceKind::OldOrSource, + &mut |_, out| { + let _ = platform.matching_attributes(out); + }, + &gix_object::find::Never, + pipeline::Mode::ToGitUnlessBinaryToTextIsPresent, + &mut buf, + )?; + assert_eq!(out.driver_index, None); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!( + buf.as_bstr(), + "e\n", + "no text filter, so git conversion was applied for worktree source" + ); + + let id = db.insert("e-in-db"); + let out = filter.convert_to_diffable( + &id, + EntryKind::Blob, + "e-no-attr".into(), + ResourceKind::NewOrDestination, + &mut |_, out| { + let _ = platform.matching_attributes(out); + }, + &db, + pipeline::Mode::ToGitUnlessBinaryToTextIsPresent, + &mut buf, + )?; + assert_eq!(out.driver_index, None); + assert_eq!(out.data, Some(pipeline::Data::Buffer)); + assert_eq!( + buf.as_bstr(), + "e-in-db", + "no text filter, so git conversion was applied for ODB source" + ); + + Ok(()) + } + + pub(crate) fn default_options() -> Options { + Options { + large_file_threshold_bytes: 0, + fs: gix_fs::Capabilities { + precompose_unicode: false, + ignore_case: false, + executable_bit: true, + symlink: true, + }, + } + } +} diff --git a/gix-diff/tests/blob/platform.rs b/gix-diff/tests/blob/platform.rs new file mode 100644 index 00000000000..3a1fb57a9f0 --- /dev/null +++ b/gix-diff/tests/blob/platform.rs @@ -0,0 +1,373 @@ +use crate::blob::pipeline::convert_to_diffable::default_options; +use crate::hex_to_id; +use crate::util::ObjectDb; +use gix_diff::blob::platform::prepare_diff; +use gix_diff::blob::platform::prepare_diff::Operation; +use gix_diff::blob::{pipeline, platform, Algorithm, Platform, ResourceKind}; +use gix_object::bstr::{BString, ByteSlice}; +use gix_object::tree::EntryKind; +use gix_worktree::stack::state::attributes; + +#[test] +fn resources_of_worktree_and_odb_and_check_link() -> crate::Result { + let mut platform = new_platform( + Some(gix_diff::blob::Driver { + name: "a".into(), + ..Default::default() + }), + gix_diff::blob::pipeline::Mode::default(), + ); + platform.set_resource( + gix_hash::Kind::Sha1.null(), + EntryKind::Blob, + "a".into(), + ResourceKind::OldOrSource, + &gix_object::find::Never, + )?; + + let mut db = ObjectDb::default(); + let a_content = "a-content"; + let id = db.insert(a_content); + platform.set_resource( + id, + EntryKind::BlobExecutable, + "a".into(), + ResourceKind::NewOrDestination, + &db, + )?; + + let (old, new) = platform.resources().expect("previously set source and destination"); + assert_eq!(old.data.as_slice().expect("present").as_bstr(), "a\n"); + assert_eq!(old.driver_index, Some(0)); + assert_eq!(old.mode, EntryKind::Blob); + assert!(old.id.is_null(), "id is used verbatim"); + assert_eq!(old.rela_path, "a", "location is kept directly as provided"); + assert_eq!(new.data.as_slice().expect("present").as_bstr(), a_content); + assert_eq!(new.driver_index, Some(0)); + assert_eq!(new.mode, EntryKind::BlobExecutable); + assert_eq!(new.id, hex_to_id("4c469b6c8c4486fdc9ded9d597d8f6816a455707")); + assert_eq!(new.rela_path, "a", "location is kept directly as provided"); + + let out = platform.prepare_diff()?; + assert_eq!( + out.operation, + Operation::InternalDiff { + algorithm: Algorithm::Histogram + }, + "it ends up with the default, as it's not overridden anywhere" + ); + + assert_eq!( + comparable_ext_diff(platform.prepare_diff_command( + "test".into(), + gix_diff::command::Context { + git_dir: Some(".".into()), + ..Default::default() + }, + 2, + 3 + )), + format!("{}test a 0000000000000000000000000000000000000000 100644 4c469b6c8c4486fdc9ded9d597d8f6816a455707 100755", (!cfg!(windows)).then_some("GIT_DIFF_PATH_COUNTER=3 GIT_DIFF_PATH_TOTAL=3 GIT_DIR=. ").unwrap_or_default()), + "in this case, there is no rename-to field as last argument, it's based on the resource paths being different" + ); + + platform.set_resource(id, EntryKind::Link, "a".into(), ResourceKind::NewOrDestination, &db)?; + + // Double-inserts are fine. + platform.set_resource(id, EntryKind::Link, "a".into(), ResourceKind::NewOrDestination, &db)?; + let (old, new) = platform.resources().expect("previously set source and destination"); + assert_eq!( + old.data.as_slice().expect("present").as_bstr(), + "a\n", + "the source is still the same" + ); + assert_eq!(old.mode, EntryKind::Blob); + assert_eq!( + new.mode, + EntryKind::Link, + "but the destination has changed as is now a link" + ); + assert_eq!( + new.data.as_slice().expect("present").as_bstr(), + a_content, + "despite the same content" + ); + assert_eq!(new.id, hex_to_id("4c469b6c8c4486fdc9ded9d597d8f6816a455707")); + assert_eq!(new.rela_path, "a"); + + let out = platform.prepare_diff()?; + assert_eq!( + out.operation, + Operation::InternalDiff { + algorithm: Algorithm::Histogram + }, + "it would still diff, despite this being blob-with-link now. But that's fine." + ); + + assert_eq!( + comparable_ext_diff(platform.prepare_diff_command( + "test".into(), + gix_diff::command::Context { + git_dir: Some(".".into()), + ..Default::default() + }, + 0, + 1 + )), + format!("{}test a 0000000000000000000000000000000000000000 100644 4c469b6c8c4486fdc9ded9d597d8f6816a455707 120000", (!cfg!(windows)).then_some(r#"GIT_DIFF_PATH_COUNTER=1 GIT_DIFF_PATH_TOTAL=1 GIT_DIR=. "#).unwrap_or_default()), + "Also obvious that symlinks are definitely special, but it's what git does as well" + ); + + platform.clear_resource_cache(); + assert_eq!( + platform.resources(), + None, + "clearing the cache voids resources and one has to set it up again" + ); + + Ok(()) +} + +fn comparable_ext_diff( + cmd: Result< + gix_diff::blob::platform::prepare_diff_command::Command, + gix_diff::blob::platform::prepare_diff_command::Error, + >, +) -> String { + let cmd = cmd.expect("no error"); + let tokens = shell_words::split(&format!("{:?}", *cmd)).expect("parses fine"); + let ofs = if cfg!(windows) { 3 } else { 0 }; // Windows doesn't show env vars + tokens + .into_iter() + .enumerate() + .filter_map(|(idx, s)| { + (idx != (5 - ofs) && idx != (8 - ofs)) + .then_some(s) + .or_else(|| Some("".into())) + }) + .collect::>() + .join(" ") +} + +#[test] +fn diff_binary() -> crate::Result { + let mut platform = new_platform( + Some(gix_diff::blob::Driver { + name: "a".into(), + is_binary: Some(true), + ..Default::default() + }), + gix_diff::blob::pipeline::Mode::default(), + ); + platform.set_resource( + gix_hash::Kind::Sha1.null(), + EntryKind::Blob, + "a".into(), + ResourceKind::OldOrSource, + &gix_object::find::Never, + )?; + + let mut db = ObjectDb::default(); + let a_content = "b"; + let id = db.insert(a_content); + platform.set_resource(id, EntryKind::Blob, "b".into(), ResourceKind::NewOrDestination, &db)?; + + let out = platform.prepare_diff()?; + assert!( + matches!(out.operation, Operation::SourceOrDestinationIsBinary), + "one binary resource is enough to skip diffing entirely" + ); + + match platform.prepare_diff_command("test".into(), Default::default(), 0, 1) { + Err(err) => assert_eq!( + err.to_string(), + "Binary resources can't be diffed with an external command (as we don't have the data anymore)" + ), + Ok(_) => unreachable!("must error"), + } + + Ok(()) +} + +#[test] +fn diff_performed_despite_external_command() -> crate::Result { + let mut platform = new_platform( + Some(gix_diff::blob::Driver { + name: "a".into(), + command: Some("something-to-be-ignored".into()), + algorithm: Some(Algorithm::Myers), + ..Default::default() + }), + gix_diff::blob::pipeline::Mode::default(), + ); + platform.set_resource( + gix_hash::Kind::Sha1.null(), + EntryKind::Blob, + "a".into(), + ResourceKind::OldOrSource, + &gix_object::find::Never, + )?; + + let mut db = ObjectDb::default(); + let a_content = "b"; + let id = db.insert(a_content); + platform.set_resource(id, EntryKind::Blob, "b".into(), ResourceKind::NewOrDestination, &db)?; + + let out = platform.prepare_diff()?; + assert!( + matches!( + out.operation, + Operation::InternalDiff { + algorithm: Algorithm::Myers + } + ), + "by default, we prepare for internal diffs, unless external commands are enabled.\ + The caller could still obtain the command from here if they wanted to, as well.\ + Also, the algorithm is overridden by the source." + ); + Ok(()) +} + +#[test] +fn diff_skipped_due_to_external_command_and_enabled_option() -> crate::Result { + let command: BString = "something-to-be-ignored".into(); + let mut platform = new_platform( + Some(gix_diff::blob::Driver { + name: "a".into(), + command: Some(command.clone()), + algorithm: Some(Algorithm::Myers), + ..Default::default() + }), + gix_diff::blob::pipeline::Mode::default(), + ); + platform.options.skip_internal_diff_if_external_is_configured = true; + + platform.set_resource( + gix_hash::Kind::Sha1.null(), + EntryKind::Blob, + "a".into(), + ResourceKind::OldOrSource, + &gix_object::find::Never, + )?; + + let mut db = ObjectDb::default(); + let a_content = "b"; + let id = db.insert(a_content); + platform.set_resource(id, EntryKind::Blob, "b".into(), ResourceKind::NewOrDestination, &db)?; + + let out = platform.prepare_diff()?; + assert_eq!( + out.operation, + Operation::ExternalCommand { + command: command.as_ref() + }, + "now we provide all information that is needed to run the overridden diff command" + ); + Ok(()) +} + +#[test] +fn source_and_destination_do_not_exist() -> crate::Result { + let mut platform = new_platform(None, pipeline::Mode::default()); + platform.set_resource( + gix_hash::Kind::Sha1.null(), + EntryKind::Blob, + "missing".into(), + ResourceKind::OldOrSource, + &gix_object::find::Never, + )?; + + platform.set_resource( + gix_hash::Kind::Sha1.null(), + EntryKind::BlobExecutable, + "a".into(), + ResourceKind::NewOrDestination, + &gix_object::find::Never, + )?; + + let (old, new) = platform.resources().expect("previously set source and destination"); + assert_eq!(old.data, platform::resource::Data::Missing); + assert_eq!(old.driver_index, None); + assert_eq!(old.mode, EntryKind::Blob); + assert_eq!(new.data, platform::resource::Data::Missing); + assert_eq!(new.driver_index, None); + assert_eq!(new.mode, EntryKind::BlobExecutable); + + assert!(matches!( + platform.prepare_diff(), + Err(prepare_diff::Error::SourceAndDestinationRemoved) + )); + + assert_eq!( + format!( + "{:?}", + *platform + .prepare_diff_command( + "test".into(), + gix_diff::command::Context { + git_dir: Some(".".into()), + ..Default::default() + }, + 0, + 1 + ) + .expect("resources set") + ), + format!( + r#"{}"test" "missing" "/dev/null" "." "." "/dev/null" "." "." "a""#, + (!cfg!(windows)) + .then_some(r#"GIT_DIFF_PATH_COUNTER="1" GIT_DIFF_PATH_TOTAL="1" GIT_DIR="." "#) + .unwrap_or_default() + ) + ); + Ok(()) +} + +#[test] +fn invalid_resource_types() { + let mut platform = new_platform(None, pipeline::Mode::default()); + for (mode, name) in [(EntryKind::Commit, "Commit"), (EntryKind::Tree, "Tree")] { + assert_eq!( + platform + .set_resource( + gix_hash::Kind::Sha1.null(), + mode, + "a".into(), + ResourceKind::NewOrDestination, + &gix_object::find::Never, + ) + .unwrap_err() + .to_string(), + format!("Can only diff blobs and links, not {name}") + ); + } +} + +fn new_platform( + drivers: impl IntoIterator, + mode: gix_diff::blob::pipeline::Mode, +) -> Platform { + let root = gix_testtools::scripted_fixture_read_only_standalone("make_blob_repo.sh").expect("valid fixture"); + let attributes = gix_worktree::Stack::new( + &root, + gix_worktree::stack::State::AttributesStack(gix_worktree::stack::state::Attributes::new( + Default::default(), + None, + attributes::Source::WorktreeThenIdMapping, + Default::default(), + )), + gix_worktree::glob::pattern::Case::Sensitive, + Vec::new(), + Vec::new(), + ); + let filter = gix_diff::blob::Pipeline::new( + pipeline::WorktreeRoots { + old_root: Some(root.clone()), + new_root: None, + }, + gix_filter::Pipeline::default(), + drivers.into_iter().collect(), + default_options(), + ); + Platform::new(Default::default(), filter, mode, attributes) +} diff --git a/gix-diff/tests/diff.rs b/gix-diff/tests/diff.rs index 3ba4af8e15f..1a230f2575f 100644 --- a/gix-diff/tests/diff.rs +++ b/gix-diff/tests/diff.rs @@ -5,4 +5,54 @@ fn hex_to_id(hex: &str) -> gix_hash::ObjectId { } mod blob; +mod rewrites; mod tree; + +mod util { + use gix_hash::oid; + use gix_object::bstr::BString; + use gix_object::find::Error; + use std::collections::HashMap; + + #[derive(Default)] + pub struct ObjectDb { + data_by_id: HashMap, + } + + impl gix_object::FindHeader for ObjectDb { + fn try_header(&self, id: &oid) -> Result, Error> { + match self.data_by_id.get(&id.to_owned()) { + Some(data) => Ok(Some(gix_object::Header { + kind: gix_object::Kind::Blob, + size: data.len() as u64, + })), + None => Ok(None), + } + } + } + + impl gix_object::Find for ObjectDb { + fn try_find<'a>(&self, id: &oid, buffer: &'a mut Vec) -> Result>, Error> { + match self.data_by_id.get(&id.to_owned()) { + Some(data) => { + buffer.resize(data.len(), 0); + buffer.copy_from_slice(data); + Ok(Some(gix_object::Data { + kind: gix_object::Kind::Blob, + data: buffer.as_slice(), + })) + } + None => Ok(None), + } + } + } + + impl ObjectDb { + /// Insert `data` and return its hash. That can be used to find it again. + pub fn insert(&mut self, data: &str) -> gix_hash::ObjectId { + let id = gix_object::compute_hash(gix_hash::Kind::Sha1, gix_object::Kind::Blob, data.as_bytes()); + self.data_by_id.insert(id, data.into()); + id + } + } +} diff --git a/gix-diff/tests/fixtures/generated-archives/make_blob_repo.tar.xz b/gix-diff/tests/fixtures/generated-archives/make_blob_repo.tar.xz new file mode 100644 index 00000000000..515ebec8e50 --- /dev/null +++ b/gix-diff/tests/fixtures/generated-archives/make_blob_repo.tar.xz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7ada04b2d2df9b375d6d80499fee7eb3ff7ef5d03e9e3784376645fe533aea7 +size 10744 diff --git a/gix-diff/tests/fixtures/generated-archives/make_diff_repo.tar.xz b/gix-diff/tests/fixtures/generated-archives/make_diff_repo.tar.xz index a94a01f9e26..12b1c3e932f 100644 --- a/gix-diff/tests/fixtures/generated-archives/make_diff_repo.tar.xz +++ b/gix-diff/tests/fixtures/generated-archives/make_diff_repo.tar.xz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c37d85f4467ebe265761ffb989c0d924582e9fa5f660e1b3ad0365aca4599e84 -size 18468 +oid sha256:78430a7c39ab1337d311e3beb8988ffa1a131d40630b4e4662104a194823b596 +size 18440 diff --git a/gix-diff/tests/fixtures/make_blob_repo.sh b/gix-diff/tests/fixtures/make_blob_repo.sh new file mode 100644 index 00000000000..ac0079a68af --- /dev/null +++ b/gix-diff/tests/fixtures/make_blob_repo.sh @@ -0,0 +1,22 @@ +#!/bin/bash +set -eu -o pipefail + +git init -q + +echo a > a +echo b > b +echo c > c +echo d > d +echo e > e-no-attr +echo unset > unset + +cat <.gitattributes +a diff=a +b diff=b +c diff=c +d diff=d +missing diff=missing +unset -diff +EOF + +git add . && git commit -m "init" diff --git a/gix-diff/tests/fixtures/make_diff_repo.sh b/gix-diff/tests/fixtures/make_diff_repo.sh index 5b4859e9393..7ae9ccab22a 100755 --- a/gix-diff/tests/fixtures/make_diff_repo.sh +++ b/gix-diff/tests/fixtures/make_diff_repo.sh @@ -1,5 +1,4 @@ #!/bin/bash -set -x set -eu -o pipefail diff --git a/gix-diff/tests/rewrites/mod.rs b/gix-diff/tests/rewrites/mod.rs new file mode 100644 index 00000000000..be44ef39eda --- /dev/null +++ b/gix-diff/tests/rewrites/mod.rs @@ -0,0 +1,56 @@ +use gix_diff::rewrites::tracker::ChangeKind; +use gix_hash::{oid, ObjectId}; +use gix_object::tree::{EntryKind, EntryMode}; + +mod tracker; + +#[derive(Clone)] +pub struct Change { + id: ObjectId, + kind: ChangeKind, + mode: EntryMode, +} + +impl gix_diff::rewrites::tracker::Change for Change { + fn id(&self) -> &oid { + &self.id + } + + fn kind(&self) -> ChangeKind { + self.kind + } + + fn entry_mode(&self) -> EntryMode { + self.mode + } + + fn id_and_entry_mode(&self) -> (&oid, EntryMode) { + (&self.id, self.mode) + } +} + +const NULL_ID: gix_hash::ObjectId = gix_hash::Kind::Sha1.null(); + +impl Change { + fn modification() -> Self { + Change { + id: NULL_ID, + kind: ChangeKind::Modification, + mode: EntryKind::Blob.into(), + } + } + fn deletion() -> Self { + Change { + id: NULL_ID, + kind: ChangeKind::Deletion, + mode: EntryKind::Blob.into(), + } + } + fn addition() -> Self { + Change { + id: NULL_ID, + kind: ChangeKind::Addition, + mode: EntryKind::Blob.into(), + } + } +} diff --git a/gix-diff/tests/rewrites/tracker.rs b/gix-diff/tests/rewrites/tracker.rs new file mode 100644 index 00000000000..ce0605348fb --- /dev/null +++ b/gix-diff/tests/rewrites/tracker.rs @@ -0,0 +1,601 @@ +use crate::hex_to_id; +use crate::rewrites::{Change, NULL_ID}; +use gix_diff::blob::DiffLineStats; +use gix_diff::rewrites::tracker::visit::{Source, SourceKind}; +use gix_diff::rewrites::tracker::ChangeKind; +use gix_diff::rewrites::{Copies, CopySource}; +use gix_diff::tree::visit::Action; +use gix_diff::{rewrites, Rewrites}; +use gix_object::tree::EntryKind; + +#[test] +fn rename_by_id() -> crate::Result { + // Limits are only applied when doing rewrite-checks + for limit in [0, 1] { + let rewrites = Rewrites { + copies: None, + percentage: None, + limit, + }; + let mut track = util::new_tracker(rewrites); + assert!( + track.try_push_change(Change::modification(), "a".into()).is_some(), + "modifications play no role in rename tracking" + ); + assert!( + track.try_push_change(Change::deletion(), "b".into()).is_none(), + "recorded for later matching" + ); + assert!( + track.try_push_change(Change::addition(), "c".into()).is_none(), + "recorded for later matching" + ); + let mut called = false; + let out = util::assert_emit(&mut track, |dst, src| { + assert!(!called, "only one rename pair is expected"); + called = true; + assert_eq!( + src.unwrap(), + Source { + entry_mode: EntryKind::Blob.into(), + id: NULL_ID, + kind: SourceKind::Rename, + location: "b".into(), + diff: None, + } + ); + assert_eq!(dst.location, "c"); + Action::Continue + }); + assert_eq!( + out, + rewrites::Outcome { + options: rewrites, + ..Default::default() + }, + "no similarity check was performed, it was all matched by id" + ); + } + Ok(()) +} + +#[test] +fn copy_by_similarity_reports_limit_if_encountered() -> crate::Result { + let rewrites = Rewrites { + copies: Some(Copies { + source: CopySource::FromSetOfModifiedFiles, + percentage: Some(0.5), + }), + percentage: None, + limit: 1, + }; + let mut track = util::new_tracker(rewrites); + let odb = util::add_retained_blobs( + &mut track, + [ + (Change::modification(), "a", "a\n"), + (Change::addition(), "a-cpy-1", "a"), + (Change::addition(), "a-cpy-2", "a"), + (Change::modification(), "d", "ab"), + ], + ); + + let mut calls = 0; + let out = util::assert_emit_with_objects( + &mut track, + |dst, src| { + assert!(src.is_none()); + match calls { + 0 => assert_eq!(dst.location, "a"), + 1 => assert_eq!(dst.location, "a-cpy-1"), + 2 => assert_eq!(dst.location, "a-cpy-2"), + 3 => assert_eq!(dst.location, "d"), + _ => panic!("too many emissions"), + } + calls += 1; + Action::Continue + }, + odb, + ); + assert_eq!( + out, + rewrites::Outcome { + options: rewrites, + num_similarity_checks_skipped_for_copy_tracking_due_to_limit: 4, + ..Default::default() + }, + "no similarity check was performed at all - all or nothing" + ); + Ok(()) +} + +#[test] +fn copy_by_id() -> crate::Result { + // Limits are only applied when doing rewrite-checks + for limit in [0, 1] { + let rewrites = Rewrites { + copies: Some(Copies { + source: CopySource::FromSetOfModifiedFiles, + percentage: None, + }), + percentage: None, + limit, + }; + let mut track = util::new_tracker(rewrites); + let odb = util::add_retained_blobs( + &mut track, + [ + (Change::modification(), "a", "a"), + (Change::addition(), "a-cpy-1", "a"), + (Change::addition(), "a-cpy-2", "a"), + (Change::modification(), "d", "a"), + ], + ); + + let mut calls = 0; + let out = util::assert_emit_with_objects( + &mut track, + |dst, src| { + let source_a = Source { + entry_mode: EntryKind::Blob.into(), + id: hex_to_id("2e65efe2a145dda7ee51d1741299f848e5bf752e"), + kind: SourceKind::Copy, + location: "a".into(), + diff: None, + }; + match calls { + 0 => { + assert_eq!(src.unwrap(), source_a); + assert_eq!( + dst.location, "a-cpy-1", + "it just finds the first possible match in order, ignoring other candidates" + ); + } + 1 => { + assert_eq!(src.unwrap(), source_a, "copy-sources can be used multiple times"); + assert_eq!(dst.location, "a-cpy-2"); + } + 2 => { + assert!(src.is_none()); + assert_eq!(dst.location, "d"); + } + _ => panic!("too many emissions"), + } + calls += 1; + Action::Continue + }, + odb, + ); + assert_eq!( + out, + rewrites::Outcome { + options: rewrites, + ..Default::default() + }, + "no similarity check was performed, it was all matched by id" + ); + } + Ok(()) +} + +#[test] +fn copy_by_id_search_in_all_sources() -> crate::Result { + // Limits are only applied when doing rewrite-checks + for limit in [0, 1] { + let rewrites = Rewrites { + copies: Some(Copies { + source: CopySource::FromSetOfModifiedFilesAndAllSources, + percentage: None, + }), + percentage: None, + limit, + }; + let mut track = util::new_tracker(rewrites); + let odb = util::add_retained_blobs( + &mut track, + [ + (Change::addition(), "a-cpy-1", "a"), + (Change::addition(), "a-cpy-2", "a"), + ], + ); + + let mut calls = 0; + let content_id = hex_to_id("2e65efe2a145dda7ee51d1741299f848e5bf752e"); + let out = util::assert_emit_with_objects_and_sources( + &mut track, + |dst, src| { + let source_a = Source { + entry_mode: EntryKind::Blob.into(), + id: content_id, + kind: SourceKind::Copy, + location: "a-src".into(), + diff: None, + }; + match calls { + 0 => { + assert_eq!(src.unwrap(), source_a); + assert_eq!( + dst.location, "a-cpy-1", + "it just finds the first possible match in order, ignoring other candidates" + ); + } + 1 => { + assert_eq!(src.unwrap(), source_a, "copy-sources can be used multiple times"); + assert_eq!(dst.location, "a-cpy-2"); + } + 2 => { + assert!(src.is_none()); + assert_eq!(dst.location, "d"); + } + _ => panic!("too many emissions"), + } + calls += 1; + Action::Continue + }, + odb, + [( + { + let mut c = Change::modification(); + c.id = content_id; + c + }, + "a-src", + )], + ); + assert_eq!( + out, + rewrites::Outcome { + options: rewrites, + ..Default::default() + }, + "no similarity check was performed, it was all matched by id" + ); + } + Ok(()) +} + +#[test] +fn copy_by_50_percent_similarity() -> crate::Result { + let rewrites = Rewrites { + copies: Some(Copies { + source: CopySource::FromSetOfModifiedFiles, + percentage: Some(0.5), + }), + percentage: None, + limit: 0, + }; + let mut track = util::new_tracker(rewrites); + let odb = util::add_retained_blobs( + &mut track, + [ + (Change::modification(), "a", "a\n"), + (Change::addition(), "a-cpy-1", "a\nb"), + (Change::addition(), "a-cpy-2", "a\nc"), + (Change::modification(), "d", "a"), + ], + ); + + let mut calls = 0; + let out = util::assert_emit_with_objects( + &mut track, + |dst, src| { + let source_a = Source { + entry_mode: EntryKind::Blob.into(), + id: hex_to_id("78981922613b2afb6025042ff6bd878ac1994e85"), + kind: SourceKind::Copy, + location: "a".into(), + diff: Some(DiffLineStats { + removals: 0, + insertions: 1, + before: 1, + after: 2, + similarity: 0.6666667, + }), + }; + match calls { + 0 => { + assert_eq!( + src.unwrap(), + source_a, + "it finds the first possible source, no candidates" + ); + assert_eq!(dst.location, "a-cpy-1"); + } + 1 => { + assert_eq!(src.unwrap(), source_a, "the same source can be reused as well"); + assert_eq!(dst.location, "a-cpy-2"); + } + 2 => { + assert!(src.is_none()); + assert_eq!(dst.location, "d"); + } + _ => panic!("too many emissions"), + } + calls += 1; + Action::Continue + }, + odb, + ); + assert_eq!( + out, + rewrites::Outcome { + options: rewrites, + num_similarity_checks: 4, + ..Default::default() + }, + "no similarity check was performed, it was all matched by id" + ); + Ok(()) +} + +#[test] +fn copy_by_id_in_additions_only() -> crate::Result { + let rewrites = Rewrites { + copies: Some(Copies { + source: CopySource::FromSetOfModifiedFiles, + percentage: None, + }), + percentage: None, + limit: 0, + }; + let mut track = util::new_tracker(rewrites); + let odb = util::add_retained_blobs( + &mut track, + [ + (Change::modification(), "a", "a"), + (Change::modification(), "a-cpy-1", "a"), + ], + ); + + let mut calls = 0; + let out = util::assert_emit_with_objects( + &mut track, + |dst, src| { + match calls { + 0 => { + assert!(src.is_none()); + assert_eq!(dst.location, "a"); + } + 1 => { + assert!(src.is_none()); + assert_eq!( + dst.location, "a-cpy-1", + "copy detection is only done for additions, not within modifications" + ); + } + _ => panic!("too many emissions"), + } + calls += 1; + Action::Continue + }, + odb, + ); + assert_eq!( + out, + rewrites::Outcome { + options: rewrites, + ..Default::default() + }, + "no similarity check was performed, it was all matched by id" + ); + Ok(()) +} + +#[test] +fn rename_by_similarity_reports_limit_if_encountered() -> crate::Result { + let rewrites = Rewrites { + copies: None, + percentage: Some(0.5), + limit: 1, + }; + let mut track = util::new_tracker(rewrites); + let odb = util::add_retained_blobs( + &mut track, + [ + (Change::deletion(), "a", "first\nsecond\n"), + (Change::addition(), "b", "firt\nsecond\n"), + (Change::addition(), "c", "second\nunrelated\n"), + ], + ); + + let mut calls = 0; + let out = util::assert_emit_with_objects( + &mut track, + |dst, src| { + assert!(src.is_none()); + match calls { + 0 => assert_eq!(dst.location, "a"), + 1 => assert_eq!(dst.location, "b"), + 2 => assert_eq!(dst.location, "c"), + _ => panic!("too many elements emitted"), + }; + calls += 1; + Action::Continue + }, + odb, + ); + assert_eq!( + out, + rewrites::Outcome { + options: rewrites, + num_similarity_checks_skipped_for_rename_tracking_due_to_limit: 2, + ..Default::default() + }, + "no similarity check was performed at all - all or nothing" + ); + Ok(()) +} + +#[test] +fn rename_by_50_percent_similarity() -> crate::Result { + let rewrites = Rewrites { + copies: None, + percentage: Some(0.5), + limit: 0, + }; + let mut track = util::new_tracker(rewrites); + let odb = util::add_retained_blobs( + &mut track, + [ + (Change::deletion(), "a", "first\nsecond\n"), + (Change::addition(), "b", "firt\nsecond\n"), + (Change::addition(), "c", "second\nunrelated\n"), + ], + ); + + let mut calls = 0; + let out = util::assert_emit_with_objects( + &mut track, + |dst, src| { + match calls { + 0 => { + assert_eq!( + src.unwrap(), + Source { + entry_mode: EntryKind::Blob.into(), + id: hex_to_id("66a52ee7a1d803dc57859c3e95ac9dcdc87c0164"), + kind: SourceKind::Rename, + location: "a".into(), + diff: Some(DiffLineStats { + removals: 1, + insertions: 1, + before: 2, + after: 2, + similarity: 0.53846157 + }) + } + ); + assert_eq!(dst.location, "b"); + } + 1 => { + assert!(src.is_none(), "pair already found"); + assert_eq!(dst.location, "c"); + } + _ => panic!("too many elements emitted"), + }; + calls += 1; + Action::Continue + }, + odb, + ); + assert_eq!( + out, + rewrites::Outcome { + options: rewrites, + num_similarity_checks: 1, + ..Default::default() + }, + "the first attempt already yields the one pair, so it doesn't participate anymore\ + - we don't have best candidates yet, thus only one check" + ); + Ok(()) +} + +#[test] +fn remove_only() -> crate::Result { + let mut track = util::new_tracker(Default::default()); + assert!( + track.try_push_change(Change::deletion(), "a".into()).is_none(), + "recorded for later matching" + ); + let mut called = false; + let out = util::assert_emit(&mut track, |dst, src| { + assert!(!called); + called = true; + assert_eq!(src, None, "there is just a single deletion, no pair"); + assert_eq!(dst.location, "a"); + assert_eq!(dst.change.kind, ChangeKind::Deletion); + Action::Continue + }); + assert_eq!(out, Default::default()); + Ok(()) +} + +#[test] +fn add_only() -> crate::Result { + let mut track = util::new_tracker(Default::default()); + assert!( + track.try_push_change(Change::addition(), "a".into()).is_none(), + "recorded for later matching - note that this is the starting point of a matching run" + ); + let mut called = false; + let out = util::assert_emit(&mut track, |dst, src| { + assert!(!called); + called = true; + assert_eq!(src, None, "there is just a single deletion, no pair"); + assert_eq!(dst.location, "a"); + assert_eq!(dst.change.kind, ChangeKind::Addition); + Action::Continue + }); + assert_eq!(out, Default::default()); + Ok(()) +} + +mod util { + use crate::rewrites::Change; + use crate::util::ObjectDb; + use gix_diff::rewrites::tracker::visit::{Destination, Source}; + use gix_diff::tree::visit::Action; + use gix_diff::{rewrites, Rewrites}; + + /// Add `blobs` `(change, location, data)` to tracker that will all be retained. Note that the `id` of the respective change will be adjusted to match. + pub fn add_retained_blobs<'a>( + tracker: &mut rewrites::Tracker, + blobs: impl IntoIterator, + ) -> ObjectDb { + let mut db = ObjectDb::default(); + for (mut change, location, data) in blobs { + change.id = db.insert(data); + assert!( + tracker.try_push_change(change, location.into()).is_none(), + "input changes must be tracked" + ); + } + db + } + + pub fn assert_emit( + tracker: &mut rewrites::Tracker, + cb: impl FnMut(Destination<'_, Change>, Option>) -> Action, + ) -> rewrites::Outcome { + assert_emit_with_objects(tracker, cb, gix_object::find::Never) + } + + pub fn assert_emit_with_objects( + tracker: &mut rewrites::Tracker, + cb: impl FnMut(Destination<'_, Change>, Option>) -> Action, + objects: impl gix_object::Find, + ) -> rewrites::Outcome { + assert_emit_with_objects_and_sources(tracker, cb, objects, None) + } + + pub fn assert_emit_with_objects_and_sources<'a>( + tracker: &mut rewrites::Tracker, + cb: impl FnMut(Destination<'_, Change>, Option>) -> Action, + objects: impl gix_object::Find, + sources: impl IntoIterator, + ) -> rewrites::Outcome { + let mut sources: Vec<_> = sources.into_iter().collect(); + tracker + .emit( + cb, + &objects, + &mut gix_filter::Pipeline::new(Default::default(), Default::default()), + |cb| -> Result<(), std::io::Error> { + let sources = std::mem::take(&mut sources); + if sources.is_empty() { + panic!("Should not access more sources unless these are specified"); + } + for (src, location) in sources { + cb(src, location.into()); + } + Ok(()) + }, + ) + .expect("emit doesn't fail") + } + + pub fn new_tracker(rewrites: Rewrites) -> rewrites::Tracker { + rewrites::Tracker::new(rewrites, gix_diff::blob::Algorithm::Myers) + } +} From feca5d0c1f56321d8fe15d59003195bf2f276c35 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 28 Nov 2023 10:25:54 +0100 Subject: [PATCH 15/20] feat!: rewrite-tracker now uses blob diff-platform for correct and optimized diffing. Correctness is improved as all necessary transformation are now performed. Performance is improved by avoiding duplicate work by caching transformed diffable data for later reuse. --- gix-diff/src/blob/platform.rs | 2 +- gix-diff/src/rewrites/mod.rs | 6 - gix-diff/src/rewrites/tracker.rs | 175 ++++++++++++++++++----------- gix-diff/tests/rewrites/tracker.rs | 36 +++++- 4 files changed, 143 insertions(+), 76 deletions(-) diff --git a/gix-diff/src/blob/platform.rs b/gix-diff/src/blob/platform.rs index 29615cd55fe..143317ba04e 100644 --- a/gix-diff/src/blob/platform.rs +++ b/gix-diff/src/blob/platform.rs @@ -351,7 +351,7 @@ impl Platform { mode: gix_object::tree::EntryKind, rela_path: &BStr, kind: ResourceKind, - objects: &impl gix_object::FindObjectOrHeader, // TODO: make this `dyn` once https://github.com/rust-lang/rust/issues/65991 is stable + objects: &impl gix_object::FindObjectOrHeader, // TODO: make this `dyn` once https://github.com/rust-lang/rust/issues/65991 is stable, then also make tracker.rs `objects` dyn ) -> Result<(), set_resource::Error> { let res = self.set_resource_inner(id, mode, rela_path, kind, objects); if res.is_err() { diff --git a/gix-diff/src/rewrites/mod.rs b/gix-diff/src/rewrites/mod.rs index 8af13165f6f..08d6f2cce53 100644 --- a/gix-diff/src/rewrites/mod.rs +++ b/gix-diff/src/rewrites/mod.rs @@ -10,14 +10,8 @@ pub struct Tracker { items: Vec>, /// A place to store all paths in to reduce amount of allocations. path_backing: Vec, - /// A buffer for use when fetching objects for similarity tests. - buf1: Vec, - /// Another buffer for use when fetching objects for similarity tests. - buf2: Vec, /// How to track copies and/or rewrites. rewrites: Rewrites, - /// The diff algorithm to use when checking for similarity. - diff_algo: crate::blob::Algorithm, } /// Determine in which set of files to search for copies. diff --git a/gix-diff/src/rewrites/tracker.rs b/gix-diff/src/rewrites/tracker.rs index da8a7bb1149..fca315a253e 100644 --- a/gix-diff/src/rewrites/tracker.rs +++ b/gix-diff/src/rewrites/tracker.rs @@ -8,11 +8,11 @@ use std::ops::Range; use gix_object::tree::{EntryKind, EntryMode}; -use crate::blob::DiffLineStats; +use crate::blob::platform::prepare_diff::Operation; +use crate::blob::{DiffLineStats, ResourceKind}; use crate::rewrites::{CopySource, Outcome}; use crate::{rewrites::Tracker, Rewrites}; use bstr::BStr; -use gix_object::FindExt; /// The kind of a change. #[derive(Debug, Copy, Clone, Ord, PartialOrd, PartialEq, Eq)] @@ -123,21 +123,21 @@ pub mod emit { FindExistingBlob(#[from] gix_object::find::existing_object::Error), #[error("Could not obtain exhaustive item set to use as possible sources for copy detection")] GetItemsForExhaustiveCopyDetection(#[source] Box), + #[error(transparent)] + SetResource(#[from] crate::blob::platform::set_resource::Error), + #[error(transparent)] + PrepareDiff(#[from] crate::blob::platform::prepare_diff::Error), } } /// Lifecycle impl Tracker { - /// Create a new instance with `rewrites` configuration, and the `diff_algo` to use when performing - /// similarity checking. - pub fn new(rewrites: Rewrites, diff_algo: crate::blob::Algorithm) -> Self { + /// Create a new instance with `rewrites` configuration. + pub fn new(rewrites: Rewrites) -> Self { Tracker { items: vec![], path_backing: vec![], - buf1: Vec::new(), - buf2: Vec::new(), rewrites, - diff_algo, } } } @@ -177,10 +177,14 @@ impl Tracker { /// /// `objects` is used to access blob data for similarity checks if required and is taken directly from the object database. /// Worktree filters and text conversions will be applied afterwards automatically. Note that object-caching *should not* - /// be enabled as caching is implemented internally, after all, the blob that's actually diffed is going through conversion steps. + /// be enabled as caching is implemented by `diff_cache`, after all, the blob that's actually diffed is going + /// through conversion steps. /// - /// Use `worktree_filter` to obtain working-tree versions of files present on disk before diffing to see if rewrites happened, - /// with text-conversions being applied afterwards. + /// `diff_cache` is a way to retain a cache of resources that are prepared for rapid diffing, and it also controls + /// the diff-algorithm (provided no user-algorithm is set). + /// Note that we control a few options of `diff_cache` to assure it will ignore external commands. + /// Note that we do not control how the `diff_cache` converts resources, it's left to the caller to decide + /// if it should look at what's stored in `git`, or in the working tree, along with all diff-specific conversions. /// /// `push_source_tree(push_fn: push(change, location))` is a function that is called when the entire tree of the source /// should be added as modifications by calling `push` repeatedly to use for perfect copy tracking. Note that `push` @@ -188,14 +192,16 @@ impl Tracker { pub fn emit( &mut self, mut cb: impl FnMut(visit::Destination<'_, T>, Option>) -> crate::tree::visit::Action, - objects: &dyn gix_object::Find, - _worktree_filter: &mut gix_filter::Pipeline, + diff_cache: &mut crate::blob::Platform, + objects: &impl gix_object::FindObjectOrHeader, mut push_source_tree: PushSourceTreeFn, ) -> Result where PushSourceTreeFn: FnMut(&mut dyn FnMut(T, &BStr)) -> Result<(), E>, E: std::error::Error + Send + Sync + 'static, { + diff_cache.options.skip_internal_diff_if_external_is_configured = false; + fn by_id_and_location(a: &Item, b: &Item) -> std::cmp::Ordering { a.change .id() @@ -213,11 +219,19 @@ impl Tracker { &mut cb, self.rewrites.percentage, &mut out, + diff_cache, objects, )?; if let Some(copies) = self.rewrites.copies { - self.match_pairs_of_kind(visit::SourceKind::Copy, &mut cb, copies.percentage, &mut out, objects)?; + self.match_pairs_of_kind( + visit::SourceKind::Copy, + &mut cb, + copies.percentage, + &mut out, + diff_cache, + objects, + )?; match copies.source { CopySource::FromSetOfModifiedFiles => {} @@ -233,7 +247,14 @@ impl Tracker { .map_err(|err| emit::Error::GetItemsForExhaustiveCopyDetection(Box::new(err)))?; self.items.sort_by(by_id_and_location); - self.match_pairs_of_kind(visit::SourceKind::Copy, &mut cb, copies.percentage, &mut out, objects)?; + self.match_pairs_of_kind( + visit::SourceKind::Copy, + &mut cb, + copies.percentage, + &mut out, + diff_cache, + objects, + )?; } } } @@ -263,11 +284,14 @@ impl Tracker { cb: &mut impl FnMut(visit::Destination<'_, T>, Option>) -> crate::tree::visit::Action, percentage: Option, out: &mut Outcome, - objects: &dyn gix_object::Find, + diff_cache: &mut crate::blob::Platform, + objects: &impl gix_object::FindObjectOrHeader, ) -> Result<(), emit::Error> { // we try to cheaply reduce the set of possibilities first, before possibly looking more exhaustively. let needs_second_pass = !needs_exact_match(percentage); - if self.match_pairs(cb, None /* by identity */, kind, out, objects)? == crate::tree::visit::Action::Cancel { + if self.match_pairs(cb, None /* by identity */, kind, out, diff_cache, objects)? + == crate::tree::visit::Action::Cancel + { return Ok(()); } if needs_second_pass { @@ -292,7 +316,7 @@ impl Tracker { } }; if !is_limited { - self.match_pairs(cb, percentage, kind, out, objects)?; + self.match_pairs(cb, percentage, kind, out, diff_cache, objects)?; } } Ok(()) @@ -304,9 +328,9 @@ impl Tracker { percentage: Option, kind: visit::SourceKind, stats: &mut Outcome, - objects: &dyn gix_object::Find, + diff_cache: &mut crate::blob::Platform, + objects: &impl gix_object::FindObjectOrHeader, ) -> Result { - // TODO(perf): reuse object data and interner state and interned tokens, make these available to `find_match()` let mut dest_ofs = 0; while let Some((mut dest_idx, dest)) = self.items[dest_ofs..].iter().enumerate().find_map(|(idx, item)| { (!item.emitted && matches!(item.change.kind(), ChangeKind::Addition)).then_some((idx, item)) @@ -317,12 +341,12 @@ impl Tracker { &self.items, dest, dest_idx, - percentage.map(|p| (p, self.diff_algo)), + percentage, kind, stats, objects, - &mut self.buf1, - &mut self.buf2, + diff_cache, + &self.path_backing, )? .map(|(src_idx, src, diff)| { let (id, entry_mode) = src.change.id_and_entry_mode(); @@ -409,15 +433,15 @@ fn find_match<'a, T: Change>( items: &'a [Item], item: &Item, item_idx: usize, - percentage: Option<(f32, crate::blob::Algorithm)>, + percentage: Option, kind: visit::SourceKind, stats: &mut Outcome, - objects: &dyn gix_object::Find, - buf1: &mut Vec, - buf2: &mut Vec, + objects: &impl gix_object::FindObjectOrHeader, + diff_cache: &mut crate::blob::Platform, + path_backing: &[u8], ) -> Result>, emit::Error> { let (item_id, item_mode) = item.change.id_and_entry_mode(); - if needs_exact_match(percentage.map(|t| t.0)) || item_mode.is_link() { + if needs_exact_match(percentage) || item_mode.is_link() { let first_idx = items.partition_point(|a| a.change.id() < item_id); let range = match items.get(first_idx..).map(|items| { let end = items @@ -440,55 +464,76 @@ fn find_match<'a, T: Change>( return Ok(Some(src)); } } else { - let mut new = None; - let (percentage, algo) = percentage.expect("it's set to something below 1.0 and we assured this"); + let mut has_new = false; + let percentage = percentage.expect("it's set to something below 1.0 and we assured this"); debug_assert_eq!( item.change.entry_mode().kind(), EntryKind::Blob, "symlinks are matched exactly, and trees aren't used here" ); + for (can_idx, src) in items .iter() .enumerate() .filter(|(src_idx, src)| *src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode)) { - let new = match &new { - Some(new) => new, - None => { - new = objects.find_blob(item_id, buf1)?.into(); - new.as_ref().expect("just set") - } - }; - let old = objects.find_blob(src.change.id(), buf2)?; - // TODO: make sure we get attribute handling/worktree conversion and binary skips and filters right here. - let tokens = crate::blob::intern::InternedInput::new( - crate::blob::sources::byte_lines_with_terminator(old.data), - crate::blob::sources::byte_lines_with_terminator(new.data), - ); - let counts = crate::blob::diff( - algo, - &tokens, - crate::blob::sink::Counter::new(diff::Statistics { - removed_bytes: 0, - input: &tokens, - }), - ); - let similarity = (old.data.len() - counts.wrapped) as f32 / old.data.len().max(new.data.len()) as f32; + if !has_new { + diff_cache.set_resource( + item_id.to_owned(), + item_mode.kind(), + item.location(path_backing), + ResourceKind::NewOrDestination, + objects, + )?; + has_new = true; + } + let (src_id, src_mode) = src.change.id_and_entry_mode(); + diff_cache.set_resource( + src_id.to_owned(), + src_mode.kind(), + src.location(path_backing), + ResourceKind::OldOrSource, + objects, + )?; + let prep = diff_cache.prepare_diff()?; stats.num_similarity_checks += 1; - if similarity >= percentage { - return Ok(Some(( - can_idx, - src, - DiffLineStats { - removals: counts.removals, - insertions: counts.insertions, - before: tokens.before.len().try_into().expect("interner handles only u32"), - after: tokens.after.len().try_into().expect("interner handles only u32"), - similarity, + match prep.operation { + Operation::InternalDiff { algorithm } => { + let tokens = + crate::blob::intern::InternedInput::new(prep.old.intern_source(), prep.new.intern_source()); + let counts = crate::blob::diff( + algorithm, + &tokens, + crate::blob::sink::Counter::new(diff::Statistics { + removed_bytes: 0, + input: &tokens, + }), + ); + let old_data_len = prep.old.data.as_slice().unwrap_or_default().len(); + let new_data_len = prep.new.data.as_slice().unwrap_or_default().len(); + let similarity = (old_data_len - counts.wrapped) as f32 / old_data_len.max(new_data_len) as f32; + if similarity >= percentage { + return Ok(Some(( + can_idx, + src, + DiffLineStats { + removals: counts.removals, + insertions: counts.insertions, + before: tokens.before.len().try_into().expect("interner handles only u32"), + after: tokens.after.len().try_into().expect("interner handles only u32"), + similarity, + } + .into(), + ))); } - .into(), - ))); - } + } + Operation::ExternalCommand { .. } => { + unreachable!("we have disabled this possibility with an option") + } + Operation::SourceOrDestinationIsBinary => { + // TODO: figure out if git does more here + } + }; } } Ok(None) diff --git a/gix-diff/tests/rewrites/tracker.rs b/gix-diff/tests/rewrites/tracker.rs index ce0605348fb..a50af02f277 100644 --- a/gix-diff/tests/rewrites/tracker.rs +++ b/gix-diff/tests/rewrites/tracker.rs @@ -564,7 +564,7 @@ mod util { pub fn assert_emit_with_objects( tracker: &mut rewrites::Tracker, cb: impl FnMut(Destination<'_, Change>, Option>) -> Action, - objects: impl gix_object::Find, + objects: impl gix_object::FindObjectOrHeader, ) -> rewrites::Outcome { assert_emit_with_objects_and_sources(tracker, cb, objects, None) } @@ -572,15 +572,15 @@ mod util { pub fn assert_emit_with_objects_and_sources<'a>( tracker: &mut rewrites::Tracker, cb: impl FnMut(Destination<'_, Change>, Option>) -> Action, - objects: impl gix_object::Find, + objects: impl gix_object::FindObjectOrHeader, sources: impl IntoIterator, ) -> rewrites::Outcome { let mut sources: Vec<_> = sources.into_iter().collect(); tracker .emit( cb, + &mut new_platform_no_worktree(), &objects, - &mut gix_filter::Pipeline::new(Default::default(), Default::default()), |cb| -> Result<(), std::io::Error> { let sources = std::mem::take(&mut sources); if sources.is_empty() { @@ -596,6 +596,34 @@ mod util { } pub fn new_tracker(rewrites: Rewrites) -> rewrites::Tracker { - rewrites::Tracker::new(rewrites, gix_diff::blob::Algorithm::Myers) + rewrites::Tracker::new(rewrites) + } + + fn new_platform_no_worktree() -> gix_diff::blob::Platform { + let root = gix_testtools::scripted_fixture_read_only_standalone("make_blob_repo.sh").expect("valid fixture"); + let attributes = gix_worktree::Stack::new( + root, + gix_worktree::stack::State::AttributesStack(gix_worktree::stack::state::Attributes::new( + Default::default(), + None, + gix_worktree::stack::state::attributes::Source::IdMapping, + Default::default(), + )), + gix_worktree::glob::pattern::Case::Sensitive, + Vec::new(), + Vec::new(), + ); + let filter = gix_diff::blob::Pipeline::new( + Default::default(), + gix_filter::Pipeline::default(), + Vec::new(), + Default::default(), + ); + gix_diff::blob::Platform::new( + Default::default(), + filter, + gix_diff::blob::pipeline::Mode::ToGit, + attributes, + ) } } From 4aea9b097fb08e504cdfc4a7c3b7511a308dc074 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 28 Nov 2023 16:15:58 +0100 Subject: [PATCH 16/20] feat: add the`diff::resource_cache()` low-level utility for rapid in-memory diffing of combinations of resources. We also add the `object::tree::diff::Platform::for_each_to_obtain_tree_with_cache()` to pass a resource-cache for re-use between multiple invocation for significant savings. --- gix/src/config/cache/access.rs | 98 +++++++++++++++++-- gix/src/config/mod.rs | 30 ++++++ gix/src/config/tree/sections/core.rs | 4 + gix/src/config/tree/sections/diff.rs | 68 ++++++++++++- gix/src/diff.rs | 64 +++++++++++- gix/src/object/tree/diff/for_each.rs | 69 +++++++++++-- gix/tests/config/tree.rs | 31 ++++++ gix/tests/diff/mod.rs | 46 +++++++++ .../generated-archives/make_diff_repo.tar.xz | 4 +- gix/tests/fixtures/make_diff_repo.sh | 18 ++++ gix/tests/gix.rs | 2 + src/plumbing/progress.rs | 4 - 12 files changed, 417 insertions(+), 21 deletions(-) create mode 100644 gix/tests/diff/mod.rs diff --git a/gix/src/config/cache/access.rs b/gix/src/config/cache/access.rs index 5f9f71887fa..feeb241ba30 100644 --- a/gix/src/config/cache/access.rs +++ b/gix/src/config/cache/access.rs @@ -39,6 +39,97 @@ impl Cache { .copied() } + #[cfg(feature = "blob-diff")] + pub(crate) fn diff_drivers(&self) -> Result, config::diff::drivers::Error> { + use crate::config::cache::util::ApplyLeniencyDefault; + let mut out = Vec::::new(); + for section in self + .resolved + .sections_by_name("diff") + .into_iter() + .flatten() + .filter(|s| (self.filter_config_section)(s.meta())) + { + let Some(name) = section.header().subsection_name().filter(|n| !n.is_empty()) else { + continue; + }; + + let driver = match out.iter_mut().find(|d| d.name == name) { + Some(existing) => existing, + None => { + out.push(gix_diff::blob::Driver { + name: name.into(), + ..Default::default() + }); + out.last_mut().expect("just pushed") + } + }; + + if let Some(binary) = section.value_implicit("binary") { + driver.is_binary = config::tree::Diff::DRIVER_BINARY + .try_into_binary(binary) + .with_leniency(self.lenient_config) + .map_err(|err| config::diff::drivers::Error { + name: driver.name.clone(), + attribute: "binary", + source: Box::new(err), + })?; + } + if let Some(command) = section.value(config::tree::Diff::DRIVER_COMMAND.name) { + driver.command = command.into_owned().into(); + } + if let Some(textconv) = section.value(config::tree::Diff::DRIVER_TEXTCONV.name) { + driver.binary_to_text_command = textconv.into_owned().into(); + } + if let Some(algorithm) = section.value("algorithm") { + driver.algorithm = config::tree::Diff::DRIVER_ALGORITHM + .try_into_algorithm(algorithm) + .or_else(|err| match err { + config::diff::algorithm::Error::Unimplemented { .. } if self.lenient_config => { + Ok(gix_diff::blob::Algorithm::Histogram) + } + err => Err(err), + }) + .with_lenient_default(self.lenient_config) + .map_err(|err| config::diff::drivers::Error { + name: driver.name.clone(), + attribute: "algorithm", + source: Box::new(err), + })? + .into(); + } + } + Ok(out) + } + + #[cfg(feature = "blob-diff")] + pub(crate) fn diff_pipeline_options( + &self, + ) -> Result { + Ok(gix_diff::blob::pipeline::Options { + large_file_threshold_bytes: self.big_file_threshold()?, + fs: self.fs_capabilities()?, + }) + } + + #[cfg(feature = "blob-diff")] + pub(crate) fn diff_renames(&self) -> Result, crate::diff::new_rewrites::Error> { + self.diff_renames + .get_or_try_init(|| crate::diff::new_rewrites(&self.resolved, self.lenient_config)) + .copied() + } + + #[cfg(feature = "blob-diff")] + pub(crate) fn big_file_threshold(&self) -> Result { + Ok(self + .resolved + .integer_by_key("core.bigFileThreshold") + .map(|number| Core::BIG_FILE_THRESHOLD.try_into_u64(number)) + .transpose() + .with_leniency(self.lenient_config)? + .unwrap_or(512 * 1024 * 1024)) + } + /// Returns a user agent for use with servers. #[cfg(any(feature = "async-network-client", feature = "blocking-network-client"))] pub(crate) fn user_agent_tuple(&self) -> (&'static str, Option>) { @@ -92,13 +183,6 @@ impl Cache { }) } - #[cfg(feature = "blob-diff")] - pub(crate) fn diff_renames(&self) -> Result, crate::diff::new_rewrites::Error> { - self.diff_renames - .get_or_try_init(|| crate::diff::new_rewrites(&self.resolved, self.lenient_config)) - .copied() - } - /// Returns (file-timeout, pack-refs timeout) pub(crate) fn lock_timeout( &self, diff --git a/gix/src/config/mod.rs b/gix/src/config/mod.rs index 33c97fac687..2db9e1e6bd3 100644 --- a/gix/src/config/mod.rs +++ b/gix/src/config/mod.rs @@ -122,6 +122,36 @@ pub mod diff { Unimplemented { name: BString }, } } + + /// + pub mod pipeline_options { + /// The error produced when obtaining options needed to fill in [gix_diff::blob::pipeline::Options]. + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error { + #[error(transparent)] + FilesystemCapabilities(#[from] crate::config::boolean::Error), + #[error(transparent)] + BigFileThreshold(#[from] crate::config::unsigned_integer::Error), + } + } + + /// + pub mod drivers { + use crate::bstr::BString; + + /// The error produced when obtaining a list of [Drivers](gix_diff::blob::Driver). + #[derive(Debug, thiserror::Error)] + #[error("Failed to parse value of 'diff.{name}.{attribute}'")] + pub struct Error { + /// The name fo the driver. + pub name: BString, + /// The name of the attribute we tried to parse. + pub attribute: &'static str, + /// The actual error that occurred. + pub source: Box, + } + } } /// diff --git a/gix/src/config/tree/sections/core.rs b/gix/src/config/tree/sections/core.rs index 2ec5c279ea3..50df2a77d9c 100644 --- a/gix/src/config/tree/sections/core.rs +++ b/gix/src/config/tree/sections/core.rs @@ -8,6 +8,9 @@ impl Core { pub const ABBREV: Abbrev = Abbrev::new_with_validate("abbrev", &config::Tree::CORE, validate::Abbrev); /// The `core.bare` key. pub const BARE: keys::Boolean = keys::Boolean::new_boolean("bare", &config::Tree::CORE); + /// The `core.bigFileThreshold` key. + pub const BIG_FILE_THRESHOLD: keys::UnsignedInteger = + keys::UnsignedInteger::new_unsigned_integer("bigFileThreshold", &config::Tree::CORE); /// The `core.checkStat` key. pub const CHECK_STAT: CheckStat = CheckStat::new_with_validate("checkStat", &config::Tree::CORE, validate::CheckStat); @@ -95,6 +98,7 @@ impl Section for Core { &[ &Self::ABBREV, &Self::BARE, + &Self::BIG_FILE_THRESHOLD, &Self::CHECK_STAT, &Self::DELTA_BASE_CACHE_LIMIT, &Self::DISAMBIGUATE, diff --git a/gix/src/config/tree/sections/diff.rs b/gix/src/config/tree/sections/diff.rs index 7c467b8f523..df84d9f4c26 100644 --- a/gix/src/config/tree/sections/diff.rs +++ b/gix/src/config/tree/sections/diff.rs @@ -1,3 +1,4 @@ +use crate::config::tree::SubSectionRequirement; use crate::{ config, config::tree::{keys, Diff, Key, Section}, @@ -17,6 +18,20 @@ impl Diff { ); /// The `diff.renames` key. pub const RENAMES: Renames = Renames::new_renames("renames", &config::Tree::DIFF); + + /// The `diff..command` key. + pub const DRIVER_COMMAND: keys::String = keys::String::new_string("command", &config::Tree::DIFF) + .with_subsection_requirement(Some(SubSectionRequirement::Parameter("driver"))); + /// The `diff..textconv` key. + pub const DRIVER_TEXTCONV: keys::String = keys::String::new_string("textconv", &config::Tree::DIFF) + .with_subsection_requirement(Some(SubSectionRequirement::Parameter("driver"))); + /// The `diff..algorithm` key. + pub const DRIVER_ALGORITHM: Algorithm = + Algorithm::new_with_validate("algorithm", &config::Tree::DIFF, validate::Algorithm) + .with_subsection_requirement(Some(SubSectionRequirement::Parameter("driver"))); + /// The `diff..binary` key. + pub const DRIVER_BINARY: Binary = Binary::new_with_validate("binary", &config::Tree::DIFF, validate::Binary) + .with_subsection_requirement(Some(SubSectionRequirement::Parameter("driver"))); } impl Section for Diff { @@ -25,7 +40,15 @@ impl Section for Diff { } fn keys(&self) -> &[&dyn Key] { - &[&Self::ALGORITHM, &Self::RENAME_LIMIT, &Self::RENAMES] + &[ + &Self::ALGORITHM, + &Self::RENAME_LIMIT, + &Self::RENAMES, + &Self::DRIVER_COMMAND, + &Self::DRIVER_TEXTCONV, + &Self::DRIVER_ALGORITHM, + &Self::DRIVER_BINARY, + ] } } @@ -35,6 +58,9 @@ pub type Algorithm = keys::Any; /// The `diff.renames` key. pub type Renames = keys::Any; +/// The `diff..binary` key. +pub type Binary = keys::Any; + mod algorithm { use std::borrow::Cow; @@ -67,6 +93,38 @@ mod algorithm { } } +mod binary { + use crate::config::tree::diff::Binary; + + impl Binary { + /// Convert `value` into a tri-state boolean that can take the special value `auto`, resulting in `None`, or is a boolean. + /// If `None` is given, it's treated as implicit boolean `true`, as this method is made to be used + /// with [`gix_config::file::section::Body::value_implicit()`]. + pub fn try_into_binary( + &'static self, + value: Option>, + ) -> Result, crate::config::key::GenericErrorWithValue> { + Ok(match value { + None => Some(true), + Some(value) => { + if value.as_ref() == "auto" { + None + } else { + Some( + gix_config::Boolean::try_from(value.as_ref()) + .map(|b| b.0) + .map_err(|err| { + crate::config::key::GenericErrorWithValue::from_value(self, value.into_owned()) + .with_source(err) + })?, + ) + } + } + }) + } + } +} + mod renames { use crate::{ bstr::ByteSlice, @@ -125,4 +183,12 @@ mod validate { Ok(()) } } + + pub struct Binary; + impl keys::Validate for Binary { + fn validate(&self, value: &BStr) -> Result<(), Box> { + Diff::DRIVER_BINARY.try_into_binary(Some(value.into()))?; + Ok(()) + } + } } diff --git a/gix/src/diff.rs b/gix/src/diff.rs index 445698cea39..89fd7315e80 100644 --- a/gix/src/diff.rs +++ b/gix/src/diff.rs @@ -22,6 +22,7 @@ mod utils { use crate::config::cache::util::ApplyLeniency; use crate::config::tree::Diff; use crate::diff::rename::Tracking; + use crate::Repository; use gix_diff::rewrites::Copies; use gix_diff::Rewrites; @@ -38,6 +39,27 @@ mod utils { } } + /// + pub mod resource_cache { + /// The error returned by [`resource_cache()`](super::resource_cache()). + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error { + #[error(transparent)] + DiffAlgorithm(#[from] crate::config::diff::algorithm::Error), + #[error(transparent)] + WorktreeFilterOptions(#[from] crate::filter::pipeline::options::Error), + #[error(transparent)] + DiffDrivers(#[from] crate::config::diff::drivers::Error), + #[error(transparent)] + DiffPipelineOptions(#[from] crate::config::diff::pipeline_options::Error), + #[error(transparent)] + CommandContext(#[from] crate::config::command_context::Error), + #[error(transparent)] + AttributeStack(#[from] crate::config::attribute_stack::Error), + } + } + /// Create an instance by reading all relevant information from the `config`uration, while being `lenient` or not. /// Returns `Ok(None)` if nothing is configured. /// @@ -75,6 +97,46 @@ mod utils { } .into()) } + + /// Return a low-level utility to efficiently prepare a the blob-level diff operation between two resources, + /// and cache these diffable versions so that matrix-like MxN diffs are efficient. + /// + /// `repo` is used to obtain the needed configuration values, and `index` is used to potentially read `.gitattributes` + /// files from which may affect the diff operation. + /// `mode` determines how the diffable files will look like, and also how fast, in average, these conversions are. + /// `roots` provide information about where to get diffable data from, so source and destination can either be sourced from + /// a worktree, or from the object database, or both. + pub fn resource_cache( + repo: &Repository, + index: &gix_index::State, + mode: gix_diff::blob::pipeline::Mode, + roots: gix_diff::blob::pipeline::WorktreeRoots, + ) -> Result { + let diff_algo = repo.config.diff_algorithm()?; + let diff_cache = gix_diff::blob::Platform::new( + gix_diff::blob::platform::Options { + algorithm: Some(diff_algo), + skip_internal_diff_if_external_is_configured: false, + }, + gix_diff::blob::Pipeline::new( + roots, + gix_filter::Pipeline::new(repo.command_context()?, crate::filter::Pipeline::options(repo)?), + repo.config.diff_drivers()?, + repo.config.diff_pipeline_options()?, + ), + mode, + repo.attributes_only( + // TODO(perf): this could benefit from not having to build an intermediate index, + // and traverse the a tree directly. + index, + // This is an optimization, as we avoid reading files from the working tree, which also + // might not match the index at all depending on what the user passed. + gix_worktree::stack::state::attributes::Source::IdMapping, + )? + .inner, + ); + Ok(diff_cache) + } } #[cfg(feature = "blob-diff")] -pub use utils::new_rewrites; +pub use utils::{new_rewrites, resource_cache}; diff --git a/gix/src/object/tree/diff/for_each.rs b/gix/src/object/tree/diff/for_each.rs index 404e804327d..85d809fa83b 100644 --- a/gix/src/object/tree/diff/for_each.rs +++ b/gix/src/object/tree/diff/for_each.rs @@ -12,10 +12,12 @@ pub enum Error { Diff(#[from] gix_diff::tree::changes::Error), #[error("The user-provided callback failed")] ForEach(#[source] Box), - #[error("Could not configure diff algorithm prior to checking similarity")] - ConfigureDiffAlgorithm(#[from] crate::config::diff::algorithm::Error), + #[error(transparent)] + ResourceCache(#[from] crate::diff::resource_cache::Error), #[error("Failure during rename tracking")] RenameTracking(#[from] tracker::emit::Error), + #[error("Index for use in attribute stack could not be loaded")] + OpenIndex(#[from] crate::repository::index_or_load_from_head::Error), } /// @@ -36,18 +38,51 @@ impl<'a, 'old> Platform<'a, 'old> { other: &Tree<'new>, for_each: impl FnMut(Change<'_, 'old, 'new>) -> Result, ) -> Result + where + E: std::error::Error + Sync + Send + 'static, + { + self.for_each_to_obtain_tree_inner(other, for_each, None) + } + + /// Like [`Self::for_each_to_obtain_tree()`], but with a reusable `resource_cache` which is used to perform + /// diffs fast. + /// + /// Reusing it between multiple invocations saves a lot of IOps as it avoids the creation + /// of a temporary `resource_cache` that triggers reading or checking for multiple gitattribute files. + /// Note that it's recommended to call [`gix_diff::blob::Platform::clear_resource_cache()`] between the calls + /// to avoid runaway memory usage, as the cache isn't limited. + /// + /// Note that to do rename tracking like `git` does, one has to configure the `resource_cache` with + /// a conversion pipeline that uses [`gix_diff::blob::pipeline::Mode::ToGit`]. + pub fn for_each_to_obtain_tree_with_cache<'new, E>( + &mut self, + other: &Tree<'new>, + resource_cache: &mut gix_diff::blob::Platform, + for_each: impl FnMut(Change<'_, 'old, 'new>) -> Result, + ) -> Result + where + E: std::error::Error + Sync + Send + 'static, + { + self.for_each_to_obtain_tree_inner(other, for_each, Some(resource_cache)) + } + + fn for_each_to_obtain_tree_inner<'new, E>( + &mut self, + other: &Tree<'new>, + for_each: impl FnMut(Change<'_, 'old, 'new>) -> Result, + resource_cache: Option<&mut gix_diff::blob::Platform>, + ) -> Result where E: std::error::Error + Sync + Send + 'static, { let repo = self.lhs.repo; - let diff_algo = repo.config.diff_algorithm()?; let mut delegate = Delegate { src_tree: self.lhs, other_repo: other.repo, recorder: gix_diff::tree::Recorder::default().track_location(self.tracking), visit: for_each, location: self.tracking, - tracked: self.rewrites.map(|r| rewrites::Tracker::new(r, diff_algo)), + tracked: self.rewrites.map(rewrites::Tracker::new), err: None, }; match gix_diff::tree::Changes::from(TreeRefIter::from_bytes(&self.lhs.data)).needed_to_obtain( @@ -58,7 +93,7 @@ impl<'a, 'old> Platform<'a, 'old> { ) { Ok(()) => { let outcome = Outcome { - rewrites: delegate.process_tracked_changes()?, + rewrites: delegate.process_tracked_changes(resource_cache)?, }; match delegate.err { Some(err) => Err(Error::ForEach(Box::new(err))), @@ -131,12 +166,33 @@ where } } - fn process_tracked_changes(&mut self) -> Result, Error> { + fn process_tracked_changes( + &mut self, + diff_cache: Option<&mut gix_diff::blob::Platform>, + ) -> Result, Error> { let tracked = match self.tracked.as_mut() { Some(t) => t, None => return Ok(None), }; + let repo = self.src_tree.repo; + let mut storage; + let diff_cache = match diff_cache { + Some(cache) => cache, + None => { + storage = crate::diff::resource_cache( + repo, + // NOTE: we could easily load the index at the source or destination tree, + // but even that isn't perfectly correct as there is only one, used for both sides. + // This is how `git` does it (or at least so it seems). + &*repo.index_or_load_from_head()?, + gix_diff::blob::pipeline::Mode::ToGit, + Default::default(), + )?; + &mut storage + } + }; + let outcome = tracked.emit( |dest, source| match source { Some(source) => { @@ -174,6 +230,7 @@ where &mut self.err, ), }, + diff_cache, &self.src_tree.repo.objects, |push| { self.src_tree diff --git a/gix/tests/config/tree.rs b/gix/tests/config/tree.rs index fd844f8b0f0..9ddb8d0470e 100644 --- a/gix/tests/config/tree.rs +++ b/gix/tests/config/tree.rs @@ -243,6 +243,37 @@ mod diff { Ok(()) } + #[test] + fn driver_binary() -> crate::Result { + assert_eq!( + Diff::DRIVER_BINARY.try_into_binary(Some(bcow("auto")))?, + None, + "this is as good as not setting it, but it's a valid value that would fail if it was just a boolean. It's undocumented though…" + ); + assert!(Diff::DRIVER_BINARY.validate("auto".into()).is_ok()); + + for (actual, expected) in [ + (Some("true"), Some(true)), + (Some("false"), Some(false)), + (None, Some(true)), + ] { + assert_eq!(Diff::DRIVER_BINARY.try_into_binary(actual.map(bcow))?, expected); + if let Some(value) = actual { + assert!(Diff::DRIVER_BINARY.validate(value.into()).is_ok()); + } + } + + assert_eq!( + Diff::DRIVER_BINARY + .try_into_binary(Some(bcow("something"))) + .unwrap_err() + .to_string(), + "The key \"diff..binary=something\" was invalid", + ); + assert!(Diff::DRIVER_BINARY.validate("foo".into()).is_err()); + Ok(()) + } + #[test] fn algorithm() -> crate::Result { for (actual, expected) in [ diff --git a/gix/tests/diff/mod.rs b/gix/tests/diff/mod.rs new file mode 100644 index 00000000000..c89681e7012 --- /dev/null +++ b/gix/tests/diff/mod.rs @@ -0,0 +1,46 @@ +use crate::util::named_repo; +use gix_diff::blob::{Algorithm, Driver}; + +#[test] +fn resource_cache() -> crate::Result { + let repo = named_repo("make_diff_repo.sh")?; + let cache = gix::diff::resource_cache( + &repo, + &*repo.index()?, + gix::diff::blob::pipeline::Mode::ToWorktreeAndBinaryToText, + Default::default(), + )?; + assert_eq!( + cache.filter.drivers(), + &[ + Driver { + name: "all-but-binary".into(), + command: Some("command".into()), + algorithm: Some(Algorithm::Histogram), + binary_to_text_command: Some("textconv".into()), + is_binary: None + }, + Driver { + name: "binary-false".into(), + is_binary: Some(false), + ..Default::default() + }, + Driver { + name: "binary-true".into(), + is_binary: Some(true), + ..Default::default() + } + ] + ); + assert_eq!(cache.options.algorithm, Some(Algorithm::Histogram)); + assert!( + !cache.options.skip_internal_diff_if_external_is_configured, + "pre-set to something that makes sense for most" + ); + assert_eq!( + cache.filter.options.large_file_threshold_bytes, + 512 * 1024 * 1024, + "the default value unless it's configured" + ); + Ok(()) +} diff --git a/gix/tests/fixtures/generated-archives/make_diff_repo.tar.xz b/gix/tests/fixtures/generated-archives/make_diff_repo.tar.xz index 0d6b76452b7..d4b13341e89 100644 --- a/gix/tests/fixtures/generated-archives/make_diff_repo.tar.xz +++ b/gix/tests/fixtures/generated-archives/make_diff_repo.tar.xz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dceeedc3ee706632030f8317f2245c327456ec58a6ac690d09099557a5aa23b0 -size 32336 +oid sha256:66f54895c75e2eebaad514224242522c19e2b16b12a48f22a059b438745e0327 +size 32444 diff --git a/gix/tests/fixtures/make_diff_repo.sh b/gix/tests/fixtures/make_diff_repo.sh index 1aac40b43a7..1175cf4ccd9 100755 --- a/gix/tests/fixtures/make_diff_repo.sh +++ b/gix/tests/fixtures/make_diff_repo.sh @@ -3,6 +3,24 @@ set -eu -o pipefail git init -q +cat <>.git/config + +[diff "binary-true"] + binary = true +[diff "binary-false"] + binary = false +[diff ""] + command = "empty is ignored" +[diff] + command = "this is also ignored as sub-section name is missing" + algorithm = histogram +[diff "all-but-binary"] + command = command + textconv = textconv + algorithm = histogram + binary = auto +EOF + git checkout -b main mkdir dir touch a b dir/c d diff --git a/gix/tests/gix.rs b/gix/tests/gix.rs index 4721e0d4eb2..81008faca89 100644 --- a/gix/tests/gix.rs +++ b/gix/tests/gix.rs @@ -4,6 +4,8 @@ use util::*; mod clone; mod commit; mod config; +#[cfg(feature = "blob-diff")] +mod diff; mod head; mod id; mod init; diff --git a/src/plumbing/progress.rs b/src/plumbing/progress.rs index 21b56fa71d8..21b0ab9af9d 100644 --- a/src/plumbing/progress.rs +++ b/src/plumbing/progress.rs @@ -124,10 +124,6 @@ static GIT_CONFIG: &[Record] = &[ config: "core.alternateRefsPrefixes", usage: NotPlanned { reason: "seems like a niche feature, but can be implemented if there is demand" } }, - Record { - config: "core.bigFileThreshold", - usage: Planned { note: Some("unfortunately we can't stream packed files yet, even if not delta-compressed, but respecting the threshold for other operations is definitely a must") } - }, Record { config: "core.compression", usage: Planned { note: Some("Allow to remove similar hardcoded value - passing it through will be some effort") }, From 6f4bbc31411cd3528cc6dd3db54a333ff861ec95 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sat, 2 Dec 2023 14:57:59 +0100 Subject: [PATCH 17/20] feat: add key for `diff.external`. That way it's conceivable that applications correctly run either a configured external diff tool, or one that is configured on a per diff-driver basis, while being allowed to fall back to a built-in implementation as needed. --- gix/src/config/cache/init.rs | 10 ++++++++++ gix/src/config/tree/sections/diff.rs | 5 +++++ gix/tests/gix-init.rs | 5 ++++- 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/gix/src/config/cache/init.rs b/gix/src/config/cache/init.rs index 7f6bc0ab3fc..e29c156e841 100644 --- a/gix/src/config/cache/init.rs +++ b/gix/src/config/cache/init.rs @@ -530,6 +530,16 @@ fn apply_environment_overrides( (env(key), key.name) }], ), + #[cfg(feature = "blob-diff")] + ( + "diff", + None, + git_prefix, + &[{ + let key = &config::tree::Diff::EXTERNAL; + (env(key), key.name) + }], + ), ] { let mut section = env_override .new_section(section_name, subsection_name) diff --git a/gix/src/config/tree/sections/diff.rs b/gix/src/config/tree/sections/diff.rs index df84d9f4c26..c8bf2249326 100644 --- a/gix/src/config/tree/sections/diff.rs +++ b/gix/src/config/tree/sections/diff.rs @@ -32,6 +32,10 @@ impl Diff { /// The `diff..binary` key. pub const DRIVER_BINARY: Binary = Binary::new_with_validate("binary", &config::Tree::DIFF, validate::Binary) .with_subsection_requirement(Some(SubSectionRequirement::Parameter("driver"))); + + /// The `diff.external` key. + pub const EXTERNAL: keys::Program = + keys::Program::new_program("external", &config::Tree::DIFF).with_environment_override("GIT_EXTERNAL_DIFF"); } impl Section for Diff { @@ -48,6 +52,7 @@ impl Section for Diff { &Self::DRIVER_TEXTCONV, &Self::DRIVER_ALGORITHM, &Self::DRIVER_BINARY, + &Self::EXTERNAL, ] } } diff --git a/gix/tests/gix-init.rs b/gix/tests/gix-init.rs index 531f89e9297..f3d0f0ac242 100644 --- a/gix/tests/gix-init.rs +++ b/gix/tests/gix-init.rs @@ -50,7 +50,8 @@ mod with_overrides { .set("GIT_ICASE_PATHSPECS", "pathspecs-icase") .set("GIT_TERMINAL_PROMPT", "42") .set("GIT_SHALLOW_FILE", "shallow-file-env") - .set("GIT_NAMESPACE", "namespace-env"); + .set("GIT_NAMESPACE", "namespace-env") + .set("GIT_EXTERNAL_DIFF", "external-diff-env"); let mut opts = gix::open::Options::isolated() .cli_overrides([ "http.userAgent=agent-from-cli", @@ -234,6 +235,8 @@ mod with_overrides { ("gitoxide.http.verbose", "true"), ("gitoxide.allow.protocolFromUser", "file-allowed"), ("core.useReplaceRefs", "no-replace"), + #[cfg(feature = "blob-diff")] + ("diff.external", "external-diff-env"), ("gitoxide.objects.replaceRefBase", "refs/replace-mine"), ("gitoxide.committer.nameFallback", "committer name"), ("gitoxide.committer.emailFallback", "committer email"), From 406acef4710f3c2c9371621a57e0534b3758f76c Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sat, 2 Dec 2023 16:51:05 +0100 Subject: [PATCH 18/20] Allow empty paths to be passed to Stack in debug mode It can handle it, so let's let it be a no-op. --- gix-fs/src/stack.rs | 1 - gix-fs/tests/stack/mod.rs | 11 +++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/gix-fs/src/stack.rs b/gix-fs/src/stack.rs index d90d662ca34..5d3dfeccd34 100644 --- a/gix-fs/src/stack.rs +++ b/gix-fs/src/stack.rs @@ -63,7 +63,6 @@ impl Stack { relative.is_relative(), "only index paths are handled correctly here, must be relative" ); - debug_assert!(!relative.to_string_lossy().is_empty(), "empty paths are not allowed"); if self.valid_components == 0 { delegate.push_directory(self)?; diff --git a/gix-fs/tests/stack/mod.rs b/gix-fs/tests/stack/mod.rs index 3e8cb7ade03..5e122cdb007 100644 --- a/gix-fs/tests/stack/mod.rs +++ b/gix-fs/tests/stack/mod.rs @@ -141,5 +141,16 @@ fn delegate_calls_are_consistent() -> crate::Result { "the stack is state so keeps thinking it's a directory which is consistent. Git does it differently though." ); + s.make_relative_path_current("".as_ref(), &mut r)?; + assert_eq!( + r, + Record { + push_dir: 9, + dirs: vec![".".into()], + push: 14, + }, + "empty-paths reset the tree effectively" + ); + Ok(()) } From 4743212269c6fab69f6306fba88ee38b669a7dc3 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sat, 2 Dec 2023 15:21:56 +0100 Subject: [PATCH 19/20] feat!: `object::blob::diff::Platform` now performs all necessary conversions. Previously it would just offer the git-ODB version of a blob for diffing, while it will now make it possible to apply all necessary conversion steps for you. This also moves `Event::diff()` to `Change::diff()`, adds `Repository::diff_resource_cache()` and refactors nearly everything about the `objects::blob::diff::Platform`. --- gix/src/diff.rs | 7 +- gix/src/object/blob.rs | 274 ++++++++++++++++++--------- gix/src/object/tree/diff/change.rs | 32 ++-- gix/src/object/tree/diff/for_each.rs | 14 +- gix/src/repository/diff.rs | 45 +++++ gix/src/repository/filter.rs | 4 +- gix/src/repository/mod.rs | 3 + gix/tests/diff/mod.rs | 1 + gix/tests/object/tree/diff.rs | 9 +- 9 files changed, 260 insertions(+), 129 deletions(-) create mode 100644 gix/src/repository/diff.rs diff --git a/gix/src/diff.rs b/gix/src/diff.rs index 89fd7315e80..315ef6cd791 100644 --- a/gix/src/diff.rs +++ b/gix/src/diff.rs @@ -104,12 +104,15 @@ mod utils { /// `repo` is used to obtain the needed configuration values, and `index` is used to potentially read `.gitattributes` /// files from which may affect the diff operation. /// `mode` determines how the diffable files will look like, and also how fast, in average, these conversions are. + /// `attribute_source` controls where `.gitattributes` will be read from, and it's typically adjusted based on the + /// `roots` - if there are no worktree roots, `.gitattributes` are also not usually read from worktrees. /// `roots` provide information about where to get diffable data from, so source and destination can either be sourced from /// a worktree, or from the object database, or both. pub fn resource_cache( repo: &Repository, index: &gix_index::State, mode: gix_diff::blob::pipeline::Mode, + attribute_source: gix_worktree::stack::state::attributes::Source, roots: gix_diff::blob::pipeline::WorktreeRoots, ) -> Result { let diff_algo = repo.config.diff_algorithm()?; @@ -129,9 +132,7 @@ mod utils { // TODO(perf): this could benefit from not having to build an intermediate index, // and traverse the a tree directly. index, - // This is an optimization, as we avoid reading files from the working tree, which also - // might not match the index at all depending on what the user passed. - gix_worktree::stack::state::attributes::Source::IdMapping, + attribute_source, )? .inner, ); diff --git a/gix/src/object/blob.rs b/gix/src/object/blob.rs index ef19c302a62..0c7791e5db7 100644 --- a/gix/src/object/blob.rs +++ b/gix/src/object/blob.rs @@ -3,63 +3,132 @@ use crate::{Blob, ObjectDetached}; /// #[cfg(feature = "blob-diff")] pub mod diff { + use gix_diff::blob::platform::prepare_diff::Operation; + use gix_diff::blob::ResourceKind; use std::ops::Range; - use crate::{bstr::ByteSlice, object::blob::diff::line::Change}; + use crate::object::tree::diff::change::Event; + use crate::{bstr::ByteSlice, object::blob::diff::lines::Change}; /// A platform to keep temporary information to perform line diffs on modified blobs. /// - pub struct Platform<'old, 'new> { - /// The previous version of the blob. - pub old: crate::Object<'old>, - /// The new version of the blob. - pub new: crate::Object<'new>, - /// The algorithm to use when calling [imara_diff::diff()][gix_diff::blob::diff()]. - /// This value is determined by the `diff.algorithm` configuration. - pub algo: gix_diff::blob::Algorithm, + pub struct Platform<'a> { + /// The cache holding diffable data related to our blobs. + pub resource_cache: &'a mut gix_diff::blob::Platform, } /// pub mod init { - /// The error returned by [`Platform::from_ids()`][super::Platform::from_ids()]. - #[derive(Debug, thiserror::Error)] - #[allow(missing_docs)] - pub enum Error { - #[error("Could not find the previous blob or the new blob to diff against")] - FindExisting(#[from] crate::object::find::existing::Error), - #[error("Could not obtain diff algorithm from configuration")] - DiffAlgorithm(#[from] crate::config::diff::algorithm::Error), - } + /// The error returned by [`Platform::from_tree_change()`][super::Platform::from_tree_change()]. + pub type Error = gix_diff::blob::platform::set_resource::Error; } - impl<'old, 'new> Platform<'old, 'new> { - /// Produce a platform for performing various diffs after obtaining the object data of `previous_id` and `new_id`. - /// - /// Note that these objects are treated as raw data and are assumed to be blobs. - pub fn from_ids( - previous_id: &crate::Id<'old>, - new_id: &crate::Id<'new>, - ) -> Result, init::Error> { - match previous_id - .object() - .and_then(|old| new_id.object().map(|new| (old, new))) - { - Ok((old, new)) => { - let algo = match new_id.repo.config.diff_algorithm() { - Ok(algo) => algo, - Err(err) => return Err(err.into()), - }; - Ok(Platform { old, new, algo }) + impl<'a, 'new> Platform<'a> { + /// Produce a platform for performing various diffs after obtaining the data from a single `tree_change`. + pub fn from_tree_change( + tree_change: &crate::object::tree::diff::Change<'_, '_, '_>, + resource_cache: &'a mut gix_diff::blob::Platform, + ) -> Result, init::Error> { + match tree_change.event { + Event::Addition { entry_mode, id } => { + resource_cache.set_resource( + id.repo.object_hash().null(), + entry_mode.kind(), + tree_change.location, + ResourceKind::OldOrSource, + &id.repo.objects, + )?; + resource_cache.set_resource( + id.inner, + entry_mode.kind(), + tree_change.location, + ResourceKind::NewOrDestination, + &id.repo.objects, + )?; + } + Event::Deletion { entry_mode, id } => { + resource_cache.set_resource( + id.inner, + entry_mode.kind(), + tree_change.location, + ResourceKind::OldOrSource, + &id.repo.objects, + )?; + resource_cache.set_resource( + id.repo.object_hash().null(), + entry_mode.kind(), + tree_change.location, + ResourceKind::NewOrDestination, + &id.repo.objects, + )?; + } + Event::Modification { + previous_entry_mode, + previous_id, + entry_mode, + id, + } => { + resource_cache.set_resource( + previous_id.inner, + previous_entry_mode.kind(), + tree_change.location, + ResourceKind::OldOrSource, + &previous_id.repo.objects, + )?; + resource_cache.set_resource( + id.inner, + entry_mode.kind(), + tree_change.location, + ResourceKind::NewOrDestination, + &id.repo.objects, + )?; + } + Event::Rewrite { + source_location, + source_entry_mode, + source_id, + entry_mode, + id, + diff: _, + copy: _, + } => { + resource_cache.set_resource( + source_id.inner, + source_entry_mode.kind(), + source_location, + ResourceKind::OldOrSource, + &source_id.repo.objects, + )?; + resource_cache.set_resource( + id.inner, + entry_mode.kind(), + tree_change.location, + ResourceKind::NewOrDestination, + &id.repo.objects, + )?; } - Err(err) => Err(err.into()), } + Ok(Self { resource_cache }) } } /// - pub mod line { + pub mod lines { use crate::bstr::BStr; + /// The error returned by [Platform::lines()](super::Platform::lines()). + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error + where + E: std::error::Error + Send + Sync + 'static, + { + #[error(transparent)] + ProcessHunk(E), + #[error(transparent)] + PrepareDiff(#[from] gix_diff::blob::platform::prepare_diff::Error), + } + /// A change to a hunk of lines. pub enum Change<'a, 'data> { /// Lines were added. @@ -82,70 +151,91 @@ pub mod diff { } } - impl<'old, 'new> Platform<'old, 'new> { + impl<'a> Platform<'a> { /// Perform a diff on lines between the old and the new version of a blob, passing each hunk of lines to `process_hunk`. - /// The diffing algorithm is determined by the `diff.algorithm` configuration. - /// - /// Note that you can invoke the diff more flexibly as well. + /// The diffing algorithm is determined by the `diff.algorithm` configuration, or individual diff drivers. + /// Note that `process_hunk` is not called if one of the involved resources are binary, but that can be determined + /// by introspecting the outcome. // TODO: more tests (only tested insertion right now) - pub fn lines(&self, mut process_hunk: FnH) -> Result<(), E> + pub fn lines( + &mut self, + mut process_hunk: FnH, + ) -> Result, lines::Error> where - FnH: FnMut(line::Change<'_, '_>) -> Result<(), E>, - E: std::error::Error, + FnH: FnMut(lines::Change<'_, '_>) -> Result<(), E>, + E: std::error::Error + Send + Sync + 'static, { - let input = self.line_tokens(); - let mut err = None; - let mut lines = Vec::new(); - gix_diff::blob::diff(self.algo, &input, |before: Range, after: Range| { - if err.is_some() { - return; + self.resource_cache.options.skip_internal_diff_if_external_is_configured = false; + + let prep = self.resource_cache.prepare_diff()?; + match prep.operation { + Operation::InternalDiff { algorithm } => { + let input = prep.interned_input(); + let mut err = None; + let mut lines = Vec::new(); + + gix_diff::blob::diff(algorithm, &input, |before: Range, after: Range| { + if err.is_some() { + return; + } + lines.clear(); + lines.extend( + input.before[before.start as usize..before.end as usize] + .iter() + .map(|&line| input.interner[line].as_bstr()), + ); + let end_of_before = lines.len(); + lines.extend( + input.after[after.start as usize..after.end as usize] + .iter() + .map(|&line| input.interner[line].as_bstr()), + ); + let hunk_before = &lines[..end_of_before]; + let hunk_after = &lines[end_of_before..]; + if hunk_after.is_empty() { + err = process_hunk(Change::Deletion { lines: hunk_before }).err(); + } else if hunk_before.is_empty() { + err = process_hunk(Change::Addition { lines: hunk_after }).err(); + } else { + err = process_hunk(Change::Modification { + lines_before: hunk_before, + lines_after: hunk_after, + }) + .err(); + } + }); + + if let Some(err) = err { + return Err(lines::Error::ProcessHunk(err)); + } } - lines.clear(); - lines.extend( - input.before[before.start as usize..before.end as usize] - .iter() - .map(|&line| input.interner[line].as_bstr()), - ); - let end_of_before = lines.len(); - lines.extend( - input.after[after.start as usize..after.end as usize] - .iter() - .map(|&line| input.interner[line].as_bstr()), - ); - let hunk_before = &lines[..end_of_before]; - let hunk_after = &lines[end_of_before..]; - if hunk_after.is_empty() { - err = process_hunk(Change::Deletion { lines: hunk_before }).err(); - } else if hunk_before.is_empty() { - err = process_hunk(Change::Addition { lines: hunk_after }).err(); - } else { - err = process_hunk(Change::Modification { - lines_before: hunk_before, - lines_after: hunk_after, - }) - .err(); + Operation::ExternalCommand { .. } => { + unreachable!("we disabled that") } - }); - - match err { - Some(err) => Err(err), - None => Ok(()), - } + Operation::SourceOrDestinationIsBinary => {} + }; + Ok(prep) } /// Count the amount of removed and inserted lines efficiently. - pub fn line_counts(&self) -> gix_diff::blob::sink::Counter<()> { - let tokens = self.line_tokens(); - gix_diff::blob::diff(self.algo, &tokens, gix_diff::blob::sink::Counter::default()) - } + /// Note that nothing will happen if one of the inputs is binary, and `None` will be returned. + pub fn line_counts( + &mut self, + ) -> Result>, gix_diff::blob::platform::prepare_diff::Error> { + self.resource_cache.options.skip_internal_diff_if_external_is_configured = false; - /// Return a tokenizer which treats lines as smallest unit for use in a [diff operation][gix_diff::blob::diff()]. - /// - /// The line separator is determined according to normal git rules and filters. - pub fn line_tokens(&self) -> gix_diff::blob::intern::InternedInput<&[u8]> { - // TODO: make use of `core.eol` and/or filters to do line-counting correctly. It's probably - // OK to just know how these objects are saved to know what constitutes a line. - gix_diff::blob::intern::InternedInput::new(self.old.data.as_bytes(), self.new.data.as_bytes()) + let prep = self.resource_cache.prepare_diff()?; + match prep.operation { + Operation::InternalDiff { algorithm } => { + let tokens = prep.interned_input(); + let counter = gix_diff::blob::diff(algorithm, &tokens, gix_diff::blob::sink::Counter::default()); + Ok(Some(counter)) + } + Operation::ExternalCommand { .. } => { + unreachable!("we disabled that") + } + Operation::SourceOrDestinationIsBinary => Ok(None), + } } } } diff --git a/gix/src/object/tree/diff/change.rs b/gix/src/object/tree/diff/change.rs index a95770d6656..4e2bd2eff65 100644 --- a/gix/src/object/tree/diff/change.rs +++ b/gix/src/object/tree/diff/change.rs @@ -68,25 +68,25 @@ pub enum Event<'a, 'old, 'new> { }, } -impl<'a, 'old, 'new> Event<'a, 'old, 'new> { - /// Produce a platform for performing a line-diff, or `None` if this is not a [`Modification`][Event::Modification] - /// or one of the entries to compare is not a blob. - pub fn diff( +impl<'a, 'old, 'new> super::Change<'a, 'old, 'new> { + /// Produce a platform for performing a line-diff no matter whether the underlying [Event] is an addition, modification, + /// deletion or rewrite. + /// Use `resource_cache` to store the diffable data and possibly reuse previously stored data. + /// Afterwards the platform, which holds on to `resource_cache`, can be used to perform ready-made operations on the + /// pre-set resources. + /// + /// ### Warning about Memory Consumption + /// + /// `resource_cache` only grows, so one should call [`gix_diff::blob::Platform::clear_resource_cache`] occasionally. + pub fn diff<'b>( &self, - ) -> Option, crate::object::blob::diff::init::Error>> { - match self { - Event::Modification { - previous_entry_mode, - previous_id, - entry_mode, - id, - } if entry_mode.is_blob() && previous_entry_mode.is_blob() => { - Some(crate::object::blob::diff::Platform::from_ids(previous_id, id)) - } - _ => None, - } + resource_cache: &'b mut gix_diff::blob::Platform, + ) -> Result, crate::object::blob::diff::init::Error> { + crate::object::blob::diff::Platform::from_tree_change(self, resource_cache) } +} +impl<'a, 'old, 'new> Event<'a, 'old, 'new> { /// Return the current mode of this instance. pub fn entry_mode(&self) -> gix_object::tree::EntryMode { match self { diff --git a/gix/src/object/tree/diff/for_each.rs b/gix/src/object/tree/diff/for_each.rs index 85d809fa83b..86cddb528b6 100644 --- a/gix/src/object/tree/diff/for_each.rs +++ b/gix/src/object/tree/diff/for_each.rs @@ -13,11 +13,9 @@ pub enum Error { #[error("The user-provided callback failed")] ForEach(#[source] Box), #[error(transparent)] - ResourceCache(#[from] crate::diff::resource_cache::Error), + ResourceCache(#[from] crate::repository::diff::resource_cache::Error), #[error("Failure during rename tracking")] RenameTracking(#[from] tracker::emit::Error), - #[error("Index for use in attribute stack could not be loaded")] - OpenIndex(#[from] crate::repository::index_or_load_from_head::Error), } /// @@ -180,15 +178,7 @@ where let diff_cache = match diff_cache { Some(cache) => cache, None => { - storage = crate::diff::resource_cache( - repo, - // NOTE: we could easily load the index at the source or destination tree, - // but even that isn't perfectly correct as there is only one, used for both sides. - // This is how `git` does it (or at least so it seems). - &*repo.index_or_load_from_head()?, - gix_diff::blob::pipeline::Mode::ToGit, - Default::default(), - )?; + storage = repo.diff_resource_cache(gix_diff::blob::pipeline::Mode::ToGit, Default::default())?; &mut storage } }; diff --git a/gix/src/repository/diff.rs b/gix/src/repository/diff.rs new file mode 100644 index 00000000000..cb1d070a2f4 --- /dev/null +++ b/gix/src/repository/diff.rs @@ -0,0 +1,45 @@ +use crate::Repository; + +/// +pub mod resource_cache { + /// The error returned by [Repository::diff_resource_cache()](super::Repository::diff_resource_cache()). + #[derive(Debug, thiserror::Error)] + #[allow(missing_docs)] + pub enum Error { + #[error("Could not obtain resource cache for diffing")] + ResourceCache(#[from] crate::diff::resource_cache::Error), + #[error(transparent)] + Index(#[from] crate::repository::index_or_load_from_head::Error), + } +} + +/// Diff-utilities +impl Repository { + /// Create a resource cache for diffable objects, and configured with everything it needs to know to perform diffs + /// faithfully just like `git` would. + /// `mode` controls what version of a resource should be diffed. + /// `worktree_roots` determine if files can be read from the worktree, where each side of the diff operation can + /// be represented by its own worktree root. `.gitattributes` are automatically read from the worktree if at least + /// one worktree is present. + /// + /// Note that attributes will always be obtained from the current `HEAD` index even if the resources being diffed + /// might live in another tree. Further, if one of the `worktree_roots` are set, attributes will also be read from + /// the worktree. Otherwise, it will be skipped and attributes are read from the index tree instead. + pub fn diff_resource_cache( + &self, + mode: gix_diff::blob::pipeline::Mode, + worktree_roots: gix_diff::blob::pipeline::WorktreeRoots, + ) -> Result { + Ok(crate::diff::resource_cache( + self, + &*self.index_or_load_from_head()?, + mode, + if worktree_roots.new_root.is_some() || worktree_roots.old_root.is_some() { + gix_worktree::stack::state::attributes::Source::WorktreeThenIdMapping + } else { + gix_worktree::stack::state::attributes::Source::IdMapping + }, + worktree_roots, + )?) + } +} diff --git a/gix/src/repository/filter.rs b/gix/src/repository/filter.rs index 3aacb1a3d2c..68644ca984e 100644 --- a/gix/src/repository/filter.rs +++ b/gix/src/repository/filter.rs @@ -2,7 +2,7 @@ use crate::{filter, repository::IndexPersistedOrInMemory, Id, Repository}; /// pub mod pipeline { - /// The error returned by [Repository::filter_pipeline()][super::Repository::filter_pipeline()]. + /// The error returned by [Repository::filter_pipeline()](super::Repository::filter_pipeline()). #[derive(Debug, thiserror::Error)] #[allow(missing_docs)] pub enum Error { @@ -24,7 +24,7 @@ pub mod pipeline { impl Repository { /// Configure a pipeline for converting byte buffers to the worktree representation, and byte streams to the git-internal /// representation. Also return the index that was used when initializing the pipeline as it may be useful when calling - /// [convert_to_git()][filter::Pipeline::convert_to_git()]. + /// [convert_to_git()](filter::Pipeline::convert_to_git()). /// Bare repositories will either use `HEAD^{tree}` for accessing all relevant worktree files or the given `tree_if_bare`. /// /// Note that this is considered a primitive as it operates on data directly and will not have permanent effects. diff --git a/gix/src/repository/mod.rs b/gix/src/repository/mod.rs index 6b9e284cc05..28aa5aa8e28 100644 --- a/gix/src/repository/mod.rs +++ b/gix/src/repository/mod.rs @@ -40,6 +40,9 @@ pub mod attributes; mod cache; mod config; /// +#[cfg(feature = "blob-diff")] +pub mod diff; +/// #[cfg(feature = "attributes")] pub mod filter; mod graph; diff --git a/gix/tests/diff/mod.rs b/gix/tests/diff/mod.rs index c89681e7012..d6087a50209 100644 --- a/gix/tests/diff/mod.rs +++ b/gix/tests/diff/mod.rs @@ -8,6 +8,7 @@ fn resource_cache() -> crate::Result { &repo, &*repo.index()?, gix::diff::blob::pipeline::Mode::ToWorktreeAndBinaryToText, + gix_worktree::stack::state::attributes::Source::IdMapping, Default::default(), )?; assert_eq!( diff --git a/gix/tests/object/tree/diff.rs b/gix/tests/object/tree/diff.rs index c3304965b3f..5f9cff57648 100644 --- a/gix/tests/object/tree/diff.rs +++ b/gix/tests/object/tree/diff.rs @@ -2,7 +2,7 @@ use std::convert::Infallible; use gix::{ bstr::BString, - object::{blob::diff::line::Change, tree::diff::change::Event}, + object::{blob::diff::lines::Change, tree::diff::change::Event}, }; use gix_object::bstr::ByteSlice; use gix_object::tree::EntryKind; @@ -14,6 +14,7 @@ fn changes_against_tree_modified() -> crate::Result { let repo = named_repo("make_diff_repo.sh")?; let from = tree_named(&repo, "@^{/c3-modification}~1"); let to = tree_named(&repo, ":/c3-modification"); + let mut cache = repo.diff_resource_cache(gix_diff::blob::pipeline::Mode::ToGit, Default::default())?; from.changes()? .for_each_to_obtain_tree(&to, |change| -> Result<_, Infallible> { assert_eq!(change.location, "", "without configuration the location field is empty"); @@ -34,14 +35,14 @@ fn changes_against_tree_modified() -> crate::Result { } }; - let diff = change.event.diff().expect("changed file").expect("objects available"); - let count = diff.line_counts(); + let mut diff = change.diff(&mut cache).expect("objects available"); + let count = diff.line_counts().expect("no diff error").expect("no binary blobs"); assert_eq!(count.insertions, 1); assert_eq!(count.removals, 0); diff.lines(|hunk| { match hunk { Change::Deletion { .. } => unreachable!("there was no deletion"), - Change::Addition { lines } => assert_eq!(lines, vec!["a1".as_bytes().as_bstr()]), + Change::Addition { lines } => assert_eq!(lines, vec!["a1\n".as_bytes().as_bstr()]), Change::Modification { .. } => unreachable!("there was no modification"), }; Ok::<_, Infallible>(()) From 1706e2394380c35cd98d0e106eb0985ae1912da0 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 15 Nov 2023 17:41:00 +0100 Subject: [PATCH 20/20] adapt to changes in `gix-diff` --- .github/workflows/ci.yml | 2 +- gitoxide-core/src/hours/core.rs | 83 +++++++++--------------- gitoxide-core/src/query/engine/update.rs | 73 ++++++++++++++------- gix-index/src/fs.rs | 5 +- gix-status/src/stack.rs | 1 + gix/Cargo.toml | 2 +- gix/src/object/blob.rs | 2 +- gix/src/repository/worktree.rs | 2 +- gix/tests/object/tree/diff.rs | 8 ++- src/plumbing/progress.rs | 10 +-- 10 files changed, 92 insertions(+), 96 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c49221e4635..c58d5f071ef 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -205,7 +205,7 @@ jobs: name: crates without feature toggles - run: set +x; for feature in progress fs-walkdir-parallel parallel io-pipe crc32 zlib zlib-rust-backend fast-sha1 rustsha1 cache-efficiency-debug; do (cd gix-features && cargo build --features $feature --target ${{ matrix.target }}); done name: features of gix-features - - run: set +x; for name in gix-diff gix-pack; do (cd $name && cargo build --features wasm --target ${{ matrix.target }}); done + - run: set +x; for name in gix-pack; do (cd $name && cargo build --features wasm --target ${{ matrix.target }}); done name: crates with 'wasm' feature - run: cd gix-pack && cargo build --all-features --target ${{ matrix.target }} name: gix-pack with all features (including wasm) diff --git a/gitoxide-core/src/hours/core.rs b/gitoxide-core/src/hours/core.rs index 9728f6b5ecd..de8c01690be 100644 --- a/gitoxide-core/src/hours/core.rs +++ b/gitoxide-core/src/hours/core.rs @@ -8,7 +8,6 @@ use std::{ use gix::bstr::BStr; use itertools::Itertools; -use smallvec::SmallVec; use crate::hours::{ util::{add_lines, remove_lines}, @@ -92,25 +91,16 @@ pub fn spawn_tree_delta_threads<'scope>( move || -> Result<_, anyhow::Error> { let mut out = Vec::new(); let (commits, changes, lines_count) = stats_counters; - let mut attributes = line_stats + let mut cache = line_stats .then(|| -> anyhow::Result<_> { - repo.index_or_load_from_head().map_err(Into::into).and_then(|index| { - repo.attributes( - &index, - gix::worktree::stack::state::attributes::Source::IdMapping, - gix::worktree::stack::state::ignore::Source::IdMapping, - None, - ) - .map_err(Into::into) - .map(|attrs| { - let matches = attrs.selected_attribute_matches(["binary", "text"]); - (attrs, matches) - }) - }) + Ok(repo.diff_resource_cache(gix::diff::blob::pipeline::Mode::ToGit, Default::default())?) }) .transpose()?; for chunk in rx { for (commit_idx, parent_commit, commit) in chunk { + if let Some(cache) = cache.as_mut() { + cache.clear_resource_cache(); + } commits.fetch_add(1, Ordering::Relaxed); if gix::interrupt::is_triggered() { return Ok(out); @@ -155,47 +145,34 @@ pub fn spawn_tree_delta_threads<'scope>( previous_entry_mode, id, previous_id, - } => { - match (previous_entry_mode.is_blob(), entry_mode.is_blob()) { - (false, false) => {} - (false, true) => { - files.added += 1; - add_lines(line_stats, &lines_count, &mut lines, id); - } - (true, false) => { - files.removed += 1; - remove_lines(line_stats, &lines_count, &mut lines, previous_id); - } - (true, true) => { - files.modified += 1; - if let Some((attrs, matches)) = attributes.as_mut() { - let entry = attrs.at_entry(change.location, Some(false))?; - let is_text_file = if entry.matching_attributes(matches) { - let attrs: SmallVec<[_; 2]> = - matches.iter_selected().collect(); - let binary = &attrs[0]; - let text = &attrs[1]; - !binary.assignment.state.is_set() - && !text.assignment.state.is_unset() - } else { - // In the absence of binary or text markers, we assume it's text. - true - }; - - if let Some(Ok(diff)) = - is_text_file.then(|| change.event.diff()).flatten() - { - let mut nl = 0; - let counts = diff.line_counts(); - nl += counts.insertions as usize + counts.removals as usize; - lines.added += counts.insertions as usize; - lines.removed += counts.removals as usize; - lines_count.fetch_add(nl, Ordering::Relaxed); - } + } => match (previous_entry_mode.is_blob(), entry_mode.is_blob()) { + (false, false) => {} + (false, true) => { + files.added += 1; + add_lines(line_stats, &lines_count, &mut lines, id); + } + (true, false) => { + files.removed += 1; + remove_lines(line_stats, &lines_count, &mut lines, previous_id); + } + (true, true) => { + files.modified += 1; + if let Some(cache) = cache.as_mut() { + let mut diff = change.diff(cache).map_err(|err| { + std::io::Error::new(std::io::ErrorKind::Other, err) + })?; + let mut nl = 0; + if let Some(counts) = diff.line_counts().map_err(|err| { + std::io::Error::new(std::io::ErrorKind::Other, err) + })? { + nl += counts.insertions as usize + counts.removals as usize; + lines.added += counts.insertions as usize; + lines.removed += counts.removals as usize; + lines_count.fetch_add(nl, Ordering::Relaxed); } } } - } + }, } Ok::<_, std::io::Error>(Default::default()) })?; diff --git a/gitoxide-core/src/query/engine/update.rs b/gitoxide-core/src/query/engine/update.rs index 1dcf57ace7b..74d3ba42b1b 100644 --- a/gitoxide-core/src/query/engine/update.rs +++ b/gitoxide-core/src/query/engine/update.rs @@ -6,6 +6,7 @@ use std::{ }; use anyhow::{anyhow, bail}; +use gix::diff::blob::platform::prepare_diff::Operation; use gix::objs::find::Error; use gix::{ bstr::{BStr, BString, ByteSlice}, @@ -173,6 +174,9 @@ pub fn update( repo.object_cache_size_if_unset((object_cache_size_mb * 1024 * 1024) / threads); let rx = rx.clone(); move || -> anyhow::Result<()> { + let mut rewrite_cache = + repo.diff_resource_cache(gix::diff::blob::pipeline::Mode::ToGit, Default::default())?; + let mut diff_cache = rewrite_cache.clone(); for (chunk_id, chunk) in rx { let mut out_chunk = Vec::with_capacity(chunk.len()); for Task { @@ -201,10 +205,12 @@ pub fn update( Some(c) => c, None => continue, }; + rewrite_cache.clear_resource_cache(); + diff_cache.clear_resource_cache(); from.changes()? .track_path() .track_rewrites(Some(rewrites)) - .for_each_to_obtain_tree(&to, |change| { + .for_each_to_obtain_tree_with_cache(&to, &mut rewrite_cache, |change| { use gix::object::tree::diff::change::Event::*; change_counter.fetch_add(1, Ordering::SeqCst); match change.event { @@ -232,30 +238,47 @@ pub fn update( ); } (true, true) => { - // TODO: use git attributes here to know if it's a binary file or not. - if let Some(Ok(diff)) = change.event.diff() { - let mut nl = 0; - let tokens = diff.line_tokens(); - let counts = gix::diff::blob::diff( - diff.algo, - &tokens, - gix::diff::blob::sink::Counter::default(), - ); - nl += counts.insertions as usize - + counts.removals as usize; - let lines = LineStats { - added: counts.insertions as usize, - removed: counts.removals as usize, - before: tokens.before.len(), - after: tokens.after.len(), - }; - lines_counter.fetch_add(nl, Ordering::SeqCst); - out.push(FileChange { - relpath: change.location.to_owned(), - mode: FileMode::Modified, - source_relpath: None, - lines: Some(lines), - }); + if let Ok(cache) = + change.diff(&mut diff_cache).map(|p| p.resource_cache) + { + cache + .options + .skip_internal_diff_if_external_is_configured = + false; + if let Ok(prep) = cache.prepare_diff() { + let mut nl = 0; + let tokens = prep.interned_input(); + match prep.operation { + Operation::InternalDiff { algorithm } => { + let counts = gix::diff::blob::diff( + algorithm, + &tokens, + gix::diff::blob::sink::Counter::default( + ), + ); + nl += counts.insertions as usize + + counts.removals as usize; + let lines = LineStats { + added: counts.insertions as usize, + removed: counts.removals as usize, + before: tokens.before.len(), + after: tokens.after.len(), + }; + lines_counter + .fetch_add(nl, Ordering::SeqCst); + out.push(FileChange { + relpath: change.location.to_owned(), + mode: FileMode::Modified, + source_relpath: None, + lines: Some(lines), + }); + } + Operation::ExternalCommand { .. } => { + unreachable!("disabled above") + } + Operation::SourceOrDestinationIsBinary => {} + } + } } } }, diff --git a/gix-index/src/fs.rs b/gix-index/src/fs.rs index 21422f9b804..17abe795cad 100644 --- a/gix-index/src/fs.rs +++ b/gix-index/src/fs.rs @@ -5,7 +5,7 @@ // it's allowed for good measure, in case there are systems that use different types for that. use std::path::Path; -use std::time::{Duration, SystemTime}; +use std::time::SystemTime; /// A structure to partially mirror [`std::fs::Metadata`]. #[cfg(not(windows))] @@ -161,6 +161,7 @@ impl Metadata { } } +#[cfg(not(windows))] fn system_time_from_secs_nanos(secs: u64, nanos: u32) -> SystemTime { - std::time::UNIX_EPOCH + Duration::new(secs, nanos) + std::time::UNIX_EPOCH + std::time::Duration::new(secs, nanos) } diff --git a/gix-status/src/stack.rs b/gix-status/src/stack.rs index 3f3cd8f9797..b97a1c4e139 100644 --- a/gix-status/src/stack.rs +++ b/gix-status/src/stack.rs @@ -34,6 +34,7 @@ impl gix_fs::stack::Delegate for Delegate { Ok(()) } + #[cfg_attr(windows, allow(unused_variables))] fn push(&mut self, is_last_component: bool, stack: &Stack) -> std::io::Result<()> { #[cfg(windows)] { diff --git a/gix/Cargo.toml b/gix/Cargo.toml index 5f566b71a9e..db3dfa225ad 100644 --- a/gix/Cargo.toml +++ b/gix/Cargo.toml @@ -95,7 +95,7 @@ revparse-regex = ["regex", "revision"] ## Make it possible to diff blobs line by line. Note that this feature is integral for implementing tree-diffs as well due to the handling of rename-tracking, ## which relies on line-by-line diffs in some cases. -blob-diff = ["gix-diff/blob"] +blob-diff = ["gix-diff/blob", "attributes"] ## Make it possible to turn a tree into a stream of bytes, which can be decoded to entries and turned into various other formats. worktree-stream = ["gix-worktree-stream", "attributes"] diff --git a/gix/src/object/blob.rs b/gix/src/object/blob.rs index 0c7791e5db7..56644e5b72e 100644 --- a/gix/src/object/blob.rs +++ b/gix/src/object/blob.rs @@ -23,7 +23,7 @@ pub mod diff { pub type Error = gix_diff::blob::platform::set_resource::Error; } - impl<'a, 'new> Platform<'a> { + impl<'a> Platform<'a> { /// Produce a platform for performing various diffs after obtaining the data from a single `tree_change`. pub fn from_tree_change( tree_change: &crate::object::tree::diff::Change<'_, '_, '_>, diff --git a/gix/src/repository/worktree.rs b/gix/src/repository/worktree.rs index 1b48c7c3735..529243896ea 100644 --- a/gix/src/repository/worktree.rs +++ b/gix/src/repository/worktree.rs @@ -79,7 +79,7 @@ impl crate::Repository { let mut cache = self .attributes_only(&index, gix_worktree::stack::state::attributes::Source::IdMapping)? .detach(); - let pipeline = gix_filter::Pipeline::new(repo.command_context()?, crate::filter::Pipeline::options(self)?); + let pipeline = gix_filter::Pipeline::new(self.command_context()?, crate::filter::Pipeline::options(self)?); let objects = self.objects.clone().into_arc().expect("TBD error handling"); let stream = gix_worktree_stream::from_tree( id, diff --git a/gix/tests/object/tree/diff.rs b/gix/tests/object/tree/diff.rs index 5f9cff57648..867b6949ddf 100644 --- a/gix/tests/object/tree/diff.rs +++ b/gix/tests/object/tree/diff.rs @@ -421,6 +421,7 @@ mod track_rewrites { insertions: 1, before: 11, after: 12, + similarity: 0.8888889 }), "by similarity there is a diff" ); @@ -530,6 +531,7 @@ mod track_rewrites { insertions: 3, before: 12, after: 15, + similarity: 0.75 }), "by similarity there is a diff" ); @@ -590,12 +592,12 @@ mod track_rewrites { let out = out.rewrites.expect("tracking enabled"); assert_eq!(stat, None, "similarity can't run"); - assert_eq!(out.num_similarity_checks, 3); + assert_eq!(out.num_similarity_checks, 0); assert_eq!( out.num_similarity_checks_skipped_for_rename_tracking_due_to_limit, 0, "no limit configured" ); - assert_eq!(out.num_similarity_checks_skipped_for_copy_tracking_due_to_limit, 57); + assert_eq!(out.num_similarity_checks_skipped_for_copy_tracking_due_to_limit, 19); Ok(()) } @@ -646,7 +648,7 @@ mod track_rewrites { let out = out.rewrites.expect("tracking enabled"); assert_eq!(out.num_similarity_checks, 0); assert_eq!(out.num_similarity_checks_skipped_for_rename_tracking_due_to_limit, 0); - assert_eq!(out.num_similarity_checks_skipped_for_copy_tracking_due_to_limit, 3); + assert_eq!(out.num_similarity_checks_skipped_for_copy_tracking_due_to_limit, 2); Ok(()) } diff --git a/src/plumbing/progress.rs b/src/plumbing/progress.rs index 21b0ab9af9d..ae12ce8184f 100644 --- a/src/plumbing/progress.rs +++ b/src/plumbing/progress.rs @@ -500,17 +500,9 @@ static GIT_CONFIG: &[Record] = &[ config: "transfer.credentialsInUrl", usage: Planned { note: Some("currently we are likely to expose passwords in errors or in other places, and it's better to by default not do that") } }, - Record { - config: "diff.*.textconv", - usage: Planned { note: None } - }, Record { config: "diff.*.cachetextconv", - usage: Planned { note: None } - }, - Record { - config: "diff.*.command", - usage: Planned { note: None } + usage: NotPlanned {reason: "It seems to slow to do that, and persisting results to save a relatively cheap computation doesn't seem right"} }, ];