From d22965f8d8e67dbc3bb6c7301a12d8b613261bea Mon Sep 17 00:00:00 2001 From: Bart Dubbeldam Date: Sun, 27 Apr 2025 14:11:24 +0200 Subject: [PATCH 1/2] Refactoring of the blame operation logic This commit includes a major overhaul of the blame operation logic. A new `BlameState` struct and a `BlameProcessor` struct were introduced to cleanly encapsulate different parts of the operation. The `BlameState` struct collects information about the blame operation as it is happening, while the `BlameProcessor` struct carries the bulk of the blame operation logic. --- gix-blame/src/error.rs | 2 + gix-blame/src/file/function.rs | 773 ++++++++++++++++++++------------- 2 files changed, 468 insertions(+), 307 deletions(-) diff --git a/gix-blame/src/error.rs b/gix-blame/src/error.rs index 979cc8cd6ef..4d50668837d 100644 --- a/gix-blame/src/error.rs +++ b/gix-blame/src/error.rs @@ -17,6 +17,8 @@ pub enum Error { /// The commit whose tree didn't contain `file_path`. commit_id: gix_hash::ObjectId, }, + #[error("No `BlameState` found, so no state to start blaming from")] + BlameStateNotSet, #[error("Couldn't find commit or tree in the object database")] FindObject(#[from] gix_object::find::Error), #[error("Could not find existing blob or commit")] diff --git a/gix-blame/src/file/function.rs b/gix-blame/src/file/function.rs index 05293053231..6f33b7b2fd9 100644 --- a/gix-blame/src/file/function.rs +++ b/gix-blame/src/file/function.rs @@ -15,7 +15,7 @@ use crate::{BlameEntry, Error, Options, Outcome, Statistics}; /// Produce a list of consecutive [`BlameEntry`] instances to indicate in which commits the ranges of the file /// at `suspect:` originated in. /// -/// ## Paramters +/// ## Parameters /// /// * `odb` /// - Access to database objects, also for used for diffing. @@ -26,10 +26,8 @@ use crate::{BlameEntry, Error, Options, Outcome, Statistics}; /// - Optionally, the commitgraph cache. /// * `file_path` /// - A *slash-separated* worktree-relative path to the file to blame. -/// * `range` -/// - A 1-based inclusive range, in order to mirror `git`’s behaviour. `Some(20..40)` represents -/// 21 lines, spanning from line 20 up to and including line 40. This will be converted to -/// `19..40` internally as the algorithm uses 0-based ranges that are exclusive at the end. +/// * `options` +/// - Options to control the blame operation. /// * `resource_cache` /// - Used for diffing trees. /// @@ -49,7 +47,7 @@ use crate::{BlameEntry, Error, Options, Outcome, Statistics}; /// /// - get the commit /// - walk through its parents -/// - for each parent, do a diff and mark lines that don’t have a suspect yet (this is the term +/// - for each parent, do a diff and mark lines that don't have a suspect yet (this is the term /// used in `libgit2`), but that have been changed in this commit /// /// The algorithm in `libgit2` works by going through parents and keeping a linked list of blame @@ -71,307 +69,9 @@ pub fn file( ) -> Result { let _span = gix_trace::coarse!("gix_blame::file()", ?file_path, ?suspect); - let mut stats = Statistics::default(); - let (mut buf, mut buf2, mut buf3) = (Vec::new(), Vec::new(), Vec::new()); - let blamed_file_entry_id = find_path_entry_in_commit( - &odb, - &suspect, - file_path, - cache.as_ref(), - &mut buf, - &mut buf2, - &mut stats, - )? - .ok_or_else(|| Error::FileMissing { - file_path: file_path.to_owned(), - commit_id: suspect, - })?; - let blamed_file_blob = odb.find_blob(&blamed_file_entry_id, &mut buf)?.data.to_vec(); - let num_lines_in_blamed = tokens_for_diffing(&blamed_file_blob).tokenize().count() as u32; - - // Binary or otherwise empty? - if num_lines_in_blamed == 0 { - return Ok(Outcome::default()); - } - - let ranges = options.range.to_zero_based_exclusive(num_lines_in_blamed)?; - let mut hunks_to_blame = Vec::with_capacity(ranges.len()); - - for range in ranges { - hunks_to_blame.push(UnblamedHunk { - range_in_blamed_file: range.clone(), - suspects: [(suspect, range)].into(), - }); - } - - let (mut buf, mut buf2) = (Vec::new(), Vec::new()); - let commit = find_commit(cache.as_ref(), &odb, &suspect, &mut buf)?; - let mut queue: gix_revwalk::PriorityQueue = gix_revwalk::PriorityQueue::new(); - queue.insert(commit_time(commit)?, suspect); - - let mut out = Vec::new(); - let mut diff_state = gix_diff::tree::State::default(); - let mut previous_entry: Option<(ObjectId, ObjectId)> = None; - 'outer: while let Some(suspect) = queue.pop_value() { - stats.commits_traversed += 1; - if hunks_to_blame.is_empty() { - break; - } - - let is_still_suspect = hunks_to_blame.iter().any(|hunk| hunk.has_suspect(&suspect)); - if !is_still_suspect { - // There are no `UnblamedHunk`s associated with this `suspect`, so we can continue with - // the next one. - continue 'outer; - } - - let commit = find_commit(cache.as_ref(), &odb, &suspect, &mut buf)?; - let commit_time = commit_time(commit)?; - - if let Some(since) = options.since { - if commit_time < since.seconds { - if unblamed_to_out_is_done(&mut hunks_to_blame, &mut out, suspect) { - break 'outer; - } - - continue; - } - } - - let parent_ids: ParentIds = collect_parents(commit, &odb, cache.as_ref(), &mut buf2)?; - - if parent_ids.is_empty() { - if queue.is_empty() { - // I’m not entirely sure if this is correct yet. `suspect`, at this point, is the - // `id` of the last `item` that was yielded by `queue`, so it makes sense to assign - // the remaining lines to it, even though we don’t explicitly check whether that is - // true here. We could perhaps use diff-tree-to-tree to compare `suspect` against - // an empty tree to validate this assumption. - if unblamed_to_out_is_done(&mut hunks_to_blame, &mut out, suspect) { - break 'outer; - } - } - // There is more, keep looking. - continue; - } - - let mut entry = previous_entry - .take() - .filter(|(id, _)| *id == suspect) - .map(|(_, entry)| entry); - if entry.is_none() { - entry = find_path_entry_in_commit( - &odb, - &suspect, - file_path, - cache.as_ref(), - &mut buf, - &mut buf2, - &mut stats, - )?; - } - - let Some(entry_id) = entry else { - continue; - }; - - // This block asserts that, for every `UnblamedHunk`, all lines in the *Blamed File* are - // identical to the corresponding lines in the *Source File*. - #[cfg(debug_assertions)] - { - let source_blob = odb.find_blob(&entry_id, &mut buf)?.data.to_vec(); - let mut source_interner = gix_diff::blob::intern::Interner::new(source_blob.len() / 100); - let source_lines_as_tokens: Vec<_> = tokens_for_diffing(&source_blob) - .tokenize() - .map(|token| source_interner.intern(token)) - .collect(); - - let mut blamed_interner = gix_diff::blob::intern::Interner::new(blamed_file_blob.len() / 100); - let blamed_lines_as_tokens: Vec<_> = tokens_for_diffing(&blamed_file_blob) - .tokenize() - .map(|token| blamed_interner.intern(token)) - .collect(); - - for hunk in hunks_to_blame.iter() { - if let Some(range_in_suspect) = hunk.get_range(&suspect) { - let range_in_blamed_file = hunk.range_in_blamed_file.clone(); - - for (blamed_line_number, source_line_number) in range_in_blamed_file.zip(range_in_suspect.clone()) { - let source_token = source_lines_as_tokens[source_line_number as usize]; - let blame_token = blamed_lines_as_tokens[blamed_line_number as usize]; - - let source_line = BString::new(source_interner[source_token].into()); - let blamed_line = BString::new(blamed_interner[blame_token].into()); - - assert_eq!(source_line, blamed_line); - } - } - } - } - - for (pid, (parent_id, parent_commit_time)) in parent_ids.iter().enumerate() { - if let Some(parent_entry_id) = find_path_entry_in_commit( - &odb, - parent_id, - file_path, - cache.as_ref(), - &mut buf, - &mut buf2, - &mut stats, - )? { - let no_change_in_entry = entry_id == parent_entry_id; - if pid == 0 { - previous_entry = Some((*parent_id, parent_entry_id)); - } - if no_change_in_entry { - pass_blame_from_to(suspect, *parent_id, &mut hunks_to_blame); - queue.insert(*parent_commit_time, *parent_id); - continue 'outer; - } - } - } - - let more_than_one_parent = parent_ids.len() > 1; - for (parent_id, parent_commit_time) in parent_ids { - queue.insert(parent_commit_time, parent_id); - let changes_for_file_path = tree_diff_at_file_path( - &odb, - file_path, - suspect, - parent_id, - cache.as_ref(), - &mut stats, - &mut diff_state, - &mut buf, - &mut buf2, - &mut buf3, - )?; - let Some(modification) = changes_for_file_path else { - if more_than_one_parent { - // None of the changes affected the file we’re currently blaming. - // Copy blame to parent. - for unblamed_hunk in &mut hunks_to_blame { - unblamed_hunk.clone_blame(suspect, parent_id); - } - } else { - pass_blame_from_to(suspect, parent_id, &mut hunks_to_blame); - } - continue; - }; - - match modification { - gix_diff::tree::recorder::Change::Addition { .. } => { - if more_than_one_parent { - // Do nothing under the assumption that this always (or almost always) - // implies that the file comes from a different parent, compared to which - // it was modified, not added. - } else if unblamed_to_out_is_done(&mut hunks_to_blame, &mut out, suspect) { - break 'outer; - } - } - gix_diff::tree::recorder::Change::Deletion { .. } => { - unreachable!("We already found file_path in suspect^{{tree}}, so it can't be deleted") - } - gix_diff::tree::recorder::Change::Modification { previous_oid, oid, .. } => { - let changes = blob_changes( - &odb, - resource_cache, - oid, - previous_oid, - file_path, - options.diff_algorithm, - &mut stats, - )?; - hunks_to_blame = process_changes(hunks_to_blame, changes, suspect, parent_id); - } - } - } - - hunks_to_blame.retain_mut(|unblamed_hunk| { - if unblamed_hunk.suspects.len() == 1 { - if let Some(entry) = BlameEntry::from_unblamed_hunk(unblamed_hunk, suspect) { - // At this point, we have copied blame for every hunk to a parent. Hunks - // that have only `suspect` left in `suspects` have not passed blame to any - // parent, and so they can be converted to a `BlameEntry` and moved to - // `out`. - out.push(entry); - return false; - } - } - unblamed_hunk.remove_blame(suspect); - true - }); - - // This block asserts that line ranges for each suspect never overlap. If they did overlap - // this would mean that the same line in a *Source File* would map to more than one line in - // the *Blamed File* and this is not possible. - #[cfg(debug_assertions)] - { - let ranges = hunks_to_blame.iter().fold( - std::collections::BTreeMap::>>::new(), - |mut acc, hunk| { - for (suspect, range) in hunk.suspects.clone() { - acc.entry(suspect).or_default().push(range); - } - - acc - }, - ); - - for (_, mut ranges) in ranges { - ranges.sort_by(|a, b| a.start.cmp(&b.start)); - - for window in ranges.windows(2) { - if let [a, b] = window { - assert!(a.end <= b.start, "#{hunks_to_blame:#?}"); - } - } - } - } - } - - debug_assert_eq!( - hunks_to_blame, - vec![], - "only if there is no portion of the file left we have completed the blame" - ); - - // I don’t know yet whether it would make sense to use a data structure instead that preserves - // order on insertion. - out.sort_by(|a, b| a.start_in_blamed_file.cmp(&b.start_in_blamed_file)); - Ok(Outcome { - entries: coalesce_blame_entries(out), - blob: blamed_file_blob, - statistics: stats, - }) -} - -/// Pass ownership of each unblamed hunk of `from` to `to`. -/// -/// This happens when `from` didn't actually change anything in the blamed file. -fn pass_blame_from_to(from: ObjectId, to: ObjectId, hunks_to_blame: &mut Vec) { - for unblamed_hunk in hunks_to_blame { - unblamed_hunk.pass_blame(from, to); - } -} - -/// Convert each of the unblamed hunk in `hunks_to_blame` into a [`BlameEntry`], consuming them in the process. -/// -/// Return `true` if we are done because `hunks_to_blame` is empty. -fn unblamed_to_out_is_done( - hunks_to_blame: &mut Vec, - out: &mut Vec, - suspect: ObjectId, -) -> bool { - let mut without_suspect = Vec::new(); - out.extend(hunks_to_blame.drain(..).filter_map(|hunk| { - BlameEntry::from_unblamed_hunk(&hunk, suspect).or_else(|| { - without_suspect.push(hunk); - None - }) - })); - *hunks_to_blame = without_suspect; - hunks_to_blame.is_empty() + let mut processor = BlameProcessor::new(&odb, cache, resource_cache, file_path, options)?; + processor.set_blame_state(suspect)?; + processor.process_blame() } /// This function merges adjacent blame entries. It merges entries that are adjacent both in the @@ -699,3 +399,462 @@ fn collect_parents( pub(crate) fn tokens_for_diffing(data: &[u8]) -> impl TokenSource { gix_diff::blob::sources::byte_lines_with_terminator(data) } + +/// State maintained during the blame operation +struct BlameState { + /// The current state of unblamed hunks that still need processing + hunks_to_blame: Vec, + /// The final blame entries that have been processed + out: Vec, + /// The content of the file being blamed + blamed_file_blob: Vec, + /// Statistics gathered during the blame operation + stats: Statistics, + /// Priority queue for processing commits in chronological order + queue: gix_revwalk::PriorityQueue, + /// Cache for the previous entry to avoid redundant lookups + previous_entry: Option<(ObjectId, ObjectId)>, +} + +struct BlameProcessor<'a, T: gix_object::Find + gix_object::FindHeader> { + odb: &'a T, + commit_graph_cache: Option, + resource_cache: &'a mut gix_diff::blob::Platform, + file_path: &'a BStr, + options: Options, + blame_state: Option, +} + +impl<'a, T: gix_object::Find + gix_object::FindHeader> BlameProcessor<'a, T> { + fn new( + odb: &'a T, + commit_graph_cache: Option, + resource_cache: &'a mut gix_diff::blob::Platform, + file_path: &'a BStr, + options: Options, + ) -> Result { + Ok(Self { + odb, + commit_graph_cache, + resource_cache, + file_path, + blame_state: None, + options, + }) + } + + fn set_blame_state(&mut self, suspect: ObjectId) -> Result<(), Error>{ + let blame_state = self.create_blame_state(suspect)?; + self.blame_state = Some(blame_state); + Ok(()) + } + + /// Set a new BlameState for the given suspect commit + fn create_blame_state(&self, suspect: ObjectId) -> Result { + let (mut buf, mut buf2) = (Vec::new(), Vec::new()); + let mut stats = Statistics::default(); + let blamed_file_entry_id = find_path_entry_in_commit( + self.odb, + &suspect, + self.file_path, + self.commit_graph_cache.as_ref(), + &mut buf, + &mut buf2, + &mut stats, + )? + .ok_or_else(|| Error::FileMissing { + file_path: self.file_path.to_owned(), + commit_id: suspect, + })?; + + let blamed_file_blob = self.odb.find_blob(&blamed_file_entry_id, &mut buf)?.data.to_vec(); + let num_lines_in_blamed = tokens_for_diffing(&blamed_file_blob).tokenize().count() as u32; + + // Binary or otherwise empty? + if num_lines_in_blamed == 0 { + // Create a [`BlameState`] with an empty range. + // This way we won't have `UnblamedHunk` and return early in `process_blame()`. + let empty_range = 0..0; + return Ok(BlameState::new(blamed_file_blob, vec![empty_range], suspect, 0, Some(stats))); + } + + let ranges_in_blamed_file = self + .options + .range + .to_zero_based_exclusive(num_lines_in_blamed) + .unwrap_or_default(); + + let commit = find_commit(self.commit_graph_cache.as_ref(), self.odb, &suspect, &mut buf)?; + let initial_commit_time = commit_time(commit)?; + + Ok(BlameState::new( + blamed_file_blob, + ranges_in_blamed_file, + suspect, + initial_commit_time, + Some(stats), + )) + } + + fn process_blame(self) -> Result { + let mut state = match self.blame_state{ + Some(state) => state, + None => { + return Err(Error::BlameStateNotSet); + } + }; + + let mut diff_state = gix_diff::tree::State::default(); + let (mut buf, mut buf2, mut buf3) = (Vec::new(), Vec::new(), Vec::new()); + + 'outer: while let Some(suspect) = state.queue.pop_value() { + state.stats.commits_traversed += 1; + if !state.has_unblamed_hunks() { + break; + } + + let is_still_suspect = state.hunks_to_blame.iter().any(|hunk| hunk.has_suspect(&suspect)); + if !is_still_suspect { + // There are no `UnblamedHunk`s associated with this `suspect`, so we can continue with + // the next one. + continue 'outer; + } + + let commit = find_commit(self.commit_graph_cache.as_ref(), self.odb, &suspect, &mut buf)?; + let commit_time = commit_time(commit)?; + + if let Some(since) = self.options.since { + if commit_time < since.seconds { + if state.has_unblamed_hunks() && state.unblamed_to_out_is_done(suspect) { + break 'outer; + } + continue; + } + } + + let parent_ids: ParentIds = collect_parents(commit, self.odb, self.commit_graph_cache.as_ref(), &mut buf2)?; + + if parent_ids.is_empty() { + if state.queue.is_empty() { + // TODO I'm not entirely sure if this is correct yet. `suspect`, at this point, is the + // `id` of the last `item` that was yielded by `queue`, so it makes sense to assign + // the remaining lines to it, even though we don't explicitly check whether that is + // true here. We could perhaps use diff-tree-to-tree to compare `suspect` against + // an empty tree to validate this assumption. + if state.unblamed_to_out_is_done(suspect) { + break 'outer; + } + } + // There is more, keep looking. + continue; + } + + let mut entry = state + .previous_entry + .take() + .filter(|(id, _)| *id == suspect) + .map(|(_, entry)| entry); + if entry.is_none() { + entry = find_path_entry_in_commit( + self.odb, + &suspect, + self.file_path, + self.commit_graph_cache.as_ref(), + &mut buf, + &mut buf2, + &mut state.stats, + )?; + } + + let Some(entry_id) = entry else { + continue; + }; + + #[cfg(debug_assertions)] + state.assert_hunk_and_blame_ranges_identical(self.odb, entry_id, &mut buf)?; + + if state.check_for_unchanged_parents( + self.odb, + self.commit_graph_cache.as_ref(), + self.file_path, + entry_id, + suspect, + &parent_ids, + &mut buf, + &mut buf2, + )? { + continue 'outer; + } + + let more_than_one_parent = parent_ids.len() > 1; + for (parent_id, parent_commit_time) in parent_ids { + state.queue.insert(parent_commit_time, parent_id); + let changes_for_file_path = tree_diff_at_file_path( + self.odb, + self.file_path, + suspect, + parent_id, + self.commit_graph_cache.as_ref(), + &mut state.stats, + &mut diff_state, + &mut buf, + &mut buf2, + &mut buf3, + )?; + + let Some(modification) = changes_for_file_path else { + if more_than_one_parent { + // None of the changes affected the file we're currently blaming. + // Copy blame to parent + state.clone_blame(suspect, parent_id); + } else { + state.pass_blame(suspect, parent_id); + } + continue; + }; + + match modification { + gix_diff::tree::recorder::Change::Addition { .. } => { + if more_than_one_parent { + // Do nothing under the assumption that this always (or almost always) + // implies that the file comes from a different parent, compared to which + // it was modified, not added. + } else if state.unblamed_to_out_is_done(suspect) { + break 'outer; + } + } + gix_diff::tree::recorder::Change::Deletion { .. } => { + unreachable!("We already found file_path in suspect^{{tree}}, so it can't be deleted") + } + gix_diff::tree::recorder::Change::Modification { previous_oid, oid, .. } => { + let changes = blob_changes( + self.odb, + self.resource_cache, + oid, + previous_oid, + self.file_path, + self.options.diff_algorithm, + &mut state.stats, + )?; + state.hunks_to_blame = process_changes(state.hunks_to_blame, changes, suspect, parent_id); + } + } + } + + state.process_completed_hunks(suspect); + } + + #[cfg(debug_assertions)] + state.assert_no_overlapping_ranges(); + + Ok(state.finalize()) + } +} + +impl BlameState { + /// Create a new BlameState with initial values + fn new( + blamed_file_blob: Vec, + initial_ranges: Vec>, + initial_suspect: ObjectId, + initial_commit_time: CommitTime, + statistics: Option, + ) -> Self { + let mut queue = gix_revwalk::PriorityQueue::new(); + queue.insert(initial_commit_time, initial_suspect); + + let mut hunks_to_blame = Vec::with_capacity(initial_ranges.len()); + + for range in initial_ranges { + hunks_to_blame.push(UnblamedHunk { + range_in_blamed_file: range.clone(), + suspects: [(initial_suspect, range)].into(), + }); + } + + Self { + hunks_to_blame, + out: Vec::new(), + blamed_file_blob, + stats: statistics.unwrap_or_default(), + queue, + previous_entry: None, + } + } + + /// Check if there are any remaining hunks to process + fn has_unblamed_hunks(&self) -> bool { + !self.hunks_to_blame.is_empty() + } + + /// Convert each of the unblamed hunk in `hunks_to_blame` into a [`BlameEntry`], consuming them in the process. + /// + /// Return `true` if we are done because `hunks_to_blame` is empty. + fn unblamed_to_out_is_done(&mut self, suspect: ObjectId) -> bool { + let mut without_suspect = Vec::new(); + self.out.extend(self.hunks_to_blame.drain(..).filter_map(|hunk| { + BlameEntry::from_unblamed_hunk(&hunk, suspect).or_else(|| { + without_suspect.push(hunk); + None + }) + })); + self.hunks_to_blame = without_suspect; + self.hunks_to_blame.is_empty() + } + + /// Process completed hunks for a suspect, moving them to the output + fn process_completed_hunks(&mut self, suspect: ObjectId) { + self.hunks_to_blame.retain_mut(|unblamed_hunk| { + if unblamed_hunk.suspects.len() == 1 { + if let Some(entry) = BlameEntry::from_unblamed_hunk(unblamed_hunk, suspect) { + // At this point, we have copied blame for every hunk to a parent. Hunks + // that have only `suspect` left in `suspects` have not passed blame to any + // parent, and so they can be converted to a `BlameEntry` and moved to + // `out`. + self.out.push(entry); + return false; + } + } + unblamed_hunk.remove_blame(suspect); + true + }); + } + + /// Pass blame from one commit to another for all hunks + fn pass_blame(&mut self, from: ObjectId, to: ObjectId) { + for unblamed_hunk in &mut self.hunks_to_blame { + unblamed_hunk.pass_blame(from, to); + } + } + + /// Clone blame from one commit to another for all hunks + fn clone_blame(&mut self, from: ObjectId, to: ObjectId) { + for unblamed_hunk in &mut self.hunks_to_blame { + unblamed_hunk.clone_blame(from, to); + } + } + + /// Finalize the blame state and return the outcome + fn finalize(mut self) -> Outcome { + debug_assert_eq!( + self.hunks_to_blame, + vec![], + "only if there is no portion of the file left we have completed the blame" + ); + + // TODO I don't know yet whether it would make sense to use a data structure instead that preserves + // order on insertion. + self.out + .sort_by(|a, b| a.start_in_blamed_file.cmp(&b.start_in_blamed_file)); + Outcome { + entries: coalesce_blame_entries(self.out), + blob: self.blamed_file_blob, + statistics: self.stats, + } + } + + /// Check if any parent has the exact same file content and if so, pass blame to that parent. + /// This is an optimization to avoid unnecessary diffing. + /// Returns true if an unchanged parent was found and handled. + #[allow(clippy::too_many_arguments)] + fn check_for_unchanged_parents( + &mut self, + odb: &impl gix_object::Find, + commit_graph_cache: Option<&gix_commitgraph::Graph>, + file_path: &BStr, + entry_id: ObjectId, + suspect: ObjectId, + parent_ids: &[(ObjectId, i64)], + buf: &mut Vec, + buf2: &mut Vec, + ) -> Result { + for (pid, (parent_id, parent_commit_time)) in parent_ids.iter().enumerate() { + if let Some(parent_entry_id) = find_path_entry_in_commit( + odb, + parent_id, + file_path, + commit_graph_cache, + buf, + buf2, + &mut self.stats, + )? { + let no_change_in_entry = entry_id == parent_entry_id; + if pid == 0 { + self.previous_entry = Some((*parent_id, parent_entry_id)); + } + if no_change_in_entry { + self.pass_blame(suspect, *parent_id); + self.queue.insert(*parent_commit_time, *parent_id); + return Ok(true); + } + } + } + Ok(false) + } + + /// Assert that line ranges for each suspect never overlap + /// This block asserts that line ranges for each suspect never overlap. If they did overlap + // this would mean that the same line in a *Source File* would map to more than one line in + // the *Blamed File* and this is not possible. + #[cfg(debug_assertions)] + fn assert_no_overlapping_ranges(&self) { + let ranges = self.hunks_to_blame.iter().fold( + std::collections::BTreeMap::>>::new(), + |mut acc, hunk| { + for (suspect, range) in hunk.suspects.clone() { + acc.entry(suspect).or_default().push(range); + } + acc + }, + ); + + for (_, mut ranges) in ranges { + ranges.sort_by(|a, b| a.start.cmp(&b.start)); + for window in ranges.windows(2) { + if let [a, b] = window { + assert!(a.end <= b.start, "#{:?}", self.hunks_to_blame); + } + } + } + } + + /// Assert that for every UnblamedHunk, all lines in the Blamed File are identical + /// to the corresponding lines in the Source File. + #[cfg(debug_assertions)] + fn assert_hunk_and_blame_ranges_identical( + &self, + odb: &impl gix_object::Find, + entry_id: ObjectId, + buf: &mut Vec, + ) -> Result<(), Error> { + let source_blob = odb.find_blob(&entry_id, buf)?.data.to_vec(); + let mut source_interner = gix_diff::blob::intern::Interner::new(source_blob.len() / 100); + let source_lines_as_tokens: Vec<_> = tokens_for_diffing(&source_blob) + .tokenize() + .map(|token| source_interner.intern(token)) + .collect(); + + let mut blamed_interner = gix_diff::blob::intern::Interner::new(self.blamed_file_blob.len() / 100); + let blamed_lines_as_tokens: Vec<_> = tokens_for_diffing(&self.blamed_file_blob) + .tokenize() + .map(|token| blamed_interner.intern(token)) + .collect(); + + for hunk in self.hunks_to_blame.iter() { + if let Some(range_in_suspect) = hunk.get_range(&entry_id) { + let range_in_blamed_file = hunk.range_in_blamed_file.clone(); + + for (blamed_line_number, source_line_number) in range_in_blamed_file.zip(range_in_suspect.clone()) { + let source_token = source_lines_as_tokens[source_line_number as usize]; + let blame_token = blamed_lines_as_tokens[blamed_line_number as usize]; + + let source_line = BString::new(source_interner[source_token].into()); + let blamed_line = BString::new(blamed_interner[blame_token].into()); + + assert_eq!(source_line, blamed_line); + } + } + } + Ok(()) + } +} From cba93ab4ae73c8c6db3ddb04086933b3c688e6ba Mon Sep 17 00:00:00 2001 From: Bart Dubbeldam Date: Sun, 27 Apr 2025 19:45:00 +0200 Subject: [PATCH 2/2] feat: Implement blame continuation from an existing checkpoint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In this commit, we introduce the ability to execute a blame operation from a previously generated checkpoint. This functionality makes the computation of incremental blame operations more efficient by reusing already computed information. The refactoring performed in commit id d22965f8 allowed effortless integration of this feature into the existing blame processing algorithm. The fundamental distinction between a regular blame and a blame operation from a checkpoint is that in the latter, we can pre-fill the `BlameState` with some `BlameEntry` instances, thereby reducing the number of `UnblamedHunk` instances we need to process. This update’s algorithm for incorporating the detected modifications since the checkpoint into the blame entries is encapsulated in the `update_checkpoint_blames_with_changes` method. The newly introduced `BlameCheckpoint` type is a public type that users can utilize to establish a checkpoint state for their blame operations. --- gix-blame/src/file/checkpoint.rs | 228 +++++++++++++++++ gix-blame/src/file/function.rs | 172 +++++++++++-- gix-blame/src/file/mod.rs | 1 + gix-blame/src/file/tests.rs | 259 ++++++++++++++++++++ gix-blame/src/lib.rs | 4 +- gix-blame/src/types.rs | 17 ++ gix-blame/tests/blame.rs | 74 +++++- gix-blame/tests/fixtures/make_blame_repo.sh | 26 +- 8 files changed, 757 insertions(+), 24 deletions(-) create mode 100644 gix-blame/src/file/checkpoint.rs diff --git a/gix-blame/src/file/checkpoint.rs b/gix-blame/src/file/checkpoint.rs new file mode 100644 index 00000000000..2a55434d432 --- /dev/null +++ b/gix-blame/src/file/checkpoint.rs @@ -0,0 +1,228 @@ +use crate::types::{Change, UnblamedHunk}; +use crate::{BlameCheckpoint, BlameEntry}; +use gix_hash::ObjectId; +use std::num::NonZeroU32; +use std::ops::Range; + +/// Updates blame entries from a checkpoint with newly detected changes. +/// +/// # Arguments +/// * `checkpoint` - Previous checkpoint with blame entries +/// * `changes` - Changes since the checkpoint +/// * `suspect` - Commit ID being investigated +/// +/// # Returns +/// Updated blame entries and new hunks to blame +pub(crate) fn update_checkpoint_blames_with_changes( + checkpoint: BlameCheckpoint, + changes: Vec, + suspect: ObjectId, +) -> (Vec, Vec) { + fn blame_fully_contained_by_change( + blame_lines: &BlameLines, + blame: &BlameEntry, + change_lines: &ChangeLines, + change: &Change, + ) -> bool { + blame_lines.get_remaining(blame) < change_lines.get_remaining(change) + } + + let mut updated_blames = Vec::new(); + let mut new_hunks_to_blame = Vec::new(); + + let mut blame_iter = checkpoint.entries.into_iter().peekable(); + + // This nested loop iterates through changes and blame entries in parallel, tracking how many + // lines have been processed in each. For each change type: + // - Unchanged: Keep the original blame but adjust line numbers + // - Deleted: Remove blame entries for deleted lines + // - Added/Replaced: Create new hunks to be blamed later + // The tracking ensures we correctly handle partial overlaps between changes and blame entries. + 'change: for change in changes { + let mut change_assigned = ChangeLines::default(); + while let Some(blame) = blame_iter.peek_mut() { + let mut blame_assigned = BlameLines::default(); + + // For each of the three cases we have to check if the blame is fully contained by the change. + // If so we can update the blame with the remaining length of the blame. + // If not we have to update the blame with the remaining length of the change. + match change { + Change::Unchanged(ref range) => { + match blame_fully_contained_by_change(&blame_assigned, blame, &change_assigned, &change) { + true => { + updated_blames.push(BlameEntry { + start_in_blamed_file: range.start + change_assigned.assigned.get_assigned(), + start_in_source_file: blame.start_in_source_file, + len: blame.len, + commit_id: blame.commit_id, + }); + + change_assigned.assigned.add_assigned(blame.len.get()); + blame_assigned.assigned.add_assigned(blame.len.get()); + } + false => { + updated_blames.push(BlameEntry { + start_in_blamed_file: range.start + change_assigned.assigned.get_assigned(), + start_in_source_file: blame.start_in_source_file, + len: NonZeroU32::new(change_assigned.get_remaining(&change)).unwrap(), + commit_id: blame.commit_id, + }); + + blame_assigned + .assigned + .add_assigned(change_assigned.get_remaining(&change)); + change_assigned + .assigned + .add_assigned(change_assigned.get_remaining(&change)); + } + } + } + Change::Deleted(_start_deletion, _lines_deleted) => { + match blame_fully_contained_by_change(&blame_assigned, blame, &change_assigned, &change) { + true => { + blame_assigned.assigned.add_assigned(blame.len.get()); + change_assigned.assigned.add_assigned(blame.len.get()); + } + false => { + blame_assigned + .assigned + .add_assigned(change_assigned.get_remaining(&change)); + change_assigned + .assigned + .add_assigned(change_assigned.get_remaining(&change)); + } + } + } + Change::AddedOrReplaced(ref range, lines_deleted) => { + let new_unblamed_hunk = |range: &Range, suspect: ObjectId| UnblamedHunk { + range_in_blamed_file: range.clone(), + suspects: [(suspect, range.clone())].into(), + }; + match blame_fully_contained_by_change(&blame_assigned, blame, &change_assigned, &change) { + true => { + if lines_deleted == 0 { + new_hunks_to_blame.push(new_unblamed_hunk(range, suspect)); + } + + change_assigned.assigned.add_assigned(blame.len.get()); + blame_assigned.assigned.add_assigned(blame.len.get()); + } + false => { + new_hunks_to_blame.push(new_unblamed_hunk(range, suspect)); + + blame_assigned + .assigned + .add_assigned(change_assigned.get_remaining(&change)); + change_assigned + .assigned + .add_assigned(change_assigned.get_remaining(&change)); + } + } + } + } + + // Check if the blame or the change is fully assigned. + // If the blame is fully assigned we can continue with the next blame. + // If the change is fully assigned we can continue with the next change. + // Since we have a mutable reference to the blame we can update it and reset the assigned blame lines. + // If both are fully assigned we can continue with the next blame and change. + match ( + blame_assigned.has_remaining(blame), + change_assigned.has_remaining(&change), + ) { + (true, true) => { + // Both have remaining + blame.update_blame(&blame_assigned.assigned); + } + (true, false) => { + // Change is fully assigned + blame.update_blame(&blame_assigned.assigned); + continue 'change; + } + (false, true) => { + // Blame is fully assigned + blame_iter.next(); + } + (false, false) => { + // Both are fully assigned + blame_iter.next(); + continue 'change; + } + } + } + } + (updated_blames, new_hunks_to_blame) +} + +impl BlameEntry { + /// Updates the blame entry's line numbers by the given offset. + /// + /// This is used when processing changes to adjust the line numbers in the blame entry + /// to account for lines that have already been processed. It updates: + /// * The starting line in the blamed file + /// * The starting line in the source file + /// * The length of the blame entry + pub(crate) fn update_blame(&mut self, offset: &LinesAssigned) { + self.start_in_blamed_file += offset.get_assigned(); + self.start_in_source_file += offset.get_assigned(); + self.len = NonZeroU32::new(u32::from(self.len) - offset.get_assigned()).unwrap(); + } +} + +/// Tracks the number of lines processed during blame updates +#[derive(Debug, Default)] +pub(crate) struct LinesAssigned { + lines_assigned: u32, +} + +impl LinesAssigned { + /// Add lines to the count + fn add_assigned(&mut self, lines: u32) { + self.lines_assigned += lines; + } + + /// Get current count + fn get_assigned(&self) -> u32 { + self.lines_assigned + } +} + +/// Tracks line assignments for blame entries +#[derive(Debug, Default)] +struct BlameLines { + assigned: LinesAssigned, +} + +impl BlameLines { + /// Calculate remaining lines in a blame entry + fn get_remaining(&self, blame: &BlameEntry) -> u32 { + blame.len.get() - self.assigned.get_assigned() + } + + /// Check if any lines remain + fn has_remaining(&self, blame: &BlameEntry) -> bool { + self.get_remaining(blame) > 0 + } +} + +/// Tracks line assignments for changes +#[derive(Debug, Default)] +struct ChangeLines { + assigned: LinesAssigned, +} + +impl ChangeLines { + /// Calculate remaining lines in a change + fn get_remaining(&self, change: &Change) -> u32 { + match &change { + Change::Unchanged(range) => range.len() as u32 - self.assigned.get_assigned(), + Change::AddedOrReplaced(_, deleted_in_before) => *deleted_in_before - self.assigned.get_assigned(), + Change::Deleted(_, deleted_in_before) => *deleted_in_before - self.assigned.get_assigned(), + } + } + + /// Check if any lines remain + fn has_remaining(&self, change: &Change) -> bool { + self.get_remaining(change) > 0 + } +} diff --git a/gix-blame/src/file/function.rs b/gix-blame/src/file/function.rs index 6f33b7b2fd9..6e533a0877a 100644 --- a/gix-blame/src/file/function.rs +++ b/gix-blame/src/file/function.rs @@ -10,7 +10,8 @@ use gix_traverse::commit::find as find_commit; use smallvec::SmallVec; use super::{process_changes, Change, UnblamedHunk}; -use crate::{BlameEntry, Error, Options, Outcome, Statistics}; +use crate::file::checkpoint::update_checkpoint_blames_with_changes; +use crate::{BlameCheckpoint, BlameEntry, Error, Options, Outcome, Statistics}; /// Produce a list of consecutive [`BlameEntry`] instances to indicate in which commits the ranges of the file /// at `suspect:` originated in. @@ -74,6 +75,40 @@ pub fn file( processor.process_blame() } +/// Process blame information starting from a checkpoint. +/// +/// Uses the same core blame algorithm as `file()` but continues from a previously provided checkpoint, +/// allowing for incremental blame operations by reusing previously computed information. +/// +/// # Parameters +/// +/// * `odb` - Access to database objects, used for diffing and object lookup +/// * `suspect` - The commit to start blame processing from +/// * `cache` - Optional commitgraph cache for performance +/// * `blame_checkpoint` - Previous blame information to start from +/// * `resource_cache` - Cache used for diffing trees +/// * `file_path` - Path to the file being blamed (slash-separated, worktree-relative) +/// * `options` - Options controlling the blame operation +/// +/// # Returns +/// +/// Returns an `Outcome` containing the blame entries and statistics, or an error if the operation fails. +pub fn file_checkpoint( + odb: impl gix_object::Find + gix_object::FindHeader, + suspect: ObjectId, + cache: Option, + blame_checkpoint: BlameCheckpoint, + resource_cache: &mut gix_diff::blob::Platform, + file_path: &BStr, + options: Options, +) -> Result { + let _span = gix_trace::coarse!("gix_blame::file_checkpoint()", ?file_path, ?suspect); + + let mut processor = BlameProcessor::new(&odb, cache, resource_cache, file_path, options)?; + processor.set_blame_state_from_checkpoint(blame_checkpoint, suspect)?; + processor.process_blame() +} + /// This function merges adjacent blame entries. It merges entries that are adjacent both in the /// blamed file and in the source file that introduced them. This follows `git`’s /// behaviour. `libgit2`, as of 2024-09-19, only checks whether two entries are adjacent in the @@ -362,7 +397,7 @@ fn commit_time(commit: gix_traverse::commit::Either<'_, '_>) -> Result; +type ParentIds = SmallVec<[(ObjectId, i64); 2]>; fn collect_parents( commit: gix_traverse::commit::Either<'_, '_>, @@ -401,6 +436,10 @@ pub(crate) fn tokens_for_diffing(data: &[u8]) -> impl TokenSource } /// State maintained during the blame operation +/// +/// Tracks both the current processing state and final results, including unblamed hunks, +/// completed entries, file content, and statistics. Can be created either from scratch +/// or from a checkpoint. struct BlameState { /// The current state of unblamed hunks that still need processing hunks_to_blame: Vec, @@ -416,6 +455,10 @@ struct BlameState { previous_entry: Option<(ObjectId, ObjectId)>, } +/// Processor for handling blame operations +/// +/// Encapsulates the logic for processing blame operations, whether starting +/// from scratch or continuing from a checkpoint. struct BlameProcessor<'a, T: gix_object::Find + gix_object::FindHeader> { odb: &'a T, commit_graph_cache: Option, @@ -426,6 +469,7 @@ struct BlameProcessor<'a, T: gix_object::Find + gix_object::FindHeader> { } impl<'a, T: gix_object::Find + gix_object::FindHeader> BlameProcessor<'a, T> { + /// Creates a new blame processor with the given parameters fn new( odb: &'a T, commit_graph_cache: Option, @@ -443,39 +487,114 @@ impl<'a, T: gix_object::Find + gix_object::FindHeader> BlameProcessor<'a, T> { }) } - fn set_blame_state(&mut self, suspect: ObjectId) -> Result<(), Error>{ + /// Sets up the initial blame state for processing from scratch + fn set_blame_state(&mut self, suspect: ObjectId) -> Result<(), Error> { let blame_state = self.create_blame_state(suspect)?; self.blame_state = Some(blame_state); Ok(()) } - /// Set a new BlameState for the given suspect commit - fn create_blame_state(&self, suspect: ObjectId) -> Result { - let (mut buf, mut buf2) = (Vec::new(), Vec::new()); - let mut stats = Statistics::default(); - let blamed_file_entry_id = find_path_entry_in_commit( + /// Sets up the initial blame state using information from a checkpoint + fn set_blame_state_from_checkpoint( + &mut self, + blame_checkpoint: BlameCheckpoint, + suspect: ObjectId, + ) -> Result<(), Error> { + let blame_state = self.create_blame_state_from_checkpoint(blame_checkpoint, suspect)?; + self.blame_state = Some(blame_state); + Ok(()) + } + + /// Helper function to find a file entry in a commit and return its blob data + fn find_file_entry_and_blob( + &self, + commit_id: &ObjectId, + stats: &mut Statistics, + buf: &mut Vec, + buf2: &mut Vec, + ) -> Result<(ObjectId, Vec), Error> { + let entry_id = find_path_entry_in_commit( self.odb, - &suspect, + commit_id, self.file_path, self.commit_graph_cache.as_ref(), - &mut buf, - &mut buf2, - &mut stats, + buf, + buf2, + stats, )? .ok_or_else(|| Error::FileMissing { file_path: self.file_path.to_owned(), - commit_id: suspect, + commit_id: *commit_id, })?; - let blamed_file_blob = self.odb.find_blob(&blamed_file_entry_id, &mut buf)?.data.to_vec(); + let blob_data = self.odb.find_blob(&entry_id, buf)?.data.to_vec(); + Ok((entry_id, blob_data)) + } + + /// Creates the initial blame state from a checkpoint + /// + /// Sets up blame state by: + /// 1. Getting file contents from checkpoint and suspect commits + /// 2. Computing and applying changes + /// 3. Setting up remaining hunks for processing + fn create_blame_state_from_checkpoint( + &mut self, + blame_checkpoint: BlameCheckpoint, + suspect: ObjectId, + ) -> Result { + let mut stats = Statistics::default(); + let (mut buf, mut buf2) = (Vec::new(), Vec::new()); + + let (checkpoint_file_entry_id, _) = + self.find_file_entry_and_blob(&blame_checkpoint.checkpoint_commit_id, &mut stats, &mut buf, &mut buf2)?; + + let (blamed_file_entry_id, blamed_file_blob) = + self.find_file_entry_and_blob(&suspect, &mut stats, &mut buf, &mut buf2)?; + + let changes_with_checkpoint = blob_changes( + self.odb, + self.resource_cache, + checkpoint_file_entry_id, + blamed_file_entry_id, + self.file_path, + self.options.diff_algorithm, + &mut stats, + )?; + + let mut blame_state = BlameState::empty(blamed_file_blob, suspect, stats); + + // If there are no changes, we update the blame_state with the checkpoint entries. + // Because the range stays 0..0 we are returning early during processing + if changes_with_checkpoint + .iter() + .all(|change| matches!(change, Change::Unchanged(_))) + { + blame_state.set_out(blame_checkpoint.entries); + return Ok(blame_state); + } + + let (blame_entries, hunks_to_blame) = + update_checkpoint_blames_with_changes(blame_checkpoint, changes_with_checkpoint, suspect); + + blame_state.set_out(blame_entries); + if !hunks_to_blame.is_empty() { + blame_state.set_hunks(hunks_to_blame); + } + Ok(blame_state) + } + + /// Set a new BlameState for the given suspect commit + fn create_blame_state(&self, suspect: ObjectId) -> Result { + let mut stats = Statistics::default(); + let (mut buf, mut buf2) = (Vec::new(), Vec::new()); + + let (_, blamed_file_blob) = self.find_file_entry_and_blob(&suspect, &mut stats, &mut buf, &mut buf2)?; + let num_lines_in_blamed = tokens_for_diffing(&blamed_file_blob).tokenize().count() as u32; // Binary or otherwise empty? if num_lines_in_blamed == 0 { - // Create a [`BlameState`] with an empty range. - // This way we won't have `UnblamedHunk` and return early in `process_blame()`. - let empty_range = 0..0; - return Ok(BlameState::new(blamed_file_blob, vec![empty_range], suspect, 0, Some(stats))); + return Ok(BlameState::empty(blamed_file_blob, suspect, stats)); } let ranges_in_blamed_file = self @@ -497,7 +616,7 @@ impl<'a, T: gix_object::Find + gix_object::FindHeader> BlameProcessor<'a, T> { } fn process_blame(self) -> Result { - let mut state = match self.blame_state{ + let mut state = match self.blame_state { Some(state) => state, None => { return Err(Error::BlameStateNotSet); @@ -682,6 +801,13 @@ impl BlameState { } } + /// Create a BlameState with an empty range, used when there are no lines to blame + /// or when starting from a checkpoint + fn empty(blamed_file_blob: Vec, suspect: ObjectId, stats: Statistics) -> Self { + let empty_range = 0..0; + Self::new(blamed_file_blob, vec![empty_range], suspect, 0, Some(stats)) + } + /// Check if there are any remaining hunks to process fn has_unblamed_hunks(&self) -> bool { !self.hunks_to_blame.is_empty() @@ -702,6 +828,14 @@ impl BlameState { self.hunks_to_blame.is_empty() } + fn set_out(&mut self, out: Vec) { + self.out = out; + } + + fn set_hunks(&mut self, unblamed_hunks: Vec) { + self.hunks_to_blame = unblamed_hunks; + } + /// Process completed hunks for a suspect, moving them to the output fn process_completed_hunks(&mut self, suspect: ObjectId) { self.hunks_to_blame.retain_mut(|unblamed_hunk| { diff --git a/gix-blame/src/file/mod.rs b/gix-blame/src/file/mod.rs index bbca9681837..9528da2cfa2 100644 --- a/gix-blame/src/file/mod.rs +++ b/gix-blame/src/file/mod.rs @@ -6,6 +6,7 @@ use gix_hash::ObjectId; use crate::types::{BlameEntry, Change, Either, LineRange, Offset, UnblamedHunk}; +pub(crate) mod checkpoint; pub(super) mod function; /// Compare a section from the *Blamed File* (`hunk`) with a change from a diff and see if there diff --git a/gix-blame/src/file/tests.rs b/gix-blame/src/file/tests.rs index 02e7d89ebd1..4db06f18964 100644 --- a/gix-blame/src/file/tests.rs +++ b/gix-blame/src/file/tests.rs @@ -29,6 +29,265 @@ fn one_sha() -> ObjectId { ObjectId::from_str("1111111111111111111111111111111111111111").unwrap() } +fn two_sha() -> ObjectId { + use std::str::FromStr; + + ObjectId::from_str("2222222222222222222222222222222222222222").unwrap() +} + +mod blame_cache_to_hunks { + use super::*; + use crate::file::{checkpoint::update_checkpoint_blames_with_changes, BlameEntry, Change, UnblamedHunk}; + use crate::BlameCheckpoint; + + fn single_blame_entry() -> BlameCheckpoint { + BlameCheckpoint { + entries: vec![BlameEntry::new(0..5, 0..5, zero_sha())], + checkpoint_commit_id: zero_sha(), + } + } + + fn multiple_blame_entries() -> BlameCheckpoint { + BlameCheckpoint { + entries: vec![ + BlameEntry::new(0..5, 0..5, zero_sha()), + BlameEntry::new(5..10, 0..5, one_sha()), + ], + checkpoint_commit_id: zero_sha(), + } + } + + #[test] + fn no_changes() { + let checkpoint = single_blame_entry(); + let changes = vec![Change::Unchanged(0..5)]; + + let expected_blame = single_blame_entry().entries; + let (updated_blame_entries, new_unblamed_hunks) = + update_checkpoint_blames_with_changes(checkpoint, changes, one_sha()); + assert_eq!(updated_blame_entries, expected_blame); + assert!(new_unblamed_hunks.is_empty()); + } + + #[test] + fn deleted_line_full_blame() { + let checkpoint = single_blame_entry(); + let changes = vec![Change::Deleted(0, 5)]; + let expected_blame = vec![]; + let (updated_blame_entries, new_unblamed_hunks) = + update_checkpoint_blames_with_changes(checkpoint, changes, one_sha()); + assert_eq!(updated_blame_entries, expected_blame); + assert!(new_unblamed_hunks.is_empty()); + } + + #[test] + fn deleted_line_partial_first_delete() { + let checkpoint = single_blame_entry(); + let changes = vec![Change::Deleted(0, 3), Change::Unchanged(0..2)]; + let expected_blame = vec![BlameEntry::new(0..2, 3..5, zero_sha())]; + let (updated_blame_entries, new_unblamed_hunks) = + update_checkpoint_blames_with_changes(checkpoint, changes, one_sha()); + assert_eq!(updated_blame_entries, expected_blame); + assert!(new_unblamed_hunks.is_empty()); + } + + #[test] + fn deleted_line_partial_first_unchanged() { + let checkpoint = single_blame_entry(); + let changes = vec![Change::Unchanged(0..2), Change::Deleted(0, 3)]; + let expected_blame = vec![BlameEntry::new(0..2, 0..2, zero_sha())]; + let (updated_blame_entries, new_unblamed_hunks) = + update_checkpoint_blames_with_changes(checkpoint, changes, one_sha()); + assert_eq!(updated_blame_entries, expected_blame); + assert!(new_unblamed_hunks.is_empty()); + } + + #[test] + fn deleted_line_spanning() { + let checkpoint = single_blame_entry(); + let changes = vec![Change::Unchanged(0..1), Change::Deleted(1, 3), Change::Unchanged(1..2)]; + let expected_blame = vec![ + BlameEntry::new(0..1, 0..1, zero_sha()), + BlameEntry::new(1..2, 4..5, zero_sha()), + ]; + let (updated_blame_entries, new_unblamed_hunks) = + update_checkpoint_blames_with_changes(checkpoint, changes, one_sha()); + assert_eq!(updated_blame_entries, expected_blame); + assert!(new_unblamed_hunks.is_empty()); + } + + #[test] + fn add_or_replace_full_blame_delete() { + let checkpoint = single_blame_entry(); + let changes = vec![Change::AddedOrReplaced(0..5, 5)]; + let expected_blame = vec![]; + let expected_unblamed_hunks = vec![UnblamedHunk { + range_in_blamed_file: 0..5, + suspects: [(two_sha(), 0..5)].into(), + }]; + let (updated_blame_entries, new_unblamed_hunks) = + update_checkpoint_blames_with_changes(checkpoint, changes, two_sha()); + assert_eq!(updated_blame_entries, expected_blame); + assert_eq!(new_unblamed_hunks, expected_unblamed_hunks); + } + + #[test] + fn add_or_replace_first_partial_blame_delete() { + let checkpoint = single_blame_entry(); + let changes = vec![Change::AddedOrReplaced(0..5, 3), Change::Unchanged(5..7)]; + let expected_blame = vec![BlameEntry::new(5..7, 3..5, zero_sha())]; + let expected_unblamed_hunks = vec![UnblamedHunk { + range_in_blamed_file: 0..5, + suspects: [(two_sha(), 0..5)].into(), + }]; + let (updated_blame_entries, new_unblamed_hunks) = + update_checkpoint_blames_with_changes(checkpoint, changes, two_sha()); + assert_eq!(updated_blame_entries, expected_blame); + assert_eq!(new_unblamed_hunks, expected_unblamed_hunks); + } + + #[test] + fn add_or_replace_first_unchanged_partial_blame_delete() { + let checkpoint = single_blame_entry(); + let changes = vec![Change::Unchanged(0..3), Change::AddedOrReplaced(0..5, 2)]; + let expected_blame = vec![BlameEntry::new(0..3, 0..3, zero_sha())]; + let expected_unblamed_hunks = vec![UnblamedHunk { + range_in_blamed_file: 0..5, + suspects: [(two_sha(), 0..5)].into(), + }]; + let (updated_blame_entries, new_unblamed_hunks) = + update_checkpoint_blames_with_changes(checkpoint, changes, two_sha()); + assert_eq!(updated_blame_entries, expected_blame); + assert_eq!(new_unblamed_hunks, expected_unblamed_hunks); + } + + #[test] + fn add_or_replace_unchanged_replace_unchanged() { + let checkpoint = single_blame_entry(); + let changes = vec![ + Change::Unchanged(0..2), + Change::AddedOrReplaced(0..5, 1), + Change::Unchanged(7..9), + ]; + let expected_blame = vec![ + BlameEntry::new(0..2, 0..2, zero_sha()), + BlameEntry::new(7..9, 3..5, zero_sha()), + ]; + let expected_unblamed_hunks = vec![UnblamedHunk { + range_in_blamed_file: 0..5, + suspects: [(two_sha(), 0..5)].into(), + }]; + let (updated_blame_entries, new_unblamed_hunks) = + update_checkpoint_blames_with_changes(checkpoint, changes, two_sha()); + assert_eq!(updated_blame_entries, expected_blame); + assert_eq!(new_unblamed_hunks, expected_unblamed_hunks); + } + + #[test] + fn add_or_replace_no_deletion() { + let checkpoint = single_blame_entry(); + let changes = vec![ + Change::Unchanged(0..2), + Change::AddedOrReplaced(0..5, 0), + Change::Unchanged(6..9), + ]; + let expected_blame = vec![ + BlameEntry::new(0..2, 0..2, zero_sha()), + BlameEntry::new(6..9, 2..5, zero_sha()), + ]; + let expected_unblamed_hunks = vec![UnblamedHunk { + range_in_blamed_file: 0..5, + suspects: [(two_sha(), 0..5)].into(), + }]; + let (updated_blame_entries, new_unblamed_hunks) = + update_checkpoint_blames_with_changes(checkpoint, changes, two_sha()); + assert_eq!(updated_blame_entries, expected_blame); + assert_eq!(new_unblamed_hunks, expected_unblamed_hunks); + } + + #[test] + fn multiple_blames_no_change() { + let checkpoint = multiple_blame_entries(); + let changes = vec![Change::Unchanged(0..10)]; + let expected_blame = multiple_blame_entries().entries; + let (updated_blame_entries, new_unblamed_hunks) = + update_checkpoint_blames_with_changes(checkpoint, changes, one_sha()); + assert_eq!(updated_blame_entries, expected_blame); + assert!(new_unblamed_hunks.is_empty()); + } + + #[test] + fn multiple_blames_change_spans_multiple_lines_first_unchanged() { + let checkpoint = multiple_blame_entries(); + let changes = vec![Change::Unchanged(0..6), Change::Deleted(6, 4)]; + let expected_blame = vec![ + BlameEntry::new(0..5, 0..5, zero_sha()), + BlameEntry::new(5..6, 0..1, one_sha()), + ]; + let (updated_blame_entries, new_unblamed_hunks) = + update_checkpoint_blames_with_changes(checkpoint, changes, one_sha()); + assert_eq!(updated_blame_entries, expected_blame); + assert!(new_unblamed_hunks.is_empty()); + } + + #[test] + fn multiple_blames_change_spans_multiple_lines_first_delete() { + let checkpoint = multiple_blame_entries(); + let changes = vec![Change::Deleted(0, 4), Change::Unchanged(0..6)]; + let expected_blame = vec![ + BlameEntry::new(0..1, 4..5, zero_sha()), + BlameEntry::new(1..6, 0..5, one_sha()), + ]; + let (updated_blame_entries, new_unblamed_hunks) = + update_checkpoint_blames_with_changes(checkpoint, changes, one_sha()); + assert_eq!(updated_blame_entries, expected_blame); + assert!(new_unblamed_hunks.is_empty()); + } + + #[test] + fn multiple_blames_change_spans_delete_spanning() { + let checkpoint = multiple_blame_entries(); + let changes = vec![Change::Unchanged(0..4), Change::Deleted(4, 4), Change::Unchanged(4..6)]; + let expected_blame = vec![ + BlameEntry::new(0..4, 0..4, zero_sha()), + BlameEntry::new(4..6, 3..5, one_sha()), + ]; + let (updated_blame_entries, new_unblamed_hunks) = + update_checkpoint_blames_with_changes(checkpoint, changes, one_sha()); + assert_eq!(updated_blame_entries, expected_blame); + assert!(new_unblamed_hunks.is_empty()); + } + + #[test] + fn multiple_blames_add_or_replace_blame_delete_spanning() { + let checkpoint = multiple_blame_entries(); + let changes = vec![ + Change::Unchanged(0..4), + Change::AddedOrReplaced(4..9, 3), + Change::Unchanged(9..10), + Change::AddedOrReplaced(10..12, 2), + ]; + let expected_blame = vec![ + BlameEntry::new(0..4, 0..4, zero_sha()), + BlameEntry::new(9..10, 2..3, one_sha()), + ]; + let expected_unblamed_hunks = vec![ + UnblamedHunk { + range_in_blamed_file: 4..9, + suspects: [(two_sha(), 4..9)].into(), + }, + UnblamedHunk { + range_in_blamed_file: 10..12, + suspects: [(two_sha(), 10..12)].into(), + }, + ]; + let (updated_blame_entries, new_unblamed_hunks) = + update_checkpoint_blames_with_changes(checkpoint, changes, two_sha()); + assert_eq!(updated_blame_entries, expected_blame); + assert_eq!(new_unblamed_hunks, expected_unblamed_hunks); + } +} + mod process_change { use super::*; use crate::file::{process_change, Change, Offset, UnblamedHunk}; diff --git a/gix-blame/src/lib.rs b/gix-blame/src/lib.rs index aca4299d41c..399c7e1e619 100644 --- a/gix-blame/src/lib.rs +++ b/gix-blame/src/lib.rs @@ -17,7 +17,7 @@ mod error; pub use error::Error; mod types; -pub use types::{BlameEntry, BlameRanges, Options, Outcome, Statistics}; +pub use types::{BlameCheckpoint, BlameEntry, BlameRanges, Options, Outcome, Statistics}; mod file; -pub use file::function::file; +pub use file::function::{file, file_checkpoint}; diff --git a/gix-blame/src/types.rs b/gix-blame/src/types.rs index d60b723e598..184eb5bb523 100644 --- a/gix-blame/src/types.rs +++ b/gix-blame/src/types.rs @@ -312,6 +312,23 @@ impl BlameEntry { } } +/// A checkpoint for resuming blame operations. +/// +/// Contains previously computed blame entries that can be used with `file_checkpoint()` to: +/// - Resume processing from a previous point +/// - Enable incremental updates when history changes +/// - Speed up subsequent operations through caching +/// +/// When used, entries are validated against changes since checkpoint creation +/// and serve as the starting point for remaining blame computation. +#[derive(Debug, PartialEq)] +pub struct BlameCheckpoint { + /// The entries of the cache. + pub entries: Vec, + /// The commit that was blamed to produce these entries. + pub checkpoint_commit_id: ObjectId, +} + pub(crate) trait LineRange { fn shift_by(&self, offset: Offset) -> Self; } diff --git a/gix-blame/tests/blame.rs b/gix-blame/tests/blame.rs index 99e5105f6df..8f14b600ff5 100644 --- a/gix-blame/tests/blame.rs +++ b/gix-blame/tests/blame.rs @@ -1,6 +1,7 @@ use std::path::PathBuf; -use gix_blame::BlameRanges; +use gix_blame::{BlameCheckpoint, BlameRanges}; +use gix_ref::file::ReferenceExt; use gix_hash::ObjectId; use gix_object::bstr; @@ -417,6 +418,77 @@ mod blame_ranges { } } +#[test] +fn blame_with_checkpoint() -> gix_testtools::Result<()> { + let Fixture { + odb, + mut resource_cache, + suspect, + } = Fixture::new()?; + + // Find the checkpoint commit using the tag + let store = gix_ref::file::Store::at( + fixture_path().join(".git"), + gix_ref::store::init::Options { + write_reflog: gix_ref::store::WriteReflog::Disable, + ..Default::default() + }, + ); + let mut checkpoint_ref = gix_ref::file::Store::find(&store, "refs/tags/checkpoint-commit")?; + let checkpoint_commit = checkpoint_ref.peel_to_id_in_place(&store, &odb)?; + assert_ne!( + checkpoint_commit, + suspect, + "should not be the same commit as the suspect"); + + // First run blame on the checkpoint commit + let checkpoint_blame = gix_blame::file( + &odb, + checkpoint_commit, + None, + &mut resource_cache, + "checkpoint-test.txt".into(), + gix_blame::Options { + diff_algorithm: gix_diff::blob::Algorithm::Histogram, + range: BlameRanges::default(), + since: None, + }, + )?; + + // Create checkpoint + let checkpoint = BlameCheckpoint { + entries: checkpoint_blame.entries.clone(), + checkpoint_commit_id: checkpoint_commit, + }; + + // Run blame from checkpoint up to suspect commit + let blame_from_checkpoint = gix_blame::file_checkpoint( + &odb, + suspect, + None, + checkpoint, + &mut resource_cache, + "checkpoint-test.txt".into(), + gix_blame::Options { + diff_algorithm: gix_diff::blob::Algorithm::Histogram, + range: BlameRanges::default(), + since: None, + }, + )?; + + // Compare with git's blame output + let git_dir = fixture_path().join(".git"); + let baseline = Baseline::collect(git_dir.join("checkpoint-test.baseline"))?; + assert_eq!(blame_from_checkpoint.entries, baseline); + + // Also verify the checkpoint state matches git's blame at that commit + let checkpoint_baseline = Baseline::collect(git_dir.join("checkpoint-test-initial.baseline"))?; + assert_eq!(checkpoint_blame.entries, checkpoint_baseline); + assert_ne!(blame_from_checkpoint.entries, checkpoint_blame.entries); + + Ok(()) +} + fn fixture_path() -> PathBuf { gix_testtools::scripted_fixture_read_only("make_blame_repo.sh").unwrap() } diff --git a/gix-blame/tests/fixtures/make_blame_repo.sh b/gix-blame/tests/fixtures/make_blame_repo.sh index fe5ca4f1af0..5afcf46d1e0 100755 --- a/gix-blame/tests/fixtures/make_blame_repo.sh +++ b/gix-blame/tests/fixtures/make_blame_repo.sh @@ -6,6 +6,10 @@ git config --local diff.algorithm histogram git config merge.ff false +# Set up committer info with explicit dates +export GIT_COMMITTER_DATE="2024-01-01 12:00:00 +0000" +export GIT_AUTHOR_DATE="2024-01-01 12:00:00 +0000" + git checkout -q -b main echo "line 1" >> simple.txt @@ -53,6 +57,7 @@ echo -e "line 1\nline in between\nline 2" > coalesce-adjacent-hunks.txt git add same-line-changed-twice.txt git add coalesce-adjacent-hunks.txt git commit -q -m c2.4 +git tag c2.4 # Add tag for the commit echo "line 3" >> simple.txt git add simple.txt @@ -128,9 +133,24 @@ cp empty-lines-histogram.txt empty-lines-myers.txt git add empty-lines-histogram.txt empty-lines-myers.txt git commit -q -m c5.4 +# Add test case for checkpoint functionality +echo -e "line 1\nline 2\nline 3" > checkpoint-test.txt +git add checkpoint-test.txt +git commit -q -m c5.5 +git tag checkpoint-commit # Tag this commit for easy reference + +echo -e "line 1\nline 2 modified\nline 3" > checkpoint-test.txt +git add checkpoint-test.txt +git commit -q -m c5.6 + +# Generate baseline for both states +git blame --porcelain checkpoint-test.txt > .git/checkpoint-test.baseline +git checkout checkpoint-commit +git blame --porcelain checkpoint-test.txt > .git/checkpoint-test-initial.baseline + # The commit history created by the commits above this line is linear, it only # contains commits that have exactly one parent. -# Below this line, there’s also commits that have more than one parent. +# Below this line, there's also commits that have more than one parent. echo -e "line 1 original\nline 2\n line 3" > resolved-conflict.txt git add resolved-conflict.txt @@ -200,7 +220,7 @@ git merge branch-that-has-one-of-the-changes || true # This is to verify that, even though commits `c15`, `c15.1` and `merge` are # not chronologically ordered (`c15.1` has a `CommitDate` that is before -# `c15`’s `CommitDate`), the resulting blame is still correct. +# `c15`'s `CommitDate`), the resulting blame is still correct. # # ---c15------c15.2 # \ \ @@ -253,3 +273,5 @@ git blame --porcelain empty-lines-histogram.txt > .git/empty-lines-histogram.bas git config --local diff.algorithm myers git blame --porcelain empty-lines-myers.txt > .git/empty-lines-myers.baseline + +