Skip to content

Commit

Permalink
Faster diff (for some diffs) by skipping pygit2
Browse files Browse the repository at this point in the history
Background:
===========

pygit2 Tree.diff_to_tree is very slow for some large trees.
This seems particularly bad on a repo containing a large table dataset
(many rows, but few columns / small objects)

e.g. with this 43M-feature repo:

```
$ git ls-tree -r c62a4290d7d1d4039ff06afcbe364bac6abf0e69 | wc -l
43482704
```

A small diff (a couple thousand features changed) takes 25s for `kart
diff` to bootstrap. cProfile shows the time all spent in:
```
25.534 {method 'diff_to_tree' of '_pygit2.Tree' objects}
```

I was unable to find anything obvious in pygit2 or libgit2 source code
pertaining to this. e.g. the flags we're passing appear to be the right
ones.

Since `git diff-tree` can produce this diff within ~0.1s I just switched
to using that as a subprocess. The results are identical but much faster
(at least in the above case)

Performance of larger diffs / smaller repos
===========================================

The difference is less stark with other repos.

If I switch to a full diff (`[EMPTY]...HEAD`) I find no noticeable
difference in performance between the pygit2 and git implementations.

If I switch to a repo with less features but larger objects, I find a
much smaller difference than the above.

So to reiterate the win appears to be largest when dealing with repos
with:

* large number of features
* each feature is small
* final diff covers a small fraction of the repo (e.g. 1% or less)

e.g. on [NZ Property
Titles](https://data.linz.govt.nz/layer/50804-nz-property-titles/repository/)
I found a ~25% improvement when asking for a 14477-feature diff (0.6% of
total features)

before:
```
kart diff 'HEAD^^^...HEAD' -o json-lines --no-sort-keys --output
/dev/null
1.89s user 0.21s system 99% cpu 2.111 total
```

After:
```
kart diff 'HEAD^^^...HEAD' -o json-lines --no-sort-keys --output
/dev/null
1.38s user 0.13s system 98% cpu 1.525 total
```
  • Loading branch information
craigds committed Feb 4, 2025
1 parent 20387ca commit 5b5598a
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 10 deletions.
62 changes: 53 additions & 9 deletions kart/base_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,20 @@
import logging
import sys
from collections.abc import Iterable
from typing import Callable, Optional, Protocol, TYPE_CHECKING, ClassVar
from typing import Callable, Optional, Protocol, TYPE_CHECKING, ClassVar, Iterator

import click
import pygit2

from kart import subprocess_util
from kart.core import all_blobs_in_tree, all_blobs_with_paths_in_tree
from kart.diff_format import DiffFormat
from kart.diff_structs import DatasetDiff, DeltaDiff, Delta
from kart.exceptions import InvalidOperation, UNSUPPORTED_VERSION, PATCH_DOES_NOT_APPLY
from kart.key_filters import DatasetKeyFilter, MetaKeyFilter, UserStringKeyFilter
from kart.meta_items import MetaItemFileType, MetaItemVisibility
from kart.raw_diff_delta import RawDiffDelta
from kart.utils import chunk


class BaseDatasetMetaClass(type):
Expand Down Expand Up @@ -189,12 +191,55 @@ def diff_to_working_copy(
# Subclasses to override.
pass

def get_deltas_from_git_diff_for_subtree(
self,
other: Optional["BaseDataset"],
subtree_name: str,
*,
reverse: bool = False,
path_transform: Callable[[str], str] | None = None,
) -> Iterator[RawDiffDelta]:
self_subtree = self.get_subtree(subtree_name)
other_subtree = other.get_subtree(subtree_name) if other else self._empty_tree

if reverse:
self_subtree, other_subtree = other_subtree, self_subtree

# TODO: iterate over 0-terminated 'lines' of output rather than using check_output. Efficiently. How?
output = subprocess_util.check_output(
[
"git",
"diff-tree",
"-r",
"-z",
"--name-status",
str(self_subtree.id),
str(other_subtree.id),
]
)
if not output:
# empty diff
return

pieces = output.split(b"\0")
# diff-tree adds an extra \0 on the end of the output; trim it off
assert len(pieces) % 2 == 1
pieces.pop()

# Split on null bytes and decode each part as utf-8, filtering out empty strings
for status_char, path in chunk(pieces, 2, strict=True):
status_char = status_char.decode("utf8")
path = path.decode("utf8")
if path_transform:
path = path_transform(path)
yield RawDiffDelta.from_git_name_status(status_char, path)

def get_raw_diff_for_subtree(
self,
other: Optional["BaseDataset"],
subtree_name: str,
reverse: bool = False,
):
) -> pygit2.Diff:
"""
Get a pygit2.Diff of the diff between some subtree of this dataset, and the same subtree of another dataset
(typically actually the "same" dataset, but at a different revision).
Expand Down Expand Up @@ -255,12 +300,11 @@ def diff_subtree(
other, key_encoder_method, key_filter, reverse=reverse
)
else:
raw_diff = self.get_raw_diff_for_subtree(
other, subtree_name, reverse=reverse
)
# NOTE - we could potentially call diff.find_similar() to detect renames here
deltas = self.wrap_deltas_from_raw_diff(
raw_diff, lambda path: f"{subtree_name}/{path}"
deltas = self.get_deltas_from_git_diff_for_subtree(
other,
subtree_name,
reverse=reverse,
path_transform=lambda path: f"{subtree_name}/{path}",
)

def _no_dataset_error(method_name):
Expand Down Expand Up @@ -365,7 +409,7 @@ def wrap_deltas_from_raw_diff(
self,
raw_diff: pygit2.Diff,
path_transform: Callable[[str], str],
):
) -> Iterator[RawDiffDelta]:
for delta in raw_diff.deltas:
old_path = path_transform(delta.old_file.path) if delta.old_file else None
new_path = path_transform(delta.new_file.path) if delta.new_file else None
Expand Down
17 changes: 17 additions & 0 deletions kart/raw_diff_delta.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@ class RawDiffDelta:

__slots__ = ["status", "status_char", "old_path", "new_path"]

_GIT_STATUS_TO_PYGIT2 = {
"A": pygit2.GIT_DELTA_ADDED,
"D": pygit2.GIT_DELTA_DELETED,
"M": pygit2.GIT_DELTA_MODIFIED,
}

def __init__(self, status, status_char, old_path, new_path):
self.status = status
self.status_char = status_char
Expand All @@ -32,3 +38,14 @@ def of(cls, old_path, new_path, reverse=False):
return RawDiffDelta(pygit2.GIT_DELTA_DELETED, "D", old_path, new_path)
else:
return RawDiffDelta(pygit2.GIT_DELTA_MODIFIED, "M", old_path, new_path)

@classmethod
def from_git_name_status(cls, status_char, path):
status = cls._GIT_STATUS_TO_PYGIT2[status_char]
old_path = None
new_path = None
if status_char in "AM":
new_path = path
if status_char in "MD":
old_path = path
return RawDiffDelta(status, status_char, old_path, new_path)
5 changes: 4 additions & 1 deletion kart/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,16 @@ def wrapper(*args, **kwargs):
return decorator


def chunk(iterable, size):
def chunk(iterable, size, strict=False):
"""Generator. Yield successive chunks from iterable of length <size>."""
# TODO: replace this chunk() function with itertools.batched() (Python 3.12+)
it = iter(iterable)
while True:
chunk = tuple(itertools.islice(it, size))
if not chunk:
return
if strict and len(chunk) != size:
raise ValueError("chunk(): incomplete batch")
yield chunk


Expand Down

0 comments on commit 5b5598a

Please sign in to comment.