Skip to content

Commit

Permalink
Merge pull request #1032 from koordinates/faster-diff-5
Browse files Browse the repository at this point in the history
diff: Faster startup & much-reduced memory consumption
  • Loading branch information
craigds authored Jan 9, 2025
2 parents 115c585 + 8c8dc4f commit 9350690
Show file tree
Hide file tree
Showing 8 changed files with 152 additions and 11 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ Please note that compatibility for 0.x releases (software or repositories) isn't

_When adding new entries to the changelog, please include issue/PR numbers wherever possible._


## Unreleased

- diff: Much faster and much lower memory usage when using `--no-sort-keys` [#1032](https://github.com/koordinates/kart/pull/1032)
- diff: Fix an error when diffing a commit against the working copy, where a dataset has been deleted since the commit. [#1033](https://github.com/koordinates/kart/issues/1033)

## 0.16.0
Expand Down
4 changes: 3 additions & 1 deletion kart/base_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,9 @@ def diff(
"""
ds_diff = DatasetDiff()
meta_filter = ds_filter.get("meta", ds_filter.child_type())
ds_diff["meta"] = self.diff_meta(other, meta_filter, reverse=reverse)
ds_diff.set_if_nonempty(
"meta", self.diff_meta(other, meta_filter, reverse=reverse)
)
return ds_diff

def diff_meta(
Expand Down
2 changes: 1 addition & 1 deletion kart/base_diff_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,7 +456,7 @@ def get_file_diff(self):
def iter_deltadiff_items(self, deltas):
if self.sort_keys:
return deltas.sorted_items()
return deltas.items()
return deltas.iter_items()

def filtered_dataset_deltas(self, ds_path, ds_diff):
"""
Expand Down
101 changes: 98 additions & 3 deletions kart/diff_structs.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from collections import UserDict
from dataclasses import dataclass
from typing import Any
from typing import Any, Iterator

from .exceptions import InvalidOperation

Expand Down Expand Up @@ -325,6 +325,10 @@ def recursive_in(self, keys):
child = self.get(key)
return child.recursive_in(keys) if child is not None else False

def set_if_nonempty(self, key, value):
if value:
self[key] = value

def create_empty_child(self, key):
child = self.child_type()
self[key] = child
Expand All @@ -335,8 +339,7 @@ def prune(self, recurse=True):
Deletes any empty RichDicts that are children of self.
If recurse is True, also deletes non-empty RichDicts, as long as they only contain empty RichDicts in the end.
"""
items = list(self.items())
for key, value in items:
for key, value in list(self.items()):
if key == "data_changes" and value == False and len(self) == 1:
del self[key]
if not isinstance(value, RichDict):
Expand Down Expand Up @@ -430,25 +433,117 @@ def __json__(self):
return {k: v for k, v in self.items()}


class InvalidatedDeltaDiff(Exception):
pass


class DeltaDiff(Diff):
"""
A DeltaDiff is the inner-most type of Diff, the one that actually contains Deltas.
Since Deltas know the keys at which they should be stored, a DeltaDiff makes sure to store Deltas at these keys.
=== Using DeltaDiff with an iterator of Deltas ===
It is possible to pass in an iterator of Deltas (e.g. a generator) to the DeltaDiff constructor,
in which case the Deltas are NOT immediately stored in the DeltaDiff.
This is useful because there may be a lot of Deltas, and we don't want to store them in memory.
The only correct way to consume a DeltaDiff populated by a generator is to call `iter_items()`,
which will consume the iterator as it yields Deltas.
Calling that method will invalidate the DeltaDiff, so it cannot be used again (doing so will throw an exception)
Calling any other dict-like method (keys(), items(), len() etc) will consume the iterator and store its
contents in memory, which may be expensive.
"""

child_type = Delta

def __init__(self, initial_contents=()):
# An iterator over keys and Delta objects, which is consumed lazily
self._lazy_initial_contents: Iterator[tuple[str, Delta]] | None = None
if isinstance(initial_contents, (dict, UserDict)):
super().__init__(initial_contents)
else:
if isinstance(initial_contents, Iterator):
self._lazy_initial_contents = (
(delta.key, delta) for delta in initial_contents
)
initial_contents = ()
super().__init__((delta.key, delta) for delta in initial_contents)

def __getitem__(self, key):
if key in self.data:
return self.data[key]
self._evaluate_lazy_initial_contents()
return self.data[key]

def __setitem__(self, key, delta):
if key != delta.key:
raise ValueError("Delta must be added at the appropriate key")
super().__setitem__(key, delta)

def _evaluate_lazy_initial_contents(self):
if self._lazy_initial_contents is None:
return
# Consume the generator to populate the DeltaDiff.
self.update(self._lazy_initial_contents)
self._lazy_initial_contents = None

def __bool__(self):
result = bool(self.data)
if (not result) and self._lazy_initial_contents:
# If the DeltaDiff is empty, but has lazy initial contents, evaluate the first item to check booleanness.
try:
k, v = next(self._lazy_initial_contents)
except StopIteration:
return False
else:
# remember this result
self.data[k] = v
return True
return result

def __len__(self):
self._evaluate_lazy_initial_contents()
return super().__len__()

def items(self):
self._evaluate_lazy_initial_contents()
return super().items()

def iter_items(self):
"""
Iterates over the items in the DeltaDiff, including any lazy initial contents.
This method consumes the iterator without storing its contents.
It's not safe to call this method and then consume the DeltaDiff again.
"""
yield from self.data.items()
if self._lazy_initial_contents:
for k, v in self._lazy_initial_contents:
yield (k, v)

# Invalidate this DeltaDiff; it's not safe to consume it again after this.
# `data` is the underlying contents of UserDict, which we inherit from.
# So overriding it to a non-dict will cause all dict methods to raise exceptions.
# > TypeError: argument of type 'InvalidatedDeltaDiff' is not iterable
self.data = InvalidatedDeltaDiff(
"DeltaDiff can't be used after iter_items() has been called"
)

def keys(self):
"""
Overrides the dict.keys() method to ensure we consume any lazy initial contents first
"""
self._evaluate_lazy_initial_contents()
return super().keys()

def values(self):
"""
Overrides the dict.values() method to ensure we consume any lazy initial contents first
"""
self._evaluate_lazy_initial_contents()
return super().values()

def add_delta(self, delta):
"""Add the given delta at the appropriate key."""
super().__setitem__(delta.key, delta)
Expand Down
5 changes: 3 additions & 2 deletions kart/diff_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,9 @@ def get_dataset_diff(
ds_diff = DatasetDiff.concatenated(
base_target_diff, target_wc_diff, overwrite_original=True
)
# Get rid of parts of the diff-structure that are "empty":
ds_diff.prune()
if include_wc_diff:
# Get rid of parts of the diff-structure that are "empty":
ds_diff.prune()
return ds_diff


Expand Down
5 changes: 3 additions & 2 deletions kart/tabular/rich_table_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,9 @@ def diff(

# Else do a full diff.
else:
ds_diff["feature"] = DeltaDiff(
self.diff_feature(other, feature_filter, reverse=reverse)
ds_diff.set_if_nonempty(
"feature",
DeltaDiff(self.diff_feature(other, feature_filter, reverse=reverse)),
)
return ds_diff

Expand Down
4 changes: 3 additions & 1 deletion kart/tile/tile_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,9 @@ def diff(

# Else do a full diff.
else:
ds_diff["tile"] = self.diff_tile(other, tile_filter, reverse=reverse)
ds_diff.set_if_nonempty(
"tile", self.diff_tile(other, tile_filter, reverse=reverse)
)

return ds_diff

Expand Down
40 changes: 40 additions & 0 deletions tests/test_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import kart
from kart.tabular.v3 import TableV3
from kart.base_dataset import BaseDataset
from kart.diff_format import DiffFormat
from kart.diff_structs import Delta, DeltaDiff
from kart.html_diff_writer import HtmlDiffWriter
Expand Down Expand Up @@ -319,6 +320,45 @@ def slow_down_the_diff(self, ds_path, ds_diff, diff_format=DiffFormat.FULL):
]


def test_diff_doesnt_evaluate_all_deltas_up_front_if_you_dont_sort_keys(
data_archive_readonly, monkeypatch, cli_runner
):
# Test that we can start outputting features before we have instantiated all the feature deltas.
# Otherwise, diffs containing millions of deltas will be slow to start, and will use a lot of memory
# to buffer the deltas in memory.
# We explicitly avoid doing that, when the users has asked for a `--no-sort-keys` diff.
with data_archive_readonly("points") as repo_path:
orig_delta_as_json = JsonLinesDiffWriter.delta_as_json
features_written = 0

def delta_as_json(self, *args, **kwargs):
nonlocal features_written
features_written += 1
return orig_delta_as_json(self, *args, **kwargs)

monkeypatch.setattr(JsonLinesDiffWriter, "delta_as_json", delta_as_json)
orig_wrap_deltas_from_raw_diff = BaseDataset.wrap_deltas_from_raw_diff

def wrap_deltas_from_raw_diff(self, *args, **kwargs):
yield from orig_wrap_deltas_from_raw_diff(self, *args, **kwargs)
if not features_written:
pytest.fail(
"All deltas shouldn't be evaluated until some features are written"
)

monkeypatch.setattr(
BaseDataset, "wrap_deltas_from_raw_diff", wrap_deltas_from_raw_diff
)
r = cli_runner.invoke(
[
"diff",
f"--output-format=json-lines",
"--no-sort-keys",
"[EMPTY]...",
]
)


@pytest.mark.parametrize("output_format", DIFF_OUTPUT_FORMATS)
def test_diff_reprojection(output_format, data_working_copy, cli_runner):
"""diff the working copy against HEAD"""
Expand Down

0 comments on commit 9350690

Please sign in to comment.