Merge pull request #1032 from koordinates/faster-diff-5

diff: Faster startup & much-reduced memory consumption
koordinates · Jan 9, 2025 · 9350690 · 9350690
2 parents 115c585 + 8c8dc4f
commit 9350690
Show file tree

Hide file tree

Showing 8 changed files with 152 additions and 11 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,9 +4,9 @@ Please note that compatibility for 0.x releases (software or repositories) isn't
 
 _When adding new entries to the changelog, please include issue/PR numbers wherever possible._
 
-
 ## Unreleased
 
+- diff: Much faster and much lower memory usage when using `--no-sort-keys` [#1032](https://github.com/koordinates/kart/pull/1032)
 - diff: Fix an error when diffing a commit against the working copy, where a dataset has been deleted since the commit. [#1033](https://github.com/koordinates/kart/issues/1033)
 
 ## 0.16.0

diff --git a/kart/base_dataset.py b/kart/base_dataset.py
@@ -139,7 +139,9 @@ def diff(
         """
         ds_diff = DatasetDiff()
         meta_filter = ds_filter.get("meta", ds_filter.child_type())
-        ds_diff["meta"] = self.diff_meta(other, meta_filter, reverse=reverse)
+        ds_diff.set_if_nonempty(
+            "meta", self.diff_meta(other, meta_filter, reverse=reverse)
+        )
         return ds_diff
 
     def diff_meta(

diff --git a/kart/base_diff_writer.py b/kart/base_diff_writer.py
@@ -456,7 +456,7 @@ def get_file_diff(self):
     def iter_deltadiff_items(self, deltas):
         if self.sort_keys:
             return deltas.sorted_items()
-        return deltas.items()
+        return deltas.iter_items()
 
     def filtered_dataset_deltas(self, ds_path, ds_diff):
         """

diff --git a/kart/diff_structs.py b/kart/diff_structs.py
@@ -1,6 +1,6 @@
 from collections import UserDict
 from dataclasses import dataclass
-from typing import Any
+from typing import Any, Iterator
 
 from .exceptions import InvalidOperation
 
@@ -325,6 +325,10 @@ def recursive_in(self, keys):
         child = self.get(key)
         return child.recursive_in(keys) if child is not None else False
 
+    def set_if_nonempty(self, key, value):
+        if value:
+            self[key] = value
+
     def create_empty_child(self, key):
         child = self.child_type()
         self[key] = child
@@ -335,8 +339,7 @@ def prune(self, recurse=True):
         Deletes any empty RichDicts that are children of self.
         If recurse is True, also deletes non-empty RichDicts, as long as they only contain empty RichDicts in the end.
         """
-        items = list(self.items())
-        for key, value in items:
+        for key, value in list(self.items()):
             if key == "data_changes" and value == False and len(self) == 1:
                 del self[key]
             if not isinstance(value, RichDict):
@@ -430,25 +433,117 @@ def __json__(self):
         return {k: v for k, v in self.items()}
 
 
+class InvalidatedDeltaDiff(Exception):
+    pass
+
+
 class DeltaDiff(Diff):
     """
     A DeltaDiff is the inner-most type of Diff, the one that actually contains Deltas.
     Since Deltas know the keys at which they should be stored, a DeltaDiff makes sure to store Deltas at these keys.
+
+    === Using DeltaDiff with an iterator of Deltas ===
+    It is possible to pass in an iterator of Deltas (e.g. a generator) to the DeltaDiff constructor,
+    in which case the Deltas are NOT immediately stored in the DeltaDiff.
+    This is useful because there may be a lot of Deltas, and we don't want to store them in memory.
+
+    The only correct way to consume a DeltaDiff populated by a generator is to call `iter_items()`,
+    which will consume the iterator as it yields Deltas.
+    Calling that method will invalidate the DeltaDiff, so it cannot be used again (doing so will throw an exception)
+
+    Calling any other dict-like method (keys(), items(), len() etc) will consume the iterator and store its
+    contents in memory, which may be expensive.
     """
 
     child_type = Delta
 
     def __init__(self, initial_contents=()):
+        # An iterator over keys and Delta objects, which is consumed lazily
+        self._lazy_initial_contents: Iterator[tuple[str, Delta]] | None = None
         if isinstance(initial_contents, (dict, UserDict)):
             super().__init__(initial_contents)
         else:
+            if isinstance(initial_contents, Iterator):
+                self._lazy_initial_contents = (
+                    (delta.key, delta) for delta in initial_contents
+                )
+                initial_contents = ()
             super().__init__((delta.key, delta) for delta in initial_contents)
 
+    def __getitem__(self, key):
+        if key in self.data:
+            return self.data[key]
+        self._evaluate_lazy_initial_contents()
+        return self.data[key]
+
     def __setitem__(self, key, delta):
         if key != delta.key:
             raise ValueError("Delta must be added at the appropriate key")
         super().__setitem__(key, delta)
 
+    def _evaluate_lazy_initial_contents(self):
+        if self._lazy_initial_contents is None:
+            return
+        # Consume the generator to populate the DeltaDiff.
+        self.update(self._lazy_initial_contents)
+        self._lazy_initial_contents = None
+
+    def __bool__(self):
+        result = bool(self.data)
+        if (not result) and self._lazy_initial_contents:
+            # If the DeltaDiff is empty, but has lazy initial contents, evaluate the first item to check booleanness.
+            try:
+                k, v = next(self._lazy_initial_contents)
+            except StopIteration:
+                return False
+            else:
+                # remember this result
+                self.data[k] = v
+                return True
+        return result
+
+    def __len__(self):
+        self._evaluate_lazy_initial_contents()
+        return super().__len__()
+
+    def items(self):
+        self._evaluate_lazy_initial_contents()
+        return super().items()
+
+    def iter_items(self):
+        """
+        Iterates over the items in the DeltaDiff, including any lazy initial contents.
+
+        This method consumes the iterator without storing its contents.
+        It's not safe to call this method and then consume the DeltaDiff again.
+        """
+        yield from self.data.items()
+        if self._lazy_initial_contents:
+            for k, v in self._lazy_initial_contents:
+                yield (k, v)
+
+        # Invalidate this DeltaDiff; it's not safe to consume it again after this.
+        # `data` is the underlying contents of UserDict, which we inherit from.
+        # So overriding it to a non-dict will cause all dict methods to raise exceptions.
+        #    > TypeError: argument of type 'InvalidatedDeltaDiff' is not iterable
+        self.data = InvalidatedDeltaDiff(
+            "DeltaDiff can't be used after iter_items() has been called"
+        )
+
+    def keys(self):
+        """
+        Overrides the dict.keys() method to ensure we consume any lazy initial contents first
+        """
+        self._evaluate_lazy_initial_contents()
+        return super().keys()
+
+    def values(self):
+        """
+        Overrides the dict.values() method to ensure we consume any lazy initial contents first
+        """
+        self._evaluate_lazy_initial_contents()
+        return super().values()
+
     def add_delta(self, delta):
         """Add the given delta at the appropriate key."""
         super().__setitem__(delta.key, delta)

diff --git a/kart/diff_util.py b/kart/diff_util.py
@@ -154,8 +154,9 @@ def get_dataset_diff(
     ds_diff = DatasetDiff.concatenated(
         base_target_diff, target_wc_diff, overwrite_original=True
     )
-    # Get rid of parts of the diff-structure that are "empty":
-    ds_diff.prune()
+    if include_wc_diff:
+        # Get rid of parts of the diff-structure that are "empty":
+        ds_diff.prune()
     return ds_diff
 
 

diff --git a/kart/tabular/rich_table_dataset.py b/kart/tabular/rich_table_dataset.py
@@ -134,8 +134,9 @@ def diff(
 
         # Else do a full diff.
         else:
-            ds_diff["feature"] = DeltaDiff(
-                self.diff_feature(other, feature_filter, reverse=reverse)
+            ds_diff.set_if_nonempty(
+                "feature",
+                DeltaDiff(self.diff_feature(other, feature_filter, reverse=reverse)),
             )
         return ds_diff
 

diff --git a/kart/tile/tile_dataset.py b/kart/tile/tile_dataset.py
@@ -306,7 +306,9 @@ def diff(
 
         # Else do a full diff.
         else:
-            ds_diff["tile"] = self.diff_tile(other, tile_filter, reverse=reverse)
+            ds_diff.set_if_nonempty(
+                "tile", self.diff_tile(other, tile_filter, reverse=reverse)
+            )
 
         return ds_diff
 

diff --git a/tests/test_diff.py b/tests/test_diff.py
@@ -11,6 +11,7 @@
 
 import kart
 from kart.tabular.v3 import TableV3
+from kart.base_dataset import BaseDataset
 from kart.diff_format import DiffFormat
 from kart.diff_structs import Delta, DeltaDiff
 from kart.html_diff_writer import HtmlDiffWriter
@@ -319,6 +320,45 @@ def slow_down_the_diff(self, ds_path, ds_diff, diff_format=DiffFormat.FULL):
         ]
 
 
+def test_diff_doesnt_evaluate_all_deltas_up_front_if_you_dont_sort_keys(
+    data_archive_readonly, monkeypatch, cli_runner
+):
+    # Test that we can start outputting features before we have instantiated all the feature deltas.
+    # Otherwise, diffs containing millions of deltas will be slow to start, and will use a lot of memory
+    # to buffer the deltas in memory.
+    # We explicitly avoid doing that, when the users has asked for a `--no-sort-keys` diff.
+    with data_archive_readonly("points") as repo_path:
+        orig_delta_as_json = JsonLinesDiffWriter.delta_as_json
+        features_written = 0
+
+        def delta_as_json(self, *args, **kwargs):
+            nonlocal features_written
+            features_written += 1
+            return orig_delta_as_json(self, *args, **kwargs)
+
+        monkeypatch.setattr(JsonLinesDiffWriter, "delta_as_json", delta_as_json)
+        orig_wrap_deltas_from_raw_diff = BaseDataset.wrap_deltas_from_raw_diff
+
+        def wrap_deltas_from_raw_diff(self, *args, **kwargs):
+            yield from orig_wrap_deltas_from_raw_diff(self, *args, **kwargs)
+            if not features_written:
+                pytest.fail(
+                    "All deltas shouldn't be evaluated until some features are written"
+                )
+
+        monkeypatch.setattr(
+            BaseDataset, "wrap_deltas_from_raw_diff", wrap_deltas_from_raw_diff
+        )
+        r = cli_runner.invoke(
+            [
+                "diff",
+                f"--output-format=json-lines",
+                "--no-sort-keys",
+                "[EMPTY]...",
+            ]
+        )
+
+
 @pytest.mark.parametrize("output_format", DIFF_OUTPUT_FORMATS)
 def test_diff_reprojection(output_format, data_working_copy, cli_runner):
     """diff the working copy against HEAD"""