iterative · ilongin · Dec 20, 2024 · Dec 17, 2024 · Dec 18, 2024 · Dec 20, 2024
diff --git a/src/datachain/lib/dc.py b/src/datachain/lib/dc.py
@@ -1700,6 +1700,77 @@ def compare(
             status_col=status_col,
         )
 
+    def diff(
+        self,
+        other: "DataChain",
+        on: str = "file",
+        right_on: Optional[str] = None,
+        added: bool = True,
+        deleted: bool = True,
+        modified: bool = True,
+        unchanged: bool = False,
+        status_col: Optional[str] = None,
+    ) -> "DataChain":
+        """Similar as .compare() but for file based chains, i.e. those that have
+        File object, or it's derivatives, in it. For matching file `source` and
+        `path` are used, and for comparing file `version` and `etag`.
+
+        Parameters:
+            other: Chain to calculate diff from.
+            on: File signal to match on. If both chains have the
+                same file signal then this column is enough for the match. Otherwise,
+                `right_on` parameter has to specify the file signal for the other chain.
+                This value is used to find corresponding row in other dataset. If not
+                found there, row is considered as added (or removed if vice versa), and
+                if found then row can be either modified or unchanged.
+            right_on: Optional file signal for the `other` to match.
+            added (bool): Whether to return added rows in resulting chain.
+            deleted (bool): Whether to return deleted rows in resulting chain.
+            modified (bool): Whether to return modified rows in resulting chain.
+            unchanged (bool): Whether to return unchanged rows in resulting chain.
+            status_col (str): Name of the new column that is created in resulting chain
+                representing diff status.
+
+        Example:
+            ```py
+            diff = images.diff(
+                new_images,
+                on="file",
+                right_on="other_file",
+                added=True,
+                deleted=True,
+                modified=True,
+                unchanged=True,
+                status_col="diff"
+            )
+            ```
+        """
+        on_file_signals = ["source", "path"]
+        compare_file_signals = ["version", "etag"]
+
+        def get_file_signals(file: str, signals):
+            return [f"{file}.{c}" for c in signals]
+
+        right_on = right_on or on
+
+        on_cols = get_file_signals(on, on_file_signals)
+        right_on_cols = get_file_signals(right_on, on_file_signals)
+        compare_cols = get_file_signals(on, compare_file_signals)
+        right_compare_cols = get_file_signals(right_on, compare_file_signals)
+
+        return self.compare(
+            other,
+            on_cols,
+            right_on=right_on_cols,
+            compare=compare_cols,
+            right_compare=right_compare_cols,
+            added=added,
+            deleted=deleted,
+            modified=modified,
+            unchanged=unchanged,
+            status_col=status_col,
+        )
+
     @classmethod
     def from_values(
         cls,

diff --git a/tests/unit/lib/test_datachain.py b/tests/unit/lib/test_datachain.py
@@ -3350,3 +3350,108 @@ def test_compare_right_compare_wrong_length(test_session):
     assert str(exc_info.value) == (
         "'compare' and 'right_compare' must be have the same length"
     )
+
+
+@pytest.mark.parametrize("added", (True, False))
+@pytest.mark.parametrize("deleted", (True, False))
+@pytest.mark.parametrize("modified", (True, False))
+@pytest.mark.parametrize("unchanged", (True, False))
+@pytest.mark.parametrize("status_col", ("diff", None))
+def test_diff(test_session, added, deleted, modified, unchanged, status_col):
+    if not any([added, deleted, modified, unchanged]):
+        pytest.skip("This case is tested in another test")
+
+    fs1 = File(source="s1", path="p1", version="2", etag="e2")
+    fs1_updated = File(source="s1", path="p1", version="1", etag="e1")
+    fs2 = File(source="s2", path="p2", version="1", etag="e1")
+    fs3 = File(source="s3", path="p3", version="1", etag="e1")
+    fs4 = File(source="s4", path="p4", version="1", etag="e1")
+
+    ds1 = DataChain.from_values(
+        file=[fs1_updated, fs2, fs4], score=[1, 2, 4], session=test_session
+    )
+    ds2 = DataChain.from_values(
+        file=[fs1, fs3, fs4], score=[1, 3, 4], session=test_session
+    )
+
+    diff = ds1.diff(
+        ds2,
+        added=added,
+        deleted=deleted,
+        modified=modified,
+        unchanged=unchanged,
+        on="file",
+        status_col=status_col,
+    )
+
+    expected = []
+    if modified:
+        expected.append(("M", fs1_updated, 1))
+    if added:
+        expected.append(("A", fs2, 2))
+    if deleted:
+        expected.append(("D", fs3, 3))
+    if unchanged:
+        expected.append(("U", fs4, 4))
+
+    collect_fields = ["diff", "file", "score"]
+    if not status_col:
+        expected = [row[1:] for row in expected]
+        collect_fields = collect_fields[1:]
+
+    assert list(diff.order_by("file.source").collect(*collect_fields)) == expected
+
+
+@pytest.mark.parametrize("added", (True, False))
+@pytest.mark.parametrize("deleted", (True, False))
+@pytest.mark.parametrize("modified", (True, False))
+@pytest.mark.parametrize("unchanged", (True, False))
+@pytest.mark.parametrize("status_col", ("diff", None))
+def test_diff_nested(test_session, added, deleted, modified, unchanged, status_col):
+    if not any([added, deleted, modified, unchanged]):
+        pytest.skip("This case is tested in another test")
+
+    class Nested(BaseModel):
+        file: File
+
+    fs1 = Nested(file=File(source="s1", path="p1", version="2", etag="e2"))
+    fs1_updated = Nested(file=File(source="s1", path="p1", version="1", etag="e1"))
+    fs2 = Nested(file=File(source="s2", path="p2", version="1", etag="e1"))
+    fs3 = Nested(file=File(source="s3", path="p3", version="1", etag="e1"))
+    fs4 = Nested(file=File(source="s4", path="p4", version="1", etag="e1"))
+
+    ds1 = DataChain.from_values(
+        nested=[fs1_updated, fs2, fs4], score=[1, 2, 4], session=test_session
+    )
+    ds2 = DataChain.from_values(
+        nested=[fs1, fs3, fs4], score=[1, 3, 4], session=test_session
+    )
+
+    diff = ds1.diff(
+        ds2,
+        added=added,
+        deleted=deleted,
+        modified=modified,
+        unchanged=unchanged,
+        on="nested.file",
+        status_col=status_col,
+    )
+
+    expected = []
+    if modified:
+        expected.append(("M", fs1_updated, 1))
+    if added:
+        expected.append(("A", fs2, 2))
+    if deleted:
+        expected.append(("D", fs3, 3))
+    if unchanged:
+        expected.append(("U", fs4, 4))
+
+    collect_fields = ["diff", "nested", "score"]
+    if not status_col:
+        expected = [row[1:] for row in expected]
+        collect_fields = collect_fields[1:]
+
+    assert (
+        list(diff.order_by("nested.file.source").collect(*collect_fields)) == expected
+    )