Skip to content

Commit

Permalink
added logic of different schemas when compare columns are not provided
Browse files Browse the repository at this point in the history
  • Loading branch information
ilongin committed Dec 11, 2024
1 parent da942fb commit 0c4268b
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 7 deletions.
18 changes: 13 additions & 5 deletions src/datachain/lib/diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,14 @@ def _to_list(obj: Union[str, Sequence[str]]) -> list[str]:
right_compare = right_compare or compare
compare = left.signals_schema.resolve(*compare).db_signals() # type: ignore[assignment]
right_compare = right.signals_schema.resolve(*right_compare).db_signals() # type: ignore[assignment]
elif not compare and len(cols) != len(right_cols):
# here we will mark all rows that are not added or deleted as modified since
# there was no explicit list of compare columns provided (meaning we need
# to check all columns to determine if row is modified or unchanged), but
# the number of columns on left and right is not the same (one of the chains
# have additional column)
compare = None
right_compare = None
else:
compare = [c for c in cols if c in right_cols] # type: ignore[misc, assignment]
right_compare = compare
Expand All @@ -103,24 +111,24 @@ def _to_list(obj: Union[str, Sequence[str]]) -> list[str]:
]
)
diff_cond.append((added_cond, "A"))
if modified:
if modified and compare:
modified_cond = sa.or_(
*[
C(c) != C(f"{_rprefix(c, rc)}{rc}")
for c, rc in zip(compare, right_compare)
for c, rc in zip(compare, right_compare) # type: ignore[arg-type]
]
)
diff_cond.append((modified_cond, "M"))
if unchanged:
if unchanged and compare:
unchanged_cond = sa.and_(
*[
C(c) == C(f"{_rprefix(c, rc)}{rc}")
for c, rc in zip(compare, right_compare)
for c, rc in zip(compare, right_compare) # type: ignore[arg-type]
]
)
diff_cond.append((unchanged_cond, "U"))

diff = sa.case(*diff_cond, else_=None).label(status_col)
diff = sa.case(*diff_cond, else_=None if compare else "M").label(status_col)

left_right_merge = left.merge(
right, on=on, right_on=right_on, inner=False, rname=rname
Expand Down
2 changes: 0 additions & 2 deletions tests/unit/lib/test_datachain.py
Original file line number Diff line number Diff line change
Expand Up @@ -3214,7 +3214,6 @@ def test_compare_multiple_match_columns(test_session):


def test_compare_additional_column_on_left(test_session):
pytest.skip()
ds1 = DataChain.from_values(
id=[1, 2, 4],
name=["John", "Doe", "Andy"],
Expand All @@ -3241,7 +3240,6 @@ def test_compare_additional_column_on_left(test_session):


def test_compare_additional_column_on_right(test_session):
pytest.skip()
ds1 = DataChain.from_values(
id=[1, 2, 4],
name=["John", "Doe", "Andy"],
Expand Down

0 comments on commit 0c4268b

Please sign in to comment.