Skip to content

Commit

Permalink
changed uncahnged to same
Browse files Browse the repository at this point in the history
  • Loading branch information
ilongin committed Dec 20, 2024
1 parent c2aec82 commit 8fa9465
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 62 deletions.
26 changes: 13 additions & 13 deletions src/datachain/lib/dc.py
Original file line number Diff line number Diff line change
Expand Up @@ -1634,12 +1634,12 @@ def compare(
added: bool = True,
deleted: bool = True,
modified: bool = True,
unchanged: bool = False,
same: bool = False,
status_col: Optional[str] = None,
) -> "DataChain":
"""Comparing two chains by identifying rows that are added, deleted, modified
or unchanged. Result is the new chain that has additional column with possible
values: `A`, `D`, `M`, `U` representing added, deleted, modified and unchanged
or same. Result is the new chain that has additional column with possible
values: `A`, `D`, `M`, `U` representing added, deleted, modified and same
rows respectively. Note that if only one "status" is asked, by setting proper
flags, this additional column is not created as it would have only one value
for all rows. Beside additional diff column, new chain has schema of the chain
Expand All @@ -1652,20 +1652,20 @@ def compare(
`right_on` parameter has to specify the columns for the other chain.
This value is used to find corresponding row in other dataset. If not
found there, row is considered as added (or removed if vice versa), and
if found then row can be either modified or unchanged.
if found then row can be either modified or same.
right_on: Optional column or list of columns
for the `other` to match.
compare: Column or list of columns to compare on. If both chains have
the same columns then this column is enough for the compare. Otherwise,
`right_compare` parameter has to specify the columns for the other
chain. This value is used to see if row is modified or unchanged. If
chain. This value is used to see if row is modified or same. If
not set, all columns will be used for comparison
right_compare: Optional column or list of columns
for the `other` to compare to.
added (bool): Whether to return added rows in resulting chain.
deleted (bool): Whether to return deleted rows in resulting chain.
modified (bool): Whether to return modified rows in resulting chain.
unchanged (bool): Whether to return unchanged rows in resulting chain.
same (bool): Whether to return unchanged rows in resulting chain.
status_col (str): Name of the new column that is created in resulting chain
representing diff status.
Expand All @@ -1679,7 +1679,7 @@ def compare(
added=True,
deleted=True,
modified=True,
unchanged=True,
same=True,
status_col="diff"
)
```
Expand All @@ -1696,7 +1696,7 @@ def compare(
added=added,
deleted=deleted,
modified=modified,
unchanged=unchanged,
same=same,
status_col=status_col,
)

Expand All @@ -1708,7 +1708,7 @@ def diff(
added: bool = True,
modified: bool = True,
deleted: bool = False,
unchanged: bool = False,
same: bool = False,
status_col: Optional[str] = None,
) -> "DataChain":
"""Similar to `.compare()`, which is more generic method to calculate difference
Expand All @@ -1724,12 +1724,12 @@ def diff(
`right_on` parameter has to specify the file signal for the other chain.
This value is used to find corresponding row in other dataset. If not
found there, row is considered as added (or removed if vice versa), and
if found then row can be either modified or unchanged.
if found then row can be either modified or same.
right_on: Optional file signal for the `other` to match.
added (bool): Whether to return added rows in resulting chain.
deleted (bool): Whether to return deleted rows in resulting chain.
modified (bool): Whether to return modified rows in resulting chain.
unchanged (bool): Whether to return unchanged rows in resulting chain.
same (bool): Whether to return unchanged rows in resulting chain.
status_col (str): Optional name of the new column that is created in
resulting chain representing diff status.
Expand All @@ -1742,7 +1742,7 @@ def diff(
added=True,
deleted=True,
modified=True,
unchanged=True,
same=True,
status_col="diff"
)
```
Expand All @@ -1769,7 +1769,7 @@ def get_file_signals(file: str, signals):
added=added,
deleted=deleted,
modified=modified,
unchanged=unchanged,
same=same,
status_col=status_col,
)

Expand Down
18 changes: 9 additions & 9 deletions src/datachain/lib/diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@ def compare( # noqa: PLR0912, PLR0915, C901
added: bool = True,
deleted: bool = True,
modified: bool = True,
unchanged: bool = True,
same: bool = True,
status_col: Optional[str] = None,
) -> "DataChain":
"""Comparing two chains by identifying rows that are added, deleted, modified
or unchanged"""
or same"""
dialect = left._query.dialect

rname = "right_"
Expand Down Expand Up @@ -67,9 +67,9 @@ def _to_list(obj: Union[str, Sequence[str]]) -> list[str]:
"'compare' and 'right_compare' must be have the same length"
)

if not any([added, deleted, modified, unchanged]):
if not any([added, deleted, modified, same]):
raise ValueError(
"At least one of added, deleted, modified, unchanged flags must be set"
"At least one of added, deleted, modified, same flags must be set"
)

# we still need status column for internal implementation even if not
Expand All @@ -94,7 +94,7 @@ def _to_list(obj: Union[str, Sequence[str]]) -> list[str]:
elif not compare and len(cols) != len(right_cols):
# here we will mark all rows that are not added or deleted as modified since
# there was no explicit list of compare columns provided (meaning we need
# to check all columns to determine if row is modified or unchanged), but
# to check all columns to determine if row is modified or same), but
# the number of columns on left and right is not the same (one of the chains
# have additional column)
compare = None
Expand All @@ -121,14 +121,14 @@ def _to_list(obj: Union[str, Sequence[str]]) -> list[str]:
]
)
diff_cond.append((modified_cond, "M"))
if unchanged and compare:
unchanged_cond = sa.and_(
if same and compare:
same_cond = sa.and_(
*[
C(c) == C(f"{_rprefix(c, rc)}{rc}")
for c, rc in zip(compare, right_compare) # type: ignore[arg-type]
]
)
diff_cond.append((unchanged_cond, "U"))
diff_cond.append((same_cond, "S"))

diff = sa.case(*diff_cond, else_=None if compare else "M").label(status_col)
diff.type = String()
Expand Down Expand Up @@ -181,7 +181,7 @@ def _to_list(obj: Union[str, Sequence[str]]) -> list[str]:

if not deleted:
res = left_right_merge_select
elif deleted and not any([added, modified, unchanged]):
elif deleted and not any([added, modified, same]):
res = right_left_merge_select
else:
res = left_right_merge_select.union(right_left_merge_select)
Expand Down
Loading

0 comments on commit 8fa9465

Please sign in to comment.