Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Composite key feature #36

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ Consider two CSV files:
name: Pancakes
age: 2

The `--key=id` option means that the `id` column should be treated as the unique key, to identify which records have changed.
The `--key=id` option means that the `id` column should be treated as the unique key, to identify which records have changed. `--key` option can be reused. For example, `--key=license --key=state` will use both license and state to uniquely identify a row.

The tool will automatically detect if your files are comma- or tab-separated. You can over-ride this automatic detection and force the tool to use a specific format using `--format=tsv` or `--format=csv`.

Expand Down
20 changes: 16 additions & 4 deletions csv_diff/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,11 @@ def load_csv(fp, key=None, dialect=None):
fp = csv.reader(fp, dialect=(dialect or "excel"))
headings = next(fp)
rows = [dict(zip(headings, line)) for line in fp]

if isinstance(key, str):
key = (key,)
if key:
keyfn = lambda r: r[key]
keyfn = lambda r: tuple(r[k] for k in key)
else:
keyfn = lambda r: hashlib.sha1(
json.dumps(r, sort_keys=True).encode("utf8")
Expand All @@ -32,8 +35,10 @@ def load_json(fp, key=None):
common_keys = set()
for item in raw_list:
common_keys.update(item.keys())
# maybe add later not sure if matters
# common_keys = {tuple(k) for k in common_keys}
if key:
keyfn = lambda r: r[key]
keyfn = lambda r: tuple(r[k] for k in key)
else:
keyfn = lambda r: hashlib.sha1(
json.dumps(r, sort_keys=True).encode("utf8")
Expand Down Expand Up @@ -88,7 +93,9 @@ def compare(previous, current, show_unchanged=False):
diffs = list(diff(previous[id], current[id], ignore=ignore_columns))
if diffs:
changes = {
"key": id,
# Casting the id to a str here to keep consistent with json reading format.
# This is clunky and requires checking for a str later before recasting the key to a tuple.
"key": id[0] if len(id) == 1 else id,
"changes": {
# field can be a list if id contained '.' - #7
field[0]
Expand Down Expand Up @@ -145,7 +152,12 @@ def human_text(result, key=None, singular=None, plural=None, show_unchanged=Fals
change_blocks = []
for details in result["changed"]:
block = []
block.append(" {}: {}".format(key, details["key"]))

if isinstance(key, str):
key = (key,)
for k, v in zip(key, details["key"]): # For each k in key append to the block.
block.append(" {}: {}".format(k, v))

for field, (prev_value, current_value) in details["changes"].items():
block.append(
' {}: "{}" => "{}"'.format(field, prev_value, current_value)
Expand Down
2 changes: 1 addition & 1 deletion csv_diff/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
type=click.Path(exists=True, file_okay=True, dir_okay=False, allow_dash=False),
)
@click.option(
"--key", type=str, default=None, help="Column to use as a unique ID for each row"
"--key", type=str, default=None, multiple=True, help="Column to use as a unique ID for each row. This argument may be specified multiple times"
)
@click.option(
"--format",
Expand Down
146 changes: 146 additions & 0 deletions tests/test_composite_keys.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
from csv_diff import load_csv, compare
import io

# Multi-key Tests

# Base CSV: MULTI_KEY_BASE
MULTI_KEY_BASE = """Customer ID,Timestamp,Product Name,Quantity
CUST-001,2022-01-01 10:00:00,Chair,2
CUST-001,2022-01-02 11:00:00,Table,1
CUST-002,2022-01-02 11:00:00,Desk,3"""

# Modified CSV: MULTI_KEY_ROW_CHANGED
MULTI_KEY_ROW_CHANGED = """Customer ID,Timestamp,Product Name,Quantity
CUST-001,2022-01-01 10:00:00,Chair,3
CUST-001,2022-01-02 11:00:00,Table,1
CUST-002,2022-01-02 11:00:00,Desk,3"""

# Modified CSV: MULTI_KEY_ROW_ADDED
MULTI_KEY_ROW_ADDED = """Customer ID,Timestamp,Product Name,Quantity
CUST-001,2022-01-01 10:00:00,Chair,2
CUST-001,2022-01-02 11:00:00,Table,1
CUST-002,2022-01-02 11:00:00,Desk,3
CUST-003,2022-01-03 12:00:00,Bookshelf,1"""

# Modified CSV: MULTI_KEY_ROW_REMOVED
MULTI_KEY_ROW_REMOVED = """Customer ID,Timestamp,Product Name,Quantity
CUST-001,2022-01-01 10:00:00,Chair,2
CUST-002,2022-01-02 11:00:00,Desk,3"""

# Modified CSV: MULTI_KEY_ROW_REMOVED_AND_CHANGE
MULTI_KEY_ROW_REMOVED_AND_CHANGE = """Customer ID,Timestamp,Product Name,Quantity
CUST-001,2022-01-01 10:00:00,Chair,3
CUST-002,2022-01-02 11:00:00,Desk,3"""

# Modified CSV: MULTI_KEY_COLUMN_ADD
MULTI_KEY_COLUMN_ADD = """Customer ID,Timestamp,Product Name,Price
CUST-001,2022-01-01 10:00:00,Chair,19.99
CUST-001,2022-01-02 11:00:00,Table,49.99
CUST-002,2022-01-02 11:00:00,Desk,99.99"""

# Modified CSV: MULTI_KEY_PRIMARY_KEY_CHANGED
MULTI_KEY_PRIMARY_KEY_CHANGED = """Customer ID,Timestamp,Product Name,Quantity
CUST-001,2022-01-01 10:00:00,Chair,2
CUST-003,2022-01-02 11:00:00,Table,1
CUST-002,2022-01-02 11:00:00,Desk,3"""

# Multi-Key Tests

multi_key = ("Customer ID", "Timestamp")

def test_multi_key_row_changed():
diff = compare(
load_csv(io.StringIO(MULTI_KEY_BASE), key=multi_key),
load_csv(io.StringIO(MULTI_KEY_ROW_CHANGED), key=multi_key)
)
assert {
"added": [],
"removed": [],
"changed": [
{
"key": ("CUST-001", "2022-01-01 10:00:00"),
"changes": {"Quantity": ["2", "3"]}
}
],
"columns_added": [],
"columns_removed": [],
} == diff


def test_multi_key_row_added():
diff = compare(
load_csv(io.StringIO(MULTI_KEY_BASE), key=multi_key),
load_csv(io.StringIO(MULTI_KEY_ROW_ADDED), key=multi_key)
)
assert {
"changed": [],
"removed": [],
"added": [
{"Customer ID": "CUST-003", "Timestamp": "2022-01-03 12:00:00", "Product Name": "Bookshelf", "Quantity": "1"}
],
"columns_added": [],
"columns_removed": [],
} == diff


def test_multi_key_row_removed():
diff = compare(
load_csv(io.StringIO(MULTI_KEY_BASE), key=multi_key),
load_csv(io.StringIO(MULTI_KEY_ROW_REMOVED), key=multi_key)
)
assert {
"changed": [],
"removed": [
{"Customer ID": "CUST-001", "Timestamp": "2022-01-02 11:00:00", "Product Name": "Table", "Quantity": "1"}
],
"added": [],
"columns_added": [],
"columns_removed": [],
} == diff

def test_multi_key_row_removed_and_change():
diff = compare(
load_csv(io.StringIO(MULTI_KEY_BASE), key=multi_key),
load_csv(io.StringIO(MULTI_KEY_ROW_REMOVED_AND_CHANGE), key=multi_key)
)
assert {
"changed": [
{
"key": ("CUST-001", "2022-01-01 10:00:00"),
"changes": {"Quantity": ["2", "3"]}
}
],
"removed": [
{"Customer ID": "CUST-001", "Timestamp": "2022-01-02 11:00:00", "Product Name": "Table", "Quantity": "1"}
],
"added": [],
"columns_added": [],
"columns_removed": [],
} == diff


def test_multi_key_columns_changed():
diff = compare(
load_csv(io.StringIO(MULTI_KEY_BASE), key=multi_key),
load_csv(io.StringIO(MULTI_KEY_COLUMN_ADD), key=multi_key)
)
assert {
"changed": [],
"removed": [],
"added": [],
"columns_added": ["Price"],
"columns_removed": ["Quantity"],
} == diff

def test_multi_key_primary_key_removed_and_added():
diff = compare(
load_csv(io.StringIO(MULTI_KEY_BASE), key=multi_key),
load_csv(io.StringIO(MULTI_KEY_PRIMARY_KEY_CHANGED), key=multi_key)
)
assert {
"changed": [],
"removed": [{"Customer ID": "CUST-001", "Timestamp": "2022-01-02 11:00:00", "Product Name": "Table", "Quantity": "1"}],
"added": [{"Customer ID": "CUST-003", "Timestamp": "2022-01-02 11:00:00", "Product Name": "Table", "Quantity": "1"}],
"columns_added": [],
"columns_removed": [],
} == diff
5 changes: 3 additions & 2 deletions tests/test_csv_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@

FOUR = """id,name,age
1,Cleo,5
2,Pancakes,2,
2,Pancakes,2
3,Bailey,1"""

FIVE = """id,name,age
1,Cleo,5
2,Pancakes,2,
2,Pancakes,2
3,Bailey,1
4,Carl,7"""

Expand Down Expand Up @@ -115,3 +115,4 @@ def test_tsv():
"columns_added": [],
"columns_removed": [],
} == diff