From b73d61d80fe8a9192655c7719bce00660e9ca0ad Mon Sep 17 00:00:00 2001 From: Will Jones Date: Mon, 31 Oct 2022 20:47:24 -0700 Subject: [PATCH] test: draft Data Acceptance Tests --- .gitmodules | 3 + dat | 1 + python/tests/data_acceptance/__init__.py | 0 python/tests/data_acceptance/test_reader.py | 61 +++++++++++++++++++++ 4 files changed, 65 insertions(+) create mode 100644 .gitmodules create mode 160000 dat create mode 100644 python/tests/data_acceptance/__init__.py create mode 100644 python/tests/data_acceptance/test_reader.py diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000..b966ce62f8 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "dat"] + path = dat + url = https://github.com/delta-incubator/dat.git diff --git a/dat b/dat new file mode 160000 index 0000000000..e7798ddd9c --- /dev/null +++ b/dat @@ -0,0 +1 @@ +Subproject commit e7798ddd9cf4138d22e102361f08a9dcf92fb539 diff --git a/python/tests/data_acceptance/__init__.py b/python/tests/data_acceptance/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/tests/data_acceptance/test_reader.py b/python/tests/data_acceptance/test_reader.py new file mode 100644 index 0000000000..b980480cab --- /dev/null +++ b/python/tests/data_acceptance/test_reader.py @@ -0,0 +1,61 @@ +from typing import NamedTuple, Dict, Any +from pathlib import Path +import json + +import pyarrow as pa +import pyarrow.parquet as pq +import pytest + +from deltalake import DeltaTable + +class ReadCase(NamedTuple): + root: Path + metadata: Dict[str, Any] + +cases = [] + +project_root = Path("../dat") +for path in (project_root / "out" / "tables" / "generated").iterdir(): + if path.is_dir(): + with open(path / "table-metadata.json") as f: + metadata = json.load(f) + cases.append(ReadCase(path, metadata)) + +# TODO: external-tables should be added to cases as well + +@pytest.mark.parametrize("case", cases) +def test_dat(case: ReadCase): + root, metadata = case + + # Get Delta Table path + delta_root = root / "delta" + + # Load table + dt = DeltaTable(str(delta_root)) + + # Compare protocol versions + # TODO: this is incorrect in dat + # assert dt.protocol().min_reader_version == metadata["reader_protocol_version"] + assert dt.protocol().min_writer_version == metadata["writer_protocol_version"] + + # Perhaps? + # assert dt.version == metadata["current_version"] + + # If supported protocol version, try to read, load parquet, and compare + if dt.protocol().min_reader_version <= 1: + parquet_root = root / "parquet" + expected = pq.read_table(parquet_root) + actual = dt.to_pyarrow_table() + assert_tables_equal(expected, actual) + else: + # We should raise an error when attempting to read too advanced protocol + with pytest.raises(Exception): + dt.to_pyarrow_table() + + +def assert_tables_equal(first: pa.Table, second: pa.Table) -> None: + assert first.schema == second.schema + sort_keys = [(col, "ascending") for col in first.column_names] + first_sorted = first.sort_by(sort_keys) + second_sorted = second.sort_by(sort_keys) + assert first_sorted == second_sorted