diff --git a/.gitmodules b/.gitmodules index b966ce62f8..e69de29bb2 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +0,0 @@ -[submodule "dat"] - path = dat - url = https://github.com/delta-incubator/dat.git diff --git a/dat b/dat deleted file mode 160000 index e7798ddd9c..0000000000 --- a/dat +++ /dev/null @@ -1 +0,0 @@ -Subproject commit e7798ddd9cf4138d22e102361f08a9dcf92fb539 diff --git a/python/.gitignore b/python/.gitignore index 96132d7999..e1e978f0a6 100644 --- a/python/.gitignore +++ b/python/.gitignore @@ -15,3 +15,6 @@ __pycache__/ docs/build *.so + +# dat data +dat-data \ No newline at end of file diff --git a/python/Makefile b/python/Makefile index db72b4f20f..59d89051a6 100644 --- a/python/Makefile +++ b/python/Makefile @@ -3,6 +3,7 @@ VENV := venv MATURIN_VERSION := $(shell grep 'requires =' pyproject.toml | cut -d= -f2- | tr -d '[ "]') PACKAGE_VERSION := $(shell grep version Cargo.toml | head -n 1 | awk '{print $$3}' | tr -d '"' ) +DAT_VERSION := 0.0.1 .PHONY: setup-venv setup-venv: ## Setup the virtualenv @@ -14,6 +15,16 @@ setup: ## Setup the requirements $(info --- Setup dependencies ---) pip install "$(MATURIN_VERSION)" +.PHONY: setup-dat +setup-dat: ## Download DAT test files + mkdir -p dat-data + rm -rf dat-data/v$(DAT_VERSION) + curl -L --silent --output dat-data/deltalake-dat-v$(DAT_VERSION).tar.gz \ + https://github.com/delta-incubator/dat/releases/download/v$(DAT_VERSION)/deltalake-dat-v$(DAT_VERSION).tar.gz + -tar -xzf dat-data/deltalake-dat-v$(DAT_VERSION).tar.gz + mv out dat-data/v$(DAT_VERSION) + rm dat-data/deltalake-dat-v$(DAT_VERSION).tar.gz + .PHONY: build build: setup ## Build Python binding of delta-rs $(info --- Build Python binding ---) diff --git a/python/tests/data_acceptance/test_reader.py b/python/tests/data_acceptance/test_reader.py index b980480cab..8ab1e8bf50 100644 --- a/python/tests/data_acceptance/test_reader.py +++ b/python/tests/data_acceptance/test_reader.py @@ -1,6 +1,6 @@ -from typing import NamedTuple, Dict, Any -from pathlib import Path import json +from pathlib import Path +from typing import Any, Dict, NamedTuple, Optional import pyarrow as pa import pyarrow.parquet as pq @@ -8,42 +8,62 @@ from deltalake import DeltaTable + class ReadCase(NamedTuple): root: Path - metadata: Dict[str, Any] + version: Optional[int] + case_info: Dict[str, Any] + version_metadata: Dict[str, Any] + cases = [] -project_root = Path("../dat") -for path in (project_root / "out" / "tables" / "generated").iterdir(): +dat_version = "0.0.1" +reader_case_path = Path("dat-data") / f"v{dat_version}" / "reader_tests" / "generated" + +if not reader_case_path.exists(): + pytest.skip( + "DAT test data not present. Run make setup-dat to download them.", + allow_module_level=True, + ) + +for path in reader_case_path.iterdir(): if path.is_dir(): - with open(path / "table-metadata.json") as f: + with open(path / "test_case_info.json") as f: metadata = json.load(f) - cases.append(ReadCase(path, metadata)) -# TODO: external-tables should be added to cases as well + for version_path in (path / "expected").iterdir(): + if path.name.startswith("v"): + version = int(path.name[1:]) + else: + version = None + with open(version_path / "table_version_metadata.json") as f: + version_metadata = json.load(f) + + cases.append(ReadCase(path, version, metadata, version_metadata)) + -@pytest.mark.parametrize("case", cases) +@pytest.mark.parametrize( + "case", cases, ids=lambda case: f"{case.case_info['name']} (version={case.version})" +) def test_dat(case: ReadCase): - root, metadata = case + root, version, case_info, version_metadata = case # Get Delta Table path delta_root = root / "delta" # Load table - dt = DeltaTable(str(delta_root)) + dt = DeltaTable(str(delta_root), version=version) # Compare protocol versions - # TODO: this is incorrect in dat - # assert dt.protocol().min_reader_version == metadata["reader_protocol_version"] - assert dt.protocol().min_writer_version == metadata["writer_protocol_version"] - - # Perhaps? - # assert dt.version == metadata["current_version"] + assert dt.protocol().min_reader_version == version_metadata["min_reader_version"] + assert dt.protocol().min_writer_version == version_metadata["min_writer_version"] # If supported protocol version, try to read, load parquet, and compare if dt.protocol().min_reader_version <= 1: - parquet_root = root / "parquet" + version_path = "latest" if version is None else f"v{version}" + # TODO: fix the directory name here + parquet_root = root / "expected" / version_path / "table_content.parquet" expected = pq.read_table(parquet_root) actual = dt.to_pyarrow_table() assert_tables_equal(expected, actual) @@ -51,7 +71,7 @@ def test_dat(case: ReadCase): # We should raise an error when attempting to read too advanced protocol with pytest.raises(Exception): dt.to_pyarrow_table() - + def assert_tables_equal(first: pa.Table, second: pa.Table) -> None: assert first.schema == second.schema