Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test: add Data Acceptance Tests #909

Merged
merged 7 commits into from
Jan 25, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .github/workflows/python_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,10 @@ jobs:
pip install virtualenv
virtualenv venv
source venv/bin/activate
make develop
make develop

- name: Download Data Acceptance Tests (DAT) files
run: make setup-dat

- name: Run tests
run: |
Expand Down
Empty file added .gitmodules
Empty file.
3 changes: 3 additions & 0 deletions python/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,6 @@ __pycache__/
docs/build

*.so

# dat data
dat-data
11 changes: 11 additions & 0 deletions python/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
VENV := venv
MATURIN_VERSION := $(shell grep 'requires =' pyproject.toml | cut -d= -f2- | tr -d '[ "]')
PACKAGE_VERSION := $(shell grep version Cargo.toml | head -n 1 | awk '{print $$3}' | tr -d '"' )
DAT_VERSION := 0.0.2

.PHONY: setup-venv
setup-venv: ## Setup the virtualenv
Expand All @@ -14,6 +15,16 @@ setup: ## Setup the requirements
$(info --- Setup dependencies ---)
pip install "$(MATURIN_VERSION)"

.PHONY: setup-dat
setup-dat: ## Download DAT test files
mkdir -p dat-data
rm -rf dat-data/v$(DAT_VERSION)
curl -L --silent --output dat-data/deltalake-dat-v$(DAT_VERSION).tar.gz \
https://github.com/delta-incubator/dat/releases/download/v$(DAT_VERSION)/deltalake-dat-v$(DAT_VERSION).tar.gz
tar --no-same-permissions -xzf dat-data/deltalake-dat-v$(DAT_VERSION).tar.gz
mv out dat-data/v$(DAT_VERSION)
rm dat-data/deltalake-dat-v$(DAT_VERSION).tar.gz

.PHONY: build
build: setup ## Build Python binding of delta-rs
$(info --- Build Python binding ---)
Expand Down
Empty file.
91 changes: 91 additions & 0 deletions python/tests/data_acceptance/test_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import json
from pathlib import Path
from typing import Any, Dict, NamedTuple, Optional

import pyarrow as pa
import pyarrow.parquet as pq
import pytest

from deltalake import DeltaTable


class ReadCase(NamedTuple):
root: Path
version: Optional[int]
case_info: Dict[str, Any]
version_metadata: Dict[str, Any]


cases = []

dat_version = "0.0.2"
reader_case_path = Path("dat-data") / f"v{dat_version}" / "reader_tests" / "generated"

if not reader_case_path.exists():
pytest.skip(
"DAT test data not present. Run make setup-dat to download them.",
allow_module_level=True,
)

for path in reader_case_path.iterdir():
if path.is_dir():
with open(path / "test_case_info.json") as f:
metadata = json.load(f)

for version_path in (path / "expected").iterdir():
if path.name.startswith("v"):
version = int(path.name[1:])
else:
version = None
with open(version_path / "table_version_metadata.json") as f:
version_metadata = json.load(f)

cases.append(ReadCase(path, version, metadata, version_metadata))

failing_cases = {
"multi_partitioned_2": "Waiting for PyArrow 11.0.0 for decimal cast support (#1078)",
"nested_types": "Waiting for PyArrow 11.0.0 so we can ignore internal field names in equality",
"multi_partitioned": "Escaped characters in data file paths aren't yet handled (#1079)",
"no_stats": "We don't yet support files without stats (#582)",
}


@pytest.mark.parametrize(
"case", cases, ids=lambda case: f"{case.case_info['name']} (version={case.version})"
)
def test_dat(case: ReadCase):
root, version, case_info, version_metadata = case

if case_info["name"] in failing_cases:
msg = failing_cases[case_info["name"]]
pytest.skip(msg)

# Get Delta Table path
delta_root = root / "delta"

# Load table
dt = DeltaTable(str(delta_root), version=version)

# Compare protocol versions
assert dt.protocol().min_reader_version == version_metadata["min_reader_version"]
assert dt.protocol().min_writer_version == version_metadata["min_writer_version"]

# If supported protocol version, try to read, load parquet, and compare
if dt.protocol().min_reader_version <= 1:
version_path = "latest" if version is None else f"v{version}"
parquet_root = root / "expected" / version_path / "table_content"
expected = pq.read_table(parquet_root, coerce_int96_timestamp_unit="us")
actual = dt.to_pyarrow_table()
assert_tables_equal(expected, actual)
else:
# We should raise an error when attempting to read too advanced protocol
with pytest.raises(Exception):
dt.to_pyarrow_table()


def assert_tables_equal(first: pa.Table, second: pa.Table) -> None:
assert first.schema == second.schema
sort_keys = [(col, "ascending") for col in first.column_names]
first_sorted = first.sort_by(sort_keys)
second_sorted = second.sort_by(sort_keys)
assert first_sorted == second_sorted