Skip to content

Commit

Permalink
test: add Data Acceptance Tests (#909)
Browse files Browse the repository at this point in the history
# Description
The description of the main changes of your pull request

# Related Issue(s)

- closes #864 

# Documentation

<!---
Share links to useful documentation
--->
  • Loading branch information
wjones127 authored Jan 25, 2023
1 parent 920f72b commit b67c602
Show file tree
Hide file tree
Showing 5 changed files with 109 additions and 1 deletion.
5 changes: 4 additions & 1 deletion .github/workflows/python_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,10 @@ jobs:
pip install virtualenv
virtualenv venv
source venv/bin/activate
make develop
make develop
- name: Download Data Acceptance Tests (DAT) files
run: make setup-dat

- name: Run tests
run: |
Expand Down
3 changes: 3 additions & 0 deletions python/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,6 @@ __pycache__/
docs/build

*.so

# dat data
dat-data
11 changes: 11 additions & 0 deletions python/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
VENV := venv
MATURIN_VERSION := $(shell grep 'requires =' pyproject.toml | cut -d= -f2- | tr -d '[ "]')
PACKAGE_VERSION := $(shell grep version Cargo.toml | head -n 1 | awk '{print $$3}' | tr -d '"' )
DAT_VERSION := 0.0.2

.PHONY: setup-venv
setup-venv: ## Setup the virtualenv
Expand All @@ -14,6 +15,16 @@ setup: ## Setup the requirements
$(info --- Setup dependencies ---)
pip install "$(MATURIN_VERSION)"

.PHONY: setup-dat
setup-dat: ## Download DAT test files
mkdir -p dat-data
rm -rf dat-data/v$(DAT_VERSION)
curl -L --silent --output dat-data/deltalake-dat-v$(DAT_VERSION).tar.gz \
https://github.com/delta-incubator/dat/releases/download/v$(DAT_VERSION)/deltalake-dat-v$(DAT_VERSION).tar.gz
tar --no-same-permissions -xzf dat-data/deltalake-dat-v$(DAT_VERSION).tar.gz
mv out dat-data/v$(DAT_VERSION)
rm dat-data/deltalake-dat-v$(DAT_VERSION).tar.gz

.PHONY: build
build: setup ## Build Python binding of delta-rs
$(info --- Build Python binding ---)
Expand Down
Empty file.
91 changes: 91 additions & 0 deletions python/tests/data_acceptance/test_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import json
from pathlib import Path
from typing import Any, Dict, NamedTuple, Optional

import pyarrow as pa
import pyarrow.parquet as pq
import pytest

from deltalake import DeltaTable


class ReadCase(NamedTuple):
root: Path
version: Optional[int]
case_info: Dict[str, Any]
version_metadata: Dict[str, Any]


cases = []

dat_version = "0.0.2"
reader_case_path = Path("dat-data") / f"v{dat_version}" / "reader_tests" / "generated"

if not reader_case_path.exists():
pytest.skip(
"DAT test data not present. Run make setup-dat to download them.",
allow_module_level=True,
)

for path in reader_case_path.iterdir():
if path.is_dir():
with open(path / "test_case_info.json") as f:
metadata = json.load(f)

for version_path in (path / "expected").iterdir():
if path.name.startswith("v"):
version = int(path.name[1:])
else:
version = None
with open(version_path / "table_version_metadata.json") as f:
version_metadata = json.load(f)

cases.append(ReadCase(path, version, metadata, version_metadata))

failing_cases = {
"multi_partitioned_2": "Waiting for PyArrow 11.0.0 for decimal cast support (#1078)",
"nested_types": "Waiting for PyArrow 11.0.0 so we can ignore internal field names in equality",
"multi_partitioned": "Escaped characters in data file paths aren't yet handled (#1079)",
"no_stats": "We don't yet support files without stats (#582)",
}


@pytest.mark.parametrize(
"case", cases, ids=lambda case: f"{case.case_info['name']} (version={case.version})"
)
def test_dat(case: ReadCase):
root, version, case_info, version_metadata = case

if case_info["name"] in failing_cases:
msg = failing_cases[case_info["name"]]
pytest.skip(msg)

# Get Delta Table path
delta_root = root / "delta"

# Load table
dt = DeltaTable(str(delta_root), version=version)

# Compare protocol versions
assert dt.protocol().min_reader_version == version_metadata["min_reader_version"]
assert dt.protocol().min_writer_version == version_metadata["min_writer_version"]

# If supported protocol version, try to read, load parquet, and compare
if dt.protocol().min_reader_version <= 1:
version_path = "latest" if version is None else f"v{version}"
parquet_root = root / "expected" / version_path / "table_content"
expected = pq.read_table(parquet_root, coerce_int96_timestamp_unit="us")
actual = dt.to_pyarrow_table()
assert_tables_equal(expected, actual)
else:
# We should raise an error when attempting to read too advanced protocol
with pytest.raises(Exception):
dt.to_pyarrow_table()


def assert_tables_equal(first: pa.Table, second: pa.Table) -> None:
assert first.schema == second.schema
sort_keys = [(col, "ascending") for col in first.column_names]
first_sorted = first.sort_by(sort_keys)
second_sorted = second.sort_by(sort_keys)
assert first_sorted == second_sorted

0 comments on commit b67c602

Please sign in to comment.