Skip to content

Commit

Permalink
feat: setup DAT files download
Browse files Browse the repository at this point in the history
  • Loading branch information
wjones127 committed Jan 12, 2023
1 parent 0f8c360 commit 783437c
Show file tree
Hide file tree
Showing 5 changed files with 53 additions and 23 deletions.
3 changes: 0 additions & 3 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +0,0 @@
[submodule "dat"]
path = dat
url = https://github.com/delta-incubator/dat.git
1 change: 0 additions & 1 deletion dat
Submodule dat deleted from e7798d
3 changes: 3 additions & 0 deletions python/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,6 @@ __pycache__/
docs/build

*.so

# dat data
dat-data
11 changes: 11 additions & 0 deletions python/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
VENV := venv
MATURIN_VERSION := $(shell grep 'requires =' pyproject.toml | cut -d= -f2- | tr -d '[ "]')
PACKAGE_VERSION := $(shell grep version Cargo.toml | head -n 1 | awk '{print $$3}' | tr -d '"' )
DAT_VERSION := 0.0.1

.PHONY: setup-venv
setup-venv: ## Setup the virtualenv
Expand All @@ -14,6 +15,16 @@ setup: ## Setup the requirements
$(info --- Setup dependencies ---)
pip install "$(MATURIN_VERSION)"

.PHONY: setup-dat
setup-dat: ## Download DAT test files
mkdir -p dat-data
rm -rf dat-data/v$(DAT_VERSION)
curl -L --silent --output dat-data/deltalake-dat-v$(DAT_VERSION).tar.gz \
https://github.com/delta-incubator/dat/releases/download/v$(DAT_VERSION)/deltalake-dat-v$(DAT_VERSION).tar.gz
-tar -xzf dat-data/deltalake-dat-v$(DAT_VERSION).tar.gz
mv out dat-data/v$(DAT_VERSION)
rm dat-data/deltalake-dat-v$(DAT_VERSION).tar.gz

.PHONY: build
build: setup ## Build Python binding of delta-rs
$(info --- Build Python binding ---)
Expand Down
58 changes: 39 additions & 19 deletions python/tests/data_acceptance/test_reader.py
Original file line number Diff line number Diff line change
@@ -1,57 +1,77 @@
from typing import NamedTuple, Dict, Any
from pathlib import Path
import json
from pathlib import Path
from typing import Any, Dict, NamedTuple, Optional

import pyarrow as pa
import pyarrow.parquet as pq
import pytest

from deltalake import DeltaTable


class ReadCase(NamedTuple):
root: Path
metadata: Dict[str, Any]
version: Optional[int]
case_info: Dict[str, Any]
version_metadata: Dict[str, Any]


cases = []

project_root = Path("../dat")
for path in (project_root / "out" / "tables" / "generated").iterdir():
dat_version = "0.0.1"
reader_case_path = Path("dat-data") / f"v{dat_version}" / "reader_tests" / "generated"

if not reader_case_path.exists():
pytest.skip(
"DAT test data not present. Run make setup-dat to download them.",
allow_module_level=True,
)

for path in reader_case_path.iterdir():
if path.is_dir():
with open(path / "table-metadata.json") as f:
with open(path / "test_case_info.json") as f:
metadata = json.load(f)
cases.append(ReadCase(path, metadata))

# TODO: external-tables should be added to cases as well
for version_path in (path / "expected").iterdir():
if path.name.startswith("v"):
version = int(path.name[1:])
else:
version = None
with open(version_path / "table_version_metadata.json") as f:
version_metadata = json.load(f)

cases.append(ReadCase(path, version, metadata, version_metadata))


@pytest.mark.parametrize("case", cases)
@pytest.mark.parametrize(
"case", cases, ids=lambda case: f"{case.case_info['name']} (version={case.version})"
)
def test_dat(case: ReadCase):
root, metadata = case
root, version, case_info, version_metadata = case

# Get Delta Table path
delta_root = root / "delta"

# Load table
dt = DeltaTable(str(delta_root))
dt = DeltaTable(str(delta_root), version=version)

# Compare protocol versions
# TODO: this is incorrect in dat
# assert dt.protocol().min_reader_version == metadata["reader_protocol_version"]
assert dt.protocol().min_writer_version == metadata["writer_protocol_version"]

# Perhaps?
# assert dt.version == metadata["current_version"]
assert dt.protocol().min_reader_version == version_metadata["min_reader_version"]
assert dt.protocol().min_writer_version == version_metadata["min_writer_version"]

# If supported protocol version, try to read, load parquet, and compare
if dt.protocol().min_reader_version <= 1:
parquet_root = root / "parquet"
version_path = "latest" if version is None else f"v{version}"
# TODO: fix the directory name here
parquet_root = root / "expected" / version_path / "table_content.parquet"
expected = pq.read_table(parquet_root)
actual = dt.to_pyarrow_table()
assert_tables_equal(expected, actual)
else:
# We should raise an error when attempting to read too advanced protocol
with pytest.raises(Exception):
dt.to_pyarrow_table()


def assert_tables_equal(first: pa.Table, second: pa.Table) -> None:
assert first.schema == second.schema
Expand Down

0 comments on commit 783437c

Please sign in to comment.