Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor kerchunk reader tests to call open_virtual_dataset #317

Merged
merged 2 commits into from
Nov 25, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 76 additions & 37 deletions virtualizarr/tests/test_readers/test_kerchunk.py
Original file line number Diff line number Diff line change
@@ -1,51 +1,88 @@
from pathlib import Path
from typing import Any, Callable, Generator, Optional

import numpy as np
import pytest
import ujson

from virtualizarr.backend import open_virtual_dataset
from virtualizarr.manifests import ManifestArray
from virtualizarr.readers.kerchunk import (
dataset_from_kerchunk_refs,
)


def gen_ds_refs(
zgroup: str = '{"zarr_format":2}',
zarray: str = '{"chunks":[2,3],"compressor":null,"dtype":"<i8","fill_value":null,"filters":null,"order":"C","shape":[2,3],"zarr_format":2}',
zattrs: str = '{"_ARRAY_DIMENSIONS":["x","y"]}',
chunk: list = ["test1.nc", 6144, 48],
zgroup: str | None = None,
zarray: str | None = None,
zattrs: str | None = None,
chunks: dict[str, list[str | int]] | None = None,
):
if zgroup is None:
zgroup = '{"zarr_format":2}'
if zarray is None:
zarray = '{"chunks":[2,3],"compressor":null,"dtype":"<i8","fill_value":null,"filters":null,"order":"C","shape":[2,3],"zarr_format":2}'
if zattrs is None:
zattrs = '{"_ARRAY_DIMENSIONS":["x","y"]}'
if chunks is None:
chunks = {"a/0.0": ["test1.nc", 6144, 48]}

return {
"version": 1,
"refs": {
".zgroup": zgroup,
"a/.zarray": zarray,
"a/.zattrs": zattrs,
"a/0.0": chunk,
**chunks,
},
}


def test_dataset_from_df_refs():
ds_refs = gen_ds_refs()
ds = dataset_from_kerchunk_refs(ds_refs)
assert "a" in ds
da = ds["a"]
assert isinstance(da.data, ManifestArray)
assert da.dims == ("x", "y")
assert da.shape == (2, 3)
assert da.chunks == (2, 3)
assert da.dtype == np.dtype("<i8")
@pytest.fixture
def refs_file_factory(
tmp_path: Path,
) -> Generator[
Callable[[Optional[Any], Optional[Any], Optional[Any], Optional[Any]], str],
None,
None,
]:
"""
Fixture which defers creation of the references file until the parameters zgroup etc. are known.
"""

def _refs_file(zgroup=None, zarray=None, zattrs=None, chunks=None) -> str:
refs = gen_ds_refs(zgroup=zgroup, zarray=zarray, zattrs=zattrs, chunks=chunks)
filepath = tmp_path / "refs.json"

with open(filepath, "w") as json_file:
ujson.dump(refs, json_file)

return str(filepath)

yield _refs_file


assert da.data.zarray.compressor is None
assert da.data.zarray.filters is None
assert da.data.zarray.fill_value == 0
assert da.data.zarray.order == "C"
def test_dataset_from_df_refs(refs_file_factory):
refs_file = refs_file_factory()

assert da.data.manifest.dict() == {
vds = open_virtual_dataset(refs_file, filetype="kerchunk")

assert "a" in vds
vda = vds["a"]
assert isinstance(vda.data, ManifestArray)
assert vda.dims == ("x", "y")
assert vda.shape == (2, 3)
assert vda.chunks == (2, 3)
assert vda.dtype == np.dtype("<i8")

assert vda.data.zarray.compressor is None
assert vda.data.zarray.filters is None
assert vda.data.zarray.fill_value == 0
assert vda.data.zarray.order == "C"

assert vda.data.manifest.dict() == {
"0.0": {"path": "test1.nc", "offset": 6144, "length": 48}
}


def test_dataset_from_df_refs_with_filters():
def test_dataset_from_df_refs_with_filters(refs_file_factory):
filters = [{"elementsize": 4, "id": "shuffle"}, {"id": "zlib", "level": 4}]
zarray = {
"chunks": [2, 3],
Expand All @@ -57,13 +94,15 @@ def test_dataset_from_df_refs_with_filters():
"shape": [2, 3],
"zarr_format": 2,
}
ds_refs = gen_ds_refs(zarray=ujson.dumps(zarray))
ds = dataset_from_kerchunk_refs(ds_refs)
da = ds["a"]
assert da.data.zarray.filters == filters
refs_file = refs_file_factory(zarray=ujson.dumps(zarray))

vds = open_virtual_dataset(refs_file, filetype="kerchunk")

vda = vds["a"]
assert vda.data.zarray.filters == filters

def test_dataset_from_kerchunk_refs_empty_chunk_manifest():

def test_empty_chunk_manifest(refs_file_factory):
zarray = {
"chunks": [50, 100],
"compressor": None,
Expand All @@ -74,11 +113,11 @@ def test_dataset_from_kerchunk_refs_empty_chunk_manifest():
"shape": [100, 200],
"zarr_format": 2,
}
refs = gen_ds_refs(zarray=ujson.dumps(zarray))
del refs["refs"]["a/0.0"]

ds = dataset_from_kerchunk_refs(refs)
assert "a" in ds.variables
assert isinstance(ds["a"].data, ManifestArray)
assert ds["a"].sizes == {"x": 100, "y": 200}
assert ds["a"].chunksizes == {"x": 50, "y": 100}
refs_file = refs_file_factory(zarray=ujson.dumps(zarray), chunks={})

vds = open_virtual_dataset(refs_file, filetype="kerchunk")

assert "a" in vds.variables
assert isinstance(vds["a"].data, ManifestArray)
assert vds["a"].sizes == {"x": 100, "y": 200}
assert vds["a"].chunksizes == {"x": 50, "y": 100}
Loading