diff --git a/ci/upstream.yml b/ci/upstream.yml index 7c47af9f..8a2c8800 100644 --- a/ci/upstream.yml +++ b/ci/upstream.yml @@ -26,6 +26,7 @@ dependencies: - pytest - pooch - fsspec + - dask - pip - pip: - icechunk>=0.1.0a7 # Installs zarr v3 as dependency diff --git a/conftest.py b/conftest.py index 0be0a89c..4d80094c 100644 --- a/conftest.py +++ b/conftest.py @@ -24,6 +24,22 @@ def pytest_runtest_setup(item): ) +def _xarray_subset(): + ds = xr.tutorial.open_dataset("air_temperature", chunks={}) + return ds.isel(time=slice(0, 10), lat=slice(0, 9), lon=slice(0, 18)).chunk( + {"time": 5} + ) + + +@pytest.fixture(params=[2, 3]) +def zarr_store(tmpdir, request): + ds = _xarray_subset() + filepath = f"{tmpdir}/air.zarr" + ds.to_zarr(filepath, zarr_format=request.param) + ds.close() + return filepath + + @pytest.fixture def empty_netcdf4_file(tmpdir): # Set up example xarray dataset diff --git a/docs/releases.rst b/docs/releases.rst index 4146948f..4db590f5 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -9,6 +9,9 @@ v1.2.1 (unreleased) New Features ~~~~~~~~~~~~ +- Adds a Zarr reader to ``open_virtual_dataset``, which allows opening Zarr V2 and V3 stores as virtual datasets. + (:pull:`#271`) By `Raphael Hagen `_. + - Added a ``.nbytes`` accessor method which displays the bytes needed to hold the virtual references in memory. (:issue:`167`, :pull:`227`) By `Tom Nicholas `_. diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py index a8e3b66a..15681db1 100644 --- a/virtualizarr/backend.py +++ b/virtualizarr/backend.py @@ -17,7 +17,7 @@ KerchunkVirtualBackend, NetCDF3VirtualBackend, TIFFVirtualBackend, - ZarrV3VirtualBackend, + ZarrVirtualBackend, ) from virtualizarr.readers.common import VirtualBackend from virtualizarr.utils import _FsspecFSFromFilepath, check_for_collisions @@ -25,7 +25,7 @@ # TODO add entrypoint to allow external libraries to add to this mapping VIRTUAL_BACKENDS = { "kerchunk": KerchunkVirtualBackend, - "zarr_v3": ZarrV3VirtualBackend, + "zarr": ZarrVirtualBackend, "dmrpp": DMRPPVirtualBackend, # all the below call one of the kerchunk backends internally (https://fsspec.github.io/kerchunk/reference.html#file-format-backends) "hdf5": HDF5VirtualBackend, @@ -72,8 +72,7 @@ def automatically_determine_filetype( # TODO how do we handle kerchunk json / parquet here? if Path(filepath).suffix == ".zarr": - # TODO we could imagine opening an existing zarr store, concatenating it, and writing a new virtual one... - raise NotImplementedError() + return FileType.zarr # Read magic bytes from local or remote file fpath = _FsspecFSFromFilepath( diff --git a/virtualizarr/codecs.py b/virtualizarr/codecs.py index ad2a3d9b..b544b82d 100644 --- a/virtualizarr/codecs.py +++ b/virtualizarr/codecs.py @@ -65,9 +65,9 @@ def _get_manifestarray_codecs( def _is_zarr_array(array: object) -> bool: """Check if the array is an instance of Zarr Array.""" try: - from zarr import Array + from zarr import Array, AsyncArray - return isinstance(array, Array) + return isinstance(array, (Array, AsyncArray)) except ImportError: return False diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index 666f4854..6d6b4daf 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -48,7 +48,6 @@ def with_validation( """ # note: we can't just use `__init__` or a dataclass' `__post_init__` because we need `fs_root` to be an optional kwarg - path = validate_and_normalize_path_to_uri(path, fs_root=fs_root) validate_byte_range(offset=offset, length=length) return ChunkEntry(path=path, offset=offset, length=length) @@ -84,7 +83,8 @@ def validate_and_normalize_path_to_uri(path: str, fs_root: str | None = None) -> return urlunparse(components) elif any(path.startswith(prefix) for prefix in VALID_URI_PREFIXES): - if not PosixPath(path).suffix: + # Question: This feels fragile, is there a better way to ID a Zarr + if not PosixPath(path).suffix and "zarr" not in path: raise ValueError( f"entries in the manifest must be paths to files, but this path has no file suffix: {path}" ) @@ -357,17 +357,6 @@ def shape_chunk_grid(self) -> tuple[int, ...]: def __repr__(self) -> str: return f"ChunkManifest" - @property - def nbytes(self) -> int: - """ - Size required to hold these references in memory in bytes. - - Note this is not the size of the referenced chunks if they were actually loaded into memory, - this is only the size of the pointers to the chunk locations. - If you were to load the data into memory it would be ~1e6x larger for 1MB chunks. - """ - return self._paths.nbytes + self._offsets.nbytes + self._lengths.nbytes - def __getitem__(self, key: ChunkKey) -> ChunkEntry: indices = split(key) path = self._paths[indices] diff --git a/virtualizarr/readers/__init__.py b/virtualizarr/readers/__init__.py index c776a9ae..3d887844 100644 --- a/virtualizarr/readers/__init__.py +++ b/virtualizarr/readers/__init__.py @@ -5,7 +5,9 @@ from virtualizarr.readers.kerchunk import KerchunkVirtualBackend from virtualizarr.readers.netcdf3 import NetCDF3VirtualBackend from virtualizarr.readers.tiff import TIFFVirtualBackend -from virtualizarr.readers.zarr_v3 import ZarrV3VirtualBackend +from virtualizarr.readers.zarr import ( + ZarrVirtualBackend, +) __all__ = [ "DMRPPVirtualBackend", @@ -15,5 +17,5 @@ "KerchunkVirtualBackend", "NetCDF3VirtualBackend", "TIFFVirtualBackend", - "ZarrV3VirtualBackend", + "ZarrVirtualBackend", ] diff --git a/virtualizarr/readers/common.py b/virtualizarr/readers/common.py index 0a7ad36e..89657b5f 100644 --- a/virtualizarr/readers/common.py +++ b/virtualizarr/readers/common.py @@ -45,16 +45,24 @@ def maybe_open_loadable_vars_and_indexes( # TODO Really we probably want a dedicated backend that iterates over all variables only once # TODO See issue #124 for a suggestion of how to avoid calling xarray here. - fpath = _FsspecFSFromFilepath( - filepath=filepath, reader_options=reader_options - ).open_file() + fpath = _FsspecFSFromFilepath(filepath=filepath, reader_options=reader_options) + + # Updates the Xarray open_dataset kwargs if Zarr + + if fpath.upath.suffix == ".zarr": + engine = "zarr" + xr_input = fpath.filepath + + else: + engine = None + xr_input = fpath.open_file() # type: ignore - # fpath can be `Any` thanks to fsspec.filesystem(...).open() returning Any. ds = open_dataset( - fpath, # type: ignore[arg-type] + xr_input, # type: ignore[arg-type] drop_variables=drop_variables, group=group, decode_times=decode_times, + engine=engine, ) if indexes is None: diff --git a/virtualizarr/readers/zarr.py b/virtualizarr/readers/zarr.py new file mode 100644 index 00000000..c26eb62e --- /dev/null +++ b/virtualizarr/readers/zarr.py @@ -0,0 +1,324 @@ +from __future__ import annotations + +import asyncio +from pathlib import Path +from typing import TYPE_CHECKING, Iterable, Mapping, Optional + +from xarray import Dataset, Index, Variable +from zarr.core.common import concurrent_map + +from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.manifests.manifest import validate_and_normalize_path_to_uri +from virtualizarr.readers.common import ( + VirtualBackend, + construct_virtual_dataset, + maybe_open_loadable_vars_and_indexes, +) +from virtualizarr.utils import check_for_collisions +from virtualizarr.zarr import ZArray + +if TYPE_CHECKING: + import zarr + + +async def _parse_zarr_v2_metadata(zarr_array: zarr.Array) -> ZArray: + return ZArray( + shape=zarr_array.metadata.shape, + chunks=zarr_array.metadata.chunks, # type: ignore[union-attr] + dtype=zarr_array.metadata.dtype, + fill_value=zarr_array.metadata.fill_value, # type: ignore[arg-type] + order="C", + compressor=zarr_array.metadata.compressor, # type: ignore[union-attr] + filters=zarr_array.metadata.filters, # type: ignore + zarr_format=zarr_array.metadata.zarr_format, + ) + + +async def _parse_zarr_v3_metadata(zarr_array: zarr.Array) -> ZArray: + from virtualizarr.codecs import get_codecs + + if zarr_array.metadata.fill_value is None: + raise ValueError( + "fill_value must be specified https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value" + ) + else: + fill_value = zarr_array.metadata.fill_value + + codecs = get_codecs(zarr_array) + + # Question: How should we parse the values from get_codecs? + compressor = getattr(codecs[0], "compressor", None) # type: ignore + filters = getattr(codecs[0], "filters", None) # type: ignore + + return ZArray( + chunks=zarr_array.metadata.chunk_grid.chunk_shape, # type: ignore[attr-defined] + compressor=compressor, + dtype=zarr_array.metadata.data_type.name, # type: ignore + fill_value=fill_value, # type: ignore[arg-type] + filters=filters, + order="C", + shape=zarr_array.metadata.shape, + zarr_format=zarr_array.metadata.zarr_format, + ) + + +async def build_chunk_manifest_from_dict_mapping( + store_path: str, chunk_mapping_dict: dict, array_name: str, zarr_format: int +) -> ChunkManifest: + chunk_manifest_dict = {} + # ToDo: We could skip the dict creation and built the Manifest from arrays directly + for key, value in chunk_mapping_dict.items(): + if zarr_format == 2: + # split on array name + trailing slash + chunk_key = key.split(array_name + "/")[-1] + + elif zarr_format == 3: + # In v3 we remove the /c/ 'chunks' part of the key and + # replace trailing slashes with '.' to conform to ChunkManifest validation + chunk_key = ( + key.split(array_name + "/")[-1].split("c/")[-1].replace("/", ".") + ) + + chunk_manifest_dict[chunk_key] = { + "path": store_path + "/" + key, + "offset": 0, + "length": value, + } + + return ChunkManifest(chunk_manifest_dict) + + +async def build_chunk_manifest( + zarr_array: zarr.AsyncArray, prefix: str, filepath: str +) -> ChunkManifest: + """Build a ChunkManifest with the from_arrays method""" + import numpy as np + + # import pdb; pdb.set_trace() + key_tuples = [(x,) async for x in zarr_array.store.list_prefix(prefix)] + + filepath_list = [filepath] * len(key_tuples) + + # can this be lambda'ed? + # stolen from manifest.py + def combine_path_chunk(filepath: str, chunk_key: str): + return filepath + "/" + chunk_key + + vectorized_chunk_path_combine_func = np.vectorize( + combine_path_chunk, otypes=[np.dtypes.StringDType()] + ) + + # _paths: np.ndarray[Any, np.dtypes.StringDType] + # turn the tuples of chunks to a flattened list with :list(sum(key_tuples, ())) + _paths = vectorized_chunk_path_combine_func( + filepath_list, list(sum(key_tuples, ())) + ) + + # _offsets: np.ndarray[Any, np.dtype[np.uint64]] + _offsets = np.array([0] * len(_paths), dtype=np.uint64) + + # _lengths: np.ndarray[Any, np.dtype[np.uint64]] + lengths = await concurrent_map((key_tuples), zarr_array.store.getsize) + _lengths = np.array(lengths, dtype=np.uint64) + + return ChunkManifest.from_arrays( + paths=_paths, # type: ignore + offsets=_offsets, + lengths=_lengths, + ) + + +async def get_chunk_mapping_prefix(zarr_array: zarr.AsyncArray, prefix: str) -> dict: + """Create a chunk map""" + + keys = [(x,) async for x in zarr_array.store.list_prefix(prefix)] + + sizes = await concurrent_map(keys, zarr_array.store.getsize) + return {key[0]: size for key, size in zip(keys, sizes)} + + +async def build_zarray_metadata(zarr_array: zarr.AsyncArray): + attrs = zarr_array.metadata.attributes + + if zarr_array.metadata.zarr_format == 2: + array_zarray = await _parse_zarr_v2_metadata(zarr_array=zarr_array) # type: ignore[arg-type] + array_dims = attrs["_ARRAY_DIMENSIONS"] + + elif zarr_array.metadata.zarr_format == 3: + array_zarray = await _parse_zarr_v3_metadata(zarr_array=zarr_array) # type: ignore[arg-type] + array_dims = zarr_array.metadata.dimension_names # type: ignore[union-attr] + + else: + raise NotImplementedError("Zarr format is not recognized as v2 or v3.") + + return { + "zarray_array": array_zarray, + "array_dims": array_dims, + "array_metadata": attrs, + } + + +async def virtual_variable_from_zarr_array(zarr_array: zarr.AsyncArray, filepath: str): + zarr_prefix = zarr_array.basename + + if zarr_array.metadata.zarr_format == 3: + # if we have Zarr format/version 3, we add /c/ to the chunk paths + zarr_prefix = f"{zarr_prefix}/c" + + zarray_array = await build_zarray_metadata(zarr_array=zarr_array) + + chunk_manifest = await build_chunk_manifest( + zarr_array, prefix=zarr_prefix, filepath=filepath + ) + + manifest_array = ManifestArray( + zarray=zarray_array["zarray_array"], chunkmanifest=chunk_manifest + ) + return Variable( + dims=zarray_array["array_dims"], + data=manifest_array, + attrs=zarray_array["array_metadata"], + ) + + +async def virtual_dataset_from_zarr_group( + zarr_group: zarr.AsyncGroup, + filepath: str, + group: str, + drop_variables: Iterable[str] | None = [], + virtual_variables: Iterable[str] | None = [], + loadable_variables: Iterable[str] | None = [], + decode_times: bool | None = None, + indexes: Mapping[str, Index] | None = None, + reader_options: dict = {}, +): + virtual_zarr_arrays = await asyncio.gather( + *[zarr_group.getitem(var) for var in virtual_variables] + ) + + virtual_variable_arrays = await asyncio.gather( + *[ + virtual_variable_from_zarr_array(zarr_array=array, filepath=filepath) + for array in virtual_zarr_arrays + ] + ) + + # build a dict mapping for use later in construct_virtual_dataset + virtual_variable_array_mapping = { + array.basename: result + for array, result in zip(virtual_zarr_arrays, virtual_variable_arrays) + } + + # flatten nested tuples and get set -> list + coord_names = list( + set( + [ + item + for tup in [val.dims for val in virtual_variable_arrays] + for item in tup + ] + ) + ) + + non_loadable_variables = list(set(virtual_variables).union(set(drop_variables))) + + loadable_vars, indexes = maybe_open_loadable_vars_and_indexes( + filepath, + loadable_variables=loadable_variables, + reader_options=reader_options, + drop_variables=non_loadable_variables, + indexes=indexes, + group=group, + decode_times=decode_times, + ) + + return construct_virtual_dataset( + virtual_vars=virtual_variable_array_mapping, + loadable_vars=loadable_vars, + indexes=indexes, + coord_names=coord_names, + attrs=zarr_group.attrs, + ) + + +class ZarrVirtualBackend(VirtualBackend): + @staticmethod + def open_virtual_dataset( + filepath: str, + group: str | None = None, + drop_variables: Iterable[str] | None = None, + loadable_variables: Iterable[str] | None = None, + decode_times: bool | None = None, + indexes: Mapping[str, Index] | None = None, + virtual_backend_kwargs: Optional[dict] = None, + reader_options: Optional[dict] = None, + ) -> Dataset: + import asyncio + + import zarr + from packaging import version + + if version.parse(zarr.__version__).major < 3: + raise ImportError("Zarr V3 is required") + + async def _open_virtual_dataset( + filepath=filepath, + group=group, + drop_variables=drop_variables, + loadable_variables=loadable_variables, + decode_times=decode_times, + indexes=indexes, + virtual_backend_kwargs=virtual_backend_kwargs, + reader_options=reader_options, + ): + if virtual_backend_kwargs: + raise NotImplementedError( + "Zarr reader does not understand any virtual_backend_kwargs" + ) + + drop_variables, loadable_variables = check_for_collisions( + drop_variables, + loadable_variables, + ) + + filepath = validate_and_normalize_path_to_uri( + filepath, fs_root=Path.cwd().as_uri() + ) + # This currently fails for local filepaths (ie. tests) but works for s3: + # *** TypeError: Filesystem needs to support async operations. + # https://github.com/zarr-developers/zarr-python/issues/2554 + + if reader_options is None: + reader_options = {} + + zg = await zarr.api.asynchronous.open_group( + filepath, + storage_options=reader_options.get("storage_options"), + mode="r", + ) + + zarr_array_keys = [key async for key in zg.array_keys()] + + missing_vars = set(loadable_variables) - set(zarr_array_keys) + if missing_vars: + raise ValueError( + f"Some loadable variables specified are not present in this zarr store: {missing_vars}" + ) + virtual_vars = list( + set(zarr_array_keys) - set(loadable_variables) - set(drop_variables) + ) + + return await virtual_dataset_from_zarr_group( + zarr_group=zg, + filepath=filepath, + group=group, + virtual_variables=virtual_vars, + drop_variables=drop_variables, + loadable_variables=loadable_variables, + decode_times=decode_times, + indexes=indexes, + reader_options=reader_options, + ) + + # How does this asyncio.run call interact with zarr-pythons async event loop? + return asyncio.run(_open_virtual_dataset()) diff --git a/virtualizarr/readers/zarr_v3.py b/virtualizarr/readers/zarr_v3.py deleted file mode 100644 index 70bf66e8..00000000 --- a/virtualizarr/readers/zarr_v3.py +++ /dev/null @@ -1,161 +0,0 @@ -import json -from pathlib import Path -from typing import Iterable, Mapping, Optional - -import numcodecs -import numpy as np -from xarray import Dataset, Index, Variable - -from virtualizarr.manifests import ChunkManifest, ManifestArray -from virtualizarr.readers.common import VirtualBackend, separate_coords -from virtualizarr.zarr import ZArray - - -class ZarrV3VirtualBackend(VirtualBackend): - @staticmethod - def open_virtual_dataset( - filepath: str, - group: str | None = None, - drop_variables: Iterable[str] | None = None, - loadable_variables: Iterable[str] | None = None, - decode_times: bool | None = None, - indexes: Mapping[str, Index] | None = None, - virtual_backend_kwargs: Optional[dict] = None, - reader_options: Optional[dict] = None, - ) -> Dataset: - """ - Read a Zarr v3 store containing chunk manifests and return an xarray Dataset containing virtualized arrays. - - This is experimental - chunk manifests are not part of the Zarr v3 Spec. - """ - - if virtual_backend_kwargs: - raise NotImplementedError( - "Zarr_v3 reader does not understand any virtual_backend_kwargs" - ) - - storepath = Path(filepath) - - if group: - raise NotImplementedError() - - if loadable_variables or decode_times: - raise NotImplementedError() - - if reader_options: - raise NotImplementedError() - - drop_vars: list[str] - if drop_variables is None: - drop_vars = [] - else: - drop_vars = list(drop_variables) - - ds_attrs = attrs_from_zarr_group_json(storepath / "zarr.json") - coord_names = ds_attrs.pop("coordinates", []) - - # TODO recursive glob to create a datatree - # Note: this .is_file() check should not be necessary according to the pathlib docs, but tests fail on github CI without it - # see https://github.com/TomNicholas/VirtualiZarr/pull/45#discussion_r1547833166 - all_paths = storepath.glob("*/") - directory_paths = [p for p in all_paths if not p.is_file()] - - vars = {} - for array_dir in directory_paths: - var_name = array_dir.name - if var_name in drop_vars: - break - - zarray, dim_names, attrs = metadata_from_zarr_json(array_dir / "zarr.json") - manifest = ChunkManifest.from_zarr_json(str(array_dir / "manifest.json")) - - marr = ManifestArray(chunkmanifest=manifest, zarray=zarray) - var = Variable(data=marr, dims=dim_names, attrs=attrs) - vars[var_name] = var - - if indexes is None: - raise NotImplementedError() - elif indexes != {}: - # TODO allow manual specification of index objects - raise NotImplementedError() - else: - indexes = dict(**indexes) # for type hinting: to allow mutation - - data_vars, coords = separate_coords(vars, indexes, coord_names) - - ds = Dataset( - data_vars, - coords=coords, - # indexes={}, # TODO should be added in a later version of xarray - attrs=ds_attrs, - ) - - return ds - - -def attrs_from_zarr_group_json(filepath: Path) -> dict: - with open(filepath) as metadata_file: - attrs = json.load(metadata_file) - return attrs["attributes"] - - -def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]: - with open(filepath) as metadata_file: - metadata = json.load(metadata_file) - - if { - "name": "chunk-manifest-json", - "configuration": { - "manifest": "./manifest.json", - }, - } not in metadata.get("storage_transformers", []): - raise ValueError( - "Can only read byte ranges from Zarr v3 stores which implement the manifest storage transformer ZEP." - ) - - attrs = metadata.pop("attributes") - dim_names = metadata.pop("dimension_names") - - chunk_shape = tuple(metadata["chunk_grid"]["configuration"]["chunk_shape"]) - shape = tuple(metadata["shape"]) - zarr_format = metadata["zarr_format"] - - if metadata["fill_value"] is None: - raise ValueError( - "fill_value must be specified https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value" - ) - else: - fill_value = metadata["fill_value"] - - all_codecs = [ - codec - for codec in metadata["codecs"] - if codec["name"] not in ("transpose", "bytes") - ] - compressor, *filters = [ - _configurable_to_num_codec_config(_filter) for _filter in all_codecs - ] - zarray = ZArray( - chunks=chunk_shape, - compressor=compressor, - dtype=np.dtype(metadata["data_type"]), - fill_value=fill_value, - filters=filters or None, - order="C", - shape=shape, - zarr_format=zarr_format, - ) - - return zarray, dim_names, attrs - - -def _configurable_to_num_codec_config(configurable: dict) -> dict: - """ - Convert a zarr v3 configurable into a numcodecs codec. - """ - configurable_copy = configurable.copy() - codec_id = configurable_copy.pop("name") - if codec_id.startswith("numcodecs."): - codec_id = codec_id[len("numcodecs.") :] - configuration = configurable_copy.pop("configuration") - return numcodecs.get_codec({"id": codec_id, **configuration}).get_config() diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py index 658cf640..d560b9ca 100644 --- a/virtualizarr/tests/__init__.py +++ b/virtualizarr/tests/__init__.py @@ -37,6 +37,7 @@ def _importorskip( has_s3fs, requires_s3fs = _importorskip("s3fs") has_scipy, requires_scipy = _importorskip("scipy") has_tifffile, requires_tifffile = _importorskip("tifffile") +has_zarrV3, requires_zarrV3 = _importorskip("zarr", minversion="2.99.0") has_imagecodecs, requires_imagecodecs = _importorskip("imagecodecs") has_hdf5plugin, requires_hdf5plugin = _importorskip("hdf5plugin") has_zarr_python, requires_zarr_python = _importorskip("zarr") diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 71de89fa..62aef376 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -10,7 +10,7 @@ from virtualizarr.manifests import ChunkManifest, ManifestArray from virtualizarr.readers import HDF5VirtualBackend from virtualizarr.readers.hdf import HDFVirtualBackend -from virtualizarr.tests import requires_kerchunk +from virtualizarr.tests import network, requires_kerchunk, requires_zarrV3 from virtualizarr.translators.kerchunk import ( dataset_from_kerchunk_refs, ) @@ -76,6 +76,7 @@ def test_numpy_arrays_to_inlined_kerchunk_refs( vds = open_virtual_dataset( netcdf4_file, loadable_variables=vars_to_inline, indexes={}, backend=hdf_backend ) + refs = vds.virtualize.to_kerchunk(format="dict") # TODO I would just compare the entire dicts but kerchunk returns inconsistent results - see https://github.com/TomNicholas/VirtualiZarr/pull/73#issuecomment-2040931202 @@ -86,6 +87,44 @@ def test_numpy_arrays_to_inlined_kerchunk_refs( assert refs["refs"]["time/0"] == expected["refs"]["time/0"] +@requires_zarrV3 +@network +@pytest.mark.skip(reason="Kerchunk & zarr-python v3 incompatibility") +@pytest.mark.parametrize( + "zarr_store", + [ + pytest.param(2, id="Zarr V2"), + pytest.param(3, id="Zarr V3"), + ], + indirect=True, +) +def test_zarr_roundtrip(zarr_store): + ds = open_virtual_dataset( + zarr_store, + indexes={}, + ) + + ds_refs = ds.virtualize.to_kerchunk(format="dict") + + roundtrip = dataset_from_kerchunk_refs(ds_refs) + + # This won't work right now b/c of the Kerchunk zarr-v3 incompatibility + # roundtrip = xr.open_dataset(ds_refs, engine="kerchunk", decode_times=False) + + def add_prefix(file_path: str) -> str: + return "file://" + file_path + + for array in ["lat", "lon", "time", "air"]: + # V2: What should the behavior here be? Should the RT dataset have _ARRAY_DIMS? + ds[array].attrs.pop("_ARRAY_DIMENSIONS", None) + + # temp workaround b/c of the zarr-python-v3 filepath issue: https://github.com/zarr-developers/zarr-python/issues/2554 + roundtrip[array].data = roundtrip[array].data.rename_paths(add_prefix) + + # Assert equal to original dataset - ManifestArrays + xrt.assert_equal(roundtrip, ds) + + @requires_kerchunk @pytest.mark.parametrize("format", ["dict", "json", "parquet"]) class TestKerchunkRoundtrip: diff --git a/virtualizarr/tests/test_readers/test_zarr.py b/virtualizarr/tests/test_readers/test_zarr.py new file mode 100644 index 00000000..622bf211 --- /dev/null +++ b/virtualizarr/tests/test_readers/test_zarr.py @@ -0,0 +1,125 @@ +import numpy as np +import pytest +import zarr + +from virtualizarr import open_virtual_dataset +from virtualizarr.manifests import ManifestArray +from virtualizarr.tests import network, requires_zarrV3 + + +@requires_zarrV3 +@network +@pytest.mark.parametrize( + "zarr_store", + [ + pytest.param( + 2, + id="Zarr V2", + marks=pytest.mark.xfail( + reason="https://github.com/zarr-developers/zarr-python/issues/2554" + ), + ), + pytest.param( + 3, + id="Zarr V3", + marks=pytest.mark.xfail( + reason="https://github.com/zarr-developers/zarr-python/issues/2554" + ), + ), + ], + indirect=True, +) +class TestOpenVirtualDatasetZarr: + def test_loadable_variables(self, zarr_store, loadable_variables=["time", "air"]): + # check that loadable variables works + vds = open_virtual_dataset( + filepath=zarr_store, loadable_variables=loadable_variables, indexes={} + ) + assert isinstance(vds["time"].data, np.ndarray) + assert isinstance(vds["air"].data, np.ndarray) + + def test_drop_variables(self, zarr_store, drop_variables=["air"]): + # check variable is dropped + vds = open_virtual_dataset( + filepath=zarr_store, drop_variables=drop_variables, indexes={} + ) + assert len(vds.data_vars) == 0 + + def test_virtual_dataset_zarr_attrs(self, zarr_store): + zg = zarr.open_group(zarr_store) + vds = open_virtual_dataset(filepath=zarr_store, indexes={}) + zg_metadata_dict = zg.metadata.to_dict() + zarr_format = zg_metadata_dict["zarr_format"] + + non_var_arrays = ["time", "lat", "lon"] + # check dims and coords are present + assert set(vds.coords) == set(non_var_arrays) + assert set(vds.dims) == set(non_var_arrays) + # check vars match + assert set(vds.keys()) == set(["air"]) + + # arrays are ManifestArrays + for array in list(vds): + assert isinstance(vds[array].data, ManifestArray) + + # check top level attrs + assert zg.attrs.asdict() == vds.attrs + + # check ZArray values + arrays = [val for val in zg.keys()] + + shared_v2_v3_attrs = [ + "shape", + "zarr_format", + ] + v2_attrs = ["chunks", "dtype", "order", "compressor", "filters"] + + def _validate_v2(attrs: list[str]): + for array in arrays: + for attr in attrs: + vds_attr = getattr(vds[array].data.zarray, attr) + zarr_metadata_attr = zg_metadata_dict["consolidated_metadata"][ + "metadata" + ][array][attr] + assert vds_attr == zarr_metadata_attr + + def _validate_v3(attrs: list[str]): + # check v2, v3 shared attrs + for array in arrays: + for attr in attrs: + zarr_metadata_attr = zg_metadata_dict["consolidated_metadata"][ + "metadata" + ][array][attr] + vds_attr = getattr(vds[array].data.zarray, attr) + assert vds_attr == zarr_metadata_attr + + # Cases where v2 and v3 attr keys differ: order, compressor, filters, dtype & chunks + + # chunks vs chunk_grid.configuration.chunk_shape + assert ( + getattr(vds[array].data.zarray, "chunks") + == zg_metadata_dict["consolidated_metadata"]["metadata"][array][ + "chunk_grid" + ]["configuration"]["chunk_shape"] + ) + + # dtype vs datatype + assert ( + getattr(vds[array].data.zarray, "dtype") + == zg_metadata_dict["consolidated_metadata"]["metadata"][array][ + "data_type" + ].to_numpy() + ) + + # order: In zarr v3, it seems like order was replaced with the transpose codec. + # compressor: removed in v3 and built into codecs + # filters: removed in v3 and built into codecs + + if zarr_format == 2: + _validate_v2(shared_v2_v3_attrs + v2_attrs) + + elif zarr_format == 3: + _validate_v3(shared_v2_v3_attrs) + + else: + raise NotImplementedError(f"Zarr format {zarr_format} not in [2,3]") diff --git a/virtualizarr/tests/test_writers/test_zarr.py b/virtualizarr/tests/test_writers/test_zarr.py deleted file mode 100644 index 5afa87a3..00000000 --- a/virtualizarr/tests/test_writers/test_zarr.py +++ /dev/null @@ -1,62 +0,0 @@ -import json - -import pytest -import xarray.testing as xrt -from xarray import Dataset - -pytest.importorskip("zarr.core.metadata.v3") - -from virtualizarr import open_virtual_dataset -from virtualizarr.backend import FileType -from virtualizarr.readers.zarr_v3 import metadata_from_zarr_json -from virtualizarr.writers.zarr import dataset_to_zarr - - -def isconfigurable(value: dict) -> bool: - """ - Several metadata attributes in ZarrV3 use a dictionary with keys "name" : str and "configuration" : dict - """ - return "name" in value and "configuration" in value - - -def test_zarr_v3_metadata_conformance(tmpdir, vds_with_manifest_arrays: Dataset): - """ - Checks that the output metadata of an array variable conforms to this spec - for the required attributes: - https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#metadata - """ - dataset_to_zarr(vds_with_manifest_arrays, tmpdir / "store.zarr") - # read the a variable's metadata - with open(tmpdir / "store.zarr/a/zarr.json", mode="r") as f: - metadata = json.loads(f.read()) - assert metadata["zarr_format"] == 3 - assert metadata["node_type"] == "array" - assert isinstance(metadata["shape"], list) and all( - isinstance(dim, int) for dim in metadata["shape"] - ) - assert isinstance(metadata["data_type"], str) or isconfigurable( - metadata["data_type"] - ) - assert isconfigurable(metadata["chunk_grid"]) - assert isconfigurable(metadata["chunk_key_encoding"]) - assert isinstance(metadata["fill_value"], (bool, int, float, str, list)) - assert ( - isinstance(metadata["codecs"], list) - and len(metadata["codecs"]) > 1 - and all(isconfigurable(codec) for codec in metadata["codecs"]) - ) - - -def test_zarr_v3_roundtrip(tmpdir, vds_with_manifest_arrays: Dataset): - vds_with_manifest_arrays.virtualize.to_zarr(tmpdir / "store.zarr") - roundtrip = open_virtual_dataset( - tmpdir / "store.zarr", filetype=FileType.zarr_v3, indexes={} - ) - - xrt.assert_identical(roundtrip, vds_with_manifest_arrays) - - -def test_metadata_roundtrip(tmpdir, vds_with_manifest_arrays: Dataset): - dataset_to_zarr(vds_with_manifest_arrays, tmpdir / "store.zarr") - zarray, _, _ = metadata_from_zarr_json(tmpdir / "store.zarr/a/zarr.json") - assert zarray == vds_with_manifest_arrays.a.data.zarray diff --git a/virtualizarr/utils.py b/virtualizarr/utils.py index b5ae3447..ca1f22c4 100644 --- a/virtualizarr/utils.py +++ b/virtualizarr/utils.py @@ -7,6 +7,7 @@ if TYPE_CHECKING: import fsspec.core import fsspec.spec + import upath # See pangeo_forge_recipes.storage OpenFileType = Union[ @@ -35,6 +36,7 @@ class _FsspecFSFromFilepath: filepath: str reader_options: Optional[dict] = field(default_factory=dict) fs: fsspec.AbstractFileSystem = field(init=False) + upath: upath.core.UPath = field(init=False) def open_file(self) -> OpenFileType: """Calls `.open` on fsspec.Filesystem instantiation using self.filepath as an input. @@ -50,18 +52,26 @@ def read_bytes(self, bytes: int) -> bytes: with self.open_file() as of: return of.read(bytes) + def get_mapper(self): + """Returns a mapper for use with Zarr""" + return self.fs.get_mapper(self.filepath) + def __post_init__(self) -> None: """Initialize the fsspec filesystem object""" import fsspec from upath import UPath - universal_filepath = UPath(self.filepath) - protocol = universal_filepath.protocol + if not isinstance(self.filepath, UPath): + upath = UPath(self.filepath) + + self.upath = upath + self.protocol = upath.protocol + self.filepath = upath.as_uri() self.reader_options = self.reader_options or {} storage_options = self.reader_options.get("storage_options", {}) # type: ignore - self.fs = fsspec.filesystem(protocol, **storage_options) + self.fs = fsspec.filesystem(self.protocol, **storage_options) def check_for_collisions( diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index e339a3f4..3d62d002 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -61,8 +61,12 @@ def __post_init__(self) -> None: # Convert dtype string to numpy.dtype self.dtype = np.dtype(self.dtype) - if self.fill_value is None: - self.fill_value = ZARR_DEFAULT_FILL_VALUE.get(self.dtype.kind, 0.0) + # Question: Why is this applied only to fill values of None? + # It was skipping np dtypes such np.float32(nan) and np.int16(0) + # which causes serialization issues when writing to Kerchunk + + # if self.fill_value is None: + self.fill_value = ZARR_DEFAULT_FILL_VALUE.get(self.dtype.kind, 0.0) @property def codec(self) -> Codec: