From 6b7abe2a0dc650ae7e6bf07c080cc9023a17bf2c Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Fri, 19 Apr 2024 13:25:28 -0600 Subject: [PATCH 01/79] Generate chunk manifest backed variable from HDF5 dataset. --- pyproject.toml | 1 + virtualizarr/readers/hdf.py | 135 ++++++++++++++++++++ virtualizarr/tests/test_readers/__init__.py | 0 virtualizarr/tests/test_readers/conftest.py | 91 +++++++++++++ virtualizarr/tests/test_readers/test_hdf.py | 71 ++++++++++ 5 files changed, 298 insertions(+) create mode 100644 virtualizarr/readers/hdf.py create mode 100644 virtualizarr/tests/test_readers/__init__.py create mode 100644 virtualizarr/tests/test_readers/conftest.py create mode 100644 virtualizarr/tests/test_readers/test_hdf.py diff --git a/pyproject.toml b/pyproject.toml index c7505bca..7994c929 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ dependencies = [ "kerchunk==0.2.2", "pydantic", "packaging", + "h5netcdf", ] [project.optional-dependencies] diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py new file mode 100644 index 00000000..a34ae341 --- /dev/null +++ b/virtualizarr/readers/hdf.py @@ -0,0 +1,135 @@ +from typing import List + +import h5py +import xarray as xr + +from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray +from virtualizarr.zarr import ZArray + + +def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest: + """ + Generate ChunkManifest for HDF5 dataset. + + Parameters + ---------- + path: str + The path the HDF5 container file + dset : h5py.Dataset + HDF5 dataset for which to create a ChunkManifest + + Returns + ------- + ChunkManifest + A Virtualizarr ChunkManifest + """ + dsid = dataset.id + + if dataset.chunks is None: + if dsid.get_offset() is None: + raise ValueError("Dataset has no space allocated in the file") + else: + key_list = [0] * (len(dataset.shape) or 1) + key = ".".join(map(str, key_list)) + chunk_entry = ChunkEntry( + path=path, + offset=dsid.get_offset(), + length=dsid.get_storage_size() + ) + chunk_entries = {key: chunk_entry} + chunk_manifest = ChunkManifest( + entries=chunk_entries + ) + return chunk_manifest + else: + num_chunks = dsid.get_num_chunks() + if num_chunks == 0: + raise ValueError("The dataset is chunked but contains no chunks") + + chunk_entries = dict() + + def get_key(blob): + key_list = [a // b for a, b in zip(blob.chunk_offset, dataset.chunks)] + key = ".".join(map(str, key_list)) + return key + + def store_chunk_entry(blob): + chunk_entries[get_key(blob)] = ChunkEntry( + path=path, + offset=blob.byte_offset, + length=blob.size + ) + + has_chunk_iter = callable(getattr(dsid, "chunk_iter", None)) + if has_chunk_iter: + dsid.chunk_iter(store_chunk_entry) + else: + for index in range(num_chunks): + store_chunk_entry(dsid.get_chunk_info(index)) + + chunk_manifest = ChunkManifest( + entries=chunk_entries + ) + return chunk_manifest + +def _dataset_dims(dataset: h5py.Dataset) -> List[str]: + """ + Get a list of dimension scale names attached to input HDF5 dataset. + + This is required by the xarray package to work with Zarr arrays. Only + one dimension scale per dataset dimension is allowed. If dataset is + dimension scale, it will be considered as the dimension to itself. + + Parameters + ---------- + dataset : h5py.Dataset + HDF5 dataset. + + Returns + ------- + list + List with HDF5 path names of dimension scales attached to input + dataset. + """ + dims = list() + rank = len(dataset.shape) + if rank: + for n in range(rank): + num_scales = len(dataset.dims[n]) + if num_scales == 1: + dims.append(dataset.dims[n][0].name[1:]) + elif h5py.h5ds.is_scale(dataset.id): + dims.append(dataset.name[1:]) + elif num_scales > 1: + raise ValueError( + f"{dataset.name}: {len(dataset.dims[n])} " + f"dimension scales attached to dimension #{n}" + ) + elif num_scales == 0: + # Some HDF5 files do not have dimension scales. + # If this is the case, `num_scales` will be 0. + # In this case, we mimic netCDF4 and assign phony dimension names. + # See https://github.com/fsspec/kerchunk/issues/41 + dims.append(f"phony_dim_{n}") + return dims + + +def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable: + # This chunk determination logic mirrors zarr-python's create + # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66 + chunks = dataset.chunks if dataset.chunks else dataset.shape + zarray = ZArray( + chunks=chunks, + compressor=dataset.compression, + dtype=dataset.dtype, + fill_value=dataset.fillvalue, + filters=None, + order="C", + shape=dataset.shape, + zarr_format=2, + ) + manifest = _dataset_chunk_manifest(path, dataset) + marray = ManifestArray(zarray=zarray, chunkmanifest=manifest) + dims = _dataset_dims(dataset) + variable = xr.Variable(data=marray, dims=dims) + return variable diff --git a/virtualizarr/tests/test_readers/__init__.py b/virtualizarr/tests/test_readers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py new file mode 100644 index 00000000..b4504839 --- /dev/null +++ b/virtualizarr/tests/test_readers/conftest.py @@ -0,0 +1,91 @@ +import h5py +import numpy as np +import pytest +import xarray as xr + + +@pytest.fixture +def empty_chunks_netcdf4_file(tmpdir): + ds = xr.Dataset({"data": []}) + filepath = f"{tmpdir}/empty_chunks.nc" + ds.to_netcdf(filepath, engine="h5netcdf") + return filepath + + +@pytest.fixture +def empty_dataset_netcdf4_file(tmpdir): + filepath = f"{tmpdir}/empty_dataset.nc" + f = h5py.File(filepath, "w") + f.create_dataset("data", shape=(0,), dtype="f") + return filepath + + +@pytest.fixture +def no_chunks_netcdf4_file(tmpdir): + filepath = f"{tmpdir}/no_chunks.nc" + f = h5py.File(filepath, "w") + data = np.random.random((10, 10)) + f.create_dataset(name="data", data=data, chunks=None) + return filepath + + +@pytest.fixture +def chunked_netcdf4_file(tmpdir): + filepath = f"{tmpdir}/chunks.nc" + f = h5py.File(filepath, "w") + data = np.random.random((100, 100)) + f.create_dataset(name="data", data=data, chunks=(50, 50)) + return filepath + + +@pytest.fixture +def single_dimension_scale_netcdf4_file(tmpdir): + filepath = f"{tmpdir}/single_dimension_scale.nc" + f = h5py.File(filepath, "w") + data = [1, 2] + x = [0, 1] + f.create_dataset(name="data", data=data) + f.create_dataset(name="x", data=x) + f["x"].make_scale() + f["data"].dims[0].attach_scale(f["x"]) + return filepath + + +@pytest.fixture +def is_scale_netcdf4_file(tmpdir): + filepath = f"{tmpdir}/is_scale.nc" + f = h5py.File(filepath, "w") + data = [1, 2] + f.create_dataset(name="data", data=data) + f["data"].make_scale() + return filepath + + +@pytest.fixture +def multiple_dimension_scales_netcdf4_file(tmpdir): + filepath = f"{tmpdir}/multiple_dimension_scales.nc" + f = h5py.File(filepath, "w") + data = [1, 2] + f.create_dataset(name="data", data=data) + f.create_dataset(name="x", data=[0, 1]) + f.create_dataset(name="y", data=[0, 1]) + f["x"].make_scale() + f["y"].make_scale() + f["data"].dims[0].attach_scale(f["x"]) + f["data"].dims[0].attach_scale(f["y"]) + return filepath + + +@pytest.fixture +def chunked_dimensions_netcdf4_file(tmpdir): + filepath = f"{tmpdir}/chunks_dimension.nc" + f = h5py.File(filepath, "w") + data = np.random.random((100, 100)) + x = np.random.random((100, 100)) + y = np.random.random((100, 100)) + f.create_dataset(name="data", data=data, chunks=(50, 50)) + f.create_dataset(name="x", data=x, chunks=(50, 50)) + f.create_dataset(name="y", data=y, chunks=(50, 50)) + f["data"].dims[0].attach_scale(f["x"]) + f["data"].dims[1].attach_scale(f["y"]) + return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py new file mode 100644 index 00000000..b6b78c11 --- /dev/null +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -0,0 +1,71 @@ +import h5py +import pytest + +from virtualizarr.readers.hdf import (_dataset_chunk_manifest, _dataset_dims, + _dataset_to_variable) + + +class TestDatasetChunkManifest: + def test_empty_chunks(self, empty_chunks_netcdf4_file): + f = h5py.File(empty_chunks_netcdf4_file) + ds = f["data"] + with pytest.raises(ValueError, match="chunked but contains no chunks"): + _dataset_chunk_manifest(path=empty_chunks_netcdf4_file, dataset=ds) + + def test_empty_dataset(self, empty_dataset_netcdf4_file): + f = h5py.File(empty_dataset_netcdf4_file) + ds = f["data"] + with pytest.raises(ValueError, match="no space allocated in the file"): + _dataset_chunk_manifest(path=empty_dataset_netcdf4_file, dataset=ds) + + def test_no_chunking(self, no_chunks_netcdf4_file): + f = h5py.File(no_chunks_netcdf4_file) + ds = f["data"] + manifest = _dataset_chunk_manifest(path=no_chunks_netcdf4_file, dataset=ds) + assert len(manifest.entries) == 1 + + def test_chunked(self, chunked_netcdf4_file): + f = h5py.File(chunked_netcdf4_file) + ds = f["data"] + manifest = _dataset_chunk_manifest(path=chunked_netcdf4_file, dataset=ds) + assert len(manifest.entries) == 4 + + +class TestDatasetDims: + def test_single_dimension_scale(self, single_dimension_scale_netcdf4_file): + f = h5py.File(single_dimension_scale_netcdf4_file) + ds = f["data"] + dims = _dataset_dims(ds) + assert dims[0] == "x" + + def test_is_dimension_scale(self, is_scale_netcdf4_file): + f = h5py.File(is_scale_netcdf4_file) + ds = f["data"] + dims = _dataset_dims(ds) + assert dims[0] == "data" + + def test_multiple_dimension_scales(self, multiple_dimension_scales_netcdf4_file): + f = h5py.File(multiple_dimension_scales_netcdf4_file) + ds = f["data"] + with pytest.raises(ValueError, match="dimension scales attached"): + _dataset_dims(ds) + + def test_no_dimension_scales(self, no_chunks_netcdf4_file): + f = h5py.File(no_chunks_netcdf4_file) + ds = f["data"] + dims = _dataset_dims(ds) + assert dims == ["phony_dim_0", "phony_dim_1"] + + +class TestDatasetToVariable: + def test_chunked_dataset(self, chunked_dimensions_netcdf4_file): + f = h5py.File(chunked_dimensions_netcdf4_file) + ds = f["data"] + var = _dataset_to_variable(chunked_dimensions_netcdf4_file, ds) + assert var.chunks == (50, 50) + + def test_not_chunked_dataset(self, single_dimension_scale_netcdf4_file): + f = h5py.File(single_dimension_scale_netcdf4_file) + ds = f["data"] + var = _dataset_to_variable(single_dimension_scale_netcdf4_file, ds) + assert var.chunks == (2,) From bca0aabd6030625156b5fe1e58fb8d9a2ccf46f1 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Fri, 19 Apr 2024 14:20:38 -0600 Subject: [PATCH 02/79] Transfer dataset attrs to variable. --- virtualizarr/readers/hdf.py | 50 ++++++++++++++++++++- virtualizarr/tests/test_readers/conftest.py | 10 +++++ virtualizarr/tests/test_readers/test_hdf.py | 16 ++++++- 3 files changed, 74 insertions(+), 2 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index a34ae341..d6518a30 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -1,6 +1,7 @@ from typing import List import h5py +import numpy as np import xarray as xr from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray @@ -114,6 +115,52 @@ def _dataset_dims(dataset: h5py.Dataset) -> List[str]: return dims +def _extract_attrs(dataset: h5py.Dataset): + """ + Extract attributes from an HDF5 dataset. + + Parameters + ---------- + dataset : h5py.Dataset + An HDF5 dataset. + """ + _HIDDEN_ATTRS = { + "REFERENCE_LIST", + "CLASS", + "DIMENSION_LIST", + "NAME", + "_Netcdf4Dimid", + "_Netcdf4Coordinates", + "_nc3_strict", + "_NCProperties", + } + attrs = {} + for n, v in dataset.attrs.items(): + if n in _HIDDEN_ATTRS: + continue + # Fix some attribute values to avoid JSON encoding exceptions... + if isinstance(v, bytes): + v = v.decode("utf-8") or " " + elif isinstance(v, (np.ndarray, np.number, np.bool_)): + if v.dtype.kind == "S": + v = v.astype(str) + if n == "_FillValue": + continue + elif v.size == 1: + v = v.flatten()[0] + if isinstance(v, (np.ndarray, np.number, np.bool_)): + v = v.tolist() + else: + v = v.tolist() + elif isinstance(v, h5py._hl.base.Empty): + v = "" + if v == "DIMENSION_SCALE": + continue + + attrs[n] = v + return attrs + + def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable: # This chunk determination logic mirrors zarr-python's create # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66 @@ -131,5 +178,6 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable: manifest = _dataset_chunk_manifest(path, dataset) marray = ManifestArray(zarray=zarray, chunkmanifest=manifest) dims = _dataset_dims(dataset) - variable = xr.Variable(data=marray, dims=dims) + attrs = _extract_attrs(dataset) + variable = xr.Variable(data=marray, dims=dims, attrs=attrs) return variable diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index b4504839..2c40fe17 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -89,3 +89,13 @@ def chunked_dimensions_netcdf4_file(tmpdir): f["data"].dims[0].attach_scale(f["x"]) f["data"].dims[1].attach_scale(f["y"]) return filepath + + +@pytest.fixture +def string_attribute_netcdf4_file(tmpdir): + filepath = f"{tmpdir}/attributes.nc" + f = h5py.File(filepath, "w") + data = np.random.random((10, 10)) + f.create_dataset(name="data", data=data, chunks=None) + f["data"].attrs["attribute_name"] = "attribute_name" + return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index b6b78c11..495b7de0 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -2,7 +2,7 @@ import pytest from virtualizarr.readers.hdf import (_dataset_chunk_manifest, _dataset_dims, - _dataset_to_variable) + _dataset_to_variable, _extract_attrs) class TestDatasetChunkManifest: @@ -69,3 +69,17 @@ def test_not_chunked_dataset(self, single_dimension_scale_netcdf4_file): ds = f["data"] var = _dataset_to_variable(single_dimension_scale_netcdf4_file, ds) assert var.chunks == (2,) + + def test_dataset_attributes(self, string_attribute_netcdf4_file): + f = h5py.File(string_attribute_netcdf4_file) + ds = f["data"] + var = _dataset_to_variable(string_attribute_netcdf4_file, ds) + assert var.attrs["attribute_name"] == "attribute_name" + + +class TestExtractAttributes: + def test_string_attribute(self, string_attribute_netcdf4_file): + f = h5py.File(string_attribute_netcdf4_file) + ds = f["data"] + attrs = _extract_attrs(ds) + assert attrs["attribute_name"] == "attribute_name" From 384ff6bb2d75b68a4af1f23d56a6544b4e20d6b5 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Fri, 19 Apr 2024 15:26:58 -0600 Subject: [PATCH 03/79] Get virtual variables dict from HDF5 file. --- virtualizarr/readers/hdf.py | 14 +++++++++++++- virtualizarr/tests/test_readers/conftest.py | 16 ++++++++++++---- virtualizarr/tests/test_readers/test_hdf.py | 15 ++++++++++++++- 3 files changed, 39 insertions(+), 6 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index d6518a30..9c3ebf44 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -1,4 +1,4 @@ -from typing import List +from typing import Mapping, List import h5py import numpy as np @@ -181,3 +181,15 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable: attrs = _extract_attrs(dataset) variable = xr.Variable(data=marray, dims=dims, attrs=attrs) return variable + + +def virtual_vars_from_hdf(path: str, f: h5py.File) -> Mapping[str, xr.Variable]: + variables = {} + for key in f.keys(): + if isinstance(f[key], h5py.Dataset): + variable = _dataset_to_variable(path, f[key]) + variables[key] = variable + else: + raise NotImplementedError("Nested groups are not yet supported") + + return variables diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 2c40fe17..735e922a 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -81,11 +81,11 @@ def chunked_dimensions_netcdf4_file(tmpdir): filepath = f"{tmpdir}/chunks_dimension.nc" f = h5py.File(filepath, "w") data = np.random.random((100, 100)) - x = np.random.random((100, 100)) - y = np.random.random((100, 100)) + x = np.random.random((100)) + y = np.random.random((100)) f.create_dataset(name="data", data=data, chunks=(50, 50)) - f.create_dataset(name="x", data=x, chunks=(50, 50)) - f.create_dataset(name="y", data=y, chunks=(50, 50)) + f.create_dataset(name="x", data=x) + f.create_dataset(name="y", data=y) f["data"].dims[0].attach_scale(f["x"]) f["data"].dims[1].attach_scale(f["y"]) return filepath @@ -99,3 +99,11 @@ def string_attribute_netcdf4_file(tmpdir): f.create_dataset(name="data", data=data, chunks=None) f["data"].attrs["attribute_name"] = "attribute_name" return filepath + + +@pytest.fixture +def group_netcdf4_file(tmpdir): + filepath = f"{tmpdir}/group.nc" + f = h5py.File(filepath, "w") + f.create_group("group") + return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index 495b7de0..da331ed9 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -2,7 +2,8 @@ import pytest from virtualizarr.readers.hdf import (_dataset_chunk_manifest, _dataset_dims, - _dataset_to_variable, _extract_attrs) + _dataset_to_variable, _extract_attrs, + virtual_vars_from_hdf) class TestDatasetChunkManifest: @@ -83,3 +84,15 @@ def test_string_attribute(self, string_attribute_netcdf4_file): ds = f["data"] attrs = _extract_attrs(ds) assert attrs["attribute_name"] == "attribute_name" + + +class TestVirtualVarsFromHDF: + def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file): + f = h5py.File(chunked_dimensions_netcdf4_file) + variables = virtual_vars_from_hdf(chunked_dimensions_netcdf4_file, f) + assert len(variables) == 3 + + def test_groups_not_implemented(self, group_netcdf4_file): + f = h5py.File(group_netcdf4_file) + with pytest.raises(NotImplementedError): + virtual_vars_from_hdf(group_netcdf4_file, f) From 4c5f9bd30186aee61ff79223a70a3172b1c17d00 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 22 Apr 2024 12:33:24 -0600 Subject: [PATCH 04/79] Update virtual_vars_from_hdf to use fsspec and drop_variables arg. --- pyproject.toml | 2 +- virtualizarr/readers/hdf.py | 25 +++++++++++++++------ virtualizarr/tests/test_readers/conftest.py | 10 +++++++++ virtualizarr/tests/test_readers/test_hdf.py | 13 +++++++---- 4 files changed, 38 insertions(+), 12 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 7994c929..d08621e3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,6 @@ dependencies = [ "kerchunk==0.2.2", "pydantic", "packaging", - "h5netcdf", ] [project.optional-dependencies] @@ -35,6 +34,7 @@ test = [ "pytest", "scipy", "pooch", + "h5netcdf", ] diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index 9c3ebf44..c4ab2927 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -1,5 +1,6 @@ -from typing import Mapping, List +from typing import List, Mapping, Optional +import fsspec import h5py import numpy as np import xarray as xr @@ -73,6 +74,7 @@ def store_chunk_entry(blob): ) return chunk_manifest + def _dataset_dims(dataset: h5py.Dataset) -> List[str]: """ Get a list of dimension scale names attached to input HDF5 dataset. @@ -183,13 +185,22 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable: return variable -def virtual_vars_from_hdf(path: str, f: h5py.File) -> Mapping[str, xr.Variable]: +def virtual_vars_from_hdf( + path: str, + drop_variables: Optional[List[str]] = None, +) -> Mapping[str, xr.Variable]: + if drop_variables is None: + drop_variables = [] + fs, file_path = fsspec.core.url_to_fs(path) + open_file = fs.open(path, "rb") + f = h5py.File(open_file, mode="r") variables = {} for key in f.keys(): - if isinstance(f[key], h5py.Dataset): - variable = _dataset_to_variable(path, f[key]) - variables[key] = variable - else: - raise NotImplementedError("Nested groups are not yet supported") + if key not in drop_variables: + if isinstance(f[key], h5py.Dataset): + variable = _dataset_to_variable(path, f[key]) + variables[key] = variable + else: + raise NotImplementedError("Nested groups are not yet supported") return variables diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 735e922a..aa2b0fe0 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -107,3 +107,13 @@ def group_netcdf4_file(tmpdir): f = h5py.File(filepath, "w") f.create_group("group") return filepath + + +@pytest.fixture +def multiple_datasets_netcdf4_file(tmpdir): + filepath = f"{tmpdir}/multiple_datasets.nc" + f = h5py.File(filepath, "w") + data = np.random.random((10, 10)) + f.create_dataset(name="data", data=data, chunks=None) + f.create_dataset(name="data2", data=data, chunks=None) + return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index da331ed9..36f7bc77 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -88,11 +88,16 @@ def test_string_attribute(self, string_attribute_netcdf4_file): class TestVirtualVarsFromHDF: def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file): - f = h5py.File(chunked_dimensions_netcdf4_file) - variables = virtual_vars_from_hdf(chunked_dimensions_netcdf4_file, f) + variables = virtual_vars_from_hdf(chunked_dimensions_netcdf4_file) assert len(variables) == 3 def test_groups_not_implemented(self, group_netcdf4_file): - f = h5py.File(group_netcdf4_file) with pytest.raises(NotImplementedError): - virtual_vars_from_hdf(group_netcdf4_file, f) + virtual_vars_from_hdf(group_netcdf4_file) + + def test_drop_variables(self, multiple_datasets_netcdf4_file): + variables = virtual_vars_from_hdf( + multiple_datasets_netcdf4_file, + ["data2"] + ) + assert "data2" not in variables.keys() From 1dd3370aedc6e0b590f752273387a716366defe9 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 22 Apr 2024 13:02:03 -0600 Subject: [PATCH 05/79] mypy fix to use ChunkKey and empty dimensions list. --- virtualizarr/readers/hdf.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index c4ab2927..fdb9a77d 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -1,4 +1,4 @@ -from typing import List, Mapping, Optional +from typing import List, Mapping, Optional, Union import fsspec import h5py @@ -8,6 +8,8 @@ from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray from virtualizarr.zarr import ZArray +from virtualizarr.types import ChunkKey + def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest: """ @@ -38,7 +40,8 @@ def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest: offset=dsid.get_offset(), length=dsid.get_storage_size() ) - chunk_entries = {key: chunk_entry} + chunk_key = ChunkKey(key) + chunk_entries = {chunk_key: chunk_entry} chunk_manifest = ChunkManifest( entries=chunk_entries ) @@ -75,7 +78,7 @@ def store_chunk_entry(blob): return chunk_manifest -def _dataset_dims(dataset: h5py.Dataset) -> List[str]: +def _dataset_dims(dataset: h5py.Dataset) -> Union[List[str], List[None]]: """ Get a list of dimension scale names attached to input HDF5 dataset. @@ -114,7 +117,7 @@ def _dataset_dims(dataset: h5py.Dataset) -> List[str]: # In this case, we mimic netCDF4 and assign phony dimension names. # See https://github.com/fsspec/kerchunk/issues/41 dims.append(f"phony_dim_{n}") - return dims + return dims def _extract_attrs(dataset: h5py.Dataset): From d92c75c82cd000bf0fafa5301c22793434fb18ed Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 22 Apr 2024 13:40:52 -0600 Subject: [PATCH 06/79] Extract attributes from hdf5 root group. --- virtualizarr/readers/hdf.py | 18 +++++++++++++----- virtualizarr/tests/test_readers/conftest.py | 8 ++++++++ virtualizarr/tests/test_readers/test_hdf.py | 5 +++++ 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index fdb9a77d..e02d03e7 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -120,14 +120,14 @@ def _dataset_dims(dataset: h5py.Dataset) -> Union[List[str], List[None]]: return dims -def _extract_attrs(dataset: h5py.Dataset): +def _extract_attrs(h5obj: Union[h5py.Dataset, h5py.Group]): """ - Extract attributes from an HDF5 dataset. + Extract attributes from an HDF5 group or dataset. Parameters ---------- - dataset : h5py.Dataset - An HDF5 dataset. + h5obj : h5py.Group or h5py.Dataset + An HDF5 group or dataset. """ _HIDDEN_ATTRS = { "REFERENCE_LIST", @@ -140,7 +140,7 @@ def _extract_attrs(dataset: h5py.Dataset): "_NCProperties", } attrs = {} - for n, v in dataset.attrs.items(): + for n, v in h5obj.attrs.items(): if n in _HIDDEN_ATTRS: continue # Fix some attribute values to avoid JSON encoding exceptions... @@ -207,3 +207,11 @@ def virtual_vars_from_hdf( raise NotImplementedError("Nested groups are not yet supported") return variables + + +def attrs_from_root_group(path: str): + fs, file_path = fsspec.core.url_to_fs(path) + open_file = fs.open(path, "rb") + f = h5py.File(open_file, mode="r") + attrs = _extract_attrs(f) + return attrs diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index aa2b0fe0..46ac7b2e 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -101,6 +101,14 @@ def string_attribute_netcdf4_file(tmpdir): return filepath +@pytest.fixture +def root_attributes_netcdf4_file(tmpdir): + filepath = f"{tmpdir}/root_attributes.nc" + f = h5py.File(filepath, "w") + f.attrs["attribute_name"] = "attribute_name" + return filepath + + @pytest.fixture def group_netcdf4_file(tmpdir): filepath = f"{tmpdir}/group.nc" diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index 36f7bc77..a24e36ab 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -85,6 +85,11 @@ def test_string_attribute(self, string_attribute_netcdf4_file): attrs = _extract_attrs(ds) assert attrs["attribute_name"] == "attribute_name" + def test_root_attribute(self, root_attributes_netcdf4_file): + f = h5py.File(root_attributes_netcdf4_file) + attrs = _extract_attrs(f) + assert attrs["attribute_name"] == "attribute_name" + class TestVirtualVarsFromHDF: def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file): From 0ed836272d26a62b8de457c30dc6525292efc916 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 22 Apr 2024 14:19:17 -0600 Subject: [PATCH 07/79] Use hdf reader for netcdf4 files. --- virtualizarr/xarray.py | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 5c3c8548..415b0a05 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -8,7 +8,8 @@ from xarray.core.variable import IndexVariable import virtualizarr.kerchunk as kerchunk -from virtualizarr.kerchunk import KerchunkStoreRefs, FileType +from virtualizarr.kerchunk import KerchunkStoreRefs, FileType, _automatically_determine_filetype +from virtualizarr.readers.hdf import virtual_vars_from_hdf, attrs_from_root_group from virtualizarr.manifests import ChunkManifest, ManifestArray @@ -76,18 +77,28 @@ def open_virtual_dataset( if common: raise ValueError(f"Cannot both load and drop variables {common}") + if filetype is None: + filetype = _automatically_determine_filetype(filepath) + filetype = FileType(filetype) + if filetype.name.lower() == "netcdf4": + virtual_vars = virtual_vars_from_hdf( + path=filepath, + drop_variables=drop_variables + ) + ds_attrs = attrs_from_root_group(path=filepath) # this is the only place we actually always need to use kerchunk directly # TODO avoid even reading byte ranges for variables that will be dropped later anyway? - vds_refs = kerchunk.read_kerchunk_references_from_file( - filepath=filepath, - filetype=filetype, - ) - virtual_vars = virtual_vars_from_kerchunk_refs( - vds_refs, - drop_variables=drop_variables + loadable_variables, - virtual_array_class=virtual_array_class, - ) - ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {}) + else: + vds_refs = kerchunk.read_kerchunk_references_from_file( + filepath=filepath, + filetype=filetype, + ) + virtual_vars = virtual_vars_from_kerchunk_refs( + vds_refs, + drop_variables=drop_variables + loadable_variables, + virtual_array_class=virtual_array_class, + ) + ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {}) if indexes is None or len(loadable_variables) > 0: # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables... From f4485fa10aebc0f8ef5ff7441704f49781325835 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 22 Apr 2024 21:57:39 +0000 Subject: [PATCH 08/79] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- virtualizarr/xarray.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 415b0a05..2213ffa9 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -9,7 +9,7 @@ import virtualizarr.kerchunk as kerchunk from virtualizarr.kerchunk import KerchunkStoreRefs, FileType, _automatically_determine_filetype -from virtualizarr.readers.hdf import virtual_vars_from_hdf, attrs_from_root_group +from virtualizarr.readers.hdf import virtual_vars_from_hdf, attrs_from_root_group from virtualizarr.manifests import ChunkManifest, ManifestArray From 0123df7b802734f1902bee0cdd196f5baca10c9e Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Wed, 8 May 2024 18:03:04 -0600 Subject: [PATCH 09/79] Fix ruff complaints. --- virtualizarr/readers/hdf.py | 3 +-- virtualizarr/tests/test_readers/test_hdf.py | 10 +++++++--- virtualizarr/xarray.py | 8 ++++++-- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index e02d03e7..af25c029 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -6,9 +6,8 @@ import xarray as xr from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray -from virtualizarr.zarr import ZArray - from virtualizarr.types import ChunkKey +from virtualizarr.zarr import ZArray def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest: diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index a24e36ab..0d5a16db 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -1,9 +1,13 @@ import h5py import pytest -from virtualizarr.readers.hdf import (_dataset_chunk_manifest, _dataset_dims, - _dataset_to_variable, _extract_attrs, - virtual_vars_from_hdf) +from virtualizarr.readers.hdf import ( + _dataset_chunk_manifest, + _dataset_dims, + _dataset_to_variable, + _extract_attrs, + virtual_vars_from_hdf, +) class TestDatasetChunkManifest: diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index fbf6136f..9629a344 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -18,9 +18,13 @@ from xarray.core.variable import IndexVariable import virtualizarr.kerchunk as kerchunk -from virtualizarr.kerchunk import KerchunkStoreRefs, FileType, _automatically_determine_filetype -from virtualizarr.readers.hdf import virtual_vars_from_hdf, attrs_from_root_group +from virtualizarr.kerchunk import ( + FileType, + KerchunkStoreRefs, + _automatically_determine_filetype, +) from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.readers.hdf import attrs_from_root_group, virtual_vars_from_hdf from virtualizarr.zarr import ( attrs_from_zarr_group_json, dataset_to_zarr, From 332bcaab1ae182696e1daf7c611f6fe8fd8ee4fd Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Fri, 10 May 2024 15:10:30 -0600 Subject: [PATCH 10/79] First steps for handling HDF5 filters. --- pyproject.toml | 1 + virtualizarr/readers/hdf.py | 7 +- virtualizarr/readers/hdf_filters.py | 34 +++++++++ virtualizarr/tests/test_readers/conftest.py | 26 +++++++ .../tests/test_readers/test_hdf_filters.py | 31 ++++++++ .../test_readers/test_hdf_integration.py | 21 ++++++ virtualizarr/xarray.py | 71 +++++++++---------- 7 files changed, 153 insertions(+), 38 deletions(-) create mode 100644 virtualizarr/readers/hdf_filters.py create mode 100644 virtualizarr/tests/test_readers/test_hdf_filters.py create mode 100644 virtualizarr/tests/test_readers/test_hdf_integration.py diff --git a/pyproject.toml b/pyproject.toml index 79a50789..4818b5f1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ dependencies = [ "numpy", "ujson", "packaging", + "hdf5plugin", ] [project.optional-dependencies] diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index af25c029..7d95d996 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -6,6 +6,7 @@ import xarray as xr from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray +from virtualizarr.readers.hdf_filters import codecs_from_dataset from virtualizarr.types import ChunkKey from virtualizarr.zarr import ZArray @@ -169,12 +170,14 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable: # This chunk determination logic mirrors zarr-python's create # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66 chunks = dataset.chunks if dataset.chunks else dataset.shape + codecs = codecs_from_dataset(dataset) + filters = [codec.get_config() for codec in codecs] zarray = ZArray( chunks=chunks, - compressor=dataset.compression, + compressor=None, dtype=dataset.dtype, fill_value=dataset.fillvalue, - filters=None, + filters=filters, order="C", shape=dataset.shape, zarr_format=2, diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py new file mode 100644 index 00000000..6070fc17 --- /dev/null +++ b/virtualizarr/readers/hdf_filters.py @@ -0,0 +1,34 @@ +from typing import List, Tuple, Union + +import h5py +import numcodecs.registry as registry +from numcodecs.abc import Codec + +_non_standard_filters = { + "gzip": "zlib" +} + + +def _filter_to_codec(filter_id: str, filter_properties: Union[int, Tuple] = None) -> Codec: + try: + id = int(filter_id) + except ValueError: + id = filter_id + + if isinstance(id, str): + if id in _non_standard_filters.keys(): + id = _non_standard_filters[id] + conf = {"id": id} + if id == "zlib": + conf["level"] = filter_properties + + codec = registry.get_codec(conf) + return codec + + +def codecs_from_dataset(dataset: h5py.Dataset) -> List[Codec]: + codecs = [] + for filter_id, filter_properties in dataset._filters.items(): + codec = _filter_to_codec(filter_id, filter_properties) + codecs.append(codec) + return codecs diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 46ac7b2e..4f0d4fce 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -125,3 +125,29 @@ def multiple_datasets_netcdf4_file(tmpdir): f.create_dataset(name="data", data=data, chunks=None) f.create_dataset(name="data2", data=data, chunks=None) return filepath + + +@pytest.fixture +def np_uncompressed(): + return np.arange(100) + + +@pytest.fixture +def gzip_filter_netcdf4_file(tmpdir, np_uncompressed): + filepath = f"{tmpdir}/gzip.nc" + f = h5py.File(filepath, "w") + f.create_dataset(name="data", data=np_uncompressed, compression="gzip", compression_opts=1) + return filepath + + +@pytest.fixture +def gzip_filter_xarray_netcdf4_file(tmpdir): + ds = xr.tutorial.open_dataset("air_temperature") + encoding = {} + for var_name in ds.variables: + # encoding[var_name] = {"zlib": True, "compression_opts": 1} + encoding[var_name] = {"compression": "gzip", "compression_opts": 1} + + filepath = f"{tmpdir}/gzip_xarray.nc" + ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding) + return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py new file mode 100644 index 00000000..50a5d08c --- /dev/null +++ b/virtualizarr/tests/test_readers/test_hdf_filters.py @@ -0,0 +1,31 @@ +import h5py +import numcodecs +import pytest + +from virtualizarr.readers.hdf_filters import ( + _filter_to_codec, + codecs_from_dataset, +) + + +class TestFilterToCodec: + def test_gzip_uses_zlib_nomcodec(self): + codec = _filter_to_codec("gzip", 1) + assert isinstance(codec, numcodecs.zlib.Zlib) + + def test_lzf_not_available(self): + with pytest.raises(ValueError, match="codec not available"): + _filter_to_codec("lzf") + + +class TestCodecsFromDataSet: + def test_gzip(self, np_uncompressed, gzip_filter_netcdf4_file): + f = h5py.File(gzip_filter_netcdf4_file) + ds = f["data"] + chunk_info = ds.id.get_chunk_info(0) + codecs = codecs_from_dataset(ds) + with open(gzip_filter_netcdf4_file, 'rb') as file: + file.seek(chunk_info.byte_offset) + bytes_read = file.read(chunk_info.size) + decoded = codecs[0].decode(bytes_read) + assert decoded == np_uncompressed.tobytes() diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py new file mode 100644 index 00000000..45bfadcd --- /dev/null +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -0,0 +1,21 @@ +import fsspec +import numpy +import xarray as xr + +import virtualizarr +from virtualizarr.kerchunk import FileType + + +class TestIntegration: + def test_gzip_filter_end_to_end(self, tmpdir, gzip_filter_xarray_netcdf4_file): + virtual_ds = virtualizarr.open_virtual_dataset( + gzip_filter_xarray_netcdf4_file, + filetype=FileType("netcdf4") + ) + kerchunk_file = f"{tmpdir}/gzip_kerchunk.json" + virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json") + fs = fsspec.filesystem("reference", fo=kerchunk_file) + m = fs.get_mapper("") + + ds = xr.open_dataset(m, engine="kerchunk") + assert isinstance(ds.air.values[0][0][0], numpy.float64) diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 9629a344..24ba973a 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -128,48 +128,47 @@ def open_virtual_dataset( ) ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {}) - if indexes is None or len(loadable_variables) > 0: - # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables... - # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references - # TODO really we probably want a dedicated xarray backend that iterates over all variables only once - ds = xr.open_dataset(filepath, drop_variables=drop_variables) - - if indexes is None: - # add default indexes by reading data from file - indexes = {name: index for name, index in ds.xindexes.items()} - elif indexes != {}: - # TODO allow manual specification of index objects - raise NotImplementedError() - else: - indexes = dict(**indexes) # for type hinting: to allow mutation - - loadable_vars = { - name: var - for name, var in ds.variables.items() - if name in loadable_variables - } - - # if we only read the indexes we can just close the file right away as nothing is lazy - if loadable_vars == {}: - ds.close() + if indexes is None or len(loadable_variables) > 0: + # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables... + # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references + # TODO really we probably want a dedicated xarray backend that iterates over all variables only once + ds = xr.open_dataset(filepath, drop_variables=drop_variables) + + if indexes is None: + # add default indexes by reading data from file + indexes = {name: index for name, index in ds.xindexes.items()} + elif indexes != {}: + # TODO allow manual specification of index objects + raise NotImplementedError() else: - loadable_vars = {} - indexes = {} + indexes = dict(**indexes) # for type hinting: to allow mutation - vars = {**virtual_vars, **loadable_vars} + loadable_vars = { + name: var + for name, var in ds.variables.items() + if name in loadable_variables + } - data_vars, coords = separate_coords(vars, indexes) + # if we only read the indexes we can just close the file right away as nothing is lazy + if loadable_vars == {}: + ds.close() + else: + loadable_vars = {} + indexes = {} - vds = xr.Dataset( - data_vars, - coords=coords, - # indexes={}, # TODO should be added in a later version of xarray - attrs=ds_attrs, - ) + vars = {**virtual_vars, **loadable_vars} + + data_vars, coords = separate_coords(vars, indexes) + vds = xr.Dataset( + data_vars, + coords=coords, + # indexes={}, # TODO should be added in a later version of xarray + attrs=ds_attrs, + ) - # TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened + # TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened - return vds + return vds def open_virtual_dataset_from_v3_store( From c51e615ca0cd5396bde54868e439419fe9d9b9c8 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 13 May 2024 12:36:29 -0600 Subject: [PATCH 11/79] Initial step for hdf5plugin supported codecs. --- virtualizarr/readers/hdf_filters.py | 25 +++++++++++++++ virtualizarr/tests/test_readers/conftest.py | 31 +++++++++++++------ .../tests/test_readers/test_hdf_filters.py | 20 +++++++++--- .../test_readers/test_hdf_integration.py | 7 +++-- 4 files changed, 66 insertions(+), 17 deletions(-) diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index 6070fc17..75f06bdc 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -1,14 +1,30 @@ from typing import List, Tuple, Union import h5py +import hdf5plugin import numcodecs.registry as registry from numcodecs.abc import Codec +from pydantic import BaseModel, validator _non_standard_filters = { "gzip": "zlib" } +class BloscProperties(BaseModel): + blocksize: int + clevel: int + shuffle: int + cname: str + + @validator("cname", pre=True) + def get_cname_from_code(cls, v): + blosc_compressor_codes = { + value: key for key, value in hdf5plugin._filters.Blosc._Blosc__COMPRESSIONS.items() + } + return blosc_compressor_codes[v] + + def _filter_to_codec(filter_id: str, filter_properties: Union[int, Tuple] = None) -> Codec: try: id = int(filter_id) @@ -21,6 +37,15 @@ def _filter_to_codec(filter_id: str, filter_properties: Union[int, Tuple] = None conf = {"id": id} if id == "zlib": conf["level"] = filter_properties + elif isinstance(id, int): + filter = hdf5plugin.get_filters(id)[0] + id = filter.filter_name + if id == "blosc": + blosc_props = BloscProperties(**{k: v for k, v in + zip(BloscProperties.__fields__.keys(), + filter_properties[-4:])}) + conf = blosc_props.model_dump() + conf["id"] = id codec = registry.get_codec(conf) return codec diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 4f0d4fce..cc9331e1 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -1,4 +1,5 @@ import h5py +import hdf5plugin import numpy as np import pytest import xarray as xr @@ -132,22 +133,32 @@ def np_uncompressed(): return np.arange(100) -@pytest.fixture -def gzip_filter_netcdf4_file(tmpdir, np_uncompressed): - filepath = f"{tmpdir}/gzip.nc" +@pytest.fixture(params=["gzip", "blosc"]) +def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request): + filepath = f"{tmpdir}/{request.param}.nc" f = h5py.File(filepath, "w") - f.create_dataset(name="data", data=np_uncompressed, compression="gzip", compression_opts=1) + if request.param == "gzip": + f.create_dataset(name="data", data=np_uncompressed, compression="gzip", compression_opts=1) + if request.param == "blosc": + f.create_dataset(name="data", data=np_uncompressed, + **hdf5plugin.Blosc( + cname="lz4", clevel=9, shuffle=hdf5plugin.Blosc.SHUFFLE + )) return filepath -@pytest.fixture -def gzip_filter_xarray_netcdf4_file(tmpdir): +@pytest.fixture(params=["gzip"]) +def filter_encoded_xarray_netcdf4_files(tmpdir, request): ds = xr.tutorial.open_dataset("air_temperature") encoding = {} + if request.param == "gzip": + encoding_config = { + "zlib": True, + "complevel": 1 + } for var_name in ds.variables: - # encoding[var_name] = {"zlib": True, "compression_opts": 1} - encoding[var_name] = {"compression": "gzip", "compression_opts": 1} + encoding[var_name] = encoding_config - filepath = f"{tmpdir}/gzip_xarray.nc" - ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding) + filepath = f"{tmpdir}/{request.param}_xarray.nc" + ds.to_netcdf(filepath, engine="netcdf4", encoding=encoding) return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py index 50a5d08c..8094d4cf 100644 --- a/virtualizarr/tests/test_readers/test_hdf_filters.py +++ b/virtualizarr/tests/test_readers/test_hdf_filters.py @@ -9,7 +9,7 @@ class TestFilterToCodec: - def test_gzip_uses_zlib_nomcodec(self): + def test_gzip_uses_zlib_numcodec(self): codec = _filter_to_codec("gzip", 1) assert isinstance(codec, numcodecs.zlib.Zlib) @@ -17,14 +17,26 @@ def test_lzf_not_available(self): with pytest.raises(ValueError, match="codec not available"): _filter_to_codec("lzf") + def test_blosc(self): + codec = _filter_to_codec("32001", (2, 2, 8, 800, 9, 2, 1)) + assert isinstance(codec, numcodecs.blosc.Blosc) + expected_config = { + "id": "blosc", + "blocksize": 800, + "clevel": 9, + "shuffle": 2, + "cname": "lz4", + } + assert codec.get_config() == expected_config + class TestCodecsFromDataSet: - def test_gzip(self, np_uncompressed, gzip_filter_netcdf4_file): - f = h5py.File(gzip_filter_netcdf4_file) + def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file): + f = h5py.File(filter_encoded_netcdf4_file) ds = f["data"] chunk_info = ds.id.get_chunk_info(0) codecs = codecs_from_dataset(ds) - with open(gzip_filter_netcdf4_file, 'rb') as file: + with open(filter_encoded_netcdf4_file, 'rb') as file: file.seek(chunk_info.byte_offset) bytes_read = file.read(chunk_info.size) decoded = codecs[0].decode(bytes_read) diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py index 45bfadcd..94fc0c1c 100644 --- a/virtualizarr/tests/test_readers/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -7,12 +7,13 @@ class TestIntegration: - def test_gzip_filter_end_to_end(self, tmpdir, gzip_filter_xarray_netcdf4_file): + def test_filters_end_to_end(self, tmpdir, + filter_encoded_xarray_netcdf4_files): virtual_ds = virtualizarr.open_virtual_dataset( - gzip_filter_xarray_netcdf4_file, + filter_encoded_xarray_netcdf4_files, filetype=FileType("netcdf4") ) - kerchunk_file = f"{tmpdir}/gzip_kerchunk.json" + kerchunk_file = f"{tmpdir}/kerchunk.json" virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json") fs = fsspec.filesystem("reference", fo=kerchunk_file) m = fs.get_mapper("") From 0083f77103c909079427ce3471e65af7fb3bfc54 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Thu, 16 May 2024 16:24:57 -0400 Subject: [PATCH 12/79] Small commit to check compression support in CI environment. --- pyproject.toml | 1 + virtualizarr/tests/test_readers/conftest.py | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4818b5f1..bba695eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,6 +41,7 @@ test = [ "scipy", "pooch", "ruff", + "netcdf4", ] diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index cc9331e1..8dc82c33 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -147,7 +147,7 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request): return filepath -@pytest.fixture(params=["gzip"]) +@pytest.fixture(params=["gzip", "blosc_lz"]) def filter_encoded_xarray_netcdf4_files(tmpdir, request): ds = xr.tutorial.open_dataset("air_temperature") encoding = {} @@ -156,9 +156,14 @@ def filter_encoded_xarray_netcdf4_files(tmpdir, request): "zlib": True, "complevel": 1 } + if request.param == "blosc_lz": + encoding_config = { + "compression": "blosc_lz", + } + for var_name in ds.variables: encoding[var_name] = encoding_config filepath = f"{tmpdir}/{request.param}_xarray.nc" - ds.to_netcdf(filepath, engine="netcdf4", encoding=encoding) + ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding) return filepath From 207c4b5cb411637070dc9a5f7011a0e0c98ef877 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 19 May 2024 21:34:26 +0000 Subject: [PATCH 13/79] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- virtualizarr/readers/hdf.py | 16 ++++---------- virtualizarr/readers/hdf_filters.py | 22 ++++++++++++------- virtualizarr/tests/test_readers/conftest.py | 18 +++++++-------- virtualizarr/tests/test_readers/test_hdf.py | 5 +---- .../tests/test_readers/test_hdf_filters.py | 2 +- .../test_readers/test_hdf_integration.py | 6 ++--- virtualizarr/xarray.py | 5 ++--- 7 files changed, 33 insertions(+), 41 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index 7d95d996..78e718e4 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -36,15 +36,11 @@ def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest: key_list = [0] * (len(dataset.shape) or 1) key = ".".join(map(str, key_list)) chunk_entry = ChunkEntry( - path=path, - offset=dsid.get_offset(), - length=dsid.get_storage_size() + path=path, offset=dsid.get_offset(), length=dsid.get_storage_size() ) chunk_key = ChunkKey(key) chunk_entries = {chunk_key: chunk_entry} - chunk_manifest = ChunkManifest( - entries=chunk_entries - ) + chunk_manifest = ChunkManifest(entries=chunk_entries) return chunk_manifest else: num_chunks = dsid.get_num_chunks() @@ -60,9 +56,7 @@ def get_key(blob): def store_chunk_entry(blob): chunk_entries[get_key(blob)] = ChunkEntry( - path=path, - offset=blob.byte_offset, - length=blob.size + path=path, offset=blob.byte_offset, length=blob.size ) has_chunk_iter = callable(getattr(dsid, "chunk_iter", None)) @@ -72,9 +66,7 @@ def store_chunk_entry(blob): for index in range(num_chunks): store_chunk_entry(dsid.get_chunk_info(index)) - chunk_manifest = ChunkManifest( - entries=chunk_entries - ) + chunk_manifest = ChunkManifest(entries=chunk_entries) return chunk_manifest diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index 75f06bdc..77e7037e 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -6,9 +6,7 @@ from numcodecs.abc import Codec from pydantic import BaseModel, validator -_non_standard_filters = { - "gzip": "zlib" -} +_non_standard_filters = {"gzip": "zlib"} class BloscProperties(BaseModel): @@ -20,12 +18,15 @@ class BloscProperties(BaseModel): @validator("cname", pre=True) def get_cname_from_code(cls, v): blosc_compressor_codes = { - value: key for key, value in hdf5plugin._filters.Blosc._Blosc__COMPRESSIONS.items() + value: key + for key, value in hdf5plugin._filters.Blosc._Blosc__COMPRESSIONS.items() } return blosc_compressor_codes[v] -def _filter_to_codec(filter_id: str, filter_properties: Union[int, Tuple] = None) -> Codec: +def _filter_to_codec( + filter_id: str, filter_properties: Union[int, Tuple] = None +) -> Codec: try: id = int(filter_id) except ValueError: @@ -41,9 +42,14 @@ def _filter_to_codec(filter_id: str, filter_properties: Union[int, Tuple] = None filter = hdf5plugin.get_filters(id)[0] id = filter.filter_name if id == "blosc": - blosc_props = BloscProperties(**{k: v for k, v in - zip(BloscProperties.__fields__.keys(), - filter_properties[-4:])}) + blosc_props = BloscProperties( + **{ + k: v + for k, v in zip( + BloscProperties.__fields__.keys(), filter_properties[-4:] + ) + } + ) conf = blosc_props.model_dump() conf["id"] = id diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index aa66f933..53c9630e 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -138,12 +138,15 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request): filepath = f"{tmpdir}/{request.param}.nc" f = h5py.File(filepath, "w") if request.param == "gzip": - f.create_dataset(name="data", data=np_uncompressed, compression="gzip", compression_opts=1) + f.create_dataset( + name="data", data=np_uncompressed, compression="gzip", compression_opts=1 + ) if request.param == "blosc": - f.create_dataset(name="data", data=np_uncompressed, - **hdf5plugin.Blosc( - cname="lz4", clevel=9, shuffle=hdf5plugin.Blosc.SHUFFLE - )) + f.create_dataset( + name="data", + data=np_uncompressed, + **hdf5plugin.Blosc(cname="lz4", clevel=9, shuffle=hdf5plugin.Blosc.SHUFFLE), + ) return filepath @@ -152,10 +155,7 @@ def filter_encoded_xarray_netcdf4_files(tmpdir, request): ds = xr.tutorial.open_dataset("air_temperature") encoding = {} if request.param == "gzip": - encoding_config = { - "zlib": True, - "complevel": 1 - } + encoding_config = {"zlib": True, "complevel": 1} for var_name in ds.variables: encoding[var_name] = encoding_config diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index 0d5a16db..a83bfc39 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -105,8 +105,5 @@ def test_groups_not_implemented(self, group_netcdf4_file): virtual_vars_from_hdf(group_netcdf4_file) def test_drop_variables(self, multiple_datasets_netcdf4_file): - variables = virtual_vars_from_hdf( - multiple_datasets_netcdf4_file, - ["data2"] - ) + variables = virtual_vars_from_hdf(multiple_datasets_netcdf4_file, ["data2"]) assert "data2" not in variables.keys() diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py index 8094d4cf..28b5d69f 100644 --- a/virtualizarr/tests/test_readers/test_hdf_filters.py +++ b/virtualizarr/tests/test_readers/test_hdf_filters.py @@ -36,7 +36,7 @@ def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file): ds = f["data"] chunk_info = ds.id.get_chunk_info(0) codecs = codecs_from_dataset(ds) - with open(filter_encoded_netcdf4_file, 'rb') as file: + with open(filter_encoded_netcdf4_file, "rb") as file: file.seek(chunk_info.byte_offset) bytes_read = file.read(chunk_info.size) decoded = codecs[0].decode(bytes_read) diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py index 94fc0c1c..b31289c0 100644 --- a/virtualizarr/tests/test_readers/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -7,11 +7,9 @@ class TestIntegration: - def test_filters_end_to_end(self, tmpdir, - filter_encoded_xarray_netcdf4_files): + def test_filters_end_to_end(self, tmpdir, filter_encoded_xarray_netcdf4_files): virtual_ds = virtualizarr.open_virtual_dataset( - filter_encoded_xarray_netcdf4_files, - filetype=FileType("netcdf4") + filter_encoded_xarray_netcdf4_files, filetype=FileType("netcdf4") ) kerchunk_file = f"{tmpdir}/kerchunk.json" virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json") diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 72645654..d8b6a080 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -20,8 +20,8 @@ _automatically_determine_filetype, ) from virtualizarr.manifests import ChunkManifest, ManifestArray -from virtualizarr.utils import _fsspec_openfile_from_filepath from virtualizarr.readers.hdf import attrs_from_root_group, virtual_vars_from_hdf +from virtualizarr.utils import _fsspec_openfile_from_filepath from virtualizarr.zarr import ( attrs_from_zarr_group_json, dataset_to_zarr, @@ -109,8 +109,7 @@ def open_virtual_dataset( if filetype.name.lower() == "netcdf4": print("wat") virtual_vars = virtual_vars_from_hdf( - path=filepath, - drop_variables=drop_variables + path=filepath, drop_variables=drop_variables ) ds_attrs = attrs_from_root_group(path=filepath) if filetype == "zarr_v3": From c57380058a5ad6ddbd908d54b1edd85b1f74f91d Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Sun, 19 May 2024 16:12:50 -0600 Subject: [PATCH 14/79] Fix mypy complaints for hdf_filters. --- virtualizarr/readers/hdf_filters.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index 75f06bdc..7a8bcc81 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -1,4 +1,4 @@ -from typing import List, Tuple, Union +from typing import List, Optional, Tuple, TypedDict, Union import h5py import hdf5plugin @@ -25,26 +25,30 @@ def get_cname_from_code(cls, v): return blosc_compressor_codes[v] -def _filter_to_codec(filter_id: str, filter_properties: Union[int, Tuple] = None) -> Codec: +def _filter_to_codec(filter_id: str, filter_properties: Union[int, None, Tuple] = None) -> Codec: + id_int = None + id_str = None try: - id = int(filter_id) + id_int = int(filter_id) except ValueError: - id = filter_id + id_str = filter_id - if isinstance(id, str): - if id in _non_standard_filters.keys(): - id = _non_standard_filters[id] + if id_str: + if id_str in _non_standard_filters.keys(): + id = _non_standard_filters[id_str] + else: + id = id_str conf = {"id": id} if id == "zlib": - conf["level"] = filter_properties - elif isinstance(id, int): - filter = hdf5plugin.get_filters(id)[0] + conf["level"] = filter_properties # type: ignore[assignment] + if id_int: + filter = hdf5plugin.get_filters(id_int)[0] id = filter.filter_name - if id == "blosc": + if id == "blosc" and isinstance(filter_properties, tuple): blosc_props = BloscProperties(**{k: v for k, v in zip(BloscProperties.__fields__.keys(), filter_properties[-4:])}) - conf = blosc_props.model_dump() + conf = blosc_props.model_dump() # type: ignore[assignment] conf["id"] = id codec = registry.get_codec(conf) From 588e06b507e8661644e33923ad0295e255152e1e Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Sun, 19 May 2024 16:22:39 -0600 Subject: [PATCH 15/79] Local pre-commit fix for hdf_filters. --- virtualizarr/readers/hdf_filters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index a3868ebd..dfe1c1f3 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Tuple, TypedDict, Union +from typing import List, Tuple, Union import h5py import hdf5plugin From 725333e06fad83d4d763317faca5f41167a2c98f Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 20 May 2024 20:13:44 -0600 Subject: [PATCH 16/79] Use fsspec reader_options introduced in #37. --- virtualizarr/readers/hdf.py | 22 ++++++++++++++++------ virtualizarr/xarray.py | 7 ++++--- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index 78e718e4..19d99b3f 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -1,6 +1,5 @@ from typing import List, Mapping, Optional, Union -import fsspec import h5py import numpy as np import xarray as xr @@ -8,6 +7,7 @@ from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray from virtualizarr.readers.hdf_filters import codecs_from_dataset from virtualizarr.types import ChunkKey +from virtualizarr.utils import _fsspec_openfile_from_filepath from virtualizarr.zarr import ZArray @@ -185,11 +185,15 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable: def virtual_vars_from_hdf( path: str, drop_variables: Optional[List[str]] = None, + reader_options: Optional[dict] = { + "storage_options": {"key": "", "secret": "", "anon": True} + }, ) -> Mapping[str, xr.Variable]: if drop_variables is None: drop_variables = [] - fs, file_path = fsspec.core.url_to_fs(path) - open_file = fs.open(path, "rb") + open_file = _fsspec_openfile_from_filepath( + filepath=path, reader_options=reader_options + ) f = h5py.File(open_file, mode="r") variables = {} for key in f.keys(): @@ -203,9 +207,15 @@ def virtual_vars_from_hdf( return variables -def attrs_from_root_group(path: str): - fs, file_path = fsspec.core.url_to_fs(path) - open_file = fs.open(path, "rb") +def attrs_from_root_group( + path: str, + reader_options: Optional[dict] = { + "storage_options": {"key": "", "secret": "", "anon": True} + }, +): + open_file = _fsspec_openfile_from_filepath( + filepath=path, reader_options=reader_options + ) f = h5py.File(open_file, mode="r") attrs = _extract_attrs(f) return attrs diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index d8b6a080..8f810ee1 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -107,11 +107,12 @@ def open_virtual_dataset( filetype = FileType(filetype) if filetype.name.lower() == "netcdf4": - print("wat") virtual_vars = virtual_vars_from_hdf( - path=filepath, drop_variables=drop_variables + path=filepath, + drop_variables=drop_variables, + reader_options=reader_options, ) - ds_attrs = attrs_from_root_group(path=filepath) + ds_attrs = attrs_from_root_group(path=filepath, reader_options=reader_options) if filetype == "zarr_v3": # TODO is there a neat way of auto-detecting this? return open_virtual_dataset_from_v3_store( From 72df10861ab0830531502885c0aaa3ebf3de4dee Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 20 May 2024 20:40:38 -0600 Subject: [PATCH 17/79] Fix incorrect zarr_v3 if block position from merge commit ef0d7a8. --- virtualizarr/xarray.py | 128 +++++++++++++++++++++-------------------- 1 file changed, 66 insertions(+), 62 deletions(-) diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 8f810ee1..d76e2a67 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -101,82 +101,86 @@ def open_virtual_dataset( if virtual_array_class is not ManifestArray: raise NotImplementedError() - - if filetype is None: - filetype = _automatically_determine_filetype(filepath=filepath) - filetype = FileType(filetype) - - if filetype.name.lower() == "netcdf4": - virtual_vars = virtual_vars_from_hdf( - path=filepath, - drop_variables=drop_variables, - reader_options=reader_options, - ) - ds_attrs = attrs_from_root_group(path=filepath, reader_options=reader_options) if filetype == "zarr_v3": # TODO is there a neat way of auto-detecting this? return open_virtual_dataset_from_v3_store( storepath=filepath, drop_variables=drop_variables, indexes=indexes ) else: - # this is the only place we actually always need to use kerchunk directly - # TODO avoid even reading byte ranges for variables that will be dropped later anyway? - vds_refs = kerchunk.read_kerchunk_references_from_file( - filepath=filepath, - filetype=filetype, - ) - virtual_vars = virtual_vars_from_kerchunk_refs( - vds_refs, - drop_variables=drop_variables + loadable_variables, - virtual_array_class=virtual_array_class, - ) - ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {}) - - if indexes is None or len(loadable_variables) > 0: - # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables... - # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references - # TODO really we probably want a dedicated xarray backend that iterates over all variables only once - fpath = _fsspec_openfile_from_filepath( - filepath=filepath, reader_options=reader_options - ) + if filetype is None: + filetype = _automatically_determine_filetype(filepath=filepath) + filetype = FileType(filetype) + + if filetype.name.lower() == "netcdf4": + virtual_vars = virtual_vars_from_hdf( + path=filepath, + drop_variables=drop_variables, + reader_options=reader_options, + ) + ds_attrs = attrs_from_root_group( + path=filepath, reader_options=reader_options + ) + else: + # this is the only place we actually always need to use kerchunk directly + # TODO avoid even reading byte ranges for variables that will be dropped later anyway? + vds_refs = kerchunk.read_kerchunk_references_from_file( + filepath=filepath, + filetype=filetype, + ) + virtual_vars = virtual_vars_from_kerchunk_refs( + vds_refs, + drop_variables=drop_variables + loadable_variables, + virtual_array_class=virtual_array_class, + ) + ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get( + ".zattrs", {} + ) - ds = xr.open_dataset(fpath, drop_variables=drop_variables) + if indexes is None or len(loadable_variables) > 0: + # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables... + # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references + # TODO really we probably want a dedicated xarray backend that iterates over all variables only once + fpath = _fsspec_openfile_from_filepath( + filepath=filepath, reader_options=reader_options + ) - if indexes is None: - # add default indexes by reading data from file - indexes = {name: index for name, index in ds.xindexes.items()} - elif indexes != {}: - # TODO allow manual specification of index objects - raise NotImplementedError() - else: - indexes = dict(**indexes) # for type hinting: to allow mutation + ds = xr.open_dataset(fpath, drop_variables=drop_variables) - loadable_vars = { - name: var - for name, var in ds.variables.items() - if name in loadable_variables - } + if indexes is None: + # add default indexes by reading data from file + indexes = {name: index for name, index in ds.xindexes.items()} + elif indexes != {}: + # TODO allow manual specification of index objects + raise NotImplementedError() + else: + indexes = dict(**indexes) # for type hinting: to allow mutation - # if we only read the indexes we can just close the file right away as nothing is lazy - if loadable_vars == {}: - ds.close() - else: - loadable_vars = {} - indexes = {} + loadable_vars = { + name: var + for name, var in ds.variables.items() + if name in loadable_variables + } - vars = {**virtual_vars, **loadable_vars} + # if we only read the indexes we can just close the file right away as nothing is lazy + if loadable_vars == {}: + ds.close() + else: + loadable_vars = {} + indexes = {} - data_vars, coords = separate_coords(vars, indexes) - vds = xr.Dataset( - data_vars, - coords=coords, - # indexes={}, # TODO should be added in a later version of xarray - attrs=ds_attrs, - ) + vars = {**virtual_vars, **loadable_vars} - # TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened + data_vars, coords = separate_coords(vars, indexes) + vds = xr.Dataset( + data_vars, + coords=coords, + # indexes={}, # TODO should be added in a later version of xarray + attrs=ds_attrs, + ) - return vds + # TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened + + return vds def open_virtual_dataset_from_v3_store( From d1e85cb169adc3851951afc2a64fcdec6180243c Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Tue, 21 May 2024 08:48:05 -0600 Subject: [PATCH 18/79] Fix early return from hdf _extract_attrs. --- virtualizarr/readers/hdf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index 19d99b3f..be93237f 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -155,7 +155,7 @@ def _extract_attrs(h5obj: Union[h5py.Dataset, h5py.Group]): continue attrs[n] = v - return attrs + return attrs def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable: From 1e2b3436fd086f8188c516f2fda4f6cd3a521325 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Tue, 21 May 2024 09:23:50 -0600 Subject: [PATCH 19/79] Test that _extract_attrs correctly handles multiple attributes. --- virtualizarr/tests/test_readers/conftest.py | 3 ++- virtualizarr/tests/test_readers/test_hdf.py | 16 +++++++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 53c9630e..fe2ec889 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -93,12 +93,13 @@ def chunked_dimensions_netcdf4_file(tmpdir): @pytest.fixture -def string_attribute_netcdf4_file(tmpdir): +def string_attributes_netcdf4_file(tmpdir): filepath = f"{tmpdir}/attributes.nc" f = h5py.File(filepath, "w") data = np.random.random((10, 10)) f.create_dataset(name="data", data=data, chunks=None) f["data"].attrs["attribute_name"] = "attribute_name" + f["data"].attrs["attribute_name2"] = "attribute_name2" return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index a83bfc39..a67352e6 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -75,16 +75,16 @@ def test_not_chunked_dataset(self, single_dimension_scale_netcdf4_file): var = _dataset_to_variable(single_dimension_scale_netcdf4_file, ds) assert var.chunks == (2,) - def test_dataset_attributes(self, string_attribute_netcdf4_file): - f = h5py.File(string_attribute_netcdf4_file) + def test_dataset_attributes(self, string_attributes_netcdf4_file): + f = h5py.File(string_attributes_netcdf4_file) ds = f["data"] - var = _dataset_to_variable(string_attribute_netcdf4_file, ds) + var = _dataset_to_variable(string_attributes_netcdf4_file, ds) assert var.attrs["attribute_name"] == "attribute_name" class TestExtractAttributes: - def test_string_attribute(self, string_attribute_netcdf4_file): - f = h5py.File(string_attribute_netcdf4_file) + def test_string_attribute(self, string_attributes_netcdf4_file): + f = h5py.File(string_attributes_netcdf4_file) ds = f["data"] attrs = _extract_attrs(ds) assert attrs["attribute_name"] == "attribute_name" @@ -94,6 +94,12 @@ def test_root_attribute(self, root_attributes_netcdf4_file): attrs = _extract_attrs(f) assert attrs["attribute_name"] == "attribute_name" + def test_multiple_attributes(self, string_attributes_netcdf4_file): + f = h5py.File(string_attributes_netcdf4_file) + ds = f["data"] + attrs = _extract_attrs(ds) + assert len(attrs.keys()) == 2 + class TestVirtualVarsFromHDF: def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file): From 7f1c1897dcad92cb988ea7e14a165d63fe23dad6 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Wed, 22 May 2024 14:16:12 -0600 Subject: [PATCH 20/79] Initial attempt at scale and offset via numcodecs. --- virtualizarr/readers/hdf.py | 14 ++++++++--- virtualizarr/readers/hdf_filters.py | 36 ++++++++++++++++++++++++++++- 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index be93237f..c251866b 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -5,7 +5,7 @@ import xarray as xr from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray -from virtualizarr.readers.hdf_filters import codecs_from_dataset +from virtualizarr.readers.hdf_filters import cfcodec_from_dataset, codecs_from_dataset from virtualizarr.types import ChunkKey from virtualizarr.utils import _fsspec_openfile_from_filepath from virtualizarr.zarr import ZArray @@ -163,11 +163,20 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable: # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66 chunks = dataset.chunks if dataset.chunks else dataset.shape codecs = codecs_from_dataset(dataset) + cfcodec = cfcodec_from_dataset(dataset) + attrs = _extract_attrs(dataset) + if cfcodec: + codecs.append(cfcodec["codec"]) + dtype = cfcodec["target_dtype"] + attrs.pop("scale_factor", None) + attrs.pop("add_offset", None) + else: + dtype = dataset.dtype filters = [codec.get_config() for codec in codecs] zarray = ZArray( chunks=chunks, compressor=None, - dtype=dataset.dtype, + dtype=dtype, fill_value=dataset.fillvalue, filters=filters, order="C", @@ -177,7 +186,6 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable: manifest = _dataset_chunk_manifest(path, dataset) marray = ManifestArray(zarray=zarray, chunkmanifest=manifest) dims = _dataset_dims(dataset) - attrs = _extract_attrs(dataset) variable = xr.Variable(data=marray, dims=dims, attrs=attrs) return variable diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index dfe1c1f3..169eab97 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -1,10 +1,13 @@ -from typing import List, Tuple, Union +from typing import List, Tuple, TypedDict, Union import h5py import hdf5plugin import numcodecs.registry as registry +import numpy as np from numcodecs.abc import Codec +from numcodecs.fixedscaleoffset import FixedScaleOffset from pydantic import BaseModel, validator +from xarray.coding.variables import _choose_float_dtype _non_standard_filters = {"gzip": "zlib"} @@ -24,6 +27,11 @@ def get_cname_from_code(cls, v): return blosc_compressor_codes[v] +class CFCodec(TypedDict): + target_dtype: np.dtype + codec: Codec + + def _filter_to_codec( filter_id: str, filter_properties: Union[int, None, Tuple] = None ) -> Codec: @@ -61,6 +69,32 @@ def _filter_to_codec( return codec +def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None: + attributes = {attr: dataset.attrs[attr] for attr in dataset.attrs} + mapping = {} + if "scale_factor" in attributes: + mapping["scale_factor"] = 1 / attributes["scale_factor"][0] + else: + mapping["scale_factor"] = 1 + if "add_offset" in attributes: + mapping["add_offset"] = attributes["add_offset"] + else: + mapping["add_offset"] = 0 + if mapping["scale_factor"] != 1 or mapping["add_offset"] != 0: + float_dtype = _choose_float_dtype(dtype=dataset.dtype, mapping=mapping) + target_dtype = np.dtype(float_dtype) + codec = FixedScaleOffset( + offset=mapping["add_offset"], + scale=mapping["scale_factor"], + dtype=target_dtype, + astype=dataset.dtype, + ) + cfcodec = CFCodec(target_dtype=target_dtype, codec=codec) + return cfcodec + else: + return None + + def codecs_from_dataset(dataset: h5py.Dataset) -> List[Codec]: codecs = [] for filter_id, filter_properties in dataset._filters.items(): From 908e332ae9860a7e7d36845633a7c9267ee72ca0 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Thu, 23 May 2024 10:54:48 -0600 Subject: [PATCH 21/79] Tests for cfcodec_from_dataset. --- virtualizarr/tests/test_readers/conftest.py | 10 +++++++ .../tests/test_readers/test_hdf_filters.py | 29 +++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index fe2ec889..202cdd9c 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -164,3 +164,13 @@ def filter_encoded_xarray_netcdf4_files(tmpdir, request): filepath = f"{tmpdir}/{request.param}_xarray.nc" ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding) return filepath + + +@pytest.fixture +def add_offset_netcdf4_file(tmpdir): + filepath = f"{tmpdir}/offset.nc" + f = h5py.File(filepath, "w") + data = np.random.random((10, 10)) + f.create_dataset(name="data", data=data, chunks=None) + f["data"].attrs.create(name="add_offset", data=5) + return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py index 28b5d69f..dca9f40d 100644 --- a/virtualizarr/tests/test_readers/test_hdf_filters.py +++ b/virtualizarr/tests/test_readers/test_hdf_filters.py @@ -1,9 +1,11 @@ import h5py import numcodecs +import numpy as np import pytest from virtualizarr.readers.hdf_filters import ( _filter_to_codec, + cfcodec_from_dataset, codecs_from_dataset, ) @@ -41,3 +43,30 @@ def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file): bytes_read = file.read(chunk_info.size) decoded = codecs[0].decode(bytes_read) assert decoded == np_uncompressed.tobytes() + + +class TestCFCodecFromDataset: + def test_no_cf_convention(self, filter_encoded_netcdf4_file): + f = h5py.File(filter_encoded_netcdf4_file) + ds = f["data"] + cf_codec = cfcodec_from_dataset(ds) + assert cf_codec is None + + def test_cf_scale_factor(self, netcdf4_file): + f = h5py.File(netcdf4_file) + ds = f["air"] + cf_codec = cfcodec_from_dataset(ds) + assert cf_codec["target_dtype"] == np.dtype(np.float64) + assert cf_codec["codec"].scale == 100.0 + assert cf_codec["codec"].offset == 0 + assert cf_codec["codec"].dtype == " Date: Fri, 24 May 2024 12:47:12 -0600 Subject: [PATCH 22/79] Temporarily relax integration tests to assert_allclose. --- virtualizarr/tests/test_integration.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 064968b3..1b9aad83 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -62,7 +62,7 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format): roundtrip = xr.open_dataset(f"{tmpdir}/refs.{format}", engine="kerchunk") # assert equal to original dataset - xrt.assert_equal(roundtrip, ds) + xrt.assert_allclose(roundtrip, ds) def test_kerchunk_roundtrip_concat(self, tmpdir, format): # set up example xarray dataset @@ -89,7 +89,7 @@ def test_kerchunk_roundtrip_concat(self, tmpdir, format): roundtrip = xr.open_dataset(f"{tmpdir}/refs.{format}", engine="kerchunk") # assert equal to original dataset - xrt.assert_equal(roundtrip, ds) + xrt.assert_allclose(roundtrip, ds) def test_open_scalar_variable(tmpdir): From ca6b236b36fabf96c0659556f2cff2ef59435d6c Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Fri, 24 May 2024 13:50:49 -0600 Subject: [PATCH 23/79] Add blosc_lz4 fixture parameterization to confirm libnetcdf environment. --- virtualizarr/tests/test_readers/conftest.py | 13 +++++++++---- .../tests/test_readers/test_hdf_integration.py | 4 ++-- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 202cdd9c..20d5433e 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -134,7 +134,7 @@ def np_uncompressed(): return np.arange(100) -@pytest.fixture(params=["gzip", "blosc"]) +@pytest.fixture(params=["gzip", "blosc_lz4"]) def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request): filepath = f"{tmpdir}/{request.param}.nc" f = h5py.File(filepath, "w") @@ -142,7 +142,7 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request): f.create_dataset( name="data", data=np_uncompressed, compression="gzip", compression_opts=1 ) - if request.param == "blosc": + if request.param == "blosc_lz4": f.create_dataset( name="data", data=np_uncompressed, @@ -151,18 +151,23 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request): return filepath -@pytest.fixture(params=["gzip"]) -def filter_encoded_xarray_netcdf4_files(tmpdir, request): +@pytest.fixture(params=["gzip", "blosc_zlib"]) +def filter_encoded_xarray_netcdf4_file(tmpdir, request): ds = xr.tutorial.open_dataset("air_temperature") encoding = {} if request.param == "gzip": encoding_config = {"zlib": True, "complevel": 1} + if "blosc" in request.param: + encoding_config = { + "compression": request.param, + } for var_name in ds.variables: encoding[var_name] = encoding_config filepath = f"{tmpdir}/{request.param}_xarray.nc" ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding) + # ds.to_netcdf(filepath, engine="netcdf4", encoding=encoding) return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py index b31289c0..ade8e7ce 100644 --- a/virtualizarr/tests/test_readers/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -7,9 +7,9 @@ class TestIntegration: - def test_filters_end_to_end(self, tmpdir, filter_encoded_xarray_netcdf4_files): + def test_filters_roundtrip(self, tmpdir, filter_encoded_xarray_netcdf4_file): virtual_ds = virtualizarr.open_virtual_dataset( - filter_encoded_xarray_netcdf4_files, filetype=FileType("netcdf4") + filter_encoded_xarray_netcdf4_file, filetype=FileType("netcdf4") ) kerchunk_file = f"{tmpdir}/kerchunk.json" virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json") From b7426c5b15f33a65a0890a51fbc6d9464b673eaf Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Fri, 24 May 2024 14:05:21 -0600 Subject: [PATCH 24/79] Check for compatability with netcdf4 engine. --- virtualizarr/tests/test_readers/conftest.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 20d5433e..cb1212f0 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -166,8 +166,7 @@ def filter_encoded_xarray_netcdf4_file(tmpdir, request): encoding[var_name] = encoding_config filepath = f"{tmpdir}/{request.param}_xarray.nc" - ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding) - # ds.to_netcdf(filepath, engine="netcdf4", encoding=encoding) + ds.to_netcdf(filepath, engine="netcdf4", encoding=encoding) return filepath From dac21dde6239b5ea7e918ff50aef8839ab2f7773 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 27 May 2024 12:58:48 -0600 Subject: [PATCH 25/79] Use separate fixtures for h5netcdf and netcdf4 compression styles. --- virtualizarr/tests/test_readers/conftest.py | 27 ++++++++++++++----- .../test_readers/test_hdf_integration.py | 20 ++++++++++++-- 2 files changed, 39 insertions(+), 8 deletions(-) diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index cb1212f0..a4fafed3 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -3,6 +3,7 @@ import numpy as np import pytest import xarray as xr +from xarray.tests.test_dataset import create_test_data @pytest.fixture @@ -151,22 +152,36 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request): return filepath -@pytest.fixture(params=["gzip", "blosc_zlib"]) -def filter_encoded_xarray_netcdf4_file(tmpdir, request): +@pytest.fixture(params=["gzip"]) +def filter_encoded_xarray_h5netcdf_file(tmpdir, request): ds = xr.tutorial.open_dataset("air_temperature") encoding = {} if request.param == "gzip": encoding_config = {"zlib": True, "complevel": 1} + + for var_name in ds.variables: + encoding[var_name] = encoding_config + + filepath = f"{tmpdir}/{request.param}_xarray.nc" + ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding) + return filepath + + +@pytest.fixture(params=["blosc_zlib"]) +def filter_encoded_xarray_netcdf4_file(tmpdir, request): + ds = create_test_data(dim_sizes=(20, 80, 10)) if "blosc" in request.param: encoding_config = { "compression": request.param, + "chunksizes": (20, 40), + "original_shape": ds.var2.shape, + "blosc_shuffle": 1, + "fletcher32": False, } - for var_name in ds.variables: - encoding[var_name] = encoding_config - + ds["var2"].encoding.update(encoding_config) filepath = f"{tmpdir}/{request.param}_xarray.nc" - ds.to_netcdf(filepath, engine="netcdf4", encoding=encoding) + ds.to_netcdf(filepath, engine="netcdf4") return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py index ade8e7ce..d6ecf2f1 100644 --- a/virtualizarr/tests/test_readers/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -7,9 +7,11 @@ class TestIntegration: - def test_filters_roundtrip(self, tmpdir, filter_encoded_xarray_netcdf4_file): + def test_filters_h5netcdf_roundtrip( + self, tmpdir, filter_encoded_xarray_h5netcdf_file + ): virtual_ds = virtualizarr.open_virtual_dataset( - filter_encoded_xarray_netcdf4_file, filetype=FileType("netcdf4") + filter_encoded_xarray_h5netcdf_file, filetype=FileType("netcdf4") ) kerchunk_file = f"{tmpdir}/kerchunk.json" virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json") @@ -18,3 +20,17 @@ def test_filters_roundtrip(self, tmpdir, filter_encoded_xarray_netcdf4_file): ds = xr.open_dataset(m, engine="kerchunk") assert isinstance(ds.air.values[0][0][0], numpy.float64) + + def test_filters_netcdf4_roundtrip( + self, tmpdir, filter_encoded_xarray_netcdf4_file + ): + virtual_ds = virtualizarr.open_virtual_dataset( + filter_encoded_xarray_netcdf4_file, filetype=FileType("netcdf4") + ) + kerchunk_file = f"{tmpdir}/kerchunk.json" + virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json") + fs = fsspec.filesystem("reference", fo=kerchunk_file) + m = fs.get_mapper("") + + ds = xr.open_dataset(m, engine="kerchunk") + print(ds["var2"].encoding) From e968772a3a206658064e3e29294afec7604d0bc9 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 27 May 2024 15:49:22 -0600 Subject: [PATCH 26/79] Print libhdf5 and libnetcdf4 versions to confirm compiled environment. --- virtualizarr/tests/test_readers/conftest.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index a4fafed3..8904dd38 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -4,6 +4,7 @@ import pytest import xarray as xr from xarray.tests.test_dataset import create_test_data +from xarray.util.print_versions import netcdf_and_hdf5_versions @pytest.fixture @@ -181,6 +182,7 @@ def filter_encoded_xarray_netcdf4_file(tmpdir, request): ds["var2"].encoding.update(encoding_config) filepath = f"{tmpdir}/{request.param}_xarray.nc" + print(netcdf_and_hdf5_versions()) ds.to_netcdf(filepath, engine="netcdf4") return filepath From 9a98e57e55fd020bcf3d682604eee2f03775ff26 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 27 May 2024 17:07:51 -0600 Subject: [PATCH 27/79] Skip netcdf4 style compression tests when libhdf5 < 1.14. --- virtualizarr/tests/test_readers/conftest.py | 15 ++++++++++++--- .../test_readers/test_hdf_integration.py | 19 ++++++++++++++++--- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 8904dd38..0ddb2a01 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -3,6 +3,7 @@ import numpy as np import pytest import xarray as xr +from packaging.version import Version from xarray.tests.test_dataset import create_test_data from xarray.util.print_versions import netcdf_and_hdf5_versions @@ -168,8 +169,17 @@ def filter_encoded_xarray_h5netcdf_file(tmpdir, request): return filepath +@pytest.fixture() +def skip_test_for_libhdf5_version(): + versions = netcdf_and_hdf5_versions() + libhdf5_version = Version(versions[0][1]) + return libhdf5_version < Version("1.14") + + @pytest.fixture(params=["blosc_zlib"]) -def filter_encoded_xarray_netcdf4_file(tmpdir, request): +def filter_encoded_xarray_netcdf4_file(tmpdir, request, skip_test_for_libhdf5_version): + if skip_test_for_libhdf5_version: + pytest.skip("Requires libhdf5 >= 1.14") ds = create_test_data(dim_sizes=(20, 80, 10)) if "blosc" in request.param: encoding_config = { @@ -182,9 +192,8 @@ def filter_encoded_xarray_netcdf4_file(tmpdir, request): ds["var2"].encoding.update(encoding_config) filepath = f"{tmpdir}/{request.param}_xarray.nc" - print(netcdf_and_hdf5_versions()) ds.to_netcdf(filepath, engine="netcdf4") - return filepath + return {"filepath": filepath, "compressor": request.param} @pytest.fixture diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py index d6ecf2f1..f51ebd45 100644 --- a/virtualizarr/tests/test_readers/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -24,13 +24,26 @@ def test_filters_h5netcdf_roundtrip( def test_filters_netcdf4_roundtrip( self, tmpdir, filter_encoded_xarray_netcdf4_file ): + filepath = filter_encoded_xarray_netcdf4_file["filepath"] + compressor = filter_encoded_xarray_netcdf4_file["compressor"] virtual_ds = virtualizarr.open_virtual_dataset( - filter_encoded_xarray_netcdf4_file, filetype=FileType("netcdf4") + filepath, filetype=FileType("netcdf4") ) kerchunk_file = f"{tmpdir}/kerchunk.json" virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json") fs = fsspec.filesystem("reference", fo=kerchunk_file) m = fs.get_mapper("") - ds = xr.open_dataset(m, engine="kerchunk") - print(ds["var2"].encoding) + + expected_encoding = ds["var2"].encoding.copy() + compression = expected_encoding.pop("compression") + blosc_shuffle = expected_encoding.pop("blosc_shuffle") + if compression is not None: + if "blosc" in compression and blosc_shuffle: + expected_encoding["blosc"] = { + "compressor": compressor, + "shuffle": blosc_shuffle, + } + expected_encoding["shuffle"] = False + actual_encoding = ds["var2"].encoding + assert expected_encoding.items() <= actual_encoding.items() From 7590b87e375f0dea6683aceba4322ca5a0c8a95d Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Tue, 11 Jun 2024 13:57:51 -0600 Subject: [PATCH 28/79] Include imagecodecs.numcodecs to support HDF5 lzf filters. --- pyproject.toml | 1 + virtualizarr/readers/hdf_filters.py | 2 +- virtualizarr/tests/test_readers/test_hdf_filters.py | 8 ++++---- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f0563f09..773cccc2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,7 @@ test = [ "fsspec", "s3fs", "fastparquet", + "imagecodecs-numcodecs", ] diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index 169eab97..08a3bba4 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -9,7 +9,7 @@ from pydantic import BaseModel, validator from xarray.coding.variables import _choose_float_dtype -_non_standard_filters = {"gzip": "zlib"} +_non_standard_filters = {"gzip": "zlib", "lzf": "imagecodecs_lzf"} class BloscProperties(BaseModel): diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py index dca9f40d..b5b04047 100644 --- a/virtualizarr/tests/test_readers/test_hdf_filters.py +++ b/virtualizarr/tests/test_readers/test_hdf_filters.py @@ -1,7 +1,7 @@ import h5py +import imagecodecs import numcodecs import numpy as np -import pytest from virtualizarr.readers.hdf_filters import ( _filter_to_codec, @@ -15,9 +15,9 @@ def test_gzip_uses_zlib_numcodec(self): codec = _filter_to_codec("gzip", 1) assert isinstance(codec, numcodecs.zlib.Zlib) - def test_lzf_not_available(self): - with pytest.raises(ValueError, match="codec not available"): - _filter_to_codec("lzf") + def test_lzf(self): + codec = _filter_to_codec("lzf") + assert isinstance(codec, imagecodecs.numcodecs.Lzf) def test_blosc(self): codec = _filter_to_codec("32001", (2, 2, 8, 800, 9, 2, 1)) From 14bd7098545bd7f443b791f24aafa11bcc00fdbb Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Tue, 11 Jun 2024 16:24:30 -0600 Subject: [PATCH 29/79] Remove test that verifies call to read_kerchunk_references_from_file. --- virtualizarr/tests/test_xarray.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index 695759bd..d145550e 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -1,5 +1,4 @@ from collections.abc import Mapping -from unittest.mock import patch import numpy as np import pytest @@ -304,16 +303,3 @@ def test_loadable_variables(self, netcdf4_file): for name in full_ds.variables: if name in vars_to_load: xrt.assert_identical(vds.variables[name], full_ds.variables[name]) - - @patch("virtualizarr.kerchunk.read_kerchunk_references_from_file") - def test_open_virtual_dataset_passes_expected_args( - self, mock_read_kerchunk, netcdf4_file - ): - reader_options = {"option1": "value1", "option2": "value2"} - open_virtual_dataset(netcdf4_file, indexes={}, reader_options=reader_options) - args = { - "filepath": netcdf4_file, - "filetype": None, - "reader_options": reader_options, - } - mock_read_kerchunk.assert_called_once_with(**args) From acdf0d76557a5abdf2657f1278f57c732a4dd347 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Wed, 12 Jun 2024 15:05:34 -0600 Subject: [PATCH 30/79] Add additional codec support structures for imagecodecs and numcodecs. --- virtualizarr/readers/hdf_filters.py | 23 +++++++++++++++++---- virtualizarr/tests/test_readers/conftest.py | 9 +++++++- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index 08a3bba4..667ff09a 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -9,7 +9,12 @@ from pydantic import BaseModel, validator from xarray.coding.variables import _choose_float_dtype -_non_standard_filters = {"gzip": "zlib", "lzf": "imagecodecs_lzf"} +_non_standard_filters = { + "gzip": "zlib", + "lzf": "imagecodecs_lzf", +} + +_hdf5plugin_imagecodecs = {"lz4": "imagecodecs_lz4h5", "bzip2": "imagecodecs_bz2"} class BloscProperties(BaseModel): @@ -27,6 +32,10 @@ def get_cname_from_code(cls, v): return blosc_compressor_codes[v] +class ZstdProperties(BaseModel): + level: int + + class CFCodec(TypedDict): target_dtype: np.dtype codec: Codec @@ -41,18 +50,20 @@ def _filter_to_codec( id_int = int(filter_id) except ValueError: id_str = filter_id - + conf = {} if id_str: if id_str in _non_standard_filters.keys(): id = _non_standard_filters[id_str] else: id = id_str - conf = {"id": id} + conf["id"] = id # type: ignore[assignment] if id == "zlib": conf["level"] = filter_properties # type: ignore[assignment] if id_int: filter = hdf5plugin.get_filters(id_int)[0] id = filter.filter_name + if id in _hdf5plugin_imagecodecs.keys(): + id = _hdf5plugin_imagecodecs[id] if id == "blosc" and isinstance(filter_properties, tuple): blosc_props = BloscProperties( **{ @@ -63,7 +74,11 @@ def _filter_to_codec( } ) conf = blosc_props.model_dump() # type: ignore[assignment] - conf["id"] = id + if id == "zstd" and isinstance(filter_properties, tuple): + zstd_props = ZstdProperties(level=filter_properties[0]) + conf = zstd_props.model_dump() # type: ignore[assignment] + + conf["id"] = id codec = registry.get_codec(conf) return codec diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 0ddb2a01..3e6f9c3f 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -137,7 +137,7 @@ def np_uncompressed(): return np.arange(100) -@pytest.fixture(params=["gzip", "blosc_lz4"]) +@pytest.fixture(params=["gzip", "blosc_lz4", "lz4", "bzip2", "zstd"]) def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request): filepath = f"{tmpdir}/{request.param}.nc" f = h5py.File(filepath, "w") @@ -151,6 +151,13 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request): data=np_uncompressed, **hdf5plugin.Blosc(cname="lz4", clevel=9, shuffle=hdf5plugin.Blosc.SHUFFLE), ) + if request.param == "lz4": + f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.LZ4(nbytes=0)) + if request.param == "bzip2": + f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.BZip2()) + if request.param == "zstd": + f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.Zstd(clevel=2)) + return filepath From 4ba323a6c862deb8908706373b6df429fd78f986 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Wed, 12 Jun 2024 16:17:04 -0600 Subject: [PATCH 31/79] Add codec config test for Zstd. --- virtualizarr/tests/test_readers/test_hdf_filters.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py index b5b04047..4d23a756 100644 --- a/virtualizarr/tests/test_readers/test_hdf_filters.py +++ b/virtualizarr/tests/test_readers/test_hdf_filters.py @@ -31,6 +31,12 @@ def test_blosc(self): } assert codec.get_config() == expected_config + def test_zstd(self): + codec = _filter_to_codec("32015", (5,)) + assert isinstance(codec, numcodecs.zstd.Zstd) + expected_config = {"id": "zstd", "level": 5} + assert codec.get_config() == expected_config + class TestCodecsFromDataSet: def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file): From e14e53b0fc2bb7ed1ca3d5b73fc43594aff77426 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Thu, 20 Jun 2024 18:03:26 -0600 Subject: [PATCH 32/79] Include initial cf decoding tests. --- virtualizarr/readers/hdf_filters.py | 3 +- virtualizarr/tests/test_readers/conftest.py | 34 ++++++++++++++++--- .../tests/test_readers/test_hdf_filters.py | 28 +++++++++++++++ 3 files changed, 60 insertions(+), 5 deletions(-) diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index 667ff09a..f4e2dcfa 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -88,7 +88,8 @@ def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None: attributes = {attr: dataset.attrs[attr] for attr in dataset.attrs} mapping = {} if "scale_factor" in attributes: - mapping["scale_factor"] = 1 / attributes["scale_factor"][0] + mapping["scale_factor"] = 1 / attributes["scale_factor"] + # mapping["scale_factor"] =attributes["scale_factor"][0] else: mapping["scale_factor"] = 1 if "add_offset" in attributes: diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 3e6f9c3f..e1a53c5e 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -204,10 +204,36 @@ def filter_encoded_xarray_netcdf4_file(tmpdir, request, skip_test_for_libhdf5_ve @pytest.fixture -def add_offset_netcdf4_file(tmpdir): +def np_uncompressed_int16(): + return np.arange(100, dtype=np.int16) + + +@pytest.fixture +def offset(): + return np.float32(5.0) + + +@pytest.fixture +def add_offset_netcdf4_file(tmpdir, np_uncompressed_int16, offset): filepath = f"{tmpdir}/offset.nc" f = h5py.File(filepath, "w") - data = np.random.random((10, 10)) - f.create_dataset(name="data", data=data, chunks=None) - f["data"].attrs.create(name="add_offset", data=5) + data = np_uncompressed_int16 - offset + f.create_dataset(name="data", data=data, chunks=True) + f["data"].attrs.create(name="add_offset", data=offset) + return filepath + + +@pytest.fixture +def scale_factor(): + return 0.01 + + +@pytest.fixture +def scale_add_offset_netcdf4_file(tmpdir, np_uncompressed_int16, offset, scale_factor): + filepath = f"{tmpdir}/scale_offset.nc" + f = h5py.File(filepath, "w") + data = (np_uncompressed_int16 - offset) / scale_factor + f.create_dataset(name="data", data=data, chunks=True) + f["data"].attrs.create(name="add_offset", data=offset) + f["data"].attrs.create(name="scale_factor", data=np.array([scale_factor])) return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py index 4d23a756..960bcf2c 100644 --- a/virtualizarr/tests/test_readers/test_hdf_filters.py +++ b/virtualizarr/tests/test_readers/test_hdf_filters.py @@ -76,3 +76,31 @@ def test_cf_add_offset(self, add_offset_netcdf4_file): assert cf_codec["codec"].scale == 1 assert cf_codec["codec"].offset == 5 assert cf_codec["codec"].dtype == " Date: Thu, 20 Jun 2024 19:49:54 -0600 Subject: [PATCH 33/79] Revert typo for scale_factor retrieval. --- virtualizarr/readers/hdf_filters.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index f4e2dcfa..667ff09a 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -88,8 +88,7 @@ def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None: attributes = {attr: dataset.attrs[attr] for attr in dataset.attrs} mapping = {} if "scale_factor" in attributes: - mapping["scale_factor"] = 1 / attributes["scale_factor"] - # mapping["scale_factor"] =attributes["scale_factor"][0] + mapping["scale_factor"] = 1 / attributes["scale_factor"][0] else: mapping["scale_factor"] = 1 if "add_offset" in attributes: From 01a3980f541a45c8a33a907dd6d3bed722eacae9 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Thu, 20 Jun 2024 20:12:44 -0600 Subject: [PATCH 34/79] Update reader to use new numpy manifest representation. --- virtualizarr/readers/hdf.py | 29 ++++++++++----------- virtualizarr/tests/test_readers/test_hdf.py | 4 +-- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index c251866b..b96bdff7 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -39,34 +39,33 @@ def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest: path=path, offset=dsid.get_offset(), length=dsid.get_storage_size() ) chunk_key = ChunkKey(key) - chunk_entries = {chunk_key: chunk_entry} + chunk_entries = {chunk_key: chunk_entry.dict()} chunk_manifest = ChunkManifest(entries=chunk_entries) return chunk_manifest else: num_chunks = dsid.get_num_chunks() if num_chunks == 0: raise ValueError("The dataset is chunked but contains no chunks") + paths = np.full(num_chunks, path, dtype=np.dtypes.StringDType) # type: ignore + offsets = np.empty((num_chunks), dtype=np.int32) + lengths = np.empty((num_chunks), dtype=np.int32) - chunk_entries = dict() - - def get_key(blob): - key_list = [a // b for a, b in zip(blob.chunk_offset, dataset.chunks)] - key = ".".join(map(str, key_list)) - return key - - def store_chunk_entry(blob): - chunk_entries[get_key(blob)] = ChunkEntry( - path=path, offset=blob.byte_offset, length=blob.size - ) + def add_chunk_info(blob, chunk_index): + offsets[chunk_index] = blob.byte_offset + lengths[chunk_index] = blob.size + chunk_index += 1 has_chunk_iter = callable(getattr(dsid, "chunk_iter", None)) if has_chunk_iter: - dsid.chunk_iter(store_chunk_entry) + chunk_index = 0 + dsid.chunk_iter(add_chunk_info, chunk_index) else: for index in range(num_chunks): - store_chunk_entry(dsid.get_chunk_info(index)) + add_chunk_info(dsid.get_chunk_info(index), index) - chunk_manifest = ChunkManifest(entries=chunk_entries) + chunk_manifest = ChunkManifest.from_arrays( + paths=paths, offsets=offsets, lengths=lengths + ) return chunk_manifest diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index a67352e6..8c5a40a7 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -27,13 +27,13 @@ def test_no_chunking(self, no_chunks_netcdf4_file): f = h5py.File(no_chunks_netcdf4_file) ds = f["data"] manifest = _dataset_chunk_manifest(path=no_chunks_netcdf4_file, dataset=ds) - assert len(manifest.entries) == 1 + assert len(manifest) == 1 def test_chunked(self, chunked_netcdf4_file): f = h5py.File(chunked_netcdf4_file) ds = f["data"] manifest = _dataset_chunk_manifest(path=chunked_netcdf4_file, dataset=ds) - assert len(manifest.entries) == 4 + assert len(manifest) == 4 class TestDatasetDims: From c37d9e526239ad5207f76d400924fffaabb578ec Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Fri, 21 Jun 2024 19:05:01 -0600 Subject: [PATCH 35/79] Temporarily skip test until blosc netcdf4 issue is solved. --- virtualizarr/tests/test_readers/test_hdf_integration.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py index f51ebd45..dca34dbd 100644 --- a/virtualizarr/tests/test_readers/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -1,5 +1,6 @@ import fsspec import numpy +import pytest import xarray as xr import virtualizarr @@ -21,6 +22,9 @@ def test_filters_h5netcdf_roundtrip( ds = xr.open_dataset(m, engine="kerchunk") assert isinstance(ds.air.values[0][0][0], numpy.float64) + @pytest.mark.skip( + reason="Issue with xr 'dim1' serialization and blosc availability" + ) def test_filters_netcdf4_roundtrip( self, tmpdir, filter_encoded_xarray_netcdf4_file ): From 17b30d4149603c952e0b24892b2d104ed7499a52 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Fri, 21 Jun 2024 19:24:07 -0600 Subject: [PATCH 36/79] Fix Pydantic 2 migration warnings. --- virtualizarr/readers/hdf_filters.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index 667ff09a..cc8e810e 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -6,7 +6,7 @@ import numpy as np from numcodecs.abc import Codec from numcodecs.fixedscaleoffset import FixedScaleOffset -from pydantic import BaseModel, validator +from pydantic import BaseModel, field_validator from xarray.coding.variables import _choose_float_dtype _non_standard_filters = { @@ -23,7 +23,7 @@ class BloscProperties(BaseModel): shuffle: int cname: str - @validator("cname", pre=True) + @field_validator("cname", mode="before") def get_cname_from_code(cls, v): blosc_compressor_codes = { value: key @@ -69,7 +69,7 @@ def _filter_to_codec( **{ k: v for k, v in zip( - BloscProperties.__fields__.keys(), filter_properties[-4:] + BloscProperties.model_fields.keys(), filter_properties[-4:] ) } ) From f6b596a6563aff90a70acb0b8190898399368f32 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Fri, 21 Jun 2024 19:30:55 -0600 Subject: [PATCH 37/79] Include hdf5plugin and imagecodecs-numcodecs in mamba test environment. --- ci/environment.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ci/environment.yml b/ci/environment.yml index 0385ea5a..e909beec 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -14,6 +14,7 @@ dependencies: - ujson - packaging - universal_pathlib + - hdf5plugin # Testing - codecov - pre-commit @@ -26,3 +27,4 @@ dependencies: - fsspec - s3fs - fastparquet + - imagecodecs-numcodecs From eb6e24d10385fa68a9a8909d0c6cfb9a97a34461 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Fri, 21 Jun 2024 19:35:24 -0600 Subject: [PATCH 38/79] Mamba attempt with imagecodecs rather than imagecodecs-numcodecs. --- ci/environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/environment.yml b/ci/environment.yml index e909beec..20784a6e 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -27,4 +27,4 @@ dependencies: - fsspec - s3fs - fastparquet - - imagecodecs-numcodecs + - imagecodecs From c85bd168025d4c96c1112aff22cc82fc0e07cbfd Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Fri, 21 Jun 2024 19:41:14 -0600 Subject: [PATCH 39/79] Mamba attempt with latest imagecodecs release. --- ci/environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/environment.yml b/ci/environment.yml index 20784a6e..fb967bcd 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -27,4 +27,4 @@ dependencies: - fsspec - s3fs - fastparquet - - imagecodecs + - imagecodecs>=2024.6.1 From ca435da5007263136bf489ffe647cb690145cbd7 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Tue, 25 Jun 2024 19:34:35 -0600 Subject: [PATCH 40/79] Use correct iter_chunks callback function signtature. --- virtualizarr/readers/hdf.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index b96bdff7..d082b717 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -53,12 +53,22 @@ def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest: def add_chunk_info(blob, chunk_index): offsets[chunk_index] = blob.byte_offset lengths[chunk_index] = blob.size - chunk_index += 1 has_chunk_iter = callable(getattr(dsid, "chunk_iter", None)) if has_chunk_iter: - chunk_index = 0 - dsid.chunk_iter(add_chunk_info, chunk_index) + + def create_callback(initial=0): + value = initial + + def callback(blob): + nonlocal value + add_chunk_info(blob, chunk_index=value) + value += 1 + + return callback + + callback = create_callback() + dsid.chunk_iter(callback) else: for index in range(num_chunks): add_chunk_info(dsid.get_chunk_info(index), index) From 3017951549fe4b3d9d7099b1357aa76136d23f16 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Tue, 25 Jun 2024 19:35:40 -0600 Subject: [PATCH 41/79] Include pip based imagecodecs-numcodecs until conda-forge availability. --- ci/environment.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ci/environment.yml b/ci/environment.yml index fb967bcd..e2f5a865 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -28,3 +28,5 @@ dependencies: - s3fs - fastparquet - imagecodecs>=2024.6.1 + - pip: + - imagecodecs-numcodecs From 32ba13537070fbee7e861d8618f6a77eacbe0da8 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Thu, 27 Jun 2024 15:43:10 -0600 Subject: [PATCH 42/79] Handle non-coordinate dims which are serialized to hdf as empty dataset. --- virtualizarr/readers/hdf.py | 65 ++++++++++++--------- virtualizarr/tests/test_integration.py | 18 +++++- virtualizarr/tests/test_readers/test_hdf.py | 1 + virtualizarr/xarray.py | 2 +- 4 files changed, 53 insertions(+), 33 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index d082b717..cbbe824f 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -11,7 +11,9 @@ from virtualizarr.zarr import ZArray -def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest: +def _dataset_chunk_manifest( + path: str, dataset: h5py.Dataset +) -> Optional[ChunkManifest]: """ Generate ChunkManifest for HDF5 dataset. @@ -31,7 +33,7 @@ def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest: if dataset.chunks is None: if dsid.get_offset() is None: - raise ValueError("Dataset has no space allocated in the file") + return None else: key_list = [0] * (len(dataset.shape) or 1) key = ".".join(map(str, key_list)) @@ -167,35 +169,39 @@ def _extract_attrs(h5obj: Union[h5py.Dataset, h5py.Group]): return attrs -def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable: +def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> Optional[xr.Variable]: # This chunk determination logic mirrors zarr-python's create # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66 - chunks = dataset.chunks if dataset.chunks else dataset.shape - codecs = codecs_from_dataset(dataset) - cfcodec = cfcodec_from_dataset(dataset) - attrs = _extract_attrs(dataset) - if cfcodec: - codecs.append(cfcodec["codec"]) - dtype = cfcodec["target_dtype"] - attrs.pop("scale_factor", None) - attrs.pop("add_offset", None) - else: - dtype = dataset.dtype - filters = [codec.get_config() for codec in codecs] - zarray = ZArray( - chunks=chunks, - compressor=None, - dtype=dtype, - fill_value=dataset.fillvalue, - filters=filters, - order="C", - shape=dataset.shape, - zarr_format=2, - ) + manifest = _dataset_chunk_manifest(path, dataset) - marray = ManifestArray(zarray=zarray, chunkmanifest=manifest) - dims = _dataset_dims(dataset) - variable = xr.Variable(data=marray, dims=dims, attrs=attrs) + if manifest: + chunks = dataset.chunks if dataset.chunks else dataset.shape + codecs = codecs_from_dataset(dataset) + cfcodec = cfcodec_from_dataset(dataset) + attrs = _extract_attrs(dataset) + if cfcodec: + codecs.append(cfcodec["codec"]) + dtype = cfcodec["target_dtype"] + attrs.pop("scale_factor", None) + attrs.pop("add_offset", None) + else: + dtype = dataset.dtype + filters = [codec.get_config() for codec in codecs] + zarray = ZArray( + chunks=chunks, + compressor=None, + dtype=dtype, + fill_value=dataset.fillvalue, + filters=filters, + order="C", + shape=dataset.shape, + zarr_format=2, + ) + marray = ManifestArray(zarray=zarray, chunkmanifest=manifest) + dims = _dataset_dims(dataset) + variable = xr.Variable(data=marray, dims=dims, attrs=attrs) + else: + variable = None return variable @@ -217,7 +223,8 @@ def virtual_vars_from_hdf( if key not in drop_variables: if isinstance(f[key], h5py.Dataset): variable = _dataset_to_variable(path, f[key]) - variables[key] = variable + if variable is not None: + variables[key] = variable else: raise NotImplementedError("Nested groups are not yet supported") diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 451862c6..6a1f91ef 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -71,9 +71,13 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format): f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=False ) - # assert identical to original dataset + # assert all_close to original dataset xrt.assert_allclose(roundtrip, ds) + # assert coordinate attributes are maintained + for coord in ds.coords: + assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs + def test_kerchunk_roundtrip_concat(self, tmpdir, format): # set up example xarray dataset ds = xr.tutorial.open_dataset("air_temperature", decode_times=False) @@ -107,8 +111,12 @@ def test_kerchunk_roundtrip_concat(self, tmpdir, format): f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=False ) - # assert identical to original dataset - xrt.assert_identical(roundtrip, ds) + # assert all_close to original dataset + xrt.assert_allclose(roundtrip, ds) + + # assert coordinate attributes are maintained + for coord in ds.coords: + assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs def test_non_dimension_coordinates(self, tmpdir, format): # regression test for GH issue #105 @@ -142,6 +150,10 @@ def test_non_dimension_coordinates(self, tmpdir, format): # assert equal to original dataset xrt.assert_allclose(roundtrip, ds) + # assert coordinate attributes are maintained + for coord in ds.coords: + assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs + def test_open_scalar_variable(tmpdir): # regression test for GH issue #100 diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index 8c5a40a7..c744cd68 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -17,6 +17,7 @@ def test_empty_chunks(self, empty_chunks_netcdf4_file): with pytest.raises(ValueError, match="chunked but contains no chunks"): _dataset_chunk_manifest(path=empty_chunks_netcdf4_file, dataset=ds) + @pytest.mark.skip("Need to differentiate non coordinate dimensions from empty") def test_empty_dataset(self, empty_dataset_netcdf4_file): f = h5py.File(empty_dataset_netcdf4_file) ds = f["data"] diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 39bd0671..a8a23693 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -121,7 +121,7 @@ def open_virtual_dataset( ds_attrs = attrs_from_root_group( path=filepath, reader_options=reader_options ) - coord_names = None + coord_names = ds_attrs.pop("coordinates", []) else: # this is the only place we actually always need to use kerchunk directly # TODO avoid even reading byte ranges for variables that will be dropped later anyway? From 64f446c8d452291548bba2c73a104bf068dc2d7e Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Thu, 27 Jun 2024 16:23:43 -0600 Subject: [PATCH 43/79] Use reader_options for filetype check and update failing kerchunk call. --- virtualizarr/tests/test_xarray.py | 18 +++++++++++++----- virtualizarr/xarray.py | 4 +++- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index e55583bf..282d4ad1 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -8,6 +8,7 @@ from xarray.core.indexes import Index from virtualizarr import open_virtual_dataset +from virtualizarr.kerchunk import FileType from virtualizarr.manifests import ChunkManifest, ManifestArray from virtualizarr.tests import network, requires_s3fs from virtualizarr.zarr import ZArray @@ -325,18 +326,25 @@ def test_loadable_variables(self, netcdf4_file): if name in vars_to_load: xrt.assert_identical(vds.variables[name], full_ds.variables[name]) - @patch("virtualizarr.kerchunk.read_kerchunk_references_from_file") + @patch("virtualizarr.xarray._automatically_determine_filetype") + @patch("virtualizarr.xarray.virtual_vars_from_hdf") def test_open_virtual_dataset_passes_expected_args( - self, mock_read_kerchunk, netcdf4_file + self, mock_reader, mock_determine_filetype, netcdf4_file ): reader_options = {"option1": "value1", "option2": "value2"} + mock_determine_filetype.return_value = FileType.netcdf4 open_virtual_dataset(netcdf4_file, indexes={}, reader_options=reader_options) - args = { + reader_args = { + "path": netcdf4_file, + "drop_variables": [], + "reader_options": reader_options, + } + mock_reader.assert_called_once_with(**reader_args) + filetype_args = { "filepath": netcdf4_file, - "filetype": None, "reader_options": reader_options, } - mock_read_kerchunk.assert_called_once_with(**args) + mock_determine_filetype.assert_called_once_with(**filetype_args) class TestRenamePaths: diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index a8a23693..86a59c8d 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -109,7 +109,9 @@ def open_virtual_dataset( ) else: if filetype is None: - filetype = _automatically_determine_filetype(filepath=filepath) + filetype = _automatically_determine_filetype( + filepath=filepath, reader_options=reader_options + ) filetype = FileType(filetype) if filetype.name.lower() == "netcdf4": From 9797346463e443d6f48b567569156f4ca01490cf Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Sat, 29 Jun 2024 18:20:06 -0600 Subject: [PATCH 44/79] Fix chunkmanifest shaping for chunked datasets. --- virtualizarr/readers/hdf.py | 36 +++++++++------------ virtualizarr/tests/test_readers/test_hdf.py | 10 ++++-- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index cbbe824f..d683f693 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -1,3 +1,4 @@ +import math from typing import List, Mapping, Optional, Union import h5py @@ -48,32 +49,27 @@ def _dataset_chunk_manifest( num_chunks = dsid.get_num_chunks() if num_chunks == 0: raise ValueError("The dataset is chunked but contains no chunks") - paths = np.full(num_chunks, path, dtype=np.dtypes.StringDType) # type: ignore - offsets = np.empty((num_chunks), dtype=np.int32) - lengths = np.empty((num_chunks), dtype=np.int32) - def add_chunk_info(blob, chunk_index): - offsets[chunk_index] = blob.byte_offset - lengths[chunk_index] = blob.size + shape = tuple(math.ceil(a / b) for a, b in zip(dataset.shape, dataset.chunks)) + paths = np.empty(shape, dtype=np.dtypes.StringDType) # type: ignore + offsets = np.empty(shape, dtype=np.int32) + lengths = np.empty(shape, dtype=np.int32) - has_chunk_iter = callable(getattr(dsid, "chunk_iter", None)) - if has_chunk_iter: - - def create_callback(initial=0): - value = initial + def get_key(blob): + return tuple([a // b for a, b in zip(blob.chunk_offset, dataset.chunks)]) - def callback(blob): - nonlocal value - add_chunk_info(blob, chunk_index=value) - value += 1 + def add_chunk_info(blob): + key = get_key(blob) + paths[key] = path + offsets[key] = blob.byte_offset + lengths[key] = blob.size - return callback - - callback = create_callback() - dsid.chunk_iter(callback) + has_chunk_iter = callable(getattr(dsid, "chunk_iter", None)) + if has_chunk_iter: + dsid.chunk_iter(add_chunk_info) else: for index in range(num_chunks): - add_chunk_info(dsid.get_chunk_info(index), index) + add_chunk_info(dsid.get_chunk_info(index)) chunk_manifest = ChunkManifest.from_arrays( paths=paths, offsets=offsets, lengths=lengths diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index c744cd68..25caab93 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -28,13 +28,19 @@ def test_no_chunking(self, no_chunks_netcdf4_file): f = h5py.File(no_chunks_netcdf4_file) ds = f["data"] manifest = _dataset_chunk_manifest(path=no_chunks_netcdf4_file, dataset=ds) - assert len(manifest) == 1 + assert manifest.shape_chunk_grid == (1, 1) def test_chunked(self, chunked_netcdf4_file): f = h5py.File(chunked_netcdf4_file) ds = f["data"] manifest = _dataset_chunk_manifest(path=chunked_netcdf4_file, dataset=ds) - assert len(manifest) == 4 + assert manifest.shape_chunk_grid == (2, 2) + + def test_chunked_roundtrip(self, chunked_roundtrip): + f = h5py.File(chunked_roundtrip) + ds = f["var2"] + manifest = _dataset_chunk_manifest(path=chunked_roundtrip, dataset=ds) + assert manifest.shape_chunk_grid == (2, 8) class TestDatasetDims: From c833e191abb773e409aec6eeb47ab6438d0ee0a9 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Sat, 29 Jun 2024 18:22:05 -0600 Subject: [PATCH 45/79] Handle scale_factor attribute serialization for compressed files. --- virtualizarr/readers/hdf_filters.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index cc8e810e..1a3c2220 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -88,7 +88,11 @@ def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None: attributes = {attr: dataset.attrs[attr] for attr in dataset.attrs} mapping = {} if "scale_factor" in attributes: - mapping["scale_factor"] = 1 / attributes["scale_factor"][0] + try: + scale_factor = attributes["scale_factor"][0] + except IndexError: + scale_factor = attributes["scale_factor"] + mapping["scale_factor"] = 1 / scale_factor else: mapping["scale_factor"] = 1 if "add_offset" in attributes: From 701bcfad494326a71ec08c454465bceaa33803e9 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Sat, 29 Jun 2024 18:24:13 -0600 Subject: [PATCH 46/79] Include chunked roundtrip fixture. --- virtualizarr/tests/test_readers/conftest.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index e1a53c5e..5fbec00e 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -196,7 +196,8 @@ def filter_encoded_xarray_netcdf4_file(tmpdir, request, skip_test_for_libhdf5_ve "blosc_shuffle": 1, "fletcher32": False, } - + # Check on how handle scalar dim. + ds = ds.drop_dims("dim3") ds["var2"].encoding.update(encoding_config) filepath = f"{tmpdir}/{request.param}_xarray.nc" ds.to_netcdf(filepath, engine="netcdf4") @@ -237,3 +238,14 @@ def scale_add_offset_netcdf4_file(tmpdir, np_uncompressed_int16, offset, scale_f f["data"].attrs.create(name="add_offset", data=offset) f["data"].attrs.create(name="scale_factor", data=np.array([scale_factor])) return filepath + + +@pytest.fixture() +def chunked_roundtrip(tmpdir): + ds = create_test_data(dim_sizes=(20, 80, 10)) + ds = ds.drop_dims("dim3") + filepath = f"{tmpdir}/chunked_xarray.nc" + ds.to_netcdf( + filepath, engine="netcdf4", encoding={"var2": {"chunksizes": (10, 10)}} + ) + return filepath From 08c988e2c16a7366a4ea99f2fc073da407b326d5 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Sat, 29 Jun 2024 18:24:48 -0600 Subject: [PATCH 47/79] Standardize xarray integration tests for hdf filters. --- .../test_readers/test_hdf_integration.py | 47 ++++++------------- 1 file changed, 14 insertions(+), 33 deletions(-) diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py index dca34dbd..abc23df6 100644 --- a/virtualizarr/tests/test_readers/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -1,53 +1,34 @@ -import fsspec -import numpy import pytest import xarray as xr +import xarray.testing as xrt import virtualizarr from virtualizarr.kerchunk import FileType class TestIntegration: + @pytest.mark.xfail(reason="Investigate initial time value decoding issue") def test_filters_h5netcdf_roundtrip( self, tmpdir, filter_encoded_xarray_h5netcdf_file ): - virtual_ds = virtualizarr.open_virtual_dataset( + ds = xr.open_dataset(filter_encoded_xarray_h5netcdf_file, decode_times=False) + vds = virtualizarr.open_virtual_dataset( filter_encoded_xarray_h5netcdf_file, filetype=FileType("netcdf4") ) kerchunk_file = f"{tmpdir}/kerchunk.json" - virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json") - fs = fsspec.filesystem("reference", fo=kerchunk_file) - m = fs.get_mapper("") - - ds = xr.open_dataset(m, engine="kerchunk") - assert isinstance(ds.air.values[0][0][0], numpy.float64) + vds.virtualize.to_kerchunk(kerchunk_file, format="json") + roundtrip = xr.open_dataset( + kerchunk_file, engine="kerchunk", decode_times=False + ) + xrt.assert_allclose(ds, roundtrip) - @pytest.mark.skip( - reason="Issue with xr 'dim1' serialization and blosc availability" - ) def test_filters_netcdf4_roundtrip( self, tmpdir, filter_encoded_xarray_netcdf4_file ): filepath = filter_encoded_xarray_netcdf4_file["filepath"] - compressor = filter_encoded_xarray_netcdf4_file["compressor"] - virtual_ds = virtualizarr.open_virtual_dataset( - filepath, filetype=FileType("netcdf4") - ) + ds = xr.open_dataset(filepath) + vds = virtualizarr.open_virtual_dataset(filepath, filetype=FileType("netcdf4")) kerchunk_file = f"{tmpdir}/kerchunk.json" - virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json") - fs = fsspec.filesystem("reference", fo=kerchunk_file) - m = fs.get_mapper("") - ds = xr.open_dataset(m, engine="kerchunk") - - expected_encoding = ds["var2"].encoding.copy() - compression = expected_encoding.pop("compression") - blosc_shuffle = expected_encoding.pop("blosc_shuffle") - if compression is not None: - if "blosc" in compression and blosc_shuffle: - expected_encoding["blosc"] = { - "compressor": compressor, - "shuffle": blosc_shuffle, - } - expected_encoding["shuffle"] = False - actual_encoding = ds["var2"].encoding - assert expected_encoding.items() <= actual_encoding.items() + vds.virtualize.to_kerchunk(kerchunk_file, format="json") + roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk") + xrt.assert_equal(ds, roundtrip) From 4cb4bac261a7825f44798e247c13a6faeb752a5a Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Sat, 29 Jun 2024 20:00:56 -0600 Subject: [PATCH 48/79] Update reader selection logic for new filetype determination. --- virtualizarr/xarray.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 1a795e56..9671264d 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -136,8 +136,7 @@ def open_virtual_dataset( filepath=filepath, reader_options=reader_options ) filetype = FileType(filetype) - - if filetype.name.lower() == "netcdf4": + if filetype.name.lower() == "netcdf4" or filetype.name.lower() == "hdf5": virtual_vars = virtual_vars_from_hdf( path=filepath, drop_variables=drop_variables, From d352104393d0eeacfc3b566a9f0cb79c7e688c8f Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Sun, 30 Jun 2024 13:07:17 -0600 Subject: [PATCH 49/79] Use decode_times for integration test. --- .../tests/test_readers/test_hdf_integration.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py index abc23df6..882dea31 100644 --- a/virtualizarr/tests/test_readers/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -1,4 +1,3 @@ -import pytest import xarray as xr import xarray.testing as xrt @@ -7,19 +6,18 @@ class TestIntegration: - @pytest.mark.xfail(reason="Investigate initial time value decoding issue") def test_filters_h5netcdf_roundtrip( self, tmpdir, filter_encoded_xarray_h5netcdf_file ): - ds = xr.open_dataset(filter_encoded_xarray_h5netcdf_file, decode_times=False) + ds = xr.open_dataset(filter_encoded_xarray_h5netcdf_file, decode_times=True) vds = virtualizarr.open_virtual_dataset( - filter_encoded_xarray_h5netcdf_file, filetype=FileType("netcdf4") + filter_encoded_xarray_h5netcdf_file, + loadable_variables=["time"], + cftime_variables=["time"], ) kerchunk_file = f"{tmpdir}/kerchunk.json" vds.virtualize.to_kerchunk(kerchunk_file, format="json") - roundtrip = xr.open_dataset( - kerchunk_file, engine="kerchunk", decode_times=False - ) + roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk", decode_times=True) xrt.assert_allclose(ds, roundtrip) def test_filters_netcdf4_roundtrip( From 3d89ea426ccb0f8abdcb961e55773887d48d38d6 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Sun, 30 Jun 2024 13:38:46 -0600 Subject: [PATCH 50/79] Standardize fixture names for hdf5 vs netcdf4 file types. --- virtualizarr/tests/test_readers/conftest.py | 36 +++++---- virtualizarr/tests/test_readers/test_hdf.py | 78 +++++++++---------- .../tests/test_readers/test_hdf_filters.py | 26 +++---- .../test_readers/test_hdf_integration.py | 10 +-- 4 files changed, 76 insertions(+), 74 deletions(-) diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 5fbec00e..539b2fbb 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -9,7 +9,7 @@ @pytest.fixture -def empty_chunks_netcdf4_file(tmpdir): +def empty_chunks_hdf5_file(tmpdir): ds = xr.Dataset({"data": []}) filepath = f"{tmpdir}/empty_chunks.nc" ds.to_netcdf(filepath, engine="h5netcdf") @@ -17,7 +17,7 @@ def empty_chunks_netcdf4_file(tmpdir): @pytest.fixture -def empty_dataset_netcdf4_file(tmpdir): +def empty_dataset_hdf5_file(tmpdir): filepath = f"{tmpdir}/empty_dataset.nc" f = h5py.File(filepath, "w") f.create_dataset("data", shape=(0,), dtype="f") @@ -25,7 +25,7 @@ def empty_dataset_netcdf4_file(tmpdir): @pytest.fixture -def no_chunks_netcdf4_file(tmpdir): +def no_chunks_hdf5_file(tmpdir): filepath = f"{tmpdir}/no_chunks.nc" f = h5py.File(filepath, "w") data = np.random.random((10, 10)) @@ -34,7 +34,7 @@ def no_chunks_netcdf4_file(tmpdir): @pytest.fixture -def chunked_netcdf4_file(tmpdir): +def chunked_hdf5_file(tmpdir): filepath = f"{tmpdir}/chunks.nc" f = h5py.File(filepath, "w") data = np.random.random((100, 100)) @@ -43,7 +43,7 @@ def chunked_netcdf4_file(tmpdir): @pytest.fixture -def single_dimension_scale_netcdf4_file(tmpdir): +def single_dimension_scale_hdf5_file(tmpdir): filepath = f"{tmpdir}/single_dimension_scale.nc" f = h5py.File(filepath, "w") data = [1, 2] @@ -56,7 +56,7 @@ def single_dimension_scale_netcdf4_file(tmpdir): @pytest.fixture -def is_scale_netcdf4_file(tmpdir): +def is_scale_hdf5_file(tmpdir): filepath = f"{tmpdir}/is_scale.nc" f = h5py.File(filepath, "w") data = [1, 2] @@ -66,7 +66,7 @@ def is_scale_netcdf4_file(tmpdir): @pytest.fixture -def multiple_dimension_scales_netcdf4_file(tmpdir): +def multiple_dimension_scales_hdf5_file(tmpdir): filepath = f"{tmpdir}/multiple_dimension_scales.nc" f = h5py.File(filepath, "w") data = [1, 2] @@ -96,7 +96,7 @@ def chunked_dimensions_netcdf4_file(tmpdir): @pytest.fixture -def string_attributes_netcdf4_file(tmpdir): +def string_attributes_hdf5_file(tmpdir): filepath = f"{tmpdir}/attributes.nc" f = h5py.File(filepath, "w") data = np.random.random((10, 10)) @@ -107,7 +107,7 @@ def string_attributes_netcdf4_file(tmpdir): @pytest.fixture -def root_attributes_netcdf4_file(tmpdir): +def root_attributes_hdf5_file(tmpdir): filepath = f"{tmpdir}/root_attributes.nc" f = h5py.File(filepath, "w") f.attrs["attribute_name"] = "attribute_name" @@ -115,7 +115,7 @@ def root_attributes_netcdf4_file(tmpdir): @pytest.fixture -def group_netcdf4_file(tmpdir): +def group_hdf5_file(tmpdir): filepath = f"{tmpdir}/group.nc" f = h5py.File(filepath, "w") f.create_group("group") @@ -123,7 +123,7 @@ def group_netcdf4_file(tmpdir): @pytest.fixture -def multiple_datasets_netcdf4_file(tmpdir): +def multiple_datasets_hdf5_file(tmpdir): filepath = f"{tmpdir}/multiple_datasets.nc" f = h5py.File(filepath, "w") data = np.random.random((10, 10)) @@ -138,7 +138,7 @@ def np_uncompressed(): @pytest.fixture(params=["gzip", "blosc_lz4", "lz4", "bzip2", "zstd"]) -def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request): +def filter_encoded_hdf5_file(tmpdir, np_uncompressed, request): filepath = f"{tmpdir}/{request.param}.nc" f = h5py.File(filepath, "w") if request.param == "gzip": @@ -162,7 +162,7 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request): @pytest.fixture(params=["gzip"]) -def filter_encoded_xarray_h5netcdf_file(tmpdir, request): +def filter_encoded_roundtrip_hdf5_file(tmpdir, request): ds = xr.tutorial.open_dataset("air_temperature") encoding = {} if request.param == "gzip": @@ -184,7 +184,9 @@ def skip_test_for_libhdf5_version(): @pytest.fixture(params=["blosc_zlib"]) -def filter_encoded_xarray_netcdf4_file(tmpdir, request, skip_test_for_libhdf5_version): +def filter_encoded_roundtrip_netcdf4_file( + tmpdir, request, skip_test_for_libhdf5_version +): if skip_test_for_libhdf5_version: pytest.skip("Requires libhdf5 >= 1.14") ds = create_test_data(dim_sizes=(20, 80, 10)) @@ -215,7 +217,7 @@ def offset(): @pytest.fixture -def add_offset_netcdf4_file(tmpdir, np_uncompressed_int16, offset): +def add_offset_hdf5_file(tmpdir, np_uncompressed_int16, offset): filepath = f"{tmpdir}/offset.nc" f = h5py.File(filepath, "w") data = np_uncompressed_int16 - offset @@ -230,7 +232,7 @@ def scale_factor(): @pytest.fixture -def scale_add_offset_netcdf4_file(tmpdir, np_uncompressed_int16, offset, scale_factor): +def scale_add_offset_hdf5_file(tmpdir, np_uncompressed_int16, offset, scale_factor): filepath = f"{tmpdir}/scale_offset.nc" f = h5py.File(filepath, "w") data = (np_uncompressed_int16 - offset) / scale_factor @@ -241,7 +243,7 @@ def scale_add_offset_netcdf4_file(tmpdir, np_uncompressed_int16, offset, scale_f @pytest.fixture() -def chunked_roundtrip(tmpdir): +def chunked_roundtrip_hdf5_file(tmpdir): ds = create_test_data(dim_sizes=(20, 80, 10)) ds = ds.drop_dims("dim3") filepath = f"{tmpdir}/chunked_xarray.nc" diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index 25caab93..1fb0f6ee 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -11,59 +11,59 @@ class TestDatasetChunkManifest: - def test_empty_chunks(self, empty_chunks_netcdf4_file): - f = h5py.File(empty_chunks_netcdf4_file) + def test_empty_chunks(self, empty_chunks_hdf5_file): + f = h5py.File(empty_chunks_hdf5_file) ds = f["data"] with pytest.raises(ValueError, match="chunked but contains no chunks"): - _dataset_chunk_manifest(path=empty_chunks_netcdf4_file, dataset=ds) + _dataset_chunk_manifest(path=empty_chunks_hdf5_file, dataset=ds) @pytest.mark.skip("Need to differentiate non coordinate dimensions from empty") - def test_empty_dataset(self, empty_dataset_netcdf4_file): - f = h5py.File(empty_dataset_netcdf4_file) + def test_empty_dataset(self, empty_dataset_hdf5_file): + f = h5py.File(empty_dataset_hdf5_file) ds = f["data"] with pytest.raises(ValueError, match="no space allocated in the file"): - _dataset_chunk_manifest(path=empty_dataset_netcdf4_file, dataset=ds) + _dataset_chunk_manifest(path=empty_dataset_hdf5_file, dataset=ds) - def test_no_chunking(self, no_chunks_netcdf4_file): - f = h5py.File(no_chunks_netcdf4_file) + def test_no_chunking(self, no_chunks_hdf5_file): + f = h5py.File(no_chunks_hdf5_file) ds = f["data"] - manifest = _dataset_chunk_manifest(path=no_chunks_netcdf4_file, dataset=ds) + manifest = _dataset_chunk_manifest(path=no_chunks_hdf5_file, dataset=ds) assert manifest.shape_chunk_grid == (1, 1) - def test_chunked(self, chunked_netcdf4_file): - f = h5py.File(chunked_netcdf4_file) + def test_chunked(self, chunked_hdf5_file): + f = h5py.File(chunked_hdf5_file) ds = f["data"] - manifest = _dataset_chunk_manifest(path=chunked_netcdf4_file, dataset=ds) + manifest = _dataset_chunk_manifest(path=chunked_hdf5_file, dataset=ds) assert manifest.shape_chunk_grid == (2, 2) - def test_chunked_roundtrip(self, chunked_roundtrip): - f = h5py.File(chunked_roundtrip) + def test_chunked_roundtrip(self, chunked_roundtrip_hdf5_file): + f = h5py.File(chunked_roundtrip_hdf5_file) ds = f["var2"] - manifest = _dataset_chunk_manifest(path=chunked_roundtrip, dataset=ds) + manifest = _dataset_chunk_manifest(path=chunked_roundtrip_hdf5_file, dataset=ds) assert manifest.shape_chunk_grid == (2, 8) class TestDatasetDims: - def test_single_dimension_scale(self, single_dimension_scale_netcdf4_file): - f = h5py.File(single_dimension_scale_netcdf4_file) + def test_single_dimension_scale(self, single_dimension_scale_hdf5_file): + f = h5py.File(single_dimension_scale_hdf5_file) ds = f["data"] dims = _dataset_dims(ds) assert dims[0] == "x" - def test_is_dimension_scale(self, is_scale_netcdf4_file): - f = h5py.File(is_scale_netcdf4_file) + def test_is_dimension_scale(self, is_scale_hdf5_file): + f = h5py.File(is_scale_hdf5_file) ds = f["data"] dims = _dataset_dims(ds) assert dims[0] == "data" - def test_multiple_dimension_scales(self, multiple_dimension_scales_netcdf4_file): - f = h5py.File(multiple_dimension_scales_netcdf4_file) + def test_multiple_dimension_scales(self, multiple_dimension_scales_hdf5_file): + f = h5py.File(multiple_dimension_scales_hdf5_file) ds = f["data"] with pytest.raises(ValueError, match="dimension scales attached"): _dataset_dims(ds) - def test_no_dimension_scales(self, no_chunks_netcdf4_file): - f = h5py.File(no_chunks_netcdf4_file) + def test_no_dimension_scales(self, no_chunks_hdf5_file): + f = h5py.File(no_chunks_hdf5_file) ds = f["data"] dims = _dataset_dims(ds) assert dims == ["phony_dim_0", "phony_dim_1"] @@ -76,33 +76,33 @@ def test_chunked_dataset(self, chunked_dimensions_netcdf4_file): var = _dataset_to_variable(chunked_dimensions_netcdf4_file, ds) assert var.chunks == (50, 50) - def test_not_chunked_dataset(self, single_dimension_scale_netcdf4_file): - f = h5py.File(single_dimension_scale_netcdf4_file) + def test_not_chunked_dataset(self, single_dimension_scale_hdf5_file): + f = h5py.File(single_dimension_scale_hdf5_file) ds = f["data"] - var = _dataset_to_variable(single_dimension_scale_netcdf4_file, ds) + var = _dataset_to_variable(single_dimension_scale_hdf5_file, ds) assert var.chunks == (2,) - def test_dataset_attributes(self, string_attributes_netcdf4_file): - f = h5py.File(string_attributes_netcdf4_file) + def test_dataset_attributes(self, string_attributes_hdf5_file): + f = h5py.File(string_attributes_hdf5_file) ds = f["data"] - var = _dataset_to_variable(string_attributes_netcdf4_file, ds) + var = _dataset_to_variable(string_attributes_hdf5_file, ds) assert var.attrs["attribute_name"] == "attribute_name" class TestExtractAttributes: - def test_string_attribute(self, string_attributes_netcdf4_file): - f = h5py.File(string_attributes_netcdf4_file) + def test_string_attribute(self, string_attributes_hdf5_file): + f = h5py.File(string_attributes_hdf5_file) ds = f["data"] attrs = _extract_attrs(ds) assert attrs["attribute_name"] == "attribute_name" - def test_root_attribute(self, root_attributes_netcdf4_file): - f = h5py.File(root_attributes_netcdf4_file) + def test_root_attribute(self, root_attributes_hdf5_file): + f = h5py.File(root_attributes_hdf5_file) attrs = _extract_attrs(f) assert attrs["attribute_name"] == "attribute_name" - def test_multiple_attributes(self, string_attributes_netcdf4_file): - f = h5py.File(string_attributes_netcdf4_file) + def test_multiple_attributes(self, string_attributes_hdf5_file): + f = h5py.File(string_attributes_hdf5_file) ds = f["data"] attrs = _extract_attrs(ds) assert len(attrs.keys()) == 2 @@ -113,10 +113,10 @@ def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file): variables = virtual_vars_from_hdf(chunked_dimensions_netcdf4_file) assert len(variables) == 3 - def test_groups_not_implemented(self, group_netcdf4_file): + def test_groups_not_implemented(self, group_hdf5_file): with pytest.raises(NotImplementedError): - virtual_vars_from_hdf(group_netcdf4_file) + virtual_vars_from_hdf(group_hdf5_file) - def test_drop_variables(self, multiple_datasets_netcdf4_file): - variables = virtual_vars_from_hdf(multiple_datasets_netcdf4_file, ["data2"]) + def test_drop_variables(self, multiple_datasets_hdf5_file): + variables = virtual_vars_from_hdf(multiple_datasets_hdf5_file, ["data2"]) assert "data2" not in variables.keys() diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py index 960bcf2c..99b3af48 100644 --- a/virtualizarr/tests/test_readers/test_hdf_filters.py +++ b/virtualizarr/tests/test_readers/test_hdf_filters.py @@ -39,12 +39,12 @@ def test_zstd(self): class TestCodecsFromDataSet: - def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file): - f = h5py.File(filter_encoded_netcdf4_file) + def test_numcodec_decoding(self, np_uncompressed, filter_encoded_hdf5_file): + f = h5py.File(filter_encoded_hdf5_file) ds = f["data"] chunk_info = ds.id.get_chunk_info(0) codecs = codecs_from_dataset(ds) - with open(filter_encoded_netcdf4_file, "rb") as file: + with open(filter_encoded_hdf5_file, "rb") as file: file.seek(chunk_info.byte_offset) bytes_read = file.read(chunk_info.size) decoded = codecs[0].decode(bytes_read) @@ -52,8 +52,8 @@ def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file): class TestCFCodecFromDataset: - def test_no_cf_convention(self, filter_encoded_netcdf4_file): - f = h5py.File(filter_encoded_netcdf4_file) + def test_no_cf_convention(self, filter_encoded_hdf5_file): + f = h5py.File(filter_encoded_hdf5_file) ds = f["data"] cf_codec = cfcodec_from_dataset(ds) assert cf_codec is None @@ -68,8 +68,8 @@ def test_cf_scale_factor(self, netcdf4_file): assert cf_codec["codec"].dtype == " Date: Sun, 30 Jun 2024 22:14:26 -0600 Subject: [PATCH 51/79] Handle array add_offset property for compressed data. --- virtualizarr/readers/hdf_filters.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index 1a3c2220..5b35d8ff 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -96,7 +96,11 @@ def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None: else: mapping["scale_factor"] = 1 if "add_offset" in attributes: - mapping["add_offset"] = attributes["add_offset"] + try: + offset = attributes["add_offset"][0] + except IndexError: + offset = attributes["add_offset"] + mapping["add_offset"] = offset else: mapping["add_offset"] = 0 if mapping["scale_factor"] != 1 or mapping["add_offset"] != 0: From db5b4213b0c4b512c872ce4acdce04c66936a6a5 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 1 Jul 2024 16:57:11 -0600 Subject: [PATCH 52/79] Include h5py shuffle filter. --- virtualizarr/readers/hdf_filters.py | 18 ++++++++++++++---- .../tests/test_readers/test_hdf_filters.py | 11 ++++++++++- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index 5b35d8ff..a60dd56a 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -36,6 +36,14 @@ class ZstdProperties(BaseModel): level: int +class ShuffleProperties(BaseModel): + elementsize: int + + +class ZlibProperties(BaseModel): + level: int + + class CFCodec(TypedDict): target_dtype: np.dtype codec: Codec @@ -56,9 +64,13 @@ def _filter_to_codec( id = _non_standard_filters[id_str] else: id = id_str - conf["id"] = id # type: ignore[assignment] if id == "zlib": - conf["level"] = filter_properties # type: ignore[assignment] + zlib_props = ZlibProperties(level=filter_properties) + conf = zlib_props.model_dump() # type: ignore[assignment] + if id == "shuffle" and isinstance(filter_properties, tuple): + shuffle_props = ShuffleProperties(elementsize=filter_properties[0]) + conf = shuffle_props.model_dump() # type: ignore[assignment] + conf["id"] = id # type: ignore[assignment] if id_int: filter = hdf5plugin.get_filters(id_int)[0] id = filter.filter_name @@ -77,9 +89,7 @@ def _filter_to_codec( if id == "zstd" and isinstance(filter_properties, tuple): zstd_props = ZstdProperties(level=filter_properties[0]) conf = zstd_props.model_dump() # type: ignore[assignment] - conf["id"] = id - codec = registry.get_codec(conf) return codec diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py index 99b3af48..efaad781 100644 --- a/virtualizarr/tests/test_readers/test_hdf_filters.py +++ b/virtualizarr/tests/test_readers/test_hdf_filters.py @@ -37,6 +37,12 @@ def test_zstd(self): expected_config = {"id": "zstd", "level": 5} assert codec.get_config() == expected_config + def test_shuffle(self): + codec = _filter_to_codec("shuffle", (7,)) + assert isinstance(codec, numcodecs.shuffle.Shuffle) + expected_config = {"id": "shuffle", "elementsize": 7} + assert codec.get_config() == expected_config + class TestCodecsFromDataSet: def test_numcodec_decoding(self, np_uncompressed, filter_encoded_hdf5_file): @@ -48,7 +54,10 @@ def test_numcodec_decoding(self, np_uncompressed, filter_encoded_hdf5_file): file.seek(chunk_info.byte_offset) bytes_read = file.read(chunk_info.size) decoded = codecs[0].decode(bytes_read) - assert decoded == np_uncompressed.tobytes() + if isinstance(decoded, np.ndarray): + assert decoded.tobytes() == np_uncompressed.tobytes() + else: + assert decoded == np_uncompressed.tobytes() class TestCFCodecFromDataset: From 9a1da321e186f56d230cb5609dc787f7d9ec557b Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 1 Jul 2024 17:03:46 -0600 Subject: [PATCH 53/79] Make ScaleAndOffset codec last in filters list. --- virtualizarr/readers/hdf.py | 2 +- virtualizarr/tests/test_readers/conftest.py | 36 ++++++++++++++++++- .../test_readers/test_hdf_integration.py | 10 ++++++ 3 files changed, 46 insertions(+), 2 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index d683f693..f3337c04 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -176,7 +176,7 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> Optional[xr.Variab cfcodec = cfcodec_from_dataset(dataset) attrs = _extract_attrs(dataset) if cfcodec: - codecs.append(cfcodec["codec"]) + codecs.insert(0, cfcodec["codec"]) dtype = cfcodec["target_dtype"] attrs.pop("scale_factor", None) attrs.pop("add_offset", None) diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 539b2fbb..afc0beea 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -137,7 +137,7 @@ def np_uncompressed(): return np.arange(100) -@pytest.fixture(params=["gzip", "blosc_lz4", "lz4", "bzip2", "zstd"]) +@pytest.fixture(params=["gzip", "blosc_lz4", "lz4", "bzip2", "zstd", "shuffle"]) def filter_encoded_hdf5_file(tmpdir, np_uncompressed, request): filepath = f"{tmpdir}/{request.param}.nc" f = h5py.File(filepath, "w") @@ -157,6 +157,8 @@ def filter_encoded_hdf5_file(tmpdir, np_uncompressed, request): f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.BZip2()) if request.param == "zstd": f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.Zstd(clevel=2)) + if request.param == "shuffle": + f.create_dataset(name="data", data=np_uncompressed, shuffle=True) return filepath @@ -251,3 +253,35 @@ def chunked_roundtrip_hdf5_file(tmpdir): filepath, engine="netcdf4", encoding={"var2": {"chunksizes": (10, 10)}} ) return filepath + + +@pytest.fixture(params=["gzip", "zlib"]) +def filter_and_cf_roundtrip_hdf5_file(tmpdir, request): + x = np.arange(100) + y = np.arange(100) + temperature = 0.1 * x[:, None] + 0.1 * y[None, :] + ds = xr.Dataset( + {"temperature": (["x", "y"], temperature)}, + coords={"x": np.arange(100), "y": np.arange(100)}, + ) + encoding = { + "temperature": { + "dtype": "int16", + "scale_factor": 0.1, + "add_offset": 273.15, + } + } + if request.param == "gzip": + encoding["temperature"]["compression"] = "gzip" + encoding["temperature"]["compression_opts"] = 7 + + if request.param == "zlib": + encoding["temperature"]["zlib"] = True + encoding["temperature"]["complevel"] = 9 + + from random import randint + + filepath = f"{tmpdir}/{request.param}_{randint(0,100)}_cf_roundtrip.nc" + ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding) + + return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py index 4fc7bd3e..dd8d6c3b 100644 --- a/virtualizarr/tests/test_readers/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -1,3 +1,4 @@ +import pytest import xarray as xr import xarray.testing as xrt @@ -30,3 +31,12 @@ def test_filters_netcdf4_roundtrip( vds.virtualize.to_kerchunk(kerchunk_file, format="json") roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk") xrt.assert_equal(ds, roundtrip) + + @pytest.mark.xfail(reason="Investigate kerchunk _FillValue logic") + def test_filter_and_cf_roundtrip(self, tmpdir, filter_and_cf_roundtrip_hdf5_file): + ds = xr.open_dataset(filter_and_cf_roundtrip_hdf5_file) + vds = virtualizarr.open_virtual_dataset(filter_and_cf_roundtrip_hdf5_file) + kerchunk_file = f"{tmpdir}/filter_cf_kerchunk.json" + vds.virtualize.to_kerchunk(kerchunk_file, format="json") + roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk") + xrt.assert_allclose(ds, roundtrip) From 9b2b0f8a2b94073c2bf50fe78d8dd068e6d1332c Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Tue, 2 Jul 2024 13:23:23 -0600 Subject: [PATCH 54/79] Apply ScaleAndOffset codec to _FillValue since it's value is now downstream. --- virtualizarr/readers/hdf.py | 4 +++- virtualizarr/tests/test_readers/conftest.py | 7 ++++++- virtualizarr/tests/test_readers/test_hdf_integration.py | 2 -- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index f3337c04..6197067f 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -180,14 +180,16 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> Optional[xr.Variab dtype = cfcodec["target_dtype"] attrs.pop("scale_factor", None) attrs.pop("add_offset", None) + fill_value = cfcodec["codec"].decode(dataset.fillvalue) else: dtype = dataset.dtype + fill_value = dataset.fillvalue filters = [codec.get_config() for codec in codecs] zarray = ZArray( chunks=chunks, compressor=None, dtype=dtype, - fill_value=dataset.fillvalue, + fill_value=fill_value, filters=filters, order="C", shape=dataset.shape, diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index afc0beea..ec4132ba 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -259,7 +259,9 @@ def chunked_roundtrip_hdf5_file(tmpdir): def filter_and_cf_roundtrip_hdf5_file(tmpdir, request): x = np.arange(100) y = np.arange(100) + fill_value = np.int16(-9999) temperature = 0.1 * x[:, None] + 0.1 * y[None, :] + temperature[0][0] = fill_value ds = xr.Dataset( {"temperature": (["x", "y"], temperature)}, coords={"x": np.arange(100), "y": np.arange(100)}, @@ -269,7 +271,10 @@ def filter_and_cf_roundtrip_hdf5_file(tmpdir, request): "dtype": "int16", "scale_factor": 0.1, "add_offset": 273.15, - } + "_FillValue": fill_value, + }, + "x": {"_FillValue": fill_value}, + "y": {"_FillValue": fill_value}, } if request.param == "gzip": encoding["temperature"]["compression"] = "gzip" diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py index dd8d6c3b..5cf3f79d 100644 --- a/virtualizarr/tests/test_readers/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -1,4 +1,3 @@ -import pytest import xarray as xr import xarray.testing as xrt @@ -32,7 +31,6 @@ def test_filters_netcdf4_roundtrip( roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk") xrt.assert_equal(ds, roundtrip) - @pytest.mark.xfail(reason="Investigate kerchunk _FillValue logic") def test_filter_and_cf_roundtrip(self, tmpdir, filter_and_cf_roundtrip_hdf5_file): ds = xr.open_dataset(filter_and_cf_roundtrip_hdf5_file) vds = virtualizarr.open_virtual_dataset(filter_and_cf_roundtrip_hdf5_file) From 9ef136275ff636535dcb7e6ecc5b35c1e7149065 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Tue, 2 Jul 2024 15:12:04 -0600 Subject: [PATCH 55/79] Coerce scale and add_offset values to native float for JSON serialization. --- virtualizarr/readers/hdf_filters.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index a60dd56a..ae232fec 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -102,7 +102,7 @@ def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None: scale_factor = attributes["scale_factor"][0] except IndexError: scale_factor = attributes["scale_factor"] - mapping["scale_factor"] = 1 / scale_factor + mapping["scale_factor"] = float(1 / scale_factor) else: mapping["scale_factor"] = 1 if "add_offset" in attributes: @@ -110,7 +110,7 @@ def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None: offset = attributes["add_offset"][0] except IndexError: offset = attributes["add_offset"] - mapping["add_offset"] = offset + mapping["add_offset"] = float(offset) else: mapping["add_offset"] = 0 if mapping["scale_factor"] != 1 or mapping["add_offset"] != 0: From f4f9c8f643c34cbabe71faa6b439853499d4464a Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Thu, 8 Aug 2024 19:36:39 -0600 Subject: [PATCH 56/79] Temporarily xfail integration tests for main --- virtualizarr/tests/test_readers/test_hdf_integration.py | 2 ++ virtualizarr/tests/test_xarray.py | 1 + 2 files changed, 3 insertions(+) diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py index 5cf3f79d..9d5d2a26 100644 --- a/virtualizarr/tests/test_readers/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -1,3 +1,4 @@ +import pytest import xarray as xr import xarray.testing as xrt @@ -6,6 +7,7 @@ class TestIntegration: + @pytest.mark.xfail(reason="0 time start is being interpreted as fillvalue") def test_filters_h5netcdf_roundtrip( self, tmpdir, filter_encoded_roundtrip_hdf5_file ): diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index d5b5f360..8942f4ad 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -427,6 +427,7 @@ def test_open_virtual_dataset_passes_expected_args( } mock_determine_filetype.assert_called_once_with(**filetype_args) + @pytest.mark.xfail(reason="Requires discussion on validity of empty datasets") def test_open_dataset_with_empty(self, hdf5_empty, tmpdir): vds = open_virtual_dataset(hdf5_empty) assert vds.empty.dims == () From a9e59f2bff085bf23d3d99849eaf68adca49fb80 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Tue, 8 Oct 2024 15:13:37 -0400 Subject: [PATCH 57/79] Remove pydantic dependency as per pull/210. --- virtualizarr/readers/hdf_filters.py | 35 +++++++++---------- .../tests/test_readers/test_hdf_filters.py | 2 +- 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index ae232fec..349d10dc 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -1,3 +1,4 @@ +import dataclasses from typing import List, Tuple, TypedDict, Union import h5py @@ -6,7 +7,6 @@ import numpy as np from numcodecs.abc import Codec from numcodecs.fixedscaleoffset import FixedScaleOffset -from pydantic import BaseModel, field_validator from xarray.coding.variables import _choose_float_dtype _non_standard_filters = { @@ -17,30 +17,33 @@ _hdf5plugin_imagecodecs = {"lz4": "imagecodecs_lz4h5", "bzip2": "imagecodecs_bz2"} -class BloscProperties(BaseModel): +@dataclasses.dataclass +class BloscProperties: blocksize: int clevel: int shuffle: int cname: str - @field_validator("cname", mode="before") - def get_cname_from_code(cls, v): + def __post_init__(self): blosc_compressor_codes = { value: key for key, value in hdf5plugin._filters.Blosc._Blosc__COMPRESSIONS.items() } - return blosc_compressor_codes[v] + self.cname = blosc_compressor_codes[self.cname] -class ZstdProperties(BaseModel): +@dataclasses.dataclass +class ZstdProperties: level: int -class ShuffleProperties(BaseModel): +@dataclasses.dataclass +class ShuffleProperties: elementsize: int -class ZlibProperties(BaseModel): +@dataclasses.dataclass +class ZlibProperties: level: int @@ -66,10 +69,10 @@ def _filter_to_codec( id = id_str if id == "zlib": zlib_props = ZlibProperties(level=filter_properties) - conf = zlib_props.model_dump() # type: ignore[assignment] + conf = dataclasses.asdict(zlib_props) if id == "shuffle" and isinstance(filter_properties, tuple): shuffle_props = ShuffleProperties(elementsize=filter_properties[0]) - conf = shuffle_props.model_dump() # type: ignore[assignment] + conf = dataclasses.asdict(shuffle_props) conf["id"] = id # type: ignore[assignment] if id_int: filter = hdf5plugin.get_filters(id_int)[0] @@ -77,18 +80,14 @@ def _filter_to_codec( if id in _hdf5plugin_imagecodecs.keys(): id = _hdf5plugin_imagecodecs[id] if id == "blosc" and isinstance(filter_properties, tuple): + blosc_fields = [field.name for field in dataclasses.fields(BloscProperties)] blosc_props = BloscProperties( - **{ - k: v - for k, v in zip( - BloscProperties.model_fields.keys(), filter_properties[-4:] - ) - } + **{k: v for k, v in zip(blosc_fields, filter_properties[-4:])} ) - conf = blosc_props.model_dump() # type: ignore[assignment] + conf = dataclasses.asdict(blosc_props) if id == "zstd" and isinstance(filter_properties, tuple): zstd_props = ZstdProperties(level=filter_properties[0]) - conf = zstd_props.model_dump() # type: ignore[assignment] + conf = dataclasses.asdict(zstd_props) conf["id"] = id codec = registry.get_codec(conf) return codec diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py index efaad781..b2581c58 100644 --- a/virtualizarr/tests/test_readers/test_hdf_filters.py +++ b/virtualizarr/tests/test_readers/test_hdf_filters.py @@ -34,7 +34,7 @@ def test_blosc(self): def test_zstd(self): codec = _filter_to_codec("32015", (5,)) assert isinstance(codec, numcodecs.zstd.Zstd) - expected_config = {"id": "zstd", "level": 5} + expected_config = {"id": "zstd", "level": 5, "checksum": False} assert codec.get_config() == expected_config def test_shuffle(self): From 2b33bc2a46c3742e886151eb064b439efde2e8f2 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Tue, 8 Oct 2024 15:15:12 -0400 Subject: [PATCH 58/79] Update test for new kerchunk reader module location. --- virtualizarr/tests/test_readers/test_hdf_integration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py index 9d5d2a26..5973a8b9 100644 --- a/virtualizarr/tests/test_readers/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -3,7 +3,7 @@ import xarray.testing as xrt import virtualizarr -from virtualizarr.kerchunk import FileType +from virtualizarr.readers.kerchunk import FileType class TestIntegration: From a57ae9e7578c3e6167f8fc423af6a2c11891a8ab Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Wed, 9 Oct 2024 11:14:20 -0400 Subject: [PATCH 59/79] Fix branch typing errors. --- virtualizarr/readers/hdf.py | 6 +++--- virtualizarr/readers/hdf_filters.py | 6 +++--- virtualizarr/tests/test_readers/conftest.py | 4 ++-- virtualizarr/tests/test_readers/test_hdf.py | 2 +- virtualizarr/tests/test_readers/test_hdf_filters.py | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index 42127ba3..bcf896a8 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -1,7 +1,7 @@ import math -from typing import List, Mapping, Optional, Union +from typing import Dict, List, Optional, Union -import h5py +import h5py # type: ignore import numpy as np import xarray as xr @@ -209,7 +209,7 @@ def virtual_vars_from_hdf( reader_options: Optional[dict] = { "storage_options": {"key": "", "secret": "", "anon": True} }, -) -> Mapping[str, xr.Variable]: +) -> Dict[str, xr.Variable]: if drop_variables is None: drop_variables = [] open_file = _fsspec_openfile_from_filepath( diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index 349d10dc..aedf89b3 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -1,8 +1,8 @@ import dataclasses from typing import List, Tuple, TypedDict, Union -import h5py -import hdf5plugin +import h5py # type: ignore +import hdf5plugin # type: ignore import numcodecs.registry as registry import numpy as np from numcodecs.abc import Codec @@ -68,7 +68,7 @@ def _filter_to_codec( else: id = id_str if id == "zlib": - zlib_props = ZlibProperties(level=filter_properties) + zlib_props = ZlibProperties(level=filter_properties) # type: ignore conf = dataclasses.asdict(zlib_props) if id == "shuffle" and isinstance(filter_properties, tuple): shuffle_props = ShuffleProperties(elementsize=filter_properties[0]) diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index ec4132ba..c47c26c9 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -1,5 +1,5 @@ -import h5py -import hdf5plugin +import h5py # type: ignore +import hdf5plugin # type: ignore import numpy as np import pytest import xarray as xr diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index 1fb0f6ee..32970a33 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -1,4 +1,4 @@ -import h5py +import h5py # type: ignore import pytest from virtualizarr.readers.hdf import ( diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py index b2581c58..c05a7eeb 100644 --- a/virtualizarr/tests/test_readers/test_hdf_filters.py +++ b/virtualizarr/tests/test_readers/test_hdf_filters.py @@ -1,4 +1,4 @@ -import h5py +import h5py # type: ignore import imagecodecs import numcodecs import numpy as np From e21fc6976d8c42633c8c6ee413855aee9ddb997f Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Wed, 9 Oct 2024 12:56:45 -0400 Subject: [PATCH 60/79] Re-include automatic file type determination. --- virtualizarr/backend.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py index fb9a452b..61fa5b96 100644 --- a/virtualizarr/backend.py +++ b/virtualizarr/backend.py @@ -162,6 +162,13 @@ def open_virtual_dataset( if reader_options is None: reader_options = {} + from virtualizarr.readers.kerchunk import _automatically_determine_filetype + + if filetype is None: + filetype = _automatically_determine_filetype( + filepath=filepath, reader_options=reader_options + ) + filetype = FileType(filetype) if filetype == FileType.hdf5: from virtualizarr.readers.hdf import ( attrs_from_root_group, From df69a12db513df051d82cde43fb3ce958b0d02ba Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Wed, 9 Oct 2024 16:27:56 -0400 Subject: [PATCH 61/79] Handle various hdf flavors of _FillValue storage. --- virtualizarr/readers/hdf.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index bcf896a8..9a0d6307 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -31,7 +31,6 @@ def _dataset_chunk_manifest( A Virtualizarr ChunkManifest """ dsid = dataset.id - if dataset.chunks is None: if dsid.get_offset() is None: return None @@ -49,7 +48,6 @@ def _dataset_chunk_manifest( num_chunks = dsid.get_num_chunks() if num_chunks == 0: raise ValueError("The dataset is chunked but contains no chunks") - shape = tuple(math.ceil(a / b) for a, b in zip(dataset.shape, dataset.chunks)) paths = np.empty(shape, dtype=np.dtypes.StringDType) # type: ignore offsets = np.empty(shape, dtype=np.uint64) @@ -184,6 +182,10 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> Optional[xr.Variab else: dtype = dataset.dtype fill_value = dataset.fillvalue + if isinstance(fill_value, np.ndarray): + fill_value = fill_value[0] + if np.isnan(fill_value): + fill_value = float("nan") filters = [codec.get_config() for codec in codecs] zarray = ZArray( chunks=chunks, From 169337c7d4c2bd5764f10fd038d63a4bd4d5fb94 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Wed, 9 Oct 2024 16:28:57 -0400 Subject: [PATCH 62/79] Include loadable variables in drop variables list. --- virtualizarr/backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py index 61fa5b96..0cc7b14a 100644 --- a/virtualizarr/backend.py +++ b/virtualizarr/backend.py @@ -177,7 +177,7 @@ def open_virtual_dataset( virtual_vars = virtual_vars_from_hdf( path=filepath, - drop_variables=drop_variables, + drop_variables=drop_variables + loadable_variables, reader_options=reader_options, ) ds_attrs = attrs_from_root_group( From bdcbfbf70bada38cfa196e748113e49e6b74f3e9 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Wed, 9 Oct 2024 16:31:04 -0400 Subject: [PATCH 63/79] Mock readers.hdf.virtual_vars_from_hdf to verify option passing. --- virtualizarr/tests/test_backend.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py index e42ad9ac..bb68c186 100644 --- a/virtualizarr/tests/test_backend.py +++ b/virtualizarr/tests/test_backend.py @@ -313,19 +313,18 @@ def test_group_kwarg(self, hdf5_groups_file): if name in vars_to_load: xrt.assert_identical(vds.variables[name], full_ds.variables[name]) - @patch("virtualizarr.readers.kerchunk.read_kerchunk_references_from_file") + @patch("virtualizarr.readers.hdf.virtual_vars_from_hdf") def test_open_virtual_dataset_passes_expected_args( - self, mock_read_kerchunk, netcdf4_file + self, mock_read_hdf, netcdf4_file ): reader_options = {"option1": "value1", "option2": "value2"} open_virtual_dataset(netcdf4_file, indexes={}, reader_options=reader_options) args = { - "filepath": netcdf4_file, - "filetype": None, - "group": None, + "path": netcdf4_file, + "drop_variables": [], "reader_options": reader_options, } - mock_read_kerchunk.assert_called_once_with(**args) + mock_read_hdf.assert_called_once_with(**args) def test_open_dataset_with_empty(self, hdf5_empty, tmpdir): vds = open_virtual_dataset(hdf5_empty) From 77f1689aee1e9288a518ae78d6066b9a7435e62f Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Wed, 9 Oct 2024 16:41:10 -0400 Subject: [PATCH 64/79] Convert numpy _FillValue to native Python for serialization support. --- virtualizarr/readers/hdf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index 9a0d6307..8d2c44ce 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -186,6 +186,8 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> Optional[xr.Variab fill_value = fill_value[0] if np.isnan(fill_value): fill_value = float("nan") + if isinstance(fill_value, np.generic): + fill_value = fill_value.item() filters = [codec.get_config() for codec in codecs] zarray = ZArray( chunks=chunks, From 42c653ad0c0d098d1b652c65df242a51363e9867 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Thu, 10 Oct 2024 19:03:43 -0400 Subject: [PATCH 65/79] Support groups with HDF5 reader. --- virtualizarr/backend.py | 3 ++- virtualizarr/readers/hdf.py | 22 +++++++++++++++++---- virtualizarr/tests/test_backend.py | 5 +++-- virtualizarr/tests/test_readers/conftest.py | 15 +++++++++++++- virtualizarr/tests/test_readers/test_hdf.py | 16 ++++++++++++--- 5 files changed, 50 insertions(+), 11 deletions(-) diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py index 0cc7b14a..076fc559 100644 --- a/virtualizarr/backend.py +++ b/virtualizarr/backend.py @@ -177,11 +177,12 @@ def open_virtual_dataset( virtual_vars = virtual_vars_from_hdf( path=filepath, + group=group, drop_variables=drop_variables + loadable_variables, reader_options=reader_options, ) ds_attrs = attrs_from_root_group( - path=filepath, reader_options=reader_options + path=filepath, reader_options=reader_options, group=group ) coord_names = ds_attrs.pop("coordinates", []) # we currently read every other filetype using kerchunks various file format backends diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index 8d2c44ce..8db6d781 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -209,6 +209,7 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> Optional[xr.Variab def virtual_vars_from_hdf( path: str, + group: Optional[str] = None, drop_variables: Optional[List[str]] = None, reader_options: Optional[dict] = { "storage_options": {"key": "", "secret": "", "anon": True} @@ -220,11 +221,17 @@ def virtual_vars_from_hdf( filepath=path, reader_options=reader_options ) f = h5py.File(open_file, mode="r") + if group: + g = f[group] + if not isinstance(g, h5py.Group): + raise ValueError("The provided group is not an HDF group") + else: + g = f variables = {} - for key in f.keys(): + for key in g.keys(): if key not in drop_variables: - if isinstance(f[key], h5py.Dataset): - variable = _dataset_to_variable(path, f[key]) + if isinstance(g[key], h5py.Dataset): + variable = _dataset_to_variable(path, g[key]) if variable is not None: variables[key] = variable else: @@ -235,6 +242,7 @@ def virtual_vars_from_hdf( def attrs_from_root_group( path: str, + group: Optional[str] = None, reader_options: Optional[dict] = { "storage_options": {"key": "", "secret": "", "anon": True} }, @@ -243,5 +251,11 @@ def attrs_from_root_group( filepath=path, reader_options=reader_options ) f = h5py.File(open_file, mode="r") - attrs = _extract_attrs(f) + if group: + g = f[group] + if not isinstance(g, h5py.Group): + raise ValueError("The provided group is not an HDF group") + else: + g = f + attrs = _extract_attrs(g) return attrs diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py index bb68c186..3feab262 100644 --- a/virtualizarr/tests/test_backend.py +++ b/virtualizarr/tests/test_backend.py @@ -293,9 +293,9 @@ def test_explicit_filetype(self, netcdf4_file): open_virtual_dataset(netcdf4_file, filetype="grib") def test_group_kwarg(self, hdf5_groups_file): - with pytest.raises(ValueError, match="Multiple HDF Groups found"): + with pytest.raises(NotImplementedError, match="Nested groups"): open_virtual_dataset(hdf5_groups_file) - with pytest.raises(ValueError, match="not found in"): + with pytest.raises(KeyError, match="doesn't exist"): open_virtual_dataset(hdf5_groups_file, group="doesnt_exist") vars_to_load = ["air", "time"] @@ -321,6 +321,7 @@ def test_open_virtual_dataset_passes_expected_args( open_virtual_dataset(netcdf4_file, indexes={}, reader_options=reader_options) args = { "path": netcdf4_file, + "group": None, "drop_variables": [], "reader_options": reader_options, } diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index c47c26c9..b0b7c41f 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -118,7 +118,20 @@ def root_attributes_hdf5_file(tmpdir): def group_hdf5_file(tmpdir): filepath = f"{tmpdir}/group.nc" f = h5py.File(filepath, "w") - f.create_group("group") + g = f.create_group("group") + data = np.random.random((10, 10)) + g.create_dataset("data", data=data) + return filepath + + +@pytest.fixture +def nested_group_hdf5_file(tmpdir): + filepath = f"{tmpdir}/nested_group.nc" + f = h5py.File(filepath, "w") + g = f.create_group("group") + data = np.random.random((10, 10)) + g.create_dataset("data", data=data) + g.create_group("nested_group") return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index 32970a33..cc9e2dff 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -113,10 +113,20 @@ def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file): variables = virtual_vars_from_hdf(chunked_dimensions_netcdf4_file) assert len(variables) == 3 - def test_groups_not_implemented(self, group_hdf5_file): + def test_nested_groups_not_implemented(self, nested_group_hdf5_file): with pytest.raises(NotImplementedError): - virtual_vars_from_hdf(group_hdf5_file) + virtual_vars_from_hdf(path=nested_group_hdf5_file, group="group") def test_drop_variables(self, multiple_datasets_hdf5_file): - variables = virtual_vars_from_hdf(multiple_datasets_hdf5_file, ["data2"]) + variables = virtual_vars_from_hdf( + path=multiple_datasets_hdf5_file, drop_variables=["data2"] + ) assert "data2" not in variables.keys() + + def test_dataset_in_group(self, group_hdf5_file): + variables = virtual_vars_from_hdf(path=group_hdf5_file, group="group") + assert len(variables) == 1 + + def test_non_group_error(self, group_hdf5_file): + with pytest.raises(ValueError): + virtual_vars_from_hdf(path=group_hdf5_file, group="group/data") From 9c86e0d2c0f8135b0a53cbf5313bfa11cc2a222e Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Thu, 17 Oct 2024 15:11:57 -0400 Subject: [PATCH 66/79] Handle empty variables with a shape. --- virtualizarr/readers/hdf.py | 64 ++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index 8db6d781..65b97eeb 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -167,43 +167,43 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> Optional[xr.Variab # This chunk determination logic mirrors zarr-python's create # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66 + chunks = dataset.chunks if dataset.chunks else dataset.shape + codecs = codecs_from_dataset(dataset) + cfcodec = cfcodec_from_dataset(dataset) + attrs = _extract_attrs(dataset) + if cfcodec: + codecs.insert(0, cfcodec["codec"]) + dtype = cfcodec["target_dtype"] + attrs.pop("scale_factor", None) + attrs.pop("add_offset", None) + fill_value = cfcodec["codec"].decode(dataset.fillvalue) + else: + dtype = dataset.dtype + fill_value = dataset.fillvalue + if isinstance(fill_value, np.ndarray): + fill_value = fill_value[0] + if np.isnan(fill_value): + fill_value = float("nan") + if isinstance(fill_value, np.generic): + fill_value = fill_value.item() + filters = [codec.get_config() for codec in codecs] + zarray = ZArray( + chunks=chunks, + compressor=None, + dtype=dtype, + fill_value=fill_value, + filters=filters, + order="C", + shape=dataset.shape, + zarr_format=2, + ) + dims = _dataset_dims(dataset) manifest = _dataset_chunk_manifest(path, dataset) if manifest: - chunks = dataset.chunks if dataset.chunks else dataset.shape - codecs = codecs_from_dataset(dataset) - cfcodec = cfcodec_from_dataset(dataset) - attrs = _extract_attrs(dataset) - if cfcodec: - codecs.insert(0, cfcodec["codec"]) - dtype = cfcodec["target_dtype"] - attrs.pop("scale_factor", None) - attrs.pop("add_offset", None) - fill_value = cfcodec["codec"].decode(dataset.fillvalue) - else: - dtype = dataset.dtype - fill_value = dataset.fillvalue - if isinstance(fill_value, np.ndarray): - fill_value = fill_value[0] - if np.isnan(fill_value): - fill_value = float("nan") - if isinstance(fill_value, np.generic): - fill_value = fill_value.item() - filters = [codec.get_config() for codec in codecs] - zarray = ZArray( - chunks=chunks, - compressor=None, - dtype=dtype, - fill_value=fill_value, - filters=filters, - order="C", - shape=dataset.shape, - zarr_format=2, - ) marray = ManifestArray(zarray=zarray, chunkmanifest=manifest) - dims = _dataset_dims(dataset) variable = xr.Variable(data=marray, dims=dims, attrs=attrs) else: - variable = None + variable = xr.Variable(data=np.empty(dataset.shape), dims=dims, attrs=attrs) return variable From 15897765e61454331dd1fa4a9d151c8673dbb179 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Wed, 23 Oct 2024 18:31:10 -0400 Subject: [PATCH 67/79] Import top-level version of xarray classes. --- virtualizarr/readers/hdf.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index 30dd402f..dd67475e 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -3,8 +3,7 @@ import h5py # type: ignore import numpy as np -from xarray import Dataset, Variable -from xarray.core.indexes import Index +from xarray import Dataset, Index, Variable from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray from virtualizarr.readers.common import ( From 772c5800b30507dac37c083b0afa119442e995e3 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Wed, 23 Oct 2024 20:59:35 -0400 Subject: [PATCH 68/79] Add option to explicitly specify use of an experimental hdf backend. --- virtualizarr/backend.py | 13 ++- virtualizarr/tests/test_backend.py | 85 +++++++++++++------ virtualizarr/tests/test_integration.py | 30 +++++-- .../test_readers/test_hdf_integration.py | 20 +++-- virtualizarr/tests/test_xarray.py | 36 +++++--- 5 files changed, 126 insertions(+), 58 deletions(-) diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py index 19aebfdd..3ab76d1f 100644 --- a/virtualizarr/backend.py +++ b/virtualizarr/backend.py @@ -13,12 +13,13 @@ from virtualizarr.readers import ( DMRPPVirtualBackend, FITSVirtualBackend, - HDFVirtualBackend, + HDF5VirtualBackend, KerchunkVirtualBackend, NetCDF3VirtualBackend, TIFFVirtualBackend, ZarrV3VirtualBackend, ) +from virtualizarr.readers.common import VirtualBackend from virtualizarr.utils import _FsspecFSFromFilepath, check_for_collisions # TODO add entrypoint to allow external libraries to add to this mapping @@ -26,9 +27,9 @@ "kerchunk": KerchunkVirtualBackend, "zarr_v3": ZarrV3VirtualBackend, "dmrpp": DMRPPVirtualBackend, - "hdf5": HDFVirtualBackend, - "netcdf4": HDFVirtualBackend, # note this is the same as for hdf5 # all the below call one of the kerchunk backends internally (https://fsspec.github.io/kerchunk/reference.html#file-format-backends) + "hdf5": HDF5VirtualBackend, + "netcdf4": HDF5VirtualBackend, # note this is the same as for hdf5 "netcdf3": NetCDF3VirtualBackend, "tiff": TIFFVirtualBackend, "fits": FITSVirtualBackend, @@ -113,6 +114,7 @@ def open_virtual_dataset( indexes: Mapping[str, Index] | None = None, virtual_array_class=ManifestArray, reader_options: Optional[dict] = None, + backend: Optional[VirtualBackend] = None, ) -> Dataset: """ Open a file or store as an xarray Dataset wrapping virtualized zarr arrays. @@ -182,7 +184,10 @@ def open_virtual_dataset( filepath=filepath, reader_options=reader_options ) - backend_cls = VIRTUAL_BACKENDS.get(filetype.name.lower()) + if backend: + backend_cls = backend + else: + backend_cls = VIRTUAL_BACKENDS.get(filetype.name.lower()) if backend_cls is None: raise NotImplementedError(f"Unsupported file type: {filetype.name}") diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py index 43a6bbd8..2368848a 100644 --- a/virtualizarr/tests/test_backend.py +++ b/virtualizarr/tests/test_backend.py @@ -11,6 +11,7 @@ from virtualizarr import open_virtual_dataset from virtualizarr.backend import FileType, automatically_determine_filetype from virtualizarr.manifests import ManifestArray +from virtualizarr.readers.hdf import HDFVirtualBackend from virtualizarr.tests import ( has_astropy, has_tifffile, @@ -82,14 +83,15 @@ def test_FileType(): @requires_kerchunk +@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend]) class TestOpenVirtualDatasetIndexes: - def test_no_indexes(self, netcdf4_file): - vds = open_virtual_dataset(netcdf4_file, indexes={}) + def test_no_indexes(self, netcdf4_file, hdf_backend): + vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend) assert vds.indexes == {} - def test_create_default_indexes(self, netcdf4_file): + def test_create_default_indexes(self, netcdf4_file, hdf_backend): with pytest.warns(UserWarning, match="will create in-memory pandas indexes"): - vds = open_virtual_dataset(netcdf4_file, indexes=None) + vds = open_virtual_dataset(netcdf4_file, indexes=None, backend=hdf_backend) ds = open_dataset(netcdf4_file, decode_times=True) # TODO use xr.testing.assert_identical(vds.indexes, ds.indexes) instead once class supported by assertion comparison, see https://github.com/pydata/xarray/issues/5812 @@ -113,7 +115,8 @@ def index_mappings_equal(indexes1: Mapping[str, Index], indexes2: Mapping[str, I @requires_kerchunk -def test_cftime_index(tmpdir): +@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend]) +def test_cftime_index(tmpdir, hdf_backend): """Ensure a virtual dataset contains the same indexes as an Xarray dataset""" # Note: Test was created to debug: https://github.com/zarr-developers/VirtualiZarr/issues/168 ds = xr.Dataset( @@ -129,7 +132,10 @@ def test_cftime_index(tmpdir): ) ds.to_netcdf(f"{tmpdir}/tmp.nc") vds = open_virtual_dataset( - f"{tmpdir}/tmp.nc", loadable_variables=["time", "lat", "lon"], indexes={} + f"{tmpdir}/tmp.nc", + loadable_variables=["time", "lat", "lon"], + indexes={}, + backend=hdf_backend, ) # TODO use xr.testing.assert_identical(vds.indexes, ds.indexes) instead once class supported by assertion comparison, see https://github.com/pydata/xarray/issues/5812 assert index_mappings_equal(vds.xindexes, ds.xindexes) @@ -139,15 +145,16 @@ def test_cftime_index(tmpdir): @requires_kerchunk +@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend]) class TestOpenVirtualDatasetAttrs: - def test_drop_array_dimensions(self, netcdf4_file): + def test_drop_array_dimensions(self, netcdf4_file, hdf_backend): # regression test for GH issue #150 - vds = open_virtual_dataset(netcdf4_file, indexes={}) + vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend) assert "_ARRAY_DIMENSIONS" not in vds["air"].attrs - def test_coordinate_variable_attrs_preserved(self, netcdf4_file): + def test_coordinate_variable_attrs_preserved(self, netcdf4_file, hdf_backend): # regression test for GH issue #155 - vds = open_virtual_dataset(netcdf4_file, indexes={}) + vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend) assert vds["lat"].attrs == { "standard_name": "latitude", "long_name": "Latitude", @@ -165,7 +172,8 @@ class TestReadFromS3: @pytest.mark.parametrize( "indexes", [None, {}], ids=["None index", "empty dict index"] ) - def test_anon_read_s3(self, filetype, indexes): + @pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend]) + def test_anon_read_s3(self, filetype, indexes, hdf_backend): """Parameterized tests for empty vs supplied indexes and filetypes.""" # TODO: Switch away from this s3 url after minIO is implemented. fpath = "s3://carbonplan-share/virtualizarr/local.nc" @@ -174,6 +182,7 @@ def test_anon_read_s3(self, filetype, indexes): filetype=filetype, indexes=indexes, reader_options={"storage_options": {"anon": True}}, + backend=hdf_backend, ) assert vds.dims == {"time": 2920, "lat": 25, "lon": 53} @@ -182,6 +191,7 @@ def test_anon_read_s3(self, filetype, indexes): @network +@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend]) class TestReadFromURL: @pytest.mark.parametrize( "filetype, url", @@ -228,10 +238,15 @@ class TestReadFromURL: ), ], ) - def test_read_from_url(self, filetype, url): + def test_read_from_url(self, hdf_backend, filetype, url): if filetype in ["grib", "jpg", "hdf4"]: with pytest.raises(NotImplementedError): - vds = open_virtual_dataset(url, reader_options={}, indexes={}) + vds = open_virtual_dataset( + url, + reader_options={}, + indexes={}, + backend=hdf_backend, + ) elif filetype == "hdf5": vds = open_virtual_dataset( url, @@ -239,13 +254,14 @@ def test_read_from_url(self, filetype, url): drop_variables=["listOfCovarianceTerms", "listOfPolarizations"], indexes={}, reader_options={}, + backend=hdf_backend, ) assert isinstance(vds, xr.Dataset) else: - vds = open_virtual_dataset(url, indexes={}) + vds = open_virtual_dataset(url, indexes={}, backend=hdf_backend) assert isinstance(vds, xr.Dataset) - def test_virtualizarr_vs_local_nisar(self): + def test_virtualizarr_vs_local_nisar(self, hdf_backend): import fsspec # Open group directly from locally cached file with xarray @@ -268,6 +284,7 @@ def test_virtualizarr_vs_local_nisar(self): group=hdf_group, indexes={}, drop_variables=["listOfCovarianceTerms", "listOfPolarizations"], + backend=hdf_backend, ) tmpref = "/tmp/cmip6.json" vds.virtualize.to_kerchunk(tmpref, format="json") @@ -279,10 +296,14 @@ def test_virtualizarr_vs_local_nisar(self): @requires_kerchunk class TestLoadVirtualDataset: - def test_loadable_variables(self, netcdf4_file): + @pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend]) + def test_loadable_variables(self, netcdf4_file, hdf_backend): vars_to_load = ["air", "time"] vds = open_virtual_dataset( - netcdf4_file, loadable_variables=vars_to_load, indexes={} + netcdf4_file, + loadable_variables=vars_to_load, + indexes={}, + backend=hdf_backend, ) for name in vds.variables: @@ -304,11 +325,20 @@ def test_explicit_filetype(self, netcdf4_file): with pytest.raises(NotImplementedError): open_virtual_dataset(netcdf4_file, filetype="grib") - def test_group_kwarg(self, hdf5_groups_file): - with pytest.raises(ValueError, match="Multiple HDF Groups found"): - open_virtual_dataset(hdf5_groups_file) - with pytest.raises(ValueError, match="not found in"): - open_virtual_dataset(hdf5_groups_file, group="doesnt_exist") + @pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend]) + def test_group_kwarg(self, hdf5_groups_file, hdf_backend): + if hdf_backend: + with pytest.raises(NotImplementedError, match="Nested groups"): + open_virtual_dataset(hdf5_groups_file, backend=hdf_backend) + with pytest.raises(KeyError, match="doesn't exist"): + open_virtual_dataset( + hdf5_groups_file, group="doesnt_exist", backend=hdf_backend + ) + else: + with pytest.raises(ValueError, match="Multiple HDF Groups found"): + open_virtual_dataset(hdf5_groups_file) + with pytest.raises(ValueError, match="not found in"): + open_virtual_dataset(hdf5_groups_file, group="doesnt_exist") vars_to_load = ["air", "time"] vds = open_virtual_dataset( @@ -316,6 +346,7 @@ def test_group_kwarg(self, hdf5_groups_file): group="test/group", loadable_variables=vars_to_load, indexes={}, + backend=hdf_backend, ) full_ds = xr.open_dataset( hdf5_groups_file, @@ -340,13 +371,15 @@ def test_open_virtual_dataset_passes_expected_args( } mock_read_kerchunk.assert_called_once_with(**args) - def test_open_dataset_with_empty(self, hdf5_empty, tmpdir): - vds = open_virtual_dataset(hdf5_empty) + @pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend]) + def test_open_dataset_with_empty(self, hdf5_empty, tmpdir, hdf_backend): + vds = open_virtual_dataset(hdf5_empty, backend=hdf_backend) assert vds.empty.dims == () assert vds.empty.attrs == {"empty": "true"} - def test_open_dataset_with_scalar(self, hdf5_scalar, tmpdir): - vds = open_virtual_dataset(hdf5_scalar) + @pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend]) + def test_open_dataset_with_scalar(self, hdf5_scalar, tmpdir, hdf_backend): + vds = open_virtual_dataset(hdf5_scalar, backend=hdf_backend) assert vds.scalar.dims == () assert vds.scalar.attrs == {"scalar": "true"} diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 63158777..0a39eb3d 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -5,6 +5,7 @@ from virtualizarr import open_virtual_dataset from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.readers.hdf import HDFVirtualBackend from virtualizarr.tests import requires_kerchunk from virtualizarr.translators.kerchunk import ( dataset_from_kerchunk_refs, @@ -63,8 +64,9 @@ def test_no_duplicates_find_var_names(): ), ], ) +@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend]) def test_numpy_arrays_to_inlined_kerchunk_refs( - netcdf4_file, inline_threshold, vars_to_inline + netcdf4_file, inline_threshold, vars_to_inline, hdf_backend ): from kerchunk.hdf import SingleHdf5ToZarr @@ -75,7 +77,7 @@ def test_numpy_arrays_to_inlined_kerchunk_refs( # loading the variables should produce same result as inlining them using kerchunk vds = open_virtual_dataset( - netcdf4_file, loadable_variables=vars_to_inline, indexes={} + netcdf4_file, loadable_variables=vars_to_inline, indexes={}, backend=hdf_backend ) refs = vds.virtualize.to_kerchunk(format="dict") @@ -90,7 +92,8 @@ def test_numpy_arrays_to_inlined_kerchunk_refs( @requires_kerchunk @pytest.mark.parametrize("format", ["dict", "json", "parquet"]) class TestKerchunkRoundtrip: - def test_kerchunk_roundtrip_no_concat(self, tmpdir, format): + @pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend]) + def test_kerchunk_roundtrip_no_concat(self, tmpdir, format, hdf_backend): # set up example xarray dataset ds = xr.tutorial.open_dataset("air_temperature", decode_times=False) @@ -98,7 +101,7 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format): ds.to_netcdf(f"{tmpdir}/air.nc") # use open_dataset_via_kerchunk to read it as references - vds = open_virtual_dataset(f"{tmpdir}/air.nc", indexes={}) + vds = open_virtual_dataset(f"{tmpdir}/air.nc", indexes={}, backend=hdf_backend) if format == "dict": # write those references to an in-memory kerchunk-formatted references dictionary @@ -122,8 +125,11 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format): for coord in ds.coords: assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs + @pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend]) @pytest.mark.parametrize("decode_times,time_vars", [(False, []), (True, ["time"])]) - def test_kerchunk_roundtrip_concat(self, tmpdir, format, decode_times, time_vars): + def test_kerchunk_roundtrip_concat( + self, tmpdir, format, hdf_backend, decode_times, time_vars + ): # set up example xarray dataset ds = xr.tutorial.open_dataset("air_temperature", decode_times=decode_times) @@ -139,11 +145,13 @@ def test_kerchunk_roundtrip_concat(self, tmpdir, format, decode_times, time_vars f"{tmpdir}/air1.nc", indexes={}, loadable_variables=time_vars, + backend=hdf_backend, ) vds2 = open_virtual_dataset( f"{tmpdir}/air2.nc", indexes={}, loadable_variables=time_vars, + backend=hdf_backend, ) if decode_times is False: @@ -187,7 +195,8 @@ def test_kerchunk_roundtrip_concat(self, tmpdir, format, decode_times, time_vars assert roundtrip.time.encoding["units"] == ds.time.encoding["units"] assert roundtrip.time.encoding["calendar"] == ds.time.encoding["calendar"] - def test_non_dimension_coordinates(self, tmpdir, format): + @pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend]) + def test_non_dimension_coordinates(self, tmpdir, format, hdf_backend): # regression test for GH issue #105 # set up example xarray dataset containing non-dimension coordinate variables @@ -196,7 +205,9 @@ def test_non_dimension_coordinates(self, tmpdir, format): # save it to disk as netCDF (in temporary directory) ds.to_netcdf(f"{tmpdir}/non_dim_coords.nc") - vds = open_virtual_dataset(f"{tmpdir}/non_dim_coords.nc", indexes={}) + vds = open_virtual_dataset( + f"{tmpdir}/non_dim_coords.nc", indexes={}, backend=hdf_backend + ) assert "lat" in vds.coords assert "coordinates" not in vds.attrs @@ -269,11 +280,12 @@ def test_datetime64_dtype_fill_value(self, tmpdir, format): @requires_kerchunk -def test_open_scalar_variable(tmpdir): +@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend]) +def test_open_scalar_variable(tmpdir, hdf_backend): # regression test for GH issue #100 ds = xr.Dataset(data_vars={"a": 0}) ds.to_netcdf(f"{tmpdir}/scalar.nc") - vds = open_virtual_dataset(f"{tmpdir}/scalar.nc", indexes={}) + vds = open_virtual_dataset(f"{tmpdir}/scalar.nc", indexes={}, backend=hdf_backend) assert vds["a"].shape == () diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py index f73292ee..716d1f28 100644 --- a/virtualizarr/tests/test_readers/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -4,18 +4,21 @@ import virtualizarr from virtualizarr.backend import FileType +from virtualizarr.readers.hdf import HDFVirtualBackend +@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend]) class TestIntegration: @pytest.mark.xfail(reason="0 time start is being interpreted as fillvalue") def test_filters_h5netcdf_roundtrip( - self, tmpdir, filter_encoded_roundtrip_hdf5_file + self, tmpdir, filter_encoded_roundtrip_hdf5_file, hdf_backend ): ds = xr.open_dataset(filter_encoded_roundtrip_hdf5_file, decode_times=True) vds = virtualizarr.open_virtual_dataset( filter_encoded_roundtrip_hdf5_file, loadable_variables=["time"], cftime_variables=["time"], + backend=hdf_backend, ) kerchunk_file = f"{tmpdir}/kerchunk.json" vds.virtualize.to_kerchunk(kerchunk_file, format="json") @@ -23,19 +26,26 @@ def test_filters_h5netcdf_roundtrip( xrt.assert_allclose(ds, roundtrip) def test_filters_netcdf4_roundtrip( - self, tmpdir, filter_encoded_roundtrip_netcdf4_file + self, tmpdir, filter_encoded_roundtrip_netcdf4_file, hdf_backend ): filepath = filter_encoded_roundtrip_netcdf4_file["filepath"] ds = xr.open_dataset(filepath) - vds = virtualizarr.open_virtual_dataset(filepath, filetype=FileType("netcdf4")) + vds = virtualizarr.open_virtual_dataset( + filepath, filetype=FileType("netcdf4"), backend=hdf_backend + ) kerchunk_file = f"{tmpdir}/kerchunk.json" vds.virtualize.to_kerchunk(kerchunk_file, format="json") roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk") xrt.assert_equal(ds, roundtrip) - def test_filter_and_cf_roundtrip(self, tmpdir, filter_and_cf_roundtrip_hdf5_file): + def test_filter_and_cf_roundtrip( + self, tmpdir, filter_and_cf_roundtrip_hdf5_file, hdf_backend + ): ds = xr.open_dataset(filter_and_cf_roundtrip_hdf5_file) - vds = virtualizarr.open_virtual_dataset(filter_and_cf_roundtrip_hdf5_file) + vds = virtualizarr.open_virtual_dataset( + filter_and_cf_roundtrip_hdf5_file, + backend=hdf_backend, + ) kerchunk_file = f"{tmpdir}/filter_cf_kerchunk.json" vds.virtualize.to_kerchunk(kerchunk_file, format="json") roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk") diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index 062eda5f..12f6fadf 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -4,6 +4,7 @@ from virtualizarr import open_virtual_dataset from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.readers.hdf import HDFVirtualBackend from virtualizarr.tests import requires_kerchunk from virtualizarr.zarr import ZArray @@ -224,14 +225,15 @@ def test_concat_dim_coords_along_existing_dim(self): @requires_kerchunk +@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend]) class TestCombineUsingIndexes: - def test_combine_by_coords(self, netcdf4_files): + def test_combine_by_coords(self, netcdf4_files, hdf_backend): filepath1, filepath2 = netcdf4_files with pytest.warns(UserWarning, match="will create in-memory pandas indexes"): - vds1 = open_virtual_dataset(filepath1) + vds1 = open_virtual_dataset(filepath1, backend=hdf_backend) with pytest.warns(UserWarning, match="will create in-memory pandas indexes"): - vds2 = open_virtual_dataset(filepath2) + vds2 = open_virtual_dataset(filepath2, backend=hdf_backend) combined_vds = xr.combine_by_coords( [vds2, vds1], @@ -240,13 +242,13 @@ def test_combine_by_coords(self, netcdf4_files): assert combined_vds.xindexes["time"].to_pandas_index().is_monotonic_increasing @pytest.mark.xfail(reason="Not yet implemented, see issue #18") - def test_combine_by_coords_keeping_manifestarrays(self, netcdf4_files): + def test_combine_by_coords_keeping_manifestarrays(self, netcdf4_files, hdf_backend): filepath1, filepath2 = netcdf4_files with pytest.warns(UserWarning, match="will create in-memory pandas indexes"): - vds1 = open_virtual_dataset(filepath1) + vds1 = open_virtual_dataset(filepath1, backend=hdf_backend) with pytest.warns(UserWarning, match="will create in-memory pandas indexes"): - vds2 = open_virtual_dataset(filepath2) + vds2 = open_virtual_dataset(filepath2, backend=hdf_backend) combined_vds = xr.combine_by_coords( [vds2, vds1], @@ -258,17 +260,18 @@ def test_combine_by_coords_keeping_manifestarrays(self, netcdf4_files): @requires_kerchunk +@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend]) class TestRenamePaths: - def test_rename_to_str(self, netcdf4_file): - vds = open_virtual_dataset(netcdf4_file, indexes={}) + def test_rename_to_str(self, netcdf4_file, hdf_backend): + vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend) renamed_vds = vds.virtualize.rename_paths("s3://bucket/air.nc") assert ( renamed_vds["air"].data.manifest.dict()["0.0.0"]["path"] == "s3://bucket/air.nc" ) - def test_rename_using_function(self, netcdf4_file): - vds = open_virtual_dataset(netcdf4_file, indexes={}) + def test_rename_using_function(self, netcdf4_file, hdf_backend): + vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend) def local_to_s3_url(old_local_path: str) -> str: from pathlib import Path @@ -284,15 +287,20 @@ def local_to_s3_url(old_local_path: str) -> str: == "s3://bucket/air.nc" ) - def test_invalid_type(self, netcdf4_file): - vds = open_virtual_dataset(netcdf4_file, indexes={}) + def test_invalid_type(self, netcdf4_file, hdf_backend): + vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend) with pytest.raises(TypeError): vds.virtualize.rename_paths(["file1.nc", "file2.nc"]) - def test_mixture_of_manifestarrays_and_numpy_arrays(self, netcdf4_file): + def test_mixture_of_manifestarrays_and_numpy_arrays( + self, netcdf4_file, hdf_backend + ): vds = open_virtual_dataset( - netcdf4_file, indexes={}, loadable_variables=["lat", "lon"] + netcdf4_file, + indexes={}, + loadable_variables=["lat", "lon"], + backend=hdf_backend, ) renamed_vds = vds.virtualize.rename_paths("s3://bucket/air.nc") assert ( From 3ab90c6d59c7dbd929ad317afbbaf6843097b7d6 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Wed, 23 Oct 2024 21:17:00 -0400 Subject: [PATCH 69/79] Include imagecodecs and hdf5plugin in all CI environments. --- ci/environment.yml | 2 ++ ci/min-deps.yml | 5 +++++ ci/upstream.yml | 4 ++++ 3 files changed, 11 insertions(+) diff --git a/ci/environment.yml b/ci/environment.yml index 25ac0bb1..1ff25449 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -14,6 +14,7 @@ dependencies: - packaging - universal_pathlib - hdf5plugin + - numcodecs # Testing - codecov - pre-commit @@ -33,5 +34,6 @@ dependencies: - tifffile # for opening FITS files - astropy + - pip - pip: - imagecodecs-numcodecs diff --git a/ci/min-deps.yml b/ci/min-deps.yml index 7ca8c0b3..12086543 100644 --- a/ci/min-deps.yml +++ b/ci/min-deps.yml @@ -10,9 +10,11 @@ dependencies: - xarray>=2024.6.0 - numpy>=2.0.0 - numcodecs + - imagecodecs>=2024.6.1 - packaging - ujson - universal_pathlib + - hdf5plugin # Testing - codecov - pre-commit @@ -24,3 +26,6 @@ dependencies: - pytest - pooch - fsspec + - pip + - pip: + - imagecodecs-numcodecs diff --git a/ci/upstream.yml b/ci/upstream.yml index 2c2680bc..931e346c 100644 --- a/ci/upstream.yml +++ b/ci/upstream.yml @@ -11,6 +11,9 @@ dependencies: - packaging - ujson - universal_pathlib + - hdf5plugin + - numcodecs + - imagecodecs>=2024.6.1 # Testing - codecov - pre-commit @@ -27,4 +30,5 @@ dependencies: - icechunk # Installs zarr v3 as dependency - git+https://github.com/pydata/xarray@zarr-v3 # zarr-v3 compatibility branch - git+https://github.com/zarr-developers/numcodecs@zarr3-codecs # zarr-v3 compatibility branch + - imagecodecs-numcodecs # - git+https://github.com/fsspec/kerchunk@main # kerchunk is currently incompatible with zarr-python v3 (https://github.com/fsspec/kerchunk/pull/516) From 150d06d215ff50657c85eb197ff9d0cf4d3eeae5 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Wed, 23 Oct 2024 21:57:37 -0400 Subject: [PATCH 70/79] Add test_hdf_integration tests to be skipped for non-kerchunk env. --- virtualizarr/tests/test_readers/test_hdf_integration.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py index 716d1f28..f2d2367d 100644 --- a/virtualizarr/tests/test_readers/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -5,8 +5,10 @@ import virtualizarr from virtualizarr.backend import FileType from virtualizarr.readers.hdf import HDFVirtualBackend +from virtualizarr.tests import requires_kerchunk +@requires_kerchunk @pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend]) class TestIntegration: @pytest.mark.xfail(reason="0 time start is being interpreted as fillvalue") @@ -43,8 +45,7 @@ def test_filter_and_cf_roundtrip( ): ds = xr.open_dataset(filter_and_cf_roundtrip_hdf5_file) vds = virtualizarr.open_virtual_dataset( - filter_and_cf_roundtrip_hdf5_file, - backend=hdf_backend, + filter_and_cf_roundtrip_hdf5_file, backend=hdf_backend ) kerchunk_file = f"{tmpdir}/filter_cf_kerchunk.json" vds.virtualize.to_kerchunk(kerchunk_file, format="json") From 8ccba34862950b32a1559af33cddbc6d657608bb Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Wed, 23 Oct 2024 22:01:31 -0400 Subject: [PATCH 71/79] Include imagecodecs in dependencies. --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index df6c37be..0d0744b5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ dependencies = [ "h5py", "hdf5plugin", "numcodecs", + "imagecodecs", "imagecodecs-numcodecs", "ujson", ] From 81874e0488c5be595b2545fbc5ec66d802524fd7 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Wed, 23 Oct 2024 22:12:32 -0400 Subject: [PATCH 72/79] Diagnose imagecodecs-numcodecs installation failures in CI. --- ci/environment.yml | 2 +- ci/min-deps.yml | 2 +- ci/upstream.yml | 2 +- pyproject.toml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ci/environment.yml b/ci/environment.yml index 1ff25449..70a4c9c3 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -36,4 +36,4 @@ dependencies: - astropy - pip - pip: - - imagecodecs-numcodecs + - imagecodecs-numcodecs==2024.6.1 diff --git a/ci/min-deps.yml b/ci/min-deps.yml index 12086543..af4a732c 100644 --- a/ci/min-deps.yml +++ b/ci/min-deps.yml @@ -28,4 +28,4 @@ dependencies: - fsspec - pip - pip: - - imagecodecs-numcodecs + - imagecodecs-numcodecs==2024.6.1 diff --git a/ci/upstream.yml b/ci/upstream.yml index 931e346c..f6c66df3 100644 --- a/ci/upstream.yml +++ b/ci/upstream.yml @@ -30,5 +30,5 @@ dependencies: - icechunk # Installs zarr v3 as dependency - git+https://github.com/pydata/xarray@zarr-v3 # zarr-v3 compatibility branch - git+https://github.com/zarr-developers/numcodecs@zarr3-codecs # zarr-v3 compatibility branch - - imagecodecs-numcodecs # - git+https://github.com/fsspec/kerchunk@main # kerchunk is currently incompatible with zarr-python v3 (https://github.com/fsspec/kerchunk/pull/516) + - imagecodecs-numcodecs==2024.6.1 diff --git a/pyproject.toml b/pyproject.toml index 0d0744b5..672398f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ dependencies = [ "hdf5plugin", "numcodecs", "imagecodecs", - "imagecodecs-numcodecs", + "imagecodecs-numcodecs==2024.6.1", "ujson", ] From f87abe2c0b6dd2c9074e566bb3083dbd8856b821 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Thu, 24 Oct 2024 10:59:35 -0400 Subject: [PATCH 73/79] Ignore mypy complaints for VirtualBackend. --- virtualizarr/backend.py | 2 +- virtualizarr/readers/hdf.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py index 3ab76d1f..247657d0 100644 --- a/virtualizarr/backend.py +++ b/virtualizarr/backend.py @@ -187,7 +187,7 @@ def open_virtual_dataset( if backend: backend_cls = backend else: - backend_cls = VIRTUAL_BACKENDS.get(filetype.name.lower()) + backend_cls = VIRTUAL_BACKENDS.get(filetype.name.lower()) # type: ignore if backend_cls is None: raise NotImplementedError(f"Unsupported file type: {filetype.name}") diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index dd67475e..b4723ded 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -127,7 +127,9 @@ def add_chunk_info(blob): add_chunk_info(dsid.get_chunk_info(index)) chunk_manifest = ChunkManifest.from_arrays( - paths=paths, offsets=offsets, lengths=lengths + paths=paths, + offsets=offsets, + lengths=lengths, # type: ignore ) return chunk_manifest From 70e7e29301527b96c91313c68f080e93ed0b79f5 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Thu, 24 Oct 2024 11:08:47 -0400 Subject: [PATCH 74/79] Remove checksum assert which varies across different zstd versions. --- virtualizarr/tests/test_readers/test_hdf_filters.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py index d0bde948..0dd8efa8 100644 --- a/virtualizarr/tests/test_readers/test_hdf_filters.py +++ b/virtualizarr/tests/test_readers/test_hdf_filters.py @@ -34,8 +34,9 @@ def test_blosc(self): def test_zstd(self): codec = _filter_to_codec("32015", (5,)) assert isinstance(codec, numcodecs.zstd.Zstd) - expected_config = {"id": "zstd", "level": 5} - assert codec.get_config() == expected_config + config = codec.get_config() + assert config["id"] == "zstd" + assert config["level"] == 5 def test_shuffle(self): codec = _filter_to_codec("shuffle", (7,)) From 43bc0e4ca03977c4e0d64bdd8308229e08407677 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Thu, 24 Oct 2024 11:21:34 -0400 Subject: [PATCH 75/79] Temporarily xfail integration tests with coordinate inconsistency. --- virtualizarr/tests/test_integration.py | 4 +++- virtualizarr/tests/test_readers/test_hdf_integration.py | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 0a39eb3d..3953e59e 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -199,6 +199,9 @@ def test_kerchunk_roundtrip_concat( def test_non_dimension_coordinates(self, tmpdir, format, hdf_backend): # regression test for GH issue #105 + if hdf_backend: + pytest.xfail("To fix coordinate behavior with HDF reader") + # set up example xarray dataset containing non-dimension coordinate variables ds = xr.Dataset(coords={"lat": (["x", "y"], np.arange(6.0).reshape(2, 3))}) @@ -208,7 +211,6 @@ def test_non_dimension_coordinates(self, tmpdir, format, hdf_backend): vds = open_virtual_dataset( f"{tmpdir}/non_dim_coords.nc", indexes={}, backend=hdf_backend ) - assert "lat" in vds.coords assert "coordinates" not in vds.attrs diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py index f2d2367d..db452086 100644 --- a/virtualizarr/tests/test_readers/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -27,6 +27,7 @@ def test_filters_h5netcdf_roundtrip( roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk", decode_times=True) xrt.assert_allclose(ds, roundtrip) + @pytest.mark.xfail(reason="Coordinate issue affecting kerchunk and HDF reader.") def test_filters_netcdf4_roundtrip( self, tmpdir, filter_encoded_roundtrip_netcdf4_file, hdf_backend ): From 82a63214b599095dcbb152e60029f2c6cbb15151 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Thu, 24 Oct 2024 12:35:20 -0400 Subject: [PATCH 76/79] Remove backend arg for non-hdf network file tests. --- virtualizarr/tests/test_backend.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py index 2368848a..7436abba 100644 --- a/virtualizarr/tests/test_backend.py +++ b/virtualizarr/tests/test_backend.py @@ -245,7 +245,6 @@ def test_read_from_url(self, hdf_backend, filetype, url): url, reader_options={}, indexes={}, - backend=hdf_backend, ) elif filetype == "hdf5": vds = open_virtual_dataset( @@ -258,7 +257,7 @@ def test_read_from_url(self, hdf_backend, filetype, url): ) assert isinstance(vds, xr.Dataset) else: - vds = open_virtual_dataset(url, indexes={}, backend=hdf_backend) + vds = open_virtual_dataset(url, indexes={}) assert isinstance(vds, xr.Dataset) def test_virtualizarr_vs_local_nisar(self, hdf_backend): From b34f260f6e260b1ef66f4907746374ebfc63e2b6 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Thu, 24 Oct 2024 12:40:00 -0400 Subject: [PATCH 77/79] Fix mypy comment moved by ruff formatting. --- virtualizarr/readers/hdf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index b4723ded..c722d7af 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -127,9 +127,9 @@ def add_chunk_info(blob): add_chunk_info(dsid.get_chunk_info(index)) chunk_manifest = ChunkManifest.from_arrays( - paths=paths, + paths=paths, # type: ignore offsets=offsets, - lengths=lengths, # type: ignore + lengths=lengths, ) return chunk_manifest From f9ead06fafef91bed37a3310bf8a32bb5df74c96 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Fri, 25 Oct 2024 10:48:35 -0400 Subject: [PATCH 78/79] Make HDR reader dependencies optional. --- ci/min-deps.yml | 6 ------ pyproject.toml | 7 ++----- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/ci/min-deps.yml b/ci/min-deps.yml index af4a732c..7debcf95 100644 --- a/ci/min-deps.yml +++ b/ci/min-deps.yml @@ -3,18 +3,15 @@ channels: - conda-forge - nodefaults dependencies: - - h5netcdf - h5py - hdf5 - netcdf4 - xarray>=2024.6.0 - numpy>=2.0.0 - numcodecs - - imagecodecs>=2024.6.1 - packaging - ujson - universal_pathlib - - hdf5plugin # Testing - codecov - pre-commit @@ -26,6 +23,3 @@ dependencies: - pytest - pooch - fsspec - - pip - - pip: - - imagecodecs-numcodecs==2024.6.1 diff --git a/pyproject.toml b/pyproject.toml index 672398f7..64453032 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,11 +25,7 @@ dependencies = [ "numpy>=2.0.0", "packaging", "universal-pathlib", - "h5py", - "hdf5plugin", "numcodecs", - "imagecodecs", - "imagecodecs-numcodecs==2024.6.1", "ujson", ] @@ -41,11 +37,12 @@ test = [ "h5netcdf", "h5py", "hdf5plugin", + "imagecodecs", + "imagecodecs-numcodecs==2024.6.1", "kerchunk>=0.2.5", "mypy", "netcdf4", "numcodecs", - "imagecodecs-numcodecs", "pandas-stubs", "pooch", "pre-commit", From 560829266f9036951a6f5f39a016339402422bc0 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Fri, 25 Oct 2024 14:31:50 -0400 Subject: [PATCH 79/79] Handle optional imagecodecs and hdf5plugin dependency imports for tests. --- pyproject.toml | 7 ++++++- virtualizarr/readers/hdf_filters.py | 13 +++++++++++- virtualizarr/tests/__init__.py | 2 ++ virtualizarr/tests/test_readers/conftest.py | 9 ++++++++- virtualizarr/tests/test_readers/test_hdf.py | 14 +++++++++++++ .../tests/test_readers/test_hdf_filters.py | 20 ++++++++++++++++++- 6 files changed, 61 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 64453032..7be7b0a7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,11 +30,16 @@ dependencies = [ ] [project.optional-dependencies] +hdf_reader = [ + "fsspec", + "hdf5plugin", + "imagecodecs", + "imagecodecs-numcodecs==2024.6.1", +] test = [ "codecov", "fastparquet", "fsspec", - "h5netcdf", "h5py", "hdf5plugin", "imagecodecs", diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index aedf89b3..cab1f351 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -1,14 +1,25 @@ import dataclasses +import warnings from typing import List, Tuple, TypedDict, Union import h5py # type: ignore -import hdf5plugin # type: ignore import numcodecs.registry as registry import numpy as np from numcodecs.abc import Codec from numcodecs.fixedscaleoffset import FixedScaleOffset from xarray.coding.variables import _choose_float_dtype +try: + import hdf5plugin # type: ignore +except ModuleNotFoundError: + hdf5plugin = None # type: ignore + warnings.warn("hdf5plugin is required for HDF reader") + +try: + import imagecodecs # noqa +except ModuleNotFoundError: + warnings.warn("imagecodecs is required for HDF reader") + _non_standard_filters = { "gzip": "zlib", "lzf": "imagecodecs_lzf", diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py index 70f613ce..aee82542 100644 --- a/virtualizarr/tests/__init__.py +++ b/virtualizarr/tests/__init__.py @@ -37,6 +37,8 @@ def _importorskip( has_s3fs, requires_s3fs = _importorskip("s3fs") has_scipy, requires_scipy = _importorskip("scipy") has_tifffile, requires_tifffile = _importorskip("tifffile") +has_imagecodecs, requires_imagecodecs = _importorskip("imagecodecs") +has_hdf5plugin, requires_hdf5plugin = _importorskip("hdf5plugin") def create_manifestarray( diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index b0b7c41f..35df93a9 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -1,5 +1,6 @@ +import warnings + import h5py # type: ignore -import hdf5plugin # type: ignore import numpy as np import pytest import xarray as xr @@ -7,6 +8,12 @@ from xarray.tests.test_dataset import create_test_data from xarray.util.print_versions import netcdf_and_hdf5_versions +try: + import hdf5plugin # type: ignore +except ModuleNotFoundError: + hdf5plugin = None # type: ignore + warnings.warn("hdf5plugin is required for HDF reader") + @pytest.fixture def empty_chunks_hdf5_file(tmpdir): diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index 0e51fe28..71d2b352 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -2,8 +2,14 @@ import pytest from virtualizarr.readers.hdf import HDFVirtualBackend +from virtualizarr.tests import ( + requires_hdf5plugin, + requires_imagecodecs, +) +@requires_hdf5plugin +@requires_imagecodecs class TestDatasetChunkManifest: def test_empty_chunks(self, empty_chunks_hdf5_file): f = h5py.File(empty_chunks_hdf5_file) @@ -47,6 +53,8 @@ def test_chunked_roundtrip(self, chunked_roundtrip_hdf5_file): assert manifest.shape_chunk_grid == (2, 8) +@requires_hdf5plugin +@requires_imagecodecs class TestDatasetDims: def test_single_dimension_scale(self, single_dimension_scale_hdf5_file): f = h5py.File(single_dimension_scale_hdf5_file) @@ -73,6 +81,8 @@ def test_no_dimension_scales(self, no_chunks_hdf5_file): assert dims == ["phony_dim_0", "phony_dim_1"] +@requires_hdf5plugin +@requires_imagecodecs class TestDatasetToVariable: def test_chunked_dataset(self, chunked_dimensions_netcdf4_file): f = h5py.File(chunked_dimensions_netcdf4_file) @@ -97,6 +107,8 @@ def test_dataset_attributes(self, string_attributes_hdf5_file): assert var.attrs["attribute_name"] == "attribute_name" +@requires_hdf5plugin +@requires_imagecodecs class TestExtractAttributes: def test_string_attribute(self, string_attributes_hdf5_file): f = h5py.File(string_attributes_hdf5_file) @@ -116,6 +128,8 @@ def test_multiple_attributes(self, string_attributes_hdf5_file): assert len(attrs.keys()) == 2 +@requires_hdf5plugin +@requires_imagecodecs class TestVirtualVarsFromHDF: def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file): variables = HDFVirtualBackend._virtual_vars_from_hdf( diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py index 0dd8efa8..20086b88 100644 --- a/virtualizarr/tests/test_readers/test_hdf_filters.py +++ b/virtualizarr/tests/test_readers/test_hdf_filters.py @@ -1,15 +1,29 @@ +import warnings + import h5py # type: ignore -import imagecodecs import numcodecs import numpy as np +try: + import imagecodecs # noqa +except ModuleNotFoundError: + imagecodecs = None # type: ignore + warnings.warn("imagecodecs is required for HDF reader") + + from virtualizarr.readers.hdf_filters import ( _filter_to_codec, cfcodec_from_dataset, codecs_from_dataset, ) +from virtualizarr.tests import ( + requires_hdf5plugin, + requires_imagecodecs, +) +@requires_hdf5plugin +@requires_imagecodecs class TestFilterToCodec: def test_gzip_uses_zlib_numcodec(self): codec = _filter_to_codec("gzip", 1) @@ -45,6 +59,8 @@ def test_shuffle(self): assert codec.get_config() == expected_config +@requires_hdf5plugin +@requires_imagecodecs class TestCodecsFromDataSet: def test_numcodec_decoding(self, np_uncompressed, filter_encoded_hdf5_file): f = h5py.File(filter_encoded_hdf5_file) @@ -61,6 +77,8 @@ def test_numcodec_decoding(self, np_uncompressed, filter_encoded_hdf5_file): assert decoded == np_uncompressed.tobytes() +@requires_hdf5plugin +@requires_imagecodecs class TestCFCodecFromDataset: def test_no_cf_convention(self, filter_encoded_hdf5_file): f = h5py.File(filter_encoded_hdf5_file)