From 6b7abe2a0dc650ae7e6bf07c080cc9023a17bf2c Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Fri, 19 Apr 2024 13:25:28 -0600 Subject: [PATCH 01/55] Generate chunk manifest backed variable from HDF5 dataset. --- pyproject.toml | 1 + virtualizarr/readers/hdf.py | 135 ++++++++++++++++++++ virtualizarr/tests/test_readers/__init__.py | 0 virtualizarr/tests/test_readers/conftest.py | 91 +++++++++++++ virtualizarr/tests/test_readers/test_hdf.py | 71 ++++++++++ 5 files changed, 298 insertions(+) create mode 100644 virtualizarr/readers/hdf.py create mode 100644 virtualizarr/tests/test_readers/__init__.py create mode 100644 virtualizarr/tests/test_readers/conftest.py create mode 100644 virtualizarr/tests/test_readers/test_hdf.py diff --git a/pyproject.toml b/pyproject.toml index c7505bca..7994c929 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ dependencies = [ "kerchunk==0.2.2", "pydantic", "packaging", + "h5netcdf", ] [project.optional-dependencies] diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py new file mode 100644 index 00000000..a34ae341 --- /dev/null +++ b/virtualizarr/readers/hdf.py @@ -0,0 +1,135 @@ +from typing import List + +import h5py +import xarray as xr + +from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray +from virtualizarr.zarr import ZArray + + +def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest: + """ + Generate ChunkManifest for HDF5 dataset. + + Parameters + ---------- + path: str + The path the HDF5 container file + dset : h5py.Dataset + HDF5 dataset for which to create a ChunkManifest + + Returns + ------- + ChunkManifest + A Virtualizarr ChunkManifest + """ + dsid = dataset.id + + if dataset.chunks is None: + if dsid.get_offset() is None: + raise ValueError("Dataset has no space allocated in the file") + else: + key_list = [0] * (len(dataset.shape) or 1) + key = ".".join(map(str, key_list)) + chunk_entry = ChunkEntry( + path=path, + offset=dsid.get_offset(), + length=dsid.get_storage_size() + ) + chunk_entries = {key: chunk_entry} + chunk_manifest = ChunkManifest( + entries=chunk_entries + ) + return chunk_manifest + else: + num_chunks = dsid.get_num_chunks() + if num_chunks == 0: + raise ValueError("The dataset is chunked but contains no chunks") + + chunk_entries = dict() + + def get_key(blob): + key_list = [a // b for a, b in zip(blob.chunk_offset, dataset.chunks)] + key = ".".join(map(str, key_list)) + return key + + def store_chunk_entry(blob): + chunk_entries[get_key(blob)] = ChunkEntry( + path=path, + offset=blob.byte_offset, + length=blob.size + ) + + has_chunk_iter = callable(getattr(dsid, "chunk_iter", None)) + if has_chunk_iter: + dsid.chunk_iter(store_chunk_entry) + else: + for index in range(num_chunks): + store_chunk_entry(dsid.get_chunk_info(index)) + + chunk_manifest = ChunkManifest( + entries=chunk_entries + ) + return chunk_manifest + +def _dataset_dims(dataset: h5py.Dataset) -> List[str]: + """ + Get a list of dimension scale names attached to input HDF5 dataset. + + This is required by the xarray package to work with Zarr arrays. Only + one dimension scale per dataset dimension is allowed. If dataset is + dimension scale, it will be considered as the dimension to itself. + + Parameters + ---------- + dataset : h5py.Dataset + HDF5 dataset. + + Returns + ------- + list + List with HDF5 path names of dimension scales attached to input + dataset. + """ + dims = list() + rank = len(dataset.shape) + if rank: + for n in range(rank): + num_scales = len(dataset.dims[n]) + if num_scales == 1: + dims.append(dataset.dims[n][0].name[1:]) + elif h5py.h5ds.is_scale(dataset.id): + dims.append(dataset.name[1:]) + elif num_scales > 1: + raise ValueError( + f"{dataset.name}: {len(dataset.dims[n])} " + f"dimension scales attached to dimension #{n}" + ) + elif num_scales == 0: + # Some HDF5 files do not have dimension scales. + # If this is the case, `num_scales` will be 0. + # In this case, we mimic netCDF4 and assign phony dimension names. + # See https://github.com/fsspec/kerchunk/issues/41 + dims.append(f"phony_dim_{n}") + return dims + + +def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable: + # This chunk determination logic mirrors zarr-python's create + # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66 + chunks = dataset.chunks if dataset.chunks else dataset.shape + zarray = ZArray( + chunks=chunks, + compressor=dataset.compression, + dtype=dataset.dtype, + fill_value=dataset.fillvalue, + filters=None, + order="C", + shape=dataset.shape, + zarr_format=2, + ) + manifest = _dataset_chunk_manifest(path, dataset) + marray = ManifestArray(zarray=zarray, chunkmanifest=manifest) + dims = _dataset_dims(dataset) + variable = xr.Variable(data=marray, dims=dims) + return variable diff --git a/virtualizarr/tests/test_readers/__init__.py b/virtualizarr/tests/test_readers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py new file mode 100644 index 00000000..b4504839 --- /dev/null +++ b/virtualizarr/tests/test_readers/conftest.py @@ -0,0 +1,91 @@ +import h5py +import numpy as np +import pytest +import xarray as xr + + +@pytest.fixture +def empty_chunks_netcdf4_file(tmpdir): + ds = xr.Dataset({"data": []}) + filepath = f"{tmpdir}/empty_chunks.nc" + ds.to_netcdf(filepath, engine="h5netcdf") + return filepath + + +@pytest.fixture +def empty_dataset_netcdf4_file(tmpdir): + filepath = f"{tmpdir}/empty_dataset.nc" + f = h5py.File(filepath, "w") + f.create_dataset("data", shape=(0,), dtype="f") + return filepath + + +@pytest.fixture +def no_chunks_netcdf4_file(tmpdir): + filepath = f"{tmpdir}/no_chunks.nc" + f = h5py.File(filepath, "w") + data = np.random.random((10, 10)) + f.create_dataset(name="data", data=data, chunks=None) + return filepath + + +@pytest.fixture +def chunked_netcdf4_file(tmpdir): + filepath = f"{tmpdir}/chunks.nc" + f = h5py.File(filepath, "w") + data = np.random.random((100, 100)) + f.create_dataset(name="data", data=data, chunks=(50, 50)) + return filepath + + +@pytest.fixture +def single_dimension_scale_netcdf4_file(tmpdir): + filepath = f"{tmpdir}/single_dimension_scale.nc" + f = h5py.File(filepath, "w") + data = [1, 2] + x = [0, 1] + f.create_dataset(name="data", data=data) + f.create_dataset(name="x", data=x) + f["x"].make_scale() + f["data"].dims[0].attach_scale(f["x"]) + return filepath + + +@pytest.fixture +def is_scale_netcdf4_file(tmpdir): + filepath = f"{tmpdir}/is_scale.nc" + f = h5py.File(filepath, "w") + data = [1, 2] + f.create_dataset(name="data", data=data) + f["data"].make_scale() + return filepath + + +@pytest.fixture +def multiple_dimension_scales_netcdf4_file(tmpdir): + filepath = f"{tmpdir}/multiple_dimension_scales.nc" + f = h5py.File(filepath, "w") + data = [1, 2] + f.create_dataset(name="data", data=data) + f.create_dataset(name="x", data=[0, 1]) + f.create_dataset(name="y", data=[0, 1]) + f["x"].make_scale() + f["y"].make_scale() + f["data"].dims[0].attach_scale(f["x"]) + f["data"].dims[0].attach_scale(f["y"]) + return filepath + + +@pytest.fixture +def chunked_dimensions_netcdf4_file(tmpdir): + filepath = f"{tmpdir}/chunks_dimension.nc" + f = h5py.File(filepath, "w") + data = np.random.random((100, 100)) + x = np.random.random((100, 100)) + y = np.random.random((100, 100)) + f.create_dataset(name="data", data=data, chunks=(50, 50)) + f.create_dataset(name="x", data=x, chunks=(50, 50)) + f.create_dataset(name="y", data=y, chunks=(50, 50)) + f["data"].dims[0].attach_scale(f["x"]) + f["data"].dims[1].attach_scale(f["y"]) + return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py new file mode 100644 index 00000000..b6b78c11 --- /dev/null +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -0,0 +1,71 @@ +import h5py +import pytest + +from virtualizarr.readers.hdf import (_dataset_chunk_manifest, _dataset_dims, + _dataset_to_variable) + + +class TestDatasetChunkManifest: + def test_empty_chunks(self, empty_chunks_netcdf4_file): + f = h5py.File(empty_chunks_netcdf4_file) + ds = f["data"] + with pytest.raises(ValueError, match="chunked but contains no chunks"): + _dataset_chunk_manifest(path=empty_chunks_netcdf4_file, dataset=ds) + + def test_empty_dataset(self, empty_dataset_netcdf4_file): + f = h5py.File(empty_dataset_netcdf4_file) + ds = f["data"] + with pytest.raises(ValueError, match="no space allocated in the file"): + _dataset_chunk_manifest(path=empty_dataset_netcdf4_file, dataset=ds) + + def test_no_chunking(self, no_chunks_netcdf4_file): + f = h5py.File(no_chunks_netcdf4_file) + ds = f["data"] + manifest = _dataset_chunk_manifest(path=no_chunks_netcdf4_file, dataset=ds) + assert len(manifest.entries) == 1 + + def test_chunked(self, chunked_netcdf4_file): + f = h5py.File(chunked_netcdf4_file) + ds = f["data"] + manifest = _dataset_chunk_manifest(path=chunked_netcdf4_file, dataset=ds) + assert len(manifest.entries) == 4 + + +class TestDatasetDims: + def test_single_dimension_scale(self, single_dimension_scale_netcdf4_file): + f = h5py.File(single_dimension_scale_netcdf4_file) + ds = f["data"] + dims = _dataset_dims(ds) + assert dims[0] == "x" + + def test_is_dimension_scale(self, is_scale_netcdf4_file): + f = h5py.File(is_scale_netcdf4_file) + ds = f["data"] + dims = _dataset_dims(ds) + assert dims[0] == "data" + + def test_multiple_dimension_scales(self, multiple_dimension_scales_netcdf4_file): + f = h5py.File(multiple_dimension_scales_netcdf4_file) + ds = f["data"] + with pytest.raises(ValueError, match="dimension scales attached"): + _dataset_dims(ds) + + def test_no_dimension_scales(self, no_chunks_netcdf4_file): + f = h5py.File(no_chunks_netcdf4_file) + ds = f["data"] + dims = _dataset_dims(ds) + assert dims == ["phony_dim_0", "phony_dim_1"] + + +class TestDatasetToVariable: + def test_chunked_dataset(self, chunked_dimensions_netcdf4_file): + f = h5py.File(chunked_dimensions_netcdf4_file) + ds = f["data"] + var = _dataset_to_variable(chunked_dimensions_netcdf4_file, ds) + assert var.chunks == (50, 50) + + def test_not_chunked_dataset(self, single_dimension_scale_netcdf4_file): + f = h5py.File(single_dimension_scale_netcdf4_file) + ds = f["data"] + var = _dataset_to_variable(single_dimension_scale_netcdf4_file, ds) + assert var.chunks == (2,) From bca0aabd6030625156b5fe1e58fb8d9a2ccf46f1 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Fri, 19 Apr 2024 14:20:38 -0600 Subject: [PATCH 02/55] Transfer dataset attrs to variable. --- virtualizarr/readers/hdf.py | 50 ++++++++++++++++++++- virtualizarr/tests/test_readers/conftest.py | 10 +++++ virtualizarr/tests/test_readers/test_hdf.py | 16 ++++++- 3 files changed, 74 insertions(+), 2 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index a34ae341..d6518a30 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -1,6 +1,7 @@ from typing import List import h5py +import numpy as np import xarray as xr from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray @@ -114,6 +115,52 @@ def _dataset_dims(dataset: h5py.Dataset) -> List[str]: return dims +def _extract_attrs(dataset: h5py.Dataset): + """ + Extract attributes from an HDF5 dataset. + + Parameters + ---------- + dataset : h5py.Dataset + An HDF5 dataset. + """ + _HIDDEN_ATTRS = { + "REFERENCE_LIST", + "CLASS", + "DIMENSION_LIST", + "NAME", + "_Netcdf4Dimid", + "_Netcdf4Coordinates", + "_nc3_strict", + "_NCProperties", + } + attrs = {} + for n, v in dataset.attrs.items(): + if n in _HIDDEN_ATTRS: + continue + # Fix some attribute values to avoid JSON encoding exceptions... + if isinstance(v, bytes): + v = v.decode("utf-8") or " " + elif isinstance(v, (np.ndarray, np.number, np.bool_)): + if v.dtype.kind == "S": + v = v.astype(str) + if n == "_FillValue": + continue + elif v.size == 1: + v = v.flatten()[0] + if isinstance(v, (np.ndarray, np.number, np.bool_)): + v = v.tolist() + else: + v = v.tolist() + elif isinstance(v, h5py._hl.base.Empty): + v = "" + if v == "DIMENSION_SCALE": + continue + + attrs[n] = v + return attrs + + def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable: # This chunk determination logic mirrors zarr-python's create # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66 @@ -131,5 +178,6 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable: manifest = _dataset_chunk_manifest(path, dataset) marray = ManifestArray(zarray=zarray, chunkmanifest=manifest) dims = _dataset_dims(dataset) - variable = xr.Variable(data=marray, dims=dims) + attrs = _extract_attrs(dataset) + variable = xr.Variable(data=marray, dims=dims, attrs=attrs) return variable diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index b4504839..2c40fe17 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -89,3 +89,13 @@ def chunked_dimensions_netcdf4_file(tmpdir): f["data"].dims[0].attach_scale(f["x"]) f["data"].dims[1].attach_scale(f["y"]) return filepath + + +@pytest.fixture +def string_attribute_netcdf4_file(tmpdir): + filepath = f"{tmpdir}/attributes.nc" + f = h5py.File(filepath, "w") + data = np.random.random((10, 10)) + f.create_dataset(name="data", data=data, chunks=None) + f["data"].attrs["attribute_name"] = "attribute_name" + return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index b6b78c11..495b7de0 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -2,7 +2,7 @@ import pytest from virtualizarr.readers.hdf import (_dataset_chunk_manifest, _dataset_dims, - _dataset_to_variable) + _dataset_to_variable, _extract_attrs) class TestDatasetChunkManifest: @@ -69,3 +69,17 @@ def test_not_chunked_dataset(self, single_dimension_scale_netcdf4_file): ds = f["data"] var = _dataset_to_variable(single_dimension_scale_netcdf4_file, ds) assert var.chunks == (2,) + + def test_dataset_attributes(self, string_attribute_netcdf4_file): + f = h5py.File(string_attribute_netcdf4_file) + ds = f["data"] + var = _dataset_to_variable(string_attribute_netcdf4_file, ds) + assert var.attrs["attribute_name"] == "attribute_name" + + +class TestExtractAttributes: + def test_string_attribute(self, string_attribute_netcdf4_file): + f = h5py.File(string_attribute_netcdf4_file) + ds = f["data"] + attrs = _extract_attrs(ds) + assert attrs["attribute_name"] == "attribute_name" From 384ff6bb2d75b68a4af1f23d56a6544b4e20d6b5 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Fri, 19 Apr 2024 15:26:58 -0600 Subject: [PATCH 03/55] Get virtual variables dict from HDF5 file. --- virtualizarr/readers/hdf.py | 14 +++++++++++++- virtualizarr/tests/test_readers/conftest.py | 16 ++++++++++++---- virtualizarr/tests/test_readers/test_hdf.py | 15 ++++++++++++++- 3 files changed, 39 insertions(+), 6 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index d6518a30..9c3ebf44 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -1,4 +1,4 @@ -from typing import List +from typing import Mapping, List import h5py import numpy as np @@ -181,3 +181,15 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable: attrs = _extract_attrs(dataset) variable = xr.Variable(data=marray, dims=dims, attrs=attrs) return variable + + +def virtual_vars_from_hdf(path: str, f: h5py.File) -> Mapping[str, xr.Variable]: + variables = {} + for key in f.keys(): + if isinstance(f[key], h5py.Dataset): + variable = _dataset_to_variable(path, f[key]) + variables[key] = variable + else: + raise NotImplementedError("Nested groups are not yet supported") + + return variables diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 2c40fe17..735e922a 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -81,11 +81,11 @@ def chunked_dimensions_netcdf4_file(tmpdir): filepath = f"{tmpdir}/chunks_dimension.nc" f = h5py.File(filepath, "w") data = np.random.random((100, 100)) - x = np.random.random((100, 100)) - y = np.random.random((100, 100)) + x = np.random.random((100)) + y = np.random.random((100)) f.create_dataset(name="data", data=data, chunks=(50, 50)) - f.create_dataset(name="x", data=x, chunks=(50, 50)) - f.create_dataset(name="y", data=y, chunks=(50, 50)) + f.create_dataset(name="x", data=x) + f.create_dataset(name="y", data=y) f["data"].dims[0].attach_scale(f["x"]) f["data"].dims[1].attach_scale(f["y"]) return filepath @@ -99,3 +99,11 @@ def string_attribute_netcdf4_file(tmpdir): f.create_dataset(name="data", data=data, chunks=None) f["data"].attrs["attribute_name"] = "attribute_name" return filepath + + +@pytest.fixture +def group_netcdf4_file(tmpdir): + filepath = f"{tmpdir}/group.nc" + f = h5py.File(filepath, "w") + f.create_group("group") + return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index 495b7de0..da331ed9 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -2,7 +2,8 @@ import pytest from virtualizarr.readers.hdf import (_dataset_chunk_manifest, _dataset_dims, - _dataset_to_variable, _extract_attrs) + _dataset_to_variable, _extract_attrs, + virtual_vars_from_hdf) class TestDatasetChunkManifest: @@ -83,3 +84,15 @@ def test_string_attribute(self, string_attribute_netcdf4_file): ds = f["data"] attrs = _extract_attrs(ds) assert attrs["attribute_name"] == "attribute_name" + + +class TestVirtualVarsFromHDF: + def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file): + f = h5py.File(chunked_dimensions_netcdf4_file) + variables = virtual_vars_from_hdf(chunked_dimensions_netcdf4_file, f) + assert len(variables) == 3 + + def test_groups_not_implemented(self, group_netcdf4_file): + f = h5py.File(group_netcdf4_file) + with pytest.raises(NotImplementedError): + virtual_vars_from_hdf(group_netcdf4_file, f) From 4c5f9bd30186aee61ff79223a70a3172b1c17d00 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 22 Apr 2024 12:33:24 -0600 Subject: [PATCH 04/55] Update virtual_vars_from_hdf to use fsspec and drop_variables arg. --- pyproject.toml | 2 +- virtualizarr/readers/hdf.py | 25 +++++++++++++++------ virtualizarr/tests/test_readers/conftest.py | 10 +++++++++ virtualizarr/tests/test_readers/test_hdf.py | 13 +++++++---- 4 files changed, 38 insertions(+), 12 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 7994c929..d08621e3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,6 @@ dependencies = [ "kerchunk==0.2.2", "pydantic", "packaging", - "h5netcdf", ] [project.optional-dependencies] @@ -35,6 +34,7 @@ test = [ "pytest", "scipy", "pooch", + "h5netcdf", ] diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index 9c3ebf44..c4ab2927 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -1,5 +1,6 @@ -from typing import Mapping, List +from typing import List, Mapping, Optional +import fsspec import h5py import numpy as np import xarray as xr @@ -73,6 +74,7 @@ def store_chunk_entry(blob): ) return chunk_manifest + def _dataset_dims(dataset: h5py.Dataset) -> List[str]: """ Get a list of dimension scale names attached to input HDF5 dataset. @@ -183,13 +185,22 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable: return variable -def virtual_vars_from_hdf(path: str, f: h5py.File) -> Mapping[str, xr.Variable]: +def virtual_vars_from_hdf( + path: str, + drop_variables: Optional[List[str]] = None, +) -> Mapping[str, xr.Variable]: + if drop_variables is None: + drop_variables = [] + fs, file_path = fsspec.core.url_to_fs(path) + open_file = fs.open(path, "rb") + f = h5py.File(open_file, mode="r") variables = {} for key in f.keys(): - if isinstance(f[key], h5py.Dataset): - variable = _dataset_to_variable(path, f[key]) - variables[key] = variable - else: - raise NotImplementedError("Nested groups are not yet supported") + if key not in drop_variables: + if isinstance(f[key], h5py.Dataset): + variable = _dataset_to_variable(path, f[key]) + variables[key] = variable + else: + raise NotImplementedError("Nested groups are not yet supported") return variables diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 735e922a..aa2b0fe0 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -107,3 +107,13 @@ def group_netcdf4_file(tmpdir): f = h5py.File(filepath, "w") f.create_group("group") return filepath + + +@pytest.fixture +def multiple_datasets_netcdf4_file(tmpdir): + filepath = f"{tmpdir}/multiple_datasets.nc" + f = h5py.File(filepath, "w") + data = np.random.random((10, 10)) + f.create_dataset(name="data", data=data, chunks=None) + f.create_dataset(name="data2", data=data, chunks=None) + return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index da331ed9..36f7bc77 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -88,11 +88,16 @@ def test_string_attribute(self, string_attribute_netcdf4_file): class TestVirtualVarsFromHDF: def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file): - f = h5py.File(chunked_dimensions_netcdf4_file) - variables = virtual_vars_from_hdf(chunked_dimensions_netcdf4_file, f) + variables = virtual_vars_from_hdf(chunked_dimensions_netcdf4_file) assert len(variables) == 3 def test_groups_not_implemented(self, group_netcdf4_file): - f = h5py.File(group_netcdf4_file) with pytest.raises(NotImplementedError): - virtual_vars_from_hdf(group_netcdf4_file, f) + virtual_vars_from_hdf(group_netcdf4_file) + + def test_drop_variables(self, multiple_datasets_netcdf4_file): + variables = virtual_vars_from_hdf( + multiple_datasets_netcdf4_file, + ["data2"] + ) + assert "data2" not in variables.keys() From 1dd3370aedc6e0b590f752273387a716366defe9 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 22 Apr 2024 13:02:03 -0600 Subject: [PATCH 05/55] mypy fix to use ChunkKey and empty dimensions list. --- virtualizarr/readers/hdf.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index c4ab2927..fdb9a77d 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -1,4 +1,4 @@ -from typing import List, Mapping, Optional +from typing import List, Mapping, Optional, Union import fsspec import h5py @@ -8,6 +8,8 @@ from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray from virtualizarr.zarr import ZArray +from virtualizarr.types import ChunkKey + def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest: """ @@ -38,7 +40,8 @@ def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest: offset=dsid.get_offset(), length=dsid.get_storage_size() ) - chunk_entries = {key: chunk_entry} + chunk_key = ChunkKey(key) + chunk_entries = {chunk_key: chunk_entry} chunk_manifest = ChunkManifest( entries=chunk_entries ) @@ -75,7 +78,7 @@ def store_chunk_entry(blob): return chunk_manifest -def _dataset_dims(dataset: h5py.Dataset) -> List[str]: +def _dataset_dims(dataset: h5py.Dataset) -> Union[List[str], List[None]]: """ Get a list of dimension scale names attached to input HDF5 dataset. @@ -114,7 +117,7 @@ def _dataset_dims(dataset: h5py.Dataset) -> List[str]: # In this case, we mimic netCDF4 and assign phony dimension names. # See https://github.com/fsspec/kerchunk/issues/41 dims.append(f"phony_dim_{n}") - return dims + return dims def _extract_attrs(dataset: h5py.Dataset): From d92c75c82cd000bf0fafa5301c22793434fb18ed Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 22 Apr 2024 13:40:52 -0600 Subject: [PATCH 06/55] Extract attributes from hdf5 root group. --- virtualizarr/readers/hdf.py | 18 +++++++++++++----- virtualizarr/tests/test_readers/conftest.py | 8 ++++++++ virtualizarr/tests/test_readers/test_hdf.py | 5 +++++ 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index fdb9a77d..e02d03e7 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -120,14 +120,14 @@ def _dataset_dims(dataset: h5py.Dataset) -> Union[List[str], List[None]]: return dims -def _extract_attrs(dataset: h5py.Dataset): +def _extract_attrs(h5obj: Union[h5py.Dataset, h5py.Group]): """ - Extract attributes from an HDF5 dataset. + Extract attributes from an HDF5 group or dataset. Parameters ---------- - dataset : h5py.Dataset - An HDF5 dataset. + h5obj : h5py.Group or h5py.Dataset + An HDF5 group or dataset. """ _HIDDEN_ATTRS = { "REFERENCE_LIST", @@ -140,7 +140,7 @@ def _extract_attrs(dataset: h5py.Dataset): "_NCProperties", } attrs = {} - for n, v in dataset.attrs.items(): + for n, v in h5obj.attrs.items(): if n in _HIDDEN_ATTRS: continue # Fix some attribute values to avoid JSON encoding exceptions... @@ -207,3 +207,11 @@ def virtual_vars_from_hdf( raise NotImplementedError("Nested groups are not yet supported") return variables + + +def attrs_from_root_group(path: str): + fs, file_path = fsspec.core.url_to_fs(path) + open_file = fs.open(path, "rb") + f = h5py.File(open_file, mode="r") + attrs = _extract_attrs(f) + return attrs diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index aa2b0fe0..46ac7b2e 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -101,6 +101,14 @@ def string_attribute_netcdf4_file(tmpdir): return filepath +@pytest.fixture +def root_attributes_netcdf4_file(tmpdir): + filepath = f"{tmpdir}/root_attributes.nc" + f = h5py.File(filepath, "w") + f.attrs["attribute_name"] = "attribute_name" + return filepath + + @pytest.fixture def group_netcdf4_file(tmpdir): filepath = f"{tmpdir}/group.nc" diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index 36f7bc77..a24e36ab 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -85,6 +85,11 @@ def test_string_attribute(self, string_attribute_netcdf4_file): attrs = _extract_attrs(ds) assert attrs["attribute_name"] == "attribute_name" + def test_root_attribute(self, root_attributes_netcdf4_file): + f = h5py.File(root_attributes_netcdf4_file) + attrs = _extract_attrs(f) + assert attrs["attribute_name"] == "attribute_name" + class TestVirtualVarsFromHDF: def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file): From 0ed836272d26a62b8de457c30dc6525292efc916 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 22 Apr 2024 14:19:17 -0600 Subject: [PATCH 07/55] Use hdf reader for netcdf4 files. --- virtualizarr/xarray.py | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 5c3c8548..415b0a05 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -8,7 +8,8 @@ from xarray.core.variable import IndexVariable import virtualizarr.kerchunk as kerchunk -from virtualizarr.kerchunk import KerchunkStoreRefs, FileType +from virtualizarr.kerchunk import KerchunkStoreRefs, FileType, _automatically_determine_filetype +from virtualizarr.readers.hdf import virtual_vars_from_hdf, attrs_from_root_group from virtualizarr.manifests import ChunkManifest, ManifestArray @@ -76,18 +77,28 @@ def open_virtual_dataset( if common: raise ValueError(f"Cannot both load and drop variables {common}") + if filetype is None: + filetype = _automatically_determine_filetype(filepath) + filetype = FileType(filetype) + if filetype.name.lower() == "netcdf4": + virtual_vars = virtual_vars_from_hdf( + path=filepath, + drop_variables=drop_variables + ) + ds_attrs = attrs_from_root_group(path=filepath) # this is the only place we actually always need to use kerchunk directly # TODO avoid even reading byte ranges for variables that will be dropped later anyway? - vds_refs = kerchunk.read_kerchunk_references_from_file( - filepath=filepath, - filetype=filetype, - ) - virtual_vars = virtual_vars_from_kerchunk_refs( - vds_refs, - drop_variables=drop_variables + loadable_variables, - virtual_array_class=virtual_array_class, - ) - ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {}) + else: + vds_refs = kerchunk.read_kerchunk_references_from_file( + filepath=filepath, + filetype=filetype, + ) + virtual_vars = virtual_vars_from_kerchunk_refs( + vds_refs, + drop_variables=drop_variables + loadable_variables, + virtual_array_class=virtual_array_class, + ) + ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {}) if indexes is None or len(loadable_variables) > 0: # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables... From f4485fa10aebc0f8ef5ff7441704f49781325835 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 22 Apr 2024 21:57:39 +0000 Subject: [PATCH 08/55] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- virtualizarr/xarray.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 415b0a05..2213ffa9 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -9,7 +9,7 @@ import virtualizarr.kerchunk as kerchunk from virtualizarr.kerchunk import KerchunkStoreRefs, FileType, _automatically_determine_filetype -from virtualizarr.readers.hdf import virtual_vars_from_hdf, attrs_from_root_group +from virtualizarr.readers.hdf import virtual_vars_from_hdf, attrs_from_root_group from virtualizarr.manifests import ChunkManifest, ManifestArray From 0123df7b802734f1902bee0cdd196f5baca10c9e Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Wed, 8 May 2024 18:03:04 -0600 Subject: [PATCH 09/55] Fix ruff complaints. --- virtualizarr/readers/hdf.py | 3 +-- virtualizarr/tests/test_readers/test_hdf.py | 10 +++++++--- virtualizarr/xarray.py | 8 ++++++-- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index e02d03e7..af25c029 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -6,9 +6,8 @@ import xarray as xr from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray -from virtualizarr.zarr import ZArray - from virtualizarr.types import ChunkKey +from virtualizarr.zarr import ZArray def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest: diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index a24e36ab..0d5a16db 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -1,9 +1,13 @@ import h5py import pytest -from virtualizarr.readers.hdf import (_dataset_chunk_manifest, _dataset_dims, - _dataset_to_variable, _extract_attrs, - virtual_vars_from_hdf) +from virtualizarr.readers.hdf import ( + _dataset_chunk_manifest, + _dataset_dims, + _dataset_to_variable, + _extract_attrs, + virtual_vars_from_hdf, +) class TestDatasetChunkManifest: diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index fbf6136f..9629a344 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -18,9 +18,13 @@ from xarray.core.variable import IndexVariable import virtualizarr.kerchunk as kerchunk -from virtualizarr.kerchunk import KerchunkStoreRefs, FileType, _automatically_determine_filetype -from virtualizarr.readers.hdf import virtual_vars_from_hdf, attrs_from_root_group +from virtualizarr.kerchunk import ( + FileType, + KerchunkStoreRefs, + _automatically_determine_filetype, +) from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.readers.hdf import attrs_from_root_group, virtual_vars_from_hdf from virtualizarr.zarr import ( attrs_from_zarr_group_json, dataset_to_zarr, From 332bcaab1ae182696e1daf7c611f6fe8fd8ee4fd Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Fri, 10 May 2024 15:10:30 -0600 Subject: [PATCH 10/55] First steps for handling HDF5 filters. --- pyproject.toml | 1 + virtualizarr/readers/hdf.py | 7 +- virtualizarr/readers/hdf_filters.py | 34 +++++++++ virtualizarr/tests/test_readers/conftest.py | 26 +++++++ .../tests/test_readers/test_hdf_filters.py | 31 ++++++++ .../test_readers/test_hdf_integration.py | 21 ++++++ virtualizarr/xarray.py | 71 +++++++++---------- 7 files changed, 153 insertions(+), 38 deletions(-) create mode 100644 virtualizarr/readers/hdf_filters.py create mode 100644 virtualizarr/tests/test_readers/test_hdf_filters.py create mode 100644 virtualizarr/tests/test_readers/test_hdf_integration.py diff --git a/pyproject.toml b/pyproject.toml index 79a50789..4818b5f1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ dependencies = [ "numpy", "ujson", "packaging", + "hdf5plugin", ] [project.optional-dependencies] diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index af25c029..7d95d996 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -6,6 +6,7 @@ import xarray as xr from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray +from virtualizarr.readers.hdf_filters import codecs_from_dataset from virtualizarr.types import ChunkKey from virtualizarr.zarr import ZArray @@ -169,12 +170,14 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable: # This chunk determination logic mirrors zarr-python's create # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66 chunks = dataset.chunks if dataset.chunks else dataset.shape + codecs = codecs_from_dataset(dataset) + filters = [codec.get_config() for codec in codecs] zarray = ZArray( chunks=chunks, - compressor=dataset.compression, + compressor=None, dtype=dataset.dtype, fill_value=dataset.fillvalue, - filters=None, + filters=filters, order="C", shape=dataset.shape, zarr_format=2, diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py new file mode 100644 index 00000000..6070fc17 --- /dev/null +++ b/virtualizarr/readers/hdf_filters.py @@ -0,0 +1,34 @@ +from typing import List, Tuple, Union + +import h5py +import numcodecs.registry as registry +from numcodecs.abc import Codec + +_non_standard_filters = { + "gzip": "zlib" +} + + +def _filter_to_codec(filter_id: str, filter_properties: Union[int, Tuple] = None) -> Codec: + try: + id = int(filter_id) + except ValueError: + id = filter_id + + if isinstance(id, str): + if id in _non_standard_filters.keys(): + id = _non_standard_filters[id] + conf = {"id": id} + if id == "zlib": + conf["level"] = filter_properties + + codec = registry.get_codec(conf) + return codec + + +def codecs_from_dataset(dataset: h5py.Dataset) -> List[Codec]: + codecs = [] + for filter_id, filter_properties in dataset._filters.items(): + codec = _filter_to_codec(filter_id, filter_properties) + codecs.append(codec) + return codecs diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 46ac7b2e..4f0d4fce 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -125,3 +125,29 @@ def multiple_datasets_netcdf4_file(tmpdir): f.create_dataset(name="data", data=data, chunks=None) f.create_dataset(name="data2", data=data, chunks=None) return filepath + + +@pytest.fixture +def np_uncompressed(): + return np.arange(100) + + +@pytest.fixture +def gzip_filter_netcdf4_file(tmpdir, np_uncompressed): + filepath = f"{tmpdir}/gzip.nc" + f = h5py.File(filepath, "w") + f.create_dataset(name="data", data=np_uncompressed, compression="gzip", compression_opts=1) + return filepath + + +@pytest.fixture +def gzip_filter_xarray_netcdf4_file(tmpdir): + ds = xr.tutorial.open_dataset("air_temperature") + encoding = {} + for var_name in ds.variables: + # encoding[var_name] = {"zlib": True, "compression_opts": 1} + encoding[var_name] = {"compression": "gzip", "compression_opts": 1} + + filepath = f"{tmpdir}/gzip_xarray.nc" + ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding) + return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py new file mode 100644 index 00000000..50a5d08c --- /dev/null +++ b/virtualizarr/tests/test_readers/test_hdf_filters.py @@ -0,0 +1,31 @@ +import h5py +import numcodecs +import pytest + +from virtualizarr.readers.hdf_filters import ( + _filter_to_codec, + codecs_from_dataset, +) + + +class TestFilterToCodec: + def test_gzip_uses_zlib_nomcodec(self): + codec = _filter_to_codec("gzip", 1) + assert isinstance(codec, numcodecs.zlib.Zlib) + + def test_lzf_not_available(self): + with pytest.raises(ValueError, match="codec not available"): + _filter_to_codec("lzf") + + +class TestCodecsFromDataSet: + def test_gzip(self, np_uncompressed, gzip_filter_netcdf4_file): + f = h5py.File(gzip_filter_netcdf4_file) + ds = f["data"] + chunk_info = ds.id.get_chunk_info(0) + codecs = codecs_from_dataset(ds) + with open(gzip_filter_netcdf4_file, 'rb') as file: + file.seek(chunk_info.byte_offset) + bytes_read = file.read(chunk_info.size) + decoded = codecs[0].decode(bytes_read) + assert decoded == np_uncompressed.tobytes() diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py new file mode 100644 index 00000000..45bfadcd --- /dev/null +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -0,0 +1,21 @@ +import fsspec +import numpy +import xarray as xr + +import virtualizarr +from virtualizarr.kerchunk import FileType + + +class TestIntegration: + def test_gzip_filter_end_to_end(self, tmpdir, gzip_filter_xarray_netcdf4_file): + virtual_ds = virtualizarr.open_virtual_dataset( + gzip_filter_xarray_netcdf4_file, + filetype=FileType("netcdf4") + ) + kerchunk_file = f"{tmpdir}/gzip_kerchunk.json" + virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json") + fs = fsspec.filesystem("reference", fo=kerchunk_file) + m = fs.get_mapper("") + + ds = xr.open_dataset(m, engine="kerchunk") + assert isinstance(ds.air.values[0][0][0], numpy.float64) diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 9629a344..24ba973a 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -128,48 +128,47 @@ def open_virtual_dataset( ) ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {}) - if indexes is None or len(loadable_variables) > 0: - # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables... - # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references - # TODO really we probably want a dedicated xarray backend that iterates over all variables only once - ds = xr.open_dataset(filepath, drop_variables=drop_variables) - - if indexes is None: - # add default indexes by reading data from file - indexes = {name: index for name, index in ds.xindexes.items()} - elif indexes != {}: - # TODO allow manual specification of index objects - raise NotImplementedError() - else: - indexes = dict(**indexes) # for type hinting: to allow mutation - - loadable_vars = { - name: var - for name, var in ds.variables.items() - if name in loadable_variables - } - - # if we only read the indexes we can just close the file right away as nothing is lazy - if loadable_vars == {}: - ds.close() + if indexes is None or len(loadable_variables) > 0: + # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables... + # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references + # TODO really we probably want a dedicated xarray backend that iterates over all variables only once + ds = xr.open_dataset(filepath, drop_variables=drop_variables) + + if indexes is None: + # add default indexes by reading data from file + indexes = {name: index for name, index in ds.xindexes.items()} + elif indexes != {}: + # TODO allow manual specification of index objects + raise NotImplementedError() else: - loadable_vars = {} - indexes = {} + indexes = dict(**indexes) # for type hinting: to allow mutation - vars = {**virtual_vars, **loadable_vars} + loadable_vars = { + name: var + for name, var in ds.variables.items() + if name in loadable_variables + } - data_vars, coords = separate_coords(vars, indexes) + # if we only read the indexes we can just close the file right away as nothing is lazy + if loadable_vars == {}: + ds.close() + else: + loadable_vars = {} + indexes = {} - vds = xr.Dataset( - data_vars, - coords=coords, - # indexes={}, # TODO should be added in a later version of xarray - attrs=ds_attrs, - ) + vars = {**virtual_vars, **loadable_vars} + + data_vars, coords = separate_coords(vars, indexes) + vds = xr.Dataset( + data_vars, + coords=coords, + # indexes={}, # TODO should be added in a later version of xarray + attrs=ds_attrs, + ) - # TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened + # TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened - return vds + return vds def open_virtual_dataset_from_v3_store( From c51e615ca0cd5396bde54868e439419fe9d9b9c8 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 13 May 2024 12:36:29 -0600 Subject: [PATCH 11/55] Initial step for hdf5plugin supported codecs. --- virtualizarr/readers/hdf_filters.py | 25 +++++++++++++++ virtualizarr/tests/test_readers/conftest.py | 31 +++++++++++++------ .../tests/test_readers/test_hdf_filters.py | 20 +++++++++--- .../test_readers/test_hdf_integration.py | 7 +++-- 4 files changed, 66 insertions(+), 17 deletions(-) diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index 6070fc17..75f06bdc 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -1,14 +1,30 @@ from typing import List, Tuple, Union import h5py +import hdf5plugin import numcodecs.registry as registry from numcodecs.abc import Codec +from pydantic import BaseModel, validator _non_standard_filters = { "gzip": "zlib" } +class BloscProperties(BaseModel): + blocksize: int + clevel: int + shuffle: int + cname: str + + @validator("cname", pre=True) + def get_cname_from_code(cls, v): + blosc_compressor_codes = { + value: key for key, value in hdf5plugin._filters.Blosc._Blosc__COMPRESSIONS.items() + } + return blosc_compressor_codes[v] + + def _filter_to_codec(filter_id: str, filter_properties: Union[int, Tuple] = None) -> Codec: try: id = int(filter_id) @@ -21,6 +37,15 @@ def _filter_to_codec(filter_id: str, filter_properties: Union[int, Tuple] = None conf = {"id": id} if id == "zlib": conf["level"] = filter_properties + elif isinstance(id, int): + filter = hdf5plugin.get_filters(id)[0] + id = filter.filter_name + if id == "blosc": + blosc_props = BloscProperties(**{k: v for k, v in + zip(BloscProperties.__fields__.keys(), + filter_properties[-4:])}) + conf = blosc_props.model_dump() + conf["id"] = id codec = registry.get_codec(conf) return codec diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 4f0d4fce..cc9331e1 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -1,4 +1,5 @@ import h5py +import hdf5plugin import numpy as np import pytest import xarray as xr @@ -132,22 +133,32 @@ def np_uncompressed(): return np.arange(100) -@pytest.fixture -def gzip_filter_netcdf4_file(tmpdir, np_uncompressed): - filepath = f"{tmpdir}/gzip.nc" +@pytest.fixture(params=["gzip", "blosc"]) +def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request): + filepath = f"{tmpdir}/{request.param}.nc" f = h5py.File(filepath, "w") - f.create_dataset(name="data", data=np_uncompressed, compression="gzip", compression_opts=1) + if request.param == "gzip": + f.create_dataset(name="data", data=np_uncompressed, compression="gzip", compression_opts=1) + if request.param == "blosc": + f.create_dataset(name="data", data=np_uncompressed, + **hdf5plugin.Blosc( + cname="lz4", clevel=9, shuffle=hdf5plugin.Blosc.SHUFFLE + )) return filepath -@pytest.fixture -def gzip_filter_xarray_netcdf4_file(tmpdir): +@pytest.fixture(params=["gzip"]) +def filter_encoded_xarray_netcdf4_files(tmpdir, request): ds = xr.tutorial.open_dataset("air_temperature") encoding = {} + if request.param == "gzip": + encoding_config = { + "zlib": True, + "complevel": 1 + } for var_name in ds.variables: - # encoding[var_name] = {"zlib": True, "compression_opts": 1} - encoding[var_name] = {"compression": "gzip", "compression_opts": 1} + encoding[var_name] = encoding_config - filepath = f"{tmpdir}/gzip_xarray.nc" - ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding) + filepath = f"{tmpdir}/{request.param}_xarray.nc" + ds.to_netcdf(filepath, engine="netcdf4", encoding=encoding) return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py index 50a5d08c..8094d4cf 100644 --- a/virtualizarr/tests/test_readers/test_hdf_filters.py +++ b/virtualizarr/tests/test_readers/test_hdf_filters.py @@ -9,7 +9,7 @@ class TestFilterToCodec: - def test_gzip_uses_zlib_nomcodec(self): + def test_gzip_uses_zlib_numcodec(self): codec = _filter_to_codec("gzip", 1) assert isinstance(codec, numcodecs.zlib.Zlib) @@ -17,14 +17,26 @@ def test_lzf_not_available(self): with pytest.raises(ValueError, match="codec not available"): _filter_to_codec("lzf") + def test_blosc(self): + codec = _filter_to_codec("32001", (2, 2, 8, 800, 9, 2, 1)) + assert isinstance(codec, numcodecs.blosc.Blosc) + expected_config = { + "id": "blosc", + "blocksize": 800, + "clevel": 9, + "shuffle": 2, + "cname": "lz4", + } + assert codec.get_config() == expected_config + class TestCodecsFromDataSet: - def test_gzip(self, np_uncompressed, gzip_filter_netcdf4_file): - f = h5py.File(gzip_filter_netcdf4_file) + def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file): + f = h5py.File(filter_encoded_netcdf4_file) ds = f["data"] chunk_info = ds.id.get_chunk_info(0) codecs = codecs_from_dataset(ds) - with open(gzip_filter_netcdf4_file, 'rb') as file: + with open(filter_encoded_netcdf4_file, 'rb') as file: file.seek(chunk_info.byte_offset) bytes_read = file.read(chunk_info.size) decoded = codecs[0].decode(bytes_read) diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py index 45bfadcd..94fc0c1c 100644 --- a/virtualizarr/tests/test_readers/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -7,12 +7,13 @@ class TestIntegration: - def test_gzip_filter_end_to_end(self, tmpdir, gzip_filter_xarray_netcdf4_file): + def test_filters_end_to_end(self, tmpdir, + filter_encoded_xarray_netcdf4_files): virtual_ds = virtualizarr.open_virtual_dataset( - gzip_filter_xarray_netcdf4_file, + filter_encoded_xarray_netcdf4_files, filetype=FileType("netcdf4") ) - kerchunk_file = f"{tmpdir}/gzip_kerchunk.json" + kerchunk_file = f"{tmpdir}/kerchunk.json" virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json") fs = fsspec.filesystem("reference", fo=kerchunk_file) m = fs.get_mapper("") From 0083f77103c909079427ce3471e65af7fb3bfc54 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Thu, 16 May 2024 16:24:57 -0400 Subject: [PATCH 12/55] Small commit to check compression support in CI environment. --- pyproject.toml | 1 + virtualizarr/tests/test_readers/conftest.py | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4818b5f1..bba695eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,6 +41,7 @@ test = [ "scipy", "pooch", "ruff", + "netcdf4", ] diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index cc9331e1..8dc82c33 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -147,7 +147,7 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request): return filepath -@pytest.fixture(params=["gzip"]) +@pytest.fixture(params=["gzip", "blosc_lz"]) def filter_encoded_xarray_netcdf4_files(tmpdir, request): ds = xr.tutorial.open_dataset("air_temperature") encoding = {} @@ -156,9 +156,14 @@ def filter_encoded_xarray_netcdf4_files(tmpdir, request): "zlib": True, "complevel": 1 } + if request.param == "blosc_lz": + encoding_config = { + "compression": "blosc_lz", + } + for var_name in ds.variables: encoding[var_name] = encoding_config filepath = f"{tmpdir}/{request.param}_xarray.nc" - ds.to_netcdf(filepath, engine="netcdf4", encoding=encoding) + ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding) return filepath From 207c4b5cb411637070dc9a5f7011a0e0c98ef877 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 19 May 2024 21:34:26 +0000 Subject: [PATCH 13/55] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- virtualizarr/readers/hdf.py | 16 ++++---------- virtualizarr/readers/hdf_filters.py | 22 ++++++++++++------- virtualizarr/tests/test_readers/conftest.py | 18 +++++++-------- virtualizarr/tests/test_readers/test_hdf.py | 5 +---- .../tests/test_readers/test_hdf_filters.py | 2 +- .../test_readers/test_hdf_integration.py | 6 ++--- virtualizarr/xarray.py | 5 ++--- 7 files changed, 33 insertions(+), 41 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index 7d95d996..78e718e4 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -36,15 +36,11 @@ def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest: key_list = [0] * (len(dataset.shape) or 1) key = ".".join(map(str, key_list)) chunk_entry = ChunkEntry( - path=path, - offset=dsid.get_offset(), - length=dsid.get_storage_size() + path=path, offset=dsid.get_offset(), length=dsid.get_storage_size() ) chunk_key = ChunkKey(key) chunk_entries = {chunk_key: chunk_entry} - chunk_manifest = ChunkManifest( - entries=chunk_entries - ) + chunk_manifest = ChunkManifest(entries=chunk_entries) return chunk_manifest else: num_chunks = dsid.get_num_chunks() @@ -60,9 +56,7 @@ def get_key(blob): def store_chunk_entry(blob): chunk_entries[get_key(blob)] = ChunkEntry( - path=path, - offset=blob.byte_offset, - length=blob.size + path=path, offset=blob.byte_offset, length=blob.size ) has_chunk_iter = callable(getattr(dsid, "chunk_iter", None)) @@ -72,9 +66,7 @@ def store_chunk_entry(blob): for index in range(num_chunks): store_chunk_entry(dsid.get_chunk_info(index)) - chunk_manifest = ChunkManifest( - entries=chunk_entries - ) + chunk_manifest = ChunkManifest(entries=chunk_entries) return chunk_manifest diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index 75f06bdc..77e7037e 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -6,9 +6,7 @@ from numcodecs.abc import Codec from pydantic import BaseModel, validator -_non_standard_filters = { - "gzip": "zlib" -} +_non_standard_filters = {"gzip": "zlib"} class BloscProperties(BaseModel): @@ -20,12 +18,15 @@ class BloscProperties(BaseModel): @validator("cname", pre=True) def get_cname_from_code(cls, v): blosc_compressor_codes = { - value: key for key, value in hdf5plugin._filters.Blosc._Blosc__COMPRESSIONS.items() + value: key + for key, value in hdf5plugin._filters.Blosc._Blosc__COMPRESSIONS.items() } return blosc_compressor_codes[v] -def _filter_to_codec(filter_id: str, filter_properties: Union[int, Tuple] = None) -> Codec: +def _filter_to_codec( + filter_id: str, filter_properties: Union[int, Tuple] = None +) -> Codec: try: id = int(filter_id) except ValueError: @@ -41,9 +42,14 @@ def _filter_to_codec(filter_id: str, filter_properties: Union[int, Tuple] = None filter = hdf5plugin.get_filters(id)[0] id = filter.filter_name if id == "blosc": - blosc_props = BloscProperties(**{k: v for k, v in - zip(BloscProperties.__fields__.keys(), - filter_properties[-4:])}) + blosc_props = BloscProperties( + **{ + k: v + for k, v in zip( + BloscProperties.__fields__.keys(), filter_properties[-4:] + ) + } + ) conf = blosc_props.model_dump() conf["id"] = id diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index aa66f933..53c9630e 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -138,12 +138,15 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request): filepath = f"{tmpdir}/{request.param}.nc" f = h5py.File(filepath, "w") if request.param == "gzip": - f.create_dataset(name="data", data=np_uncompressed, compression="gzip", compression_opts=1) + f.create_dataset( + name="data", data=np_uncompressed, compression="gzip", compression_opts=1 + ) if request.param == "blosc": - f.create_dataset(name="data", data=np_uncompressed, - **hdf5plugin.Blosc( - cname="lz4", clevel=9, shuffle=hdf5plugin.Blosc.SHUFFLE - )) + f.create_dataset( + name="data", + data=np_uncompressed, + **hdf5plugin.Blosc(cname="lz4", clevel=9, shuffle=hdf5plugin.Blosc.SHUFFLE), + ) return filepath @@ -152,10 +155,7 @@ def filter_encoded_xarray_netcdf4_files(tmpdir, request): ds = xr.tutorial.open_dataset("air_temperature") encoding = {} if request.param == "gzip": - encoding_config = { - "zlib": True, - "complevel": 1 - } + encoding_config = {"zlib": True, "complevel": 1} for var_name in ds.variables: encoding[var_name] = encoding_config diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index 0d5a16db..a83bfc39 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -105,8 +105,5 @@ def test_groups_not_implemented(self, group_netcdf4_file): virtual_vars_from_hdf(group_netcdf4_file) def test_drop_variables(self, multiple_datasets_netcdf4_file): - variables = virtual_vars_from_hdf( - multiple_datasets_netcdf4_file, - ["data2"] - ) + variables = virtual_vars_from_hdf(multiple_datasets_netcdf4_file, ["data2"]) assert "data2" not in variables.keys() diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py index 8094d4cf..28b5d69f 100644 --- a/virtualizarr/tests/test_readers/test_hdf_filters.py +++ b/virtualizarr/tests/test_readers/test_hdf_filters.py @@ -36,7 +36,7 @@ def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file): ds = f["data"] chunk_info = ds.id.get_chunk_info(0) codecs = codecs_from_dataset(ds) - with open(filter_encoded_netcdf4_file, 'rb') as file: + with open(filter_encoded_netcdf4_file, "rb") as file: file.seek(chunk_info.byte_offset) bytes_read = file.read(chunk_info.size) decoded = codecs[0].decode(bytes_read) diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py index 94fc0c1c..b31289c0 100644 --- a/virtualizarr/tests/test_readers/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -7,11 +7,9 @@ class TestIntegration: - def test_filters_end_to_end(self, tmpdir, - filter_encoded_xarray_netcdf4_files): + def test_filters_end_to_end(self, tmpdir, filter_encoded_xarray_netcdf4_files): virtual_ds = virtualizarr.open_virtual_dataset( - filter_encoded_xarray_netcdf4_files, - filetype=FileType("netcdf4") + filter_encoded_xarray_netcdf4_files, filetype=FileType("netcdf4") ) kerchunk_file = f"{tmpdir}/kerchunk.json" virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json") diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 72645654..d8b6a080 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -20,8 +20,8 @@ _automatically_determine_filetype, ) from virtualizarr.manifests import ChunkManifest, ManifestArray -from virtualizarr.utils import _fsspec_openfile_from_filepath from virtualizarr.readers.hdf import attrs_from_root_group, virtual_vars_from_hdf +from virtualizarr.utils import _fsspec_openfile_from_filepath from virtualizarr.zarr import ( attrs_from_zarr_group_json, dataset_to_zarr, @@ -109,8 +109,7 @@ def open_virtual_dataset( if filetype.name.lower() == "netcdf4": print("wat") virtual_vars = virtual_vars_from_hdf( - path=filepath, - drop_variables=drop_variables + path=filepath, drop_variables=drop_variables ) ds_attrs = attrs_from_root_group(path=filepath) if filetype == "zarr_v3": From c57380058a5ad6ddbd908d54b1edd85b1f74f91d Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Sun, 19 May 2024 16:12:50 -0600 Subject: [PATCH 14/55] Fix mypy complaints for hdf_filters. --- virtualizarr/readers/hdf_filters.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index 75f06bdc..7a8bcc81 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -1,4 +1,4 @@ -from typing import List, Tuple, Union +from typing import List, Optional, Tuple, TypedDict, Union import h5py import hdf5plugin @@ -25,26 +25,30 @@ def get_cname_from_code(cls, v): return blosc_compressor_codes[v] -def _filter_to_codec(filter_id: str, filter_properties: Union[int, Tuple] = None) -> Codec: +def _filter_to_codec(filter_id: str, filter_properties: Union[int, None, Tuple] = None) -> Codec: + id_int = None + id_str = None try: - id = int(filter_id) + id_int = int(filter_id) except ValueError: - id = filter_id + id_str = filter_id - if isinstance(id, str): - if id in _non_standard_filters.keys(): - id = _non_standard_filters[id] + if id_str: + if id_str in _non_standard_filters.keys(): + id = _non_standard_filters[id_str] + else: + id = id_str conf = {"id": id} if id == "zlib": - conf["level"] = filter_properties - elif isinstance(id, int): - filter = hdf5plugin.get_filters(id)[0] + conf["level"] = filter_properties # type: ignore[assignment] + if id_int: + filter = hdf5plugin.get_filters(id_int)[0] id = filter.filter_name - if id == "blosc": + if id == "blosc" and isinstance(filter_properties, tuple): blosc_props = BloscProperties(**{k: v for k, v in zip(BloscProperties.__fields__.keys(), filter_properties[-4:])}) - conf = blosc_props.model_dump() + conf = blosc_props.model_dump() # type: ignore[assignment] conf["id"] = id codec = registry.get_codec(conf) From 588e06b507e8661644e33923ad0295e255152e1e Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Sun, 19 May 2024 16:22:39 -0600 Subject: [PATCH 15/55] Local pre-commit fix for hdf_filters. --- virtualizarr/readers/hdf_filters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index a3868ebd..dfe1c1f3 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Tuple, TypedDict, Union +from typing import List, Tuple, Union import h5py import hdf5plugin From 725333e06fad83d4d763317faca5f41167a2c98f Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 20 May 2024 20:13:44 -0600 Subject: [PATCH 16/55] Use fsspec reader_options introduced in #37. --- virtualizarr/readers/hdf.py | 22 ++++++++++++++++------ virtualizarr/xarray.py | 7 ++++--- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index 78e718e4..19d99b3f 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -1,6 +1,5 @@ from typing import List, Mapping, Optional, Union -import fsspec import h5py import numpy as np import xarray as xr @@ -8,6 +7,7 @@ from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray from virtualizarr.readers.hdf_filters import codecs_from_dataset from virtualizarr.types import ChunkKey +from virtualizarr.utils import _fsspec_openfile_from_filepath from virtualizarr.zarr import ZArray @@ -185,11 +185,15 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable: def virtual_vars_from_hdf( path: str, drop_variables: Optional[List[str]] = None, + reader_options: Optional[dict] = { + "storage_options": {"key": "", "secret": "", "anon": True} + }, ) -> Mapping[str, xr.Variable]: if drop_variables is None: drop_variables = [] - fs, file_path = fsspec.core.url_to_fs(path) - open_file = fs.open(path, "rb") + open_file = _fsspec_openfile_from_filepath( + filepath=path, reader_options=reader_options + ) f = h5py.File(open_file, mode="r") variables = {} for key in f.keys(): @@ -203,9 +207,15 @@ def virtual_vars_from_hdf( return variables -def attrs_from_root_group(path: str): - fs, file_path = fsspec.core.url_to_fs(path) - open_file = fs.open(path, "rb") +def attrs_from_root_group( + path: str, + reader_options: Optional[dict] = { + "storage_options": {"key": "", "secret": "", "anon": True} + }, +): + open_file = _fsspec_openfile_from_filepath( + filepath=path, reader_options=reader_options + ) f = h5py.File(open_file, mode="r") attrs = _extract_attrs(f) return attrs diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index d8b6a080..8f810ee1 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -107,11 +107,12 @@ def open_virtual_dataset( filetype = FileType(filetype) if filetype.name.lower() == "netcdf4": - print("wat") virtual_vars = virtual_vars_from_hdf( - path=filepath, drop_variables=drop_variables + path=filepath, + drop_variables=drop_variables, + reader_options=reader_options, ) - ds_attrs = attrs_from_root_group(path=filepath) + ds_attrs = attrs_from_root_group(path=filepath, reader_options=reader_options) if filetype == "zarr_v3": # TODO is there a neat way of auto-detecting this? return open_virtual_dataset_from_v3_store( From 72df10861ab0830531502885c0aaa3ebf3de4dee Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 20 May 2024 20:40:38 -0600 Subject: [PATCH 17/55] Fix incorrect zarr_v3 if block position from merge commit ef0d7a8. --- virtualizarr/xarray.py | 128 +++++++++++++++++++++-------------------- 1 file changed, 66 insertions(+), 62 deletions(-) diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 8f810ee1..d76e2a67 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -101,82 +101,86 @@ def open_virtual_dataset( if virtual_array_class is not ManifestArray: raise NotImplementedError() - - if filetype is None: - filetype = _automatically_determine_filetype(filepath=filepath) - filetype = FileType(filetype) - - if filetype.name.lower() == "netcdf4": - virtual_vars = virtual_vars_from_hdf( - path=filepath, - drop_variables=drop_variables, - reader_options=reader_options, - ) - ds_attrs = attrs_from_root_group(path=filepath, reader_options=reader_options) if filetype == "zarr_v3": # TODO is there a neat way of auto-detecting this? return open_virtual_dataset_from_v3_store( storepath=filepath, drop_variables=drop_variables, indexes=indexes ) else: - # this is the only place we actually always need to use kerchunk directly - # TODO avoid even reading byte ranges for variables that will be dropped later anyway? - vds_refs = kerchunk.read_kerchunk_references_from_file( - filepath=filepath, - filetype=filetype, - ) - virtual_vars = virtual_vars_from_kerchunk_refs( - vds_refs, - drop_variables=drop_variables + loadable_variables, - virtual_array_class=virtual_array_class, - ) - ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {}) - - if indexes is None or len(loadable_variables) > 0: - # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables... - # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references - # TODO really we probably want a dedicated xarray backend that iterates over all variables only once - fpath = _fsspec_openfile_from_filepath( - filepath=filepath, reader_options=reader_options - ) + if filetype is None: + filetype = _automatically_determine_filetype(filepath=filepath) + filetype = FileType(filetype) + + if filetype.name.lower() == "netcdf4": + virtual_vars = virtual_vars_from_hdf( + path=filepath, + drop_variables=drop_variables, + reader_options=reader_options, + ) + ds_attrs = attrs_from_root_group( + path=filepath, reader_options=reader_options + ) + else: + # this is the only place we actually always need to use kerchunk directly + # TODO avoid even reading byte ranges for variables that will be dropped later anyway? + vds_refs = kerchunk.read_kerchunk_references_from_file( + filepath=filepath, + filetype=filetype, + ) + virtual_vars = virtual_vars_from_kerchunk_refs( + vds_refs, + drop_variables=drop_variables + loadable_variables, + virtual_array_class=virtual_array_class, + ) + ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get( + ".zattrs", {} + ) - ds = xr.open_dataset(fpath, drop_variables=drop_variables) + if indexes is None or len(loadable_variables) > 0: + # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables... + # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references + # TODO really we probably want a dedicated xarray backend that iterates over all variables only once + fpath = _fsspec_openfile_from_filepath( + filepath=filepath, reader_options=reader_options + ) - if indexes is None: - # add default indexes by reading data from file - indexes = {name: index for name, index in ds.xindexes.items()} - elif indexes != {}: - # TODO allow manual specification of index objects - raise NotImplementedError() - else: - indexes = dict(**indexes) # for type hinting: to allow mutation + ds = xr.open_dataset(fpath, drop_variables=drop_variables) - loadable_vars = { - name: var - for name, var in ds.variables.items() - if name in loadable_variables - } + if indexes is None: + # add default indexes by reading data from file + indexes = {name: index for name, index in ds.xindexes.items()} + elif indexes != {}: + # TODO allow manual specification of index objects + raise NotImplementedError() + else: + indexes = dict(**indexes) # for type hinting: to allow mutation - # if we only read the indexes we can just close the file right away as nothing is lazy - if loadable_vars == {}: - ds.close() - else: - loadable_vars = {} - indexes = {} + loadable_vars = { + name: var + for name, var in ds.variables.items() + if name in loadable_variables + } - vars = {**virtual_vars, **loadable_vars} + # if we only read the indexes we can just close the file right away as nothing is lazy + if loadable_vars == {}: + ds.close() + else: + loadable_vars = {} + indexes = {} - data_vars, coords = separate_coords(vars, indexes) - vds = xr.Dataset( - data_vars, - coords=coords, - # indexes={}, # TODO should be added in a later version of xarray - attrs=ds_attrs, - ) + vars = {**virtual_vars, **loadable_vars} - # TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened + data_vars, coords = separate_coords(vars, indexes) + vds = xr.Dataset( + data_vars, + coords=coords, + # indexes={}, # TODO should be added in a later version of xarray + attrs=ds_attrs, + ) - return vds + # TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened + + return vds def open_virtual_dataset_from_v3_store( From d1e85cb169adc3851951afc2a64fcdec6180243c Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Tue, 21 May 2024 08:48:05 -0600 Subject: [PATCH 18/55] Fix early return from hdf _extract_attrs. --- virtualizarr/readers/hdf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index 19d99b3f..be93237f 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -155,7 +155,7 @@ def _extract_attrs(h5obj: Union[h5py.Dataset, h5py.Group]): continue attrs[n] = v - return attrs + return attrs def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable: From 1e2b3436fd086f8188c516f2fda4f6cd3a521325 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Tue, 21 May 2024 09:23:50 -0600 Subject: [PATCH 19/55] Test that _extract_attrs correctly handles multiple attributes. --- virtualizarr/tests/test_readers/conftest.py | 3 ++- virtualizarr/tests/test_readers/test_hdf.py | 16 +++++++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 53c9630e..fe2ec889 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -93,12 +93,13 @@ def chunked_dimensions_netcdf4_file(tmpdir): @pytest.fixture -def string_attribute_netcdf4_file(tmpdir): +def string_attributes_netcdf4_file(tmpdir): filepath = f"{tmpdir}/attributes.nc" f = h5py.File(filepath, "w") data = np.random.random((10, 10)) f.create_dataset(name="data", data=data, chunks=None) f["data"].attrs["attribute_name"] = "attribute_name" + f["data"].attrs["attribute_name2"] = "attribute_name2" return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index a83bfc39..a67352e6 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -75,16 +75,16 @@ def test_not_chunked_dataset(self, single_dimension_scale_netcdf4_file): var = _dataset_to_variable(single_dimension_scale_netcdf4_file, ds) assert var.chunks == (2,) - def test_dataset_attributes(self, string_attribute_netcdf4_file): - f = h5py.File(string_attribute_netcdf4_file) + def test_dataset_attributes(self, string_attributes_netcdf4_file): + f = h5py.File(string_attributes_netcdf4_file) ds = f["data"] - var = _dataset_to_variable(string_attribute_netcdf4_file, ds) + var = _dataset_to_variable(string_attributes_netcdf4_file, ds) assert var.attrs["attribute_name"] == "attribute_name" class TestExtractAttributes: - def test_string_attribute(self, string_attribute_netcdf4_file): - f = h5py.File(string_attribute_netcdf4_file) + def test_string_attribute(self, string_attributes_netcdf4_file): + f = h5py.File(string_attributes_netcdf4_file) ds = f["data"] attrs = _extract_attrs(ds) assert attrs["attribute_name"] == "attribute_name" @@ -94,6 +94,12 @@ def test_root_attribute(self, root_attributes_netcdf4_file): attrs = _extract_attrs(f) assert attrs["attribute_name"] == "attribute_name" + def test_multiple_attributes(self, string_attributes_netcdf4_file): + f = h5py.File(string_attributes_netcdf4_file) + ds = f["data"] + attrs = _extract_attrs(ds) + assert len(attrs.keys()) == 2 + class TestVirtualVarsFromHDF: def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file): From 7f1c1897dcad92cb988ea7e14a165d63fe23dad6 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Wed, 22 May 2024 14:16:12 -0600 Subject: [PATCH 20/55] Initial attempt at scale and offset via numcodecs. --- virtualizarr/readers/hdf.py | 14 ++++++++--- virtualizarr/readers/hdf_filters.py | 36 ++++++++++++++++++++++++++++- 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index be93237f..c251866b 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -5,7 +5,7 @@ import xarray as xr from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray -from virtualizarr.readers.hdf_filters import codecs_from_dataset +from virtualizarr.readers.hdf_filters import cfcodec_from_dataset, codecs_from_dataset from virtualizarr.types import ChunkKey from virtualizarr.utils import _fsspec_openfile_from_filepath from virtualizarr.zarr import ZArray @@ -163,11 +163,20 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable: # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66 chunks = dataset.chunks if dataset.chunks else dataset.shape codecs = codecs_from_dataset(dataset) + cfcodec = cfcodec_from_dataset(dataset) + attrs = _extract_attrs(dataset) + if cfcodec: + codecs.append(cfcodec["codec"]) + dtype = cfcodec["target_dtype"] + attrs.pop("scale_factor", None) + attrs.pop("add_offset", None) + else: + dtype = dataset.dtype filters = [codec.get_config() for codec in codecs] zarray = ZArray( chunks=chunks, compressor=None, - dtype=dataset.dtype, + dtype=dtype, fill_value=dataset.fillvalue, filters=filters, order="C", @@ -177,7 +186,6 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable: manifest = _dataset_chunk_manifest(path, dataset) marray = ManifestArray(zarray=zarray, chunkmanifest=manifest) dims = _dataset_dims(dataset) - attrs = _extract_attrs(dataset) variable = xr.Variable(data=marray, dims=dims, attrs=attrs) return variable diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index dfe1c1f3..169eab97 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -1,10 +1,13 @@ -from typing import List, Tuple, Union +from typing import List, Tuple, TypedDict, Union import h5py import hdf5plugin import numcodecs.registry as registry +import numpy as np from numcodecs.abc import Codec +from numcodecs.fixedscaleoffset import FixedScaleOffset from pydantic import BaseModel, validator +from xarray.coding.variables import _choose_float_dtype _non_standard_filters = {"gzip": "zlib"} @@ -24,6 +27,11 @@ def get_cname_from_code(cls, v): return blosc_compressor_codes[v] +class CFCodec(TypedDict): + target_dtype: np.dtype + codec: Codec + + def _filter_to_codec( filter_id: str, filter_properties: Union[int, None, Tuple] = None ) -> Codec: @@ -61,6 +69,32 @@ def _filter_to_codec( return codec +def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None: + attributes = {attr: dataset.attrs[attr] for attr in dataset.attrs} + mapping = {} + if "scale_factor" in attributes: + mapping["scale_factor"] = 1 / attributes["scale_factor"][0] + else: + mapping["scale_factor"] = 1 + if "add_offset" in attributes: + mapping["add_offset"] = attributes["add_offset"] + else: + mapping["add_offset"] = 0 + if mapping["scale_factor"] != 1 or mapping["add_offset"] != 0: + float_dtype = _choose_float_dtype(dtype=dataset.dtype, mapping=mapping) + target_dtype = np.dtype(float_dtype) + codec = FixedScaleOffset( + offset=mapping["add_offset"], + scale=mapping["scale_factor"], + dtype=target_dtype, + astype=dataset.dtype, + ) + cfcodec = CFCodec(target_dtype=target_dtype, codec=codec) + return cfcodec + else: + return None + + def codecs_from_dataset(dataset: h5py.Dataset) -> List[Codec]: codecs = [] for filter_id, filter_properties in dataset._filters.items(): From 908e332ae9860a7e7d36845633a7c9267ee72ca0 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Thu, 23 May 2024 10:54:48 -0600 Subject: [PATCH 21/55] Tests for cfcodec_from_dataset. --- virtualizarr/tests/test_readers/conftest.py | 10 +++++++ .../tests/test_readers/test_hdf_filters.py | 29 +++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index fe2ec889..202cdd9c 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -164,3 +164,13 @@ def filter_encoded_xarray_netcdf4_files(tmpdir, request): filepath = f"{tmpdir}/{request.param}_xarray.nc" ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding) return filepath + + +@pytest.fixture +def add_offset_netcdf4_file(tmpdir): + filepath = f"{tmpdir}/offset.nc" + f = h5py.File(filepath, "w") + data = np.random.random((10, 10)) + f.create_dataset(name="data", data=data, chunks=None) + f["data"].attrs.create(name="add_offset", data=5) + return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py index 28b5d69f..dca9f40d 100644 --- a/virtualizarr/tests/test_readers/test_hdf_filters.py +++ b/virtualizarr/tests/test_readers/test_hdf_filters.py @@ -1,9 +1,11 @@ import h5py import numcodecs +import numpy as np import pytest from virtualizarr.readers.hdf_filters import ( _filter_to_codec, + cfcodec_from_dataset, codecs_from_dataset, ) @@ -41,3 +43,30 @@ def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file): bytes_read = file.read(chunk_info.size) decoded = codecs[0].decode(bytes_read) assert decoded == np_uncompressed.tobytes() + + +class TestCFCodecFromDataset: + def test_no_cf_convention(self, filter_encoded_netcdf4_file): + f = h5py.File(filter_encoded_netcdf4_file) + ds = f["data"] + cf_codec = cfcodec_from_dataset(ds) + assert cf_codec is None + + def test_cf_scale_factor(self, netcdf4_file): + f = h5py.File(netcdf4_file) + ds = f["air"] + cf_codec = cfcodec_from_dataset(ds) + assert cf_codec["target_dtype"] == np.dtype(np.float64) + assert cf_codec["codec"].scale == 100.0 + assert cf_codec["codec"].offset == 0 + assert cf_codec["codec"].dtype == " Date: Fri, 24 May 2024 12:47:12 -0600 Subject: [PATCH 22/55] Temporarily relax integration tests to assert_allclose. --- virtualizarr/tests/test_integration.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 064968b3..1b9aad83 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -62,7 +62,7 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format): roundtrip = xr.open_dataset(f"{tmpdir}/refs.{format}", engine="kerchunk") # assert equal to original dataset - xrt.assert_equal(roundtrip, ds) + xrt.assert_allclose(roundtrip, ds) def test_kerchunk_roundtrip_concat(self, tmpdir, format): # set up example xarray dataset @@ -89,7 +89,7 @@ def test_kerchunk_roundtrip_concat(self, tmpdir, format): roundtrip = xr.open_dataset(f"{tmpdir}/refs.{format}", engine="kerchunk") # assert equal to original dataset - xrt.assert_equal(roundtrip, ds) + xrt.assert_allclose(roundtrip, ds) def test_open_scalar_variable(tmpdir): From ca6b236b36fabf96c0659556f2cff2ef59435d6c Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Fri, 24 May 2024 13:50:49 -0600 Subject: [PATCH 23/55] Add blosc_lz4 fixture parameterization to confirm libnetcdf environment. --- virtualizarr/tests/test_readers/conftest.py | 13 +++++++++---- .../tests/test_readers/test_hdf_integration.py | 4 ++-- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 202cdd9c..20d5433e 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -134,7 +134,7 @@ def np_uncompressed(): return np.arange(100) -@pytest.fixture(params=["gzip", "blosc"]) +@pytest.fixture(params=["gzip", "blosc_lz4"]) def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request): filepath = f"{tmpdir}/{request.param}.nc" f = h5py.File(filepath, "w") @@ -142,7 +142,7 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request): f.create_dataset( name="data", data=np_uncompressed, compression="gzip", compression_opts=1 ) - if request.param == "blosc": + if request.param == "blosc_lz4": f.create_dataset( name="data", data=np_uncompressed, @@ -151,18 +151,23 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request): return filepath -@pytest.fixture(params=["gzip"]) -def filter_encoded_xarray_netcdf4_files(tmpdir, request): +@pytest.fixture(params=["gzip", "blosc_zlib"]) +def filter_encoded_xarray_netcdf4_file(tmpdir, request): ds = xr.tutorial.open_dataset("air_temperature") encoding = {} if request.param == "gzip": encoding_config = {"zlib": True, "complevel": 1} + if "blosc" in request.param: + encoding_config = { + "compression": request.param, + } for var_name in ds.variables: encoding[var_name] = encoding_config filepath = f"{tmpdir}/{request.param}_xarray.nc" ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding) + # ds.to_netcdf(filepath, engine="netcdf4", encoding=encoding) return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py index b31289c0..ade8e7ce 100644 --- a/virtualizarr/tests/test_readers/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -7,9 +7,9 @@ class TestIntegration: - def test_filters_end_to_end(self, tmpdir, filter_encoded_xarray_netcdf4_files): + def test_filters_roundtrip(self, tmpdir, filter_encoded_xarray_netcdf4_file): virtual_ds = virtualizarr.open_virtual_dataset( - filter_encoded_xarray_netcdf4_files, filetype=FileType("netcdf4") + filter_encoded_xarray_netcdf4_file, filetype=FileType("netcdf4") ) kerchunk_file = f"{tmpdir}/kerchunk.json" virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json") From b7426c5b15f33a65a0890a51fbc6d9464b673eaf Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Fri, 24 May 2024 14:05:21 -0600 Subject: [PATCH 24/55] Check for compatability with netcdf4 engine. --- virtualizarr/tests/test_readers/conftest.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 20d5433e..cb1212f0 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -166,8 +166,7 @@ def filter_encoded_xarray_netcdf4_file(tmpdir, request): encoding[var_name] = encoding_config filepath = f"{tmpdir}/{request.param}_xarray.nc" - ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding) - # ds.to_netcdf(filepath, engine="netcdf4", encoding=encoding) + ds.to_netcdf(filepath, engine="netcdf4", encoding=encoding) return filepath From dac21dde6239b5ea7e918ff50aef8839ab2f7773 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 27 May 2024 12:58:48 -0600 Subject: [PATCH 25/55] Use separate fixtures for h5netcdf and netcdf4 compression styles. --- virtualizarr/tests/test_readers/conftest.py | 27 ++++++++++++++----- .../test_readers/test_hdf_integration.py | 20 ++++++++++++-- 2 files changed, 39 insertions(+), 8 deletions(-) diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index cb1212f0..a4fafed3 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -3,6 +3,7 @@ import numpy as np import pytest import xarray as xr +from xarray.tests.test_dataset import create_test_data @pytest.fixture @@ -151,22 +152,36 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request): return filepath -@pytest.fixture(params=["gzip", "blosc_zlib"]) -def filter_encoded_xarray_netcdf4_file(tmpdir, request): +@pytest.fixture(params=["gzip"]) +def filter_encoded_xarray_h5netcdf_file(tmpdir, request): ds = xr.tutorial.open_dataset("air_temperature") encoding = {} if request.param == "gzip": encoding_config = {"zlib": True, "complevel": 1} + + for var_name in ds.variables: + encoding[var_name] = encoding_config + + filepath = f"{tmpdir}/{request.param}_xarray.nc" + ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding) + return filepath + + +@pytest.fixture(params=["blosc_zlib"]) +def filter_encoded_xarray_netcdf4_file(tmpdir, request): + ds = create_test_data(dim_sizes=(20, 80, 10)) if "blosc" in request.param: encoding_config = { "compression": request.param, + "chunksizes": (20, 40), + "original_shape": ds.var2.shape, + "blosc_shuffle": 1, + "fletcher32": False, } - for var_name in ds.variables: - encoding[var_name] = encoding_config - + ds["var2"].encoding.update(encoding_config) filepath = f"{tmpdir}/{request.param}_xarray.nc" - ds.to_netcdf(filepath, engine="netcdf4", encoding=encoding) + ds.to_netcdf(filepath, engine="netcdf4") return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py index ade8e7ce..d6ecf2f1 100644 --- a/virtualizarr/tests/test_readers/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -7,9 +7,11 @@ class TestIntegration: - def test_filters_roundtrip(self, tmpdir, filter_encoded_xarray_netcdf4_file): + def test_filters_h5netcdf_roundtrip( + self, tmpdir, filter_encoded_xarray_h5netcdf_file + ): virtual_ds = virtualizarr.open_virtual_dataset( - filter_encoded_xarray_netcdf4_file, filetype=FileType("netcdf4") + filter_encoded_xarray_h5netcdf_file, filetype=FileType("netcdf4") ) kerchunk_file = f"{tmpdir}/kerchunk.json" virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json") @@ -18,3 +20,17 @@ def test_filters_roundtrip(self, tmpdir, filter_encoded_xarray_netcdf4_file): ds = xr.open_dataset(m, engine="kerchunk") assert isinstance(ds.air.values[0][0][0], numpy.float64) + + def test_filters_netcdf4_roundtrip( + self, tmpdir, filter_encoded_xarray_netcdf4_file + ): + virtual_ds = virtualizarr.open_virtual_dataset( + filter_encoded_xarray_netcdf4_file, filetype=FileType("netcdf4") + ) + kerchunk_file = f"{tmpdir}/kerchunk.json" + virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json") + fs = fsspec.filesystem("reference", fo=kerchunk_file) + m = fs.get_mapper("") + + ds = xr.open_dataset(m, engine="kerchunk") + print(ds["var2"].encoding) From e968772a3a206658064e3e29294afec7604d0bc9 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 27 May 2024 15:49:22 -0600 Subject: [PATCH 26/55] Print libhdf5 and libnetcdf4 versions to confirm compiled environment. --- virtualizarr/tests/test_readers/conftest.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index a4fafed3..8904dd38 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -4,6 +4,7 @@ import pytest import xarray as xr from xarray.tests.test_dataset import create_test_data +from xarray.util.print_versions import netcdf_and_hdf5_versions @pytest.fixture @@ -181,6 +182,7 @@ def filter_encoded_xarray_netcdf4_file(tmpdir, request): ds["var2"].encoding.update(encoding_config) filepath = f"{tmpdir}/{request.param}_xarray.nc" + print(netcdf_and_hdf5_versions()) ds.to_netcdf(filepath, engine="netcdf4") return filepath From 9a98e57e55fd020bcf3d682604eee2f03775ff26 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 27 May 2024 17:07:51 -0600 Subject: [PATCH 27/55] Skip netcdf4 style compression tests when libhdf5 < 1.14. --- virtualizarr/tests/test_readers/conftest.py | 15 ++++++++++++--- .../test_readers/test_hdf_integration.py | 19 ++++++++++++++++--- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 8904dd38..0ddb2a01 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -3,6 +3,7 @@ import numpy as np import pytest import xarray as xr +from packaging.version import Version from xarray.tests.test_dataset import create_test_data from xarray.util.print_versions import netcdf_and_hdf5_versions @@ -168,8 +169,17 @@ def filter_encoded_xarray_h5netcdf_file(tmpdir, request): return filepath +@pytest.fixture() +def skip_test_for_libhdf5_version(): + versions = netcdf_and_hdf5_versions() + libhdf5_version = Version(versions[0][1]) + return libhdf5_version < Version("1.14") + + @pytest.fixture(params=["blosc_zlib"]) -def filter_encoded_xarray_netcdf4_file(tmpdir, request): +def filter_encoded_xarray_netcdf4_file(tmpdir, request, skip_test_for_libhdf5_version): + if skip_test_for_libhdf5_version: + pytest.skip("Requires libhdf5 >= 1.14") ds = create_test_data(dim_sizes=(20, 80, 10)) if "blosc" in request.param: encoding_config = { @@ -182,9 +192,8 @@ def filter_encoded_xarray_netcdf4_file(tmpdir, request): ds["var2"].encoding.update(encoding_config) filepath = f"{tmpdir}/{request.param}_xarray.nc" - print(netcdf_and_hdf5_versions()) ds.to_netcdf(filepath, engine="netcdf4") - return filepath + return {"filepath": filepath, "compressor": request.param} @pytest.fixture diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py index d6ecf2f1..f51ebd45 100644 --- a/virtualizarr/tests/test_readers/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -24,13 +24,26 @@ def test_filters_h5netcdf_roundtrip( def test_filters_netcdf4_roundtrip( self, tmpdir, filter_encoded_xarray_netcdf4_file ): + filepath = filter_encoded_xarray_netcdf4_file["filepath"] + compressor = filter_encoded_xarray_netcdf4_file["compressor"] virtual_ds = virtualizarr.open_virtual_dataset( - filter_encoded_xarray_netcdf4_file, filetype=FileType("netcdf4") + filepath, filetype=FileType("netcdf4") ) kerchunk_file = f"{tmpdir}/kerchunk.json" virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json") fs = fsspec.filesystem("reference", fo=kerchunk_file) m = fs.get_mapper("") - ds = xr.open_dataset(m, engine="kerchunk") - print(ds["var2"].encoding) + + expected_encoding = ds["var2"].encoding.copy() + compression = expected_encoding.pop("compression") + blosc_shuffle = expected_encoding.pop("blosc_shuffle") + if compression is not None: + if "blosc" in compression and blosc_shuffle: + expected_encoding["blosc"] = { + "compressor": compressor, + "shuffle": blosc_shuffle, + } + expected_encoding["shuffle"] = False + actual_encoding = ds["var2"].encoding + assert expected_encoding.items() <= actual_encoding.items() From 7590b87e375f0dea6683aceba4322ca5a0c8a95d Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Tue, 11 Jun 2024 13:57:51 -0600 Subject: [PATCH 28/55] Include imagecodecs.numcodecs to support HDF5 lzf filters. --- pyproject.toml | 1 + virtualizarr/readers/hdf_filters.py | 2 +- virtualizarr/tests/test_readers/test_hdf_filters.py | 8 ++++---- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f0563f09..773cccc2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,7 @@ test = [ "fsspec", "s3fs", "fastparquet", + "imagecodecs-numcodecs", ] diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index 169eab97..08a3bba4 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -9,7 +9,7 @@ from pydantic import BaseModel, validator from xarray.coding.variables import _choose_float_dtype -_non_standard_filters = {"gzip": "zlib"} +_non_standard_filters = {"gzip": "zlib", "lzf": "imagecodecs_lzf"} class BloscProperties(BaseModel): diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py index dca9f40d..b5b04047 100644 --- a/virtualizarr/tests/test_readers/test_hdf_filters.py +++ b/virtualizarr/tests/test_readers/test_hdf_filters.py @@ -1,7 +1,7 @@ import h5py +import imagecodecs import numcodecs import numpy as np -import pytest from virtualizarr.readers.hdf_filters import ( _filter_to_codec, @@ -15,9 +15,9 @@ def test_gzip_uses_zlib_numcodec(self): codec = _filter_to_codec("gzip", 1) assert isinstance(codec, numcodecs.zlib.Zlib) - def test_lzf_not_available(self): - with pytest.raises(ValueError, match="codec not available"): - _filter_to_codec("lzf") + def test_lzf(self): + codec = _filter_to_codec("lzf") + assert isinstance(codec, imagecodecs.numcodecs.Lzf) def test_blosc(self): codec = _filter_to_codec("32001", (2, 2, 8, 800, 9, 2, 1)) From 14bd7098545bd7f443b791f24aafa11bcc00fdbb Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Tue, 11 Jun 2024 16:24:30 -0600 Subject: [PATCH 29/55] Remove test that verifies call to read_kerchunk_references_from_file. --- virtualizarr/tests/test_xarray.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index 695759bd..d145550e 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -1,5 +1,4 @@ from collections.abc import Mapping -from unittest.mock import patch import numpy as np import pytest @@ -304,16 +303,3 @@ def test_loadable_variables(self, netcdf4_file): for name in full_ds.variables: if name in vars_to_load: xrt.assert_identical(vds.variables[name], full_ds.variables[name]) - - @patch("virtualizarr.kerchunk.read_kerchunk_references_from_file") - def test_open_virtual_dataset_passes_expected_args( - self, mock_read_kerchunk, netcdf4_file - ): - reader_options = {"option1": "value1", "option2": "value2"} - open_virtual_dataset(netcdf4_file, indexes={}, reader_options=reader_options) - args = { - "filepath": netcdf4_file, - "filetype": None, - "reader_options": reader_options, - } - mock_read_kerchunk.assert_called_once_with(**args) From acdf0d76557a5abdf2657f1278f57c732a4dd347 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Wed, 12 Jun 2024 15:05:34 -0600 Subject: [PATCH 30/55] Add additional codec support structures for imagecodecs and numcodecs. --- virtualizarr/readers/hdf_filters.py | 23 +++++++++++++++++---- virtualizarr/tests/test_readers/conftest.py | 9 +++++++- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index 08a3bba4..667ff09a 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -9,7 +9,12 @@ from pydantic import BaseModel, validator from xarray.coding.variables import _choose_float_dtype -_non_standard_filters = {"gzip": "zlib", "lzf": "imagecodecs_lzf"} +_non_standard_filters = { + "gzip": "zlib", + "lzf": "imagecodecs_lzf", +} + +_hdf5plugin_imagecodecs = {"lz4": "imagecodecs_lz4h5", "bzip2": "imagecodecs_bz2"} class BloscProperties(BaseModel): @@ -27,6 +32,10 @@ def get_cname_from_code(cls, v): return blosc_compressor_codes[v] +class ZstdProperties(BaseModel): + level: int + + class CFCodec(TypedDict): target_dtype: np.dtype codec: Codec @@ -41,18 +50,20 @@ def _filter_to_codec( id_int = int(filter_id) except ValueError: id_str = filter_id - + conf = {} if id_str: if id_str in _non_standard_filters.keys(): id = _non_standard_filters[id_str] else: id = id_str - conf = {"id": id} + conf["id"] = id # type: ignore[assignment] if id == "zlib": conf["level"] = filter_properties # type: ignore[assignment] if id_int: filter = hdf5plugin.get_filters(id_int)[0] id = filter.filter_name + if id in _hdf5plugin_imagecodecs.keys(): + id = _hdf5plugin_imagecodecs[id] if id == "blosc" and isinstance(filter_properties, tuple): blosc_props = BloscProperties( **{ @@ -63,7 +74,11 @@ def _filter_to_codec( } ) conf = blosc_props.model_dump() # type: ignore[assignment] - conf["id"] = id + if id == "zstd" and isinstance(filter_properties, tuple): + zstd_props = ZstdProperties(level=filter_properties[0]) + conf = zstd_props.model_dump() # type: ignore[assignment] + + conf["id"] = id codec = registry.get_codec(conf) return codec diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 0ddb2a01..3e6f9c3f 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -137,7 +137,7 @@ def np_uncompressed(): return np.arange(100) -@pytest.fixture(params=["gzip", "blosc_lz4"]) +@pytest.fixture(params=["gzip", "blosc_lz4", "lz4", "bzip2", "zstd"]) def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request): filepath = f"{tmpdir}/{request.param}.nc" f = h5py.File(filepath, "w") @@ -151,6 +151,13 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request): data=np_uncompressed, **hdf5plugin.Blosc(cname="lz4", clevel=9, shuffle=hdf5plugin.Blosc.SHUFFLE), ) + if request.param == "lz4": + f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.LZ4(nbytes=0)) + if request.param == "bzip2": + f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.BZip2()) + if request.param == "zstd": + f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.Zstd(clevel=2)) + return filepath From 4ba323a6c862deb8908706373b6df429fd78f986 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Wed, 12 Jun 2024 16:17:04 -0600 Subject: [PATCH 31/55] Add codec config test for Zstd. --- virtualizarr/tests/test_readers/test_hdf_filters.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py index b5b04047..4d23a756 100644 --- a/virtualizarr/tests/test_readers/test_hdf_filters.py +++ b/virtualizarr/tests/test_readers/test_hdf_filters.py @@ -31,6 +31,12 @@ def test_blosc(self): } assert codec.get_config() == expected_config + def test_zstd(self): + codec = _filter_to_codec("32015", (5,)) + assert isinstance(codec, numcodecs.zstd.Zstd) + expected_config = {"id": "zstd", "level": 5} + assert codec.get_config() == expected_config + class TestCodecsFromDataSet: def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file): From e14e53b0fc2bb7ed1ca3d5b73fc43594aff77426 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Thu, 20 Jun 2024 18:03:26 -0600 Subject: [PATCH 32/55] Include initial cf decoding tests. --- virtualizarr/readers/hdf_filters.py | 3 +- virtualizarr/tests/test_readers/conftest.py | 34 ++++++++++++++++--- .../tests/test_readers/test_hdf_filters.py | 28 +++++++++++++++ 3 files changed, 60 insertions(+), 5 deletions(-) diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index 667ff09a..f4e2dcfa 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -88,7 +88,8 @@ def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None: attributes = {attr: dataset.attrs[attr] for attr in dataset.attrs} mapping = {} if "scale_factor" in attributes: - mapping["scale_factor"] = 1 / attributes["scale_factor"][0] + mapping["scale_factor"] = 1 / attributes["scale_factor"] + # mapping["scale_factor"] =attributes["scale_factor"][0] else: mapping["scale_factor"] = 1 if "add_offset" in attributes: diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 3e6f9c3f..e1a53c5e 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -204,10 +204,36 @@ def filter_encoded_xarray_netcdf4_file(tmpdir, request, skip_test_for_libhdf5_ve @pytest.fixture -def add_offset_netcdf4_file(tmpdir): +def np_uncompressed_int16(): + return np.arange(100, dtype=np.int16) + + +@pytest.fixture +def offset(): + return np.float32(5.0) + + +@pytest.fixture +def add_offset_netcdf4_file(tmpdir, np_uncompressed_int16, offset): filepath = f"{tmpdir}/offset.nc" f = h5py.File(filepath, "w") - data = np.random.random((10, 10)) - f.create_dataset(name="data", data=data, chunks=None) - f["data"].attrs.create(name="add_offset", data=5) + data = np_uncompressed_int16 - offset + f.create_dataset(name="data", data=data, chunks=True) + f["data"].attrs.create(name="add_offset", data=offset) + return filepath + + +@pytest.fixture +def scale_factor(): + return 0.01 + + +@pytest.fixture +def scale_add_offset_netcdf4_file(tmpdir, np_uncompressed_int16, offset, scale_factor): + filepath = f"{tmpdir}/scale_offset.nc" + f = h5py.File(filepath, "w") + data = (np_uncompressed_int16 - offset) / scale_factor + f.create_dataset(name="data", data=data, chunks=True) + f["data"].attrs.create(name="add_offset", data=offset) + f["data"].attrs.create(name="scale_factor", data=np.array([scale_factor])) return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py index 4d23a756..960bcf2c 100644 --- a/virtualizarr/tests/test_readers/test_hdf_filters.py +++ b/virtualizarr/tests/test_readers/test_hdf_filters.py @@ -76,3 +76,31 @@ def test_cf_add_offset(self, add_offset_netcdf4_file): assert cf_codec["codec"].scale == 1 assert cf_codec["codec"].offset == 5 assert cf_codec["codec"].dtype == " Date: Thu, 20 Jun 2024 19:49:54 -0600 Subject: [PATCH 33/55] Revert typo for scale_factor retrieval. --- virtualizarr/readers/hdf_filters.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index f4e2dcfa..667ff09a 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -88,8 +88,7 @@ def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None: attributes = {attr: dataset.attrs[attr] for attr in dataset.attrs} mapping = {} if "scale_factor" in attributes: - mapping["scale_factor"] = 1 / attributes["scale_factor"] - # mapping["scale_factor"] =attributes["scale_factor"][0] + mapping["scale_factor"] = 1 / attributes["scale_factor"][0] else: mapping["scale_factor"] = 1 if "add_offset" in attributes: From 01a3980f541a45c8a33a907dd6d3bed722eacae9 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Thu, 20 Jun 2024 20:12:44 -0600 Subject: [PATCH 34/55] Update reader to use new numpy manifest representation. --- virtualizarr/readers/hdf.py | 29 ++++++++++----------- virtualizarr/tests/test_readers/test_hdf.py | 4 +-- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index c251866b..b96bdff7 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -39,34 +39,33 @@ def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest: path=path, offset=dsid.get_offset(), length=dsid.get_storage_size() ) chunk_key = ChunkKey(key) - chunk_entries = {chunk_key: chunk_entry} + chunk_entries = {chunk_key: chunk_entry.dict()} chunk_manifest = ChunkManifest(entries=chunk_entries) return chunk_manifest else: num_chunks = dsid.get_num_chunks() if num_chunks == 0: raise ValueError("The dataset is chunked but contains no chunks") + paths = np.full(num_chunks, path, dtype=np.dtypes.StringDType) # type: ignore + offsets = np.empty((num_chunks), dtype=np.int32) + lengths = np.empty((num_chunks), dtype=np.int32) - chunk_entries = dict() - - def get_key(blob): - key_list = [a // b for a, b in zip(blob.chunk_offset, dataset.chunks)] - key = ".".join(map(str, key_list)) - return key - - def store_chunk_entry(blob): - chunk_entries[get_key(blob)] = ChunkEntry( - path=path, offset=blob.byte_offset, length=blob.size - ) + def add_chunk_info(blob, chunk_index): + offsets[chunk_index] = blob.byte_offset + lengths[chunk_index] = blob.size + chunk_index += 1 has_chunk_iter = callable(getattr(dsid, "chunk_iter", None)) if has_chunk_iter: - dsid.chunk_iter(store_chunk_entry) + chunk_index = 0 + dsid.chunk_iter(add_chunk_info, chunk_index) else: for index in range(num_chunks): - store_chunk_entry(dsid.get_chunk_info(index)) + add_chunk_info(dsid.get_chunk_info(index), index) - chunk_manifest = ChunkManifest(entries=chunk_entries) + chunk_manifest = ChunkManifest.from_arrays( + paths=paths, offsets=offsets, lengths=lengths + ) return chunk_manifest diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index a67352e6..8c5a40a7 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -27,13 +27,13 @@ def test_no_chunking(self, no_chunks_netcdf4_file): f = h5py.File(no_chunks_netcdf4_file) ds = f["data"] manifest = _dataset_chunk_manifest(path=no_chunks_netcdf4_file, dataset=ds) - assert len(manifest.entries) == 1 + assert len(manifest) == 1 def test_chunked(self, chunked_netcdf4_file): f = h5py.File(chunked_netcdf4_file) ds = f["data"] manifest = _dataset_chunk_manifest(path=chunked_netcdf4_file, dataset=ds) - assert len(manifest.entries) == 4 + assert len(manifest) == 4 class TestDatasetDims: From c37d9e526239ad5207f76d400924fffaabb578ec Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Fri, 21 Jun 2024 19:05:01 -0600 Subject: [PATCH 35/55] Temporarily skip test until blosc netcdf4 issue is solved. --- virtualizarr/tests/test_readers/test_hdf_integration.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py index f51ebd45..dca34dbd 100644 --- a/virtualizarr/tests/test_readers/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -1,5 +1,6 @@ import fsspec import numpy +import pytest import xarray as xr import virtualizarr @@ -21,6 +22,9 @@ def test_filters_h5netcdf_roundtrip( ds = xr.open_dataset(m, engine="kerchunk") assert isinstance(ds.air.values[0][0][0], numpy.float64) + @pytest.mark.skip( + reason="Issue with xr 'dim1' serialization and blosc availability" + ) def test_filters_netcdf4_roundtrip( self, tmpdir, filter_encoded_xarray_netcdf4_file ): From 17b30d4149603c952e0b24892b2d104ed7499a52 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Fri, 21 Jun 2024 19:24:07 -0600 Subject: [PATCH 36/55] Fix Pydantic 2 migration warnings. --- virtualizarr/readers/hdf_filters.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index 667ff09a..cc8e810e 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -6,7 +6,7 @@ import numpy as np from numcodecs.abc import Codec from numcodecs.fixedscaleoffset import FixedScaleOffset -from pydantic import BaseModel, validator +from pydantic import BaseModel, field_validator from xarray.coding.variables import _choose_float_dtype _non_standard_filters = { @@ -23,7 +23,7 @@ class BloscProperties(BaseModel): shuffle: int cname: str - @validator("cname", pre=True) + @field_validator("cname", mode="before") def get_cname_from_code(cls, v): blosc_compressor_codes = { value: key @@ -69,7 +69,7 @@ def _filter_to_codec( **{ k: v for k, v in zip( - BloscProperties.__fields__.keys(), filter_properties[-4:] + BloscProperties.model_fields.keys(), filter_properties[-4:] ) } ) From f6b596a6563aff90a70acb0b8190898399368f32 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Fri, 21 Jun 2024 19:30:55 -0600 Subject: [PATCH 37/55] Include hdf5plugin and imagecodecs-numcodecs in mamba test environment. --- ci/environment.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ci/environment.yml b/ci/environment.yml index 0385ea5a..e909beec 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -14,6 +14,7 @@ dependencies: - ujson - packaging - universal_pathlib + - hdf5plugin # Testing - codecov - pre-commit @@ -26,3 +27,4 @@ dependencies: - fsspec - s3fs - fastparquet + - imagecodecs-numcodecs From eb6e24d10385fa68a9a8909d0c6cfb9a97a34461 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Fri, 21 Jun 2024 19:35:24 -0600 Subject: [PATCH 38/55] Mamba attempt with imagecodecs rather than imagecodecs-numcodecs. --- ci/environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/environment.yml b/ci/environment.yml index e909beec..20784a6e 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -27,4 +27,4 @@ dependencies: - fsspec - s3fs - fastparquet - - imagecodecs-numcodecs + - imagecodecs From c85bd168025d4c96c1112aff22cc82fc0e07cbfd Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Fri, 21 Jun 2024 19:41:14 -0600 Subject: [PATCH 39/55] Mamba attempt with latest imagecodecs release. --- ci/environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/environment.yml b/ci/environment.yml index 20784a6e..fb967bcd 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -27,4 +27,4 @@ dependencies: - fsspec - s3fs - fastparquet - - imagecodecs + - imagecodecs>=2024.6.1 From ca435da5007263136bf489ffe647cb690145cbd7 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Tue, 25 Jun 2024 19:34:35 -0600 Subject: [PATCH 40/55] Use correct iter_chunks callback function signtature. --- virtualizarr/readers/hdf.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index b96bdff7..d082b717 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -53,12 +53,22 @@ def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest: def add_chunk_info(blob, chunk_index): offsets[chunk_index] = blob.byte_offset lengths[chunk_index] = blob.size - chunk_index += 1 has_chunk_iter = callable(getattr(dsid, "chunk_iter", None)) if has_chunk_iter: - chunk_index = 0 - dsid.chunk_iter(add_chunk_info, chunk_index) + + def create_callback(initial=0): + value = initial + + def callback(blob): + nonlocal value + add_chunk_info(blob, chunk_index=value) + value += 1 + + return callback + + callback = create_callback() + dsid.chunk_iter(callback) else: for index in range(num_chunks): add_chunk_info(dsid.get_chunk_info(index), index) From 3017951549fe4b3d9d7099b1357aa76136d23f16 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Tue, 25 Jun 2024 19:35:40 -0600 Subject: [PATCH 41/55] Include pip based imagecodecs-numcodecs until conda-forge availability. --- ci/environment.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ci/environment.yml b/ci/environment.yml index fb967bcd..e2f5a865 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -28,3 +28,5 @@ dependencies: - s3fs - fastparquet - imagecodecs>=2024.6.1 + - pip: + - imagecodecs-numcodecs From 32ba13537070fbee7e861d8618f6a77eacbe0da8 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Thu, 27 Jun 2024 15:43:10 -0600 Subject: [PATCH 42/55] Handle non-coordinate dims which are serialized to hdf as empty dataset. --- virtualizarr/readers/hdf.py | 65 ++++++++++++--------- virtualizarr/tests/test_integration.py | 18 +++++- virtualizarr/tests/test_readers/test_hdf.py | 1 + virtualizarr/xarray.py | 2 +- 4 files changed, 53 insertions(+), 33 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index d082b717..cbbe824f 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -11,7 +11,9 @@ from virtualizarr.zarr import ZArray -def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest: +def _dataset_chunk_manifest( + path: str, dataset: h5py.Dataset +) -> Optional[ChunkManifest]: """ Generate ChunkManifest for HDF5 dataset. @@ -31,7 +33,7 @@ def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest: if dataset.chunks is None: if dsid.get_offset() is None: - raise ValueError("Dataset has no space allocated in the file") + return None else: key_list = [0] * (len(dataset.shape) or 1) key = ".".join(map(str, key_list)) @@ -167,35 +169,39 @@ def _extract_attrs(h5obj: Union[h5py.Dataset, h5py.Group]): return attrs -def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable: +def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> Optional[xr.Variable]: # This chunk determination logic mirrors zarr-python's create # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66 - chunks = dataset.chunks if dataset.chunks else dataset.shape - codecs = codecs_from_dataset(dataset) - cfcodec = cfcodec_from_dataset(dataset) - attrs = _extract_attrs(dataset) - if cfcodec: - codecs.append(cfcodec["codec"]) - dtype = cfcodec["target_dtype"] - attrs.pop("scale_factor", None) - attrs.pop("add_offset", None) - else: - dtype = dataset.dtype - filters = [codec.get_config() for codec in codecs] - zarray = ZArray( - chunks=chunks, - compressor=None, - dtype=dtype, - fill_value=dataset.fillvalue, - filters=filters, - order="C", - shape=dataset.shape, - zarr_format=2, - ) + manifest = _dataset_chunk_manifest(path, dataset) - marray = ManifestArray(zarray=zarray, chunkmanifest=manifest) - dims = _dataset_dims(dataset) - variable = xr.Variable(data=marray, dims=dims, attrs=attrs) + if manifest: + chunks = dataset.chunks if dataset.chunks else dataset.shape + codecs = codecs_from_dataset(dataset) + cfcodec = cfcodec_from_dataset(dataset) + attrs = _extract_attrs(dataset) + if cfcodec: + codecs.append(cfcodec["codec"]) + dtype = cfcodec["target_dtype"] + attrs.pop("scale_factor", None) + attrs.pop("add_offset", None) + else: + dtype = dataset.dtype + filters = [codec.get_config() for codec in codecs] + zarray = ZArray( + chunks=chunks, + compressor=None, + dtype=dtype, + fill_value=dataset.fillvalue, + filters=filters, + order="C", + shape=dataset.shape, + zarr_format=2, + ) + marray = ManifestArray(zarray=zarray, chunkmanifest=manifest) + dims = _dataset_dims(dataset) + variable = xr.Variable(data=marray, dims=dims, attrs=attrs) + else: + variable = None return variable @@ -217,7 +223,8 @@ def virtual_vars_from_hdf( if key not in drop_variables: if isinstance(f[key], h5py.Dataset): variable = _dataset_to_variable(path, f[key]) - variables[key] = variable + if variable is not None: + variables[key] = variable else: raise NotImplementedError("Nested groups are not yet supported") diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 451862c6..6a1f91ef 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -71,9 +71,13 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format): f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=False ) - # assert identical to original dataset + # assert all_close to original dataset xrt.assert_allclose(roundtrip, ds) + # assert coordinate attributes are maintained + for coord in ds.coords: + assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs + def test_kerchunk_roundtrip_concat(self, tmpdir, format): # set up example xarray dataset ds = xr.tutorial.open_dataset("air_temperature", decode_times=False) @@ -107,8 +111,12 @@ def test_kerchunk_roundtrip_concat(self, tmpdir, format): f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=False ) - # assert identical to original dataset - xrt.assert_identical(roundtrip, ds) + # assert all_close to original dataset + xrt.assert_allclose(roundtrip, ds) + + # assert coordinate attributes are maintained + for coord in ds.coords: + assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs def test_non_dimension_coordinates(self, tmpdir, format): # regression test for GH issue #105 @@ -142,6 +150,10 @@ def test_non_dimension_coordinates(self, tmpdir, format): # assert equal to original dataset xrt.assert_allclose(roundtrip, ds) + # assert coordinate attributes are maintained + for coord in ds.coords: + assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs + def test_open_scalar_variable(tmpdir): # regression test for GH issue #100 diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index 8c5a40a7..c744cd68 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -17,6 +17,7 @@ def test_empty_chunks(self, empty_chunks_netcdf4_file): with pytest.raises(ValueError, match="chunked but contains no chunks"): _dataset_chunk_manifest(path=empty_chunks_netcdf4_file, dataset=ds) + @pytest.mark.skip("Need to differentiate non coordinate dimensions from empty") def test_empty_dataset(self, empty_dataset_netcdf4_file): f = h5py.File(empty_dataset_netcdf4_file) ds = f["data"] diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 39bd0671..a8a23693 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -121,7 +121,7 @@ def open_virtual_dataset( ds_attrs = attrs_from_root_group( path=filepath, reader_options=reader_options ) - coord_names = None + coord_names = ds_attrs.pop("coordinates", []) else: # this is the only place we actually always need to use kerchunk directly # TODO avoid even reading byte ranges for variables that will be dropped later anyway? From 64f446c8d452291548bba2c73a104bf068dc2d7e Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Thu, 27 Jun 2024 16:23:43 -0600 Subject: [PATCH 43/55] Use reader_options for filetype check and update failing kerchunk call. --- virtualizarr/tests/test_xarray.py | 18 +++++++++++++----- virtualizarr/xarray.py | 4 +++- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index e55583bf..282d4ad1 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -8,6 +8,7 @@ from xarray.core.indexes import Index from virtualizarr import open_virtual_dataset +from virtualizarr.kerchunk import FileType from virtualizarr.manifests import ChunkManifest, ManifestArray from virtualizarr.tests import network, requires_s3fs from virtualizarr.zarr import ZArray @@ -325,18 +326,25 @@ def test_loadable_variables(self, netcdf4_file): if name in vars_to_load: xrt.assert_identical(vds.variables[name], full_ds.variables[name]) - @patch("virtualizarr.kerchunk.read_kerchunk_references_from_file") + @patch("virtualizarr.xarray._automatically_determine_filetype") + @patch("virtualizarr.xarray.virtual_vars_from_hdf") def test_open_virtual_dataset_passes_expected_args( - self, mock_read_kerchunk, netcdf4_file + self, mock_reader, mock_determine_filetype, netcdf4_file ): reader_options = {"option1": "value1", "option2": "value2"} + mock_determine_filetype.return_value = FileType.netcdf4 open_virtual_dataset(netcdf4_file, indexes={}, reader_options=reader_options) - args = { + reader_args = { + "path": netcdf4_file, + "drop_variables": [], + "reader_options": reader_options, + } + mock_reader.assert_called_once_with(**reader_args) + filetype_args = { "filepath": netcdf4_file, - "filetype": None, "reader_options": reader_options, } - mock_read_kerchunk.assert_called_once_with(**args) + mock_determine_filetype.assert_called_once_with(**filetype_args) class TestRenamePaths: diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index a8a23693..86a59c8d 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -109,7 +109,9 @@ def open_virtual_dataset( ) else: if filetype is None: - filetype = _automatically_determine_filetype(filepath=filepath) + filetype = _automatically_determine_filetype( + filepath=filepath, reader_options=reader_options + ) filetype = FileType(filetype) if filetype.name.lower() == "netcdf4": From 9797346463e443d6f48b567569156f4ca01490cf Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Sat, 29 Jun 2024 18:20:06 -0600 Subject: [PATCH 44/55] Fix chunkmanifest shaping for chunked datasets. --- virtualizarr/readers/hdf.py | 36 +++++++++------------ virtualizarr/tests/test_readers/test_hdf.py | 10 ++++-- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index cbbe824f..d683f693 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -1,3 +1,4 @@ +import math from typing import List, Mapping, Optional, Union import h5py @@ -48,32 +49,27 @@ def _dataset_chunk_manifest( num_chunks = dsid.get_num_chunks() if num_chunks == 0: raise ValueError("The dataset is chunked but contains no chunks") - paths = np.full(num_chunks, path, dtype=np.dtypes.StringDType) # type: ignore - offsets = np.empty((num_chunks), dtype=np.int32) - lengths = np.empty((num_chunks), dtype=np.int32) - def add_chunk_info(blob, chunk_index): - offsets[chunk_index] = blob.byte_offset - lengths[chunk_index] = blob.size + shape = tuple(math.ceil(a / b) for a, b in zip(dataset.shape, dataset.chunks)) + paths = np.empty(shape, dtype=np.dtypes.StringDType) # type: ignore + offsets = np.empty(shape, dtype=np.int32) + lengths = np.empty(shape, dtype=np.int32) - has_chunk_iter = callable(getattr(dsid, "chunk_iter", None)) - if has_chunk_iter: - - def create_callback(initial=0): - value = initial + def get_key(blob): + return tuple([a // b for a, b in zip(blob.chunk_offset, dataset.chunks)]) - def callback(blob): - nonlocal value - add_chunk_info(blob, chunk_index=value) - value += 1 + def add_chunk_info(blob): + key = get_key(blob) + paths[key] = path + offsets[key] = blob.byte_offset + lengths[key] = blob.size - return callback - - callback = create_callback() - dsid.chunk_iter(callback) + has_chunk_iter = callable(getattr(dsid, "chunk_iter", None)) + if has_chunk_iter: + dsid.chunk_iter(add_chunk_info) else: for index in range(num_chunks): - add_chunk_info(dsid.get_chunk_info(index), index) + add_chunk_info(dsid.get_chunk_info(index)) chunk_manifest = ChunkManifest.from_arrays( paths=paths, offsets=offsets, lengths=lengths diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index c744cd68..25caab93 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -28,13 +28,19 @@ def test_no_chunking(self, no_chunks_netcdf4_file): f = h5py.File(no_chunks_netcdf4_file) ds = f["data"] manifest = _dataset_chunk_manifest(path=no_chunks_netcdf4_file, dataset=ds) - assert len(manifest) == 1 + assert manifest.shape_chunk_grid == (1, 1) def test_chunked(self, chunked_netcdf4_file): f = h5py.File(chunked_netcdf4_file) ds = f["data"] manifest = _dataset_chunk_manifest(path=chunked_netcdf4_file, dataset=ds) - assert len(manifest) == 4 + assert manifest.shape_chunk_grid == (2, 2) + + def test_chunked_roundtrip(self, chunked_roundtrip): + f = h5py.File(chunked_roundtrip) + ds = f["var2"] + manifest = _dataset_chunk_manifest(path=chunked_roundtrip, dataset=ds) + assert manifest.shape_chunk_grid == (2, 8) class TestDatasetDims: From c833e191abb773e409aec6eeb47ab6438d0ee0a9 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Sat, 29 Jun 2024 18:22:05 -0600 Subject: [PATCH 45/55] Handle scale_factor attribute serialization for compressed files. --- virtualizarr/readers/hdf_filters.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index cc8e810e..1a3c2220 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -88,7 +88,11 @@ def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None: attributes = {attr: dataset.attrs[attr] for attr in dataset.attrs} mapping = {} if "scale_factor" in attributes: - mapping["scale_factor"] = 1 / attributes["scale_factor"][0] + try: + scale_factor = attributes["scale_factor"][0] + except IndexError: + scale_factor = attributes["scale_factor"] + mapping["scale_factor"] = 1 / scale_factor else: mapping["scale_factor"] = 1 if "add_offset" in attributes: From 701bcfad494326a71ec08c454465bceaa33803e9 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Sat, 29 Jun 2024 18:24:13 -0600 Subject: [PATCH 46/55] Include chunked roundtrip fixture. --- virtualizarr/tests/test_readers/conftest.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index e1a53c5e..5fbec00e 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -196,7 +196,8 @@ def filter_encoded_xarray_netcdf4_file(tmpdir, request, skip_test_for_libhdf5_ve "blosc_shuffle": 1, "fletcher32": False, } - + # Check on how handle scalar dim. + ds = ds.drop_dims("dim3") ds["var2"].encoding.update(encoding_config) filepath = f"{tmpdir}/{request.param}_xarray.nc" ds.to_netcdf(filepath, engine="netcdf4") @@ -237,3 +238,14 @@ def scale_add_offset_netcdf4_file(tmpdir, np_uncompressed_int16, offset, scale_f f["data"].attrs.create(name="add_offset", data=offset) f["data"].attrs.create(name="scale_factor", data=np.array([scale_factor])) return filepath + + +@pytest.fixture() +def chunked_roundtrip(tmpdir): + ds = create_test_data(dim_sizes=(20, 80, 10)) + ds = ds.drop_dims("dim3") + filepath = f"{tmpdir}/chunked_xarray.nc" + ds.to_netcdf( + filepath, engine="netcdf4", encoding={"var2": {"chunksizes": (10, 10)}} + ) + return filepath From 08c988e2c16a7366a4ea99f2fc073da407b326d5 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Sat, 29 Jun 2024 18:24:48 -0600 Subject: [PATCH 47/55] Standardize xarray integration tests for hdf filters. --- .../test_readers/test_hdf_integration.py | 47 ++++++------------- 1 file changed, 14 insertions(+), 33 deletions(-) diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py index dca34dbd..abc23df6 100644 --- a/virtualizarr/tests/test_readers/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -1,53 +1,34 @@ -import fsspec -import numpy import pytest import xarray as xr +import xarray.testing as xrt import virtualizarr from virtualizarr.kerchunk import FileType class TestIntegration: + @pytest.mark.xfail(reason="Investigate initial time value decoding issue") def test_filters_h5netcdf_roundtrip( self, tmpdir, filter_encoded_xarray_h5netcdf_file ): - virtual_ds = virtualizarr.open_virtual_dataset( + ds = xr.open_dataset(filter_encoded_xarray_h5netcdf_file, decode_times=False) + vds = virtualizarr.open_virtual_dataset( filter_encoded_xarray_h5netcdf_file, filetype=FileType("netcdf4") ) kerchunk_file = f"{tmpdir}/kerchunk.json" - virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json") - fs = fsspec.filesystem("reference", fo=kerchunk_file) - m = fs.get_mapper("") - - ds = xr.open_dataset(m, engine="kerchunk") - assert isinstance(ds.air.values[0][0][0], numpy.float64) + vds.virtualize.to_kerchunk(kerchunk_file, format="json") + roundtrip = xr.open_dataset( + kerchunk_file, engine="kerchunk", decode_times=False + ) + xrt.assert_allclose(ds, roundtrip) - @pytest.mark.skip( - reason="Issue with xr 'dim1' serialization and blosc availability" - ) def test_filters_netcdf4_roundtrip( self, tmpdir, filter_encoded_xarray_netcdf4_file ): filepath = filter_encoded_xarray_netcdf4_file["filepath"] - compressor = filter_encoded_xarray_netcdf4_file["compressor"] - virtual_ds = virtualizarr.open_virtual_dataset( - filepath, filetype=FileType("netcdf4") - ) + ds = xr.open_dataset(filepath) + vds = virtualizarr.open_virtual_dataset(filepath, filetype=FileType("netcdf4")) kerchunk_file = f"{tmpdir}/kerchunk.json" - virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json") - fs = fsspec.filesystem("reference", fo=kerchunk_file) - m = fs.get_mapper("") - ds = xr.open_dataset(m, engine="kerchunk") - - expected_encoding = ds["var2"].encoding.copy() - compression = expected_encoding.pop("compression") - blosc_shuffle = expected_encoding.pop("blosc_shuffle") - if compression is not None: - if "blosc" in compression and blosc_shuffle: - expected_encoding["blosc"] = { - "compressor": compressor, - "shuffle": blosc_shuffle, - } - expected_encoding["shuffle"] = False - actual_encoding = ds["var2"].encoding - assert expected_encoding.items() <= actual_encoding.items() + vds.virtualize.to_kerchunk(kerchunk_file, format="json") + roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk") + xrt.assert_equal(ds, roundtrip) From 4cb4bac261a7825f44798e247c13a6faeb752a5a Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Sat, 29 Jun 2024 20:00:56 -0600 Subject: [PATCH 48/55] Update reader selection logic for new filetype determination. --- virtualizarr/xarray.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 1a795e56..9671264d 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -136,8 +136,7 @@ def open_virtual_dataset( filepath=filepath, reader_options=reader_options ) filetype = FileType(filetype) - - if filetype.name.lower() == "netcdf4": + if filetype.name.lower() == "netcdf4" or filetype.name.lower() == "hdf5": virtual_vars = virtual_vars_from_hdf( path=filepath, drop_variables=drop_variables, From d352104393d0eeacfc3b566a9f0cb79c7e688c8f Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Sun, 30 Jun 2024 13:07:17 -0600 Subject: [PATCH 49/55] Use decode_times for integration test. --- .../tests/test_readers/test_hdf_integration.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py index abc23df6..882dea31 100644 --- a/virtualizarr/tests/test_readers/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -1,4 +1,3 @@ -import pytest import xarray as xr import xarray.testing as xrt @@ -7,19 +6,18 @@ class TestIntegration: - @pytest.mark.xfail(reason="Investigate initial time value decoding issue") def test_filters_h5netcdf_roundtrip( self, tmpdir, filter_encoded_xarray_h5netcdf_file ): - ds = xr.open_dataset(filter_encoded_xarray_h5netcdf_file, decode_times=False) + ds = xr.open_dataset(filter_encoded_xarray_h5netcdf_file, decode_times=True) vds = virtualizarr.open_virtual_dataset( - filter_encoded_xarray_h5netcdf_file, filetype=FileType("netcdf4") + filter_encoded_xarray_h5netcdf_file, + loadable_variables=["time"], + cftime_variables=["time"], ) kerchunk_file = f"{tmpdir}/kerchunk.json" vds.virtualize.to_kerchunk(kerchunk_file, format="json") - roundtrip = xr.open_dataset( - kerchunk_file, engine="kerchunk", decode_times=False - ) + roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk", decode_times=True) xrt.assert_allclose(ds, roundtrip) def test_filters_netcdf4_roundtrip( From 3d89ea426ccb0f8abdcb961e55773887d48d38d6 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Sun, 30 Jun 2024 13:38:46 -0600 Subject: [PATCH 50/55] Standardize fixture names for hdf5 vs netcdf4 file types. --- virtualizarr/tests/test_readers/conftest.py | 36 +++++---- virtualizarr/tests/test_readers/test_hdf.py | 78 +++++++++---------- .../tests/test_readers/test_hdf_filters.py | 26 +++---- .../test_readers/test_hdf_integration.py | 10 +-- 4 files changed, 76 insertions(+), 74 deletions(-) diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 5fbec00e..539b2fbb 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -9,7 +9,7 @@ @pytest.fixture -def empty_chunks_netcdf4_file(tmpdir): +def empty_chunks_hdf5_file(tmpdir): ds = xr.Dataset({"data": []}) filepath = f"{tmpdir}/empty_chunks.nc" ds.to_netcdf(filepath, engine="h5netcdf") @@ -17,7 +17,7 @@ def empty_chunks_netcdf4_file(tmpdir): @pytest.fixture -def empty_dataset_netcdf4_file(tmpdir): +def empty_dataset_hdf5_file(tmpdir): filepath = f"{tmpdir}/empty_dataset.nc" f = h5py.File(filepath, "w") f.create_dataset("data", shape=(0,), dtype="f") @@ -25,7 +25,7 @@ def empty_dataset_netcdf4_file(tmpdir): @pytest.fixture -def no_chunks_netcdf4_file(tmpdir): +def no_chunks_hdf5_file(tmpdir): filepath = f"{tmpdir}/no_chunks.nc" f = h5py.File(filepath, "w") data = np.random.random((10, 10)) @@ -34,7 +34,7 @@ def no_chunks_netcdf4_file(tmpdir): @pytest.fixture -def chunked_netcdf4_file(tmpdir): +def chunked_hdf5_file(tmpdir): filepath = f"{tmpdir}/chunks.nc" f = h5py.File(filepath, "w") data = np.random.random((100, 100)) @@ -43,7 +43,7 @@ def chunked_netcdf4_file(tmpdir): @pytest.fixture -def single_dimension_scale_netcdf4_file(tmpdir): +def single_dimension_scale_hdf5_file(tmpdir): filepath = f"{tmpdir}/single_dimension_scale.nc" f = h5py.File(filepath, "w") data = [1, 2] @@ -56,7 +56,7 @@ def single_dimension_scale_netcdf4_file(tmpdir): @pytest.fixture -def is_scale_netcdf4_file(tmpdir): +def is_scale_hdf5_file(tmpdir): filepath = f"{tmpdir}/is_scale.nc" f = h5py.File(filepath, "w") data = [1, 2] @@ -66,7 +66,7 @@ def is_scale_netcdf4_file(tmpdir): @pytest.fixture -def multiple_dimension_scales_netcdf4_file(tmpdir): +def multiple_dimension_scales_hdf5_file(tmpdir): filepath = f"{tmpdir}/multiple_dimension_scales.nc" f = h5py.File(filepath, "w") data = [1, 2] @@ -96,7 +96,7 @@ def chunked_dimensions_netcdf4_file(tmpdir): @pytest.fixture -def string_attributes_netcdf4_file(tmpdir): +def string_attributes_hdf5_file(tmpdir): filepath = f"{tmpdir}/attributes.nc" f = h5py.File(filepath, "w") data = np.random.random((10, 10)) @@ -107,7 +107,7 @@ def string_attributes_netcdf4_file(tmpdir): @pytest.fixture -def root_attributes_netcdf4_file(tmpdir): +def root_attributes_hdf5_file(tmpdir): filepath = f"{tmpdir}/root_attributes.nc" f = h5py.File(filepath, "w") f.attrs["attribute_name"] = "attribute_name" @@ -115,7 +115,7 @@ def root_attributes_netcdf4_file(tmpdir): @pytest.fixture -def group_netcdf4_file(tmpdir): +def group_hdf5_file(tmpdir): filepath = f"{tmpdir}/group.nc" f = h5py.File(filepath, "w") f.create_group("group") @@ -123,7 +123,7 @@ def group_netcdf4_file(tmpdir): @pytest.fixture -def multiple_datasets_netcdf4_file(tmpdir): +def multiple_datasets_hdf5_file(tmpdir): filepath = f"{tmpdir}/multiple_datasets.nc" f = h5py.File(filepath, "w") data = np.random.random((10, 10)) @@ -138,7 +138,7 @@ def np_uncompressed(): @pytest.fixture(params=["gzip", "blosc_lz4", "lz4", "bzip2", "zstd"]) -def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request): +def filter_encoded_hdf5_file(tmpdir, np_uncompressed, request): filepath = f"{tmpdir}/{request.param}.nc" f = h5py.File(filepath, "w") if request.param == "gzip": @@ -162,7 +162,7 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request): @pytest.fixture(params=["gzip"]) -def filter_encoded_xarray_h5netcdf_file(tmpdir, request): +def filter_encoded_roundtrip_hdf5_file(tmpdir, request): ds = xr.tutorial.open_dataset("air_temperature") encoding = {} if request.param == "gzip": @@ -184,7 +184,9 @@ def skip_test_for_libhdf5_version(): @pytest.fixture(params=["blosc_zlib"]) -def filter_encoded_xarray_netcdf4_file(tmpdir, request, skip_test_for_libhdf5_version): +def filter_encoded_roundtrip_netcdf4_file( + tmpdir, request, skip_test_for_libhdf5_version +): if skip_test_for_libhdf5_version: pytest.skip("Requires libhdf5 >= 1.14") ds = create_test_data(dim_sizes=(20, 80, 10)) @@ -215,7 +217,7 @@ def offset(): @pytest.fixture -def add_offset_netcdf4_file(tmpdir, np_uncompressed_int16, offset): +def add_offset_hdf5_file(tmpdir, np_uncompressed_int16, offset): filepath = f"{tmpdir}/offset.nc" f = h5py.File(filepath, "w") data = np_uncompressed_int16 - offset @@ -230,7 +232,7 @@ def scale_factor(): @pytest.fixture -def scale_add_offset_netcdf4_file(tmpdir, np_uncompressed_int16, offset, scale_factor): +def scale_add_offset_hdf5_file(tmpdir, np_uncompressed_int16, offset, scale_factor): filepath = f"{tmpdir}/scale_offset.nc" f = h5py.File(filepath, "w") data = (np_uncompressed_int16 - offset) / scale_factor @@ -241,7 +243,7 @@ def scale_add_offset_netcdf4_file(tmpdir, np_uncompressed_int16, offset, scale_f @pytest.fixture() -def chunked_roundtrip(tmpdir): +def chunked_roundtrip_hdf5_file(tmpdir): ds = create_test_data(dim_sizes=(20, 80, 10)) ds = ds.drop_dims("dim3") filepath = f"{tmpdir}/chunked_xarray.nc" diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index 25caab93..1fb0f6ee 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -11,59 +11,59 @@ class TestDatasetChunkManifest: - def test_empty_chunks(self, empty_chunks_netcdf4_file): - f = h5py.File(empty_chunks_netcdf4_file) + def test_empty_chunks(self, empty_chunks_hdf5_file): + f = h5py.File(empty_chunks_hdf5_file) ds = f["data"] with pytest.raises(ValueError, match="chunked but contains no chunks"): - _dataset_chunk_manifest(path=empty_chunks_netcdf4_file, dataset=ds) + _dataset_chunk_manifest(path=empty_chunks_hdf5_file, dataset=ds) @pytest.mark.skip("Need to differentiate non coordinate dimensions from empty") - def test_empty_dataset(self, empty_dataset_netcdf4_file): - f = h5py.File(empty_dataset_netcdf4_file) + def test_empty_dataset(self, empty_dataset_hdf5_file): + f = h5py.File(empty_dataset_hdf5_file) ds = f["data"] with pytest.raises(ValueError, match="no space allocated in the file"): - _dataset_chunk_manifest(path=empty_dataset_netcdf4_file, dataset=ds) + _dataset_chunk_manifest(path=empty_dataset_hdf5_file, dataset=ds) - def test_no_chunking(self, no_chunks_netcdf4_file): - f = h5py.File(no_chunks_netcdf4_file) + def test_no_chunking(self, no_chunks_hdf5_file): + f = h5py.File(no_chunks_hdf5_file) ds = f["data"] - manifest = _dataset_chunk_manifest(path=no_chunks_netcdf4_file, dataset=ds) + manifest = _dataset_chunk_manifest(path=no_chunks_hdf5_file, dataset=ds) assert manifest.shape_chunk_grid == (1, 1) - def test_chunked(self, chunked_netcdf4_file): - f = h5py.File(chunked_netcdf4_file) + def test_chunked(self, chunked_hdf5_file): + f = h5py.File(chunked_hdf5_file) ds = f["data"] - manifest = _dataset_chunk_manifest(path=chunked_netcdf4_file, dataset=ds) + manifest = _dataset_chunk_manifest(path=chunked_hdf5_file, dataset=ds) assert manifest.shape_chunk_grid == (2, 2) - def test_chunked_roundtrip(self, chunked_roundtrip): - f = h5py.File(chunked_roundtrip) + def test_chunked_roundtrip(self, chunked_roundtrip_hdf5_file): + f = h5py.File(chunked_roundtrip_hdf5_file) ds = f["var2"] - manifest = _dataset_chunk_manifest(path=chunked_roundtrip, dataset=ds) + manifest = _dataset_chunk_manifest(path=chunked_roundtrip_hdf5_file, dataset=ds) assert manifest.shape_chunk_grid == (2, 8) class TestDatasetDims: - def test_single_dimension_scale(self, single_dimension_scale_netcdf4_file): - f = h5py.File(single_dimension_scale_netcdf4_file) + def test_single_dimension_scale(self, single_dimension_scale_hdf5_file): + f = h5py.File(single_dimension_scale_hdf5_file) ds = f["data"] dims = _dataset_dims(ds) assert dims[0] == "x" - def test_is_dimension_scale(self, is_scale_netcdf4_file): - f = h5py.File(is_scale_netcdf4_file) + def test_is_dimension_scale(self, is_scale_hdf5_file): + f = h5py.File(is_scale_hdf5_file) ds = f["data"] dims = _dataset_dims(ds) assert dims[0] == "data" - def test_multiple_dimension_scales(self, multiple_dimension_scales_netcdf4_file): - f = h5py.File(multiple_dimension_scales_netcdf4_file) + def test_multiple_dimension_scales(self, multiple_dimension_scales_hdf5_file): + f = h5py.File(multiple_dimension_scales_hdf5_file) ds = f["data"] with pytest.raises(ValueError, match="dimension scales attached"): _dataset_dims(ds) - def test_no_dimension_scales(self, no_chunks_netcdf4_file): - f = h5py.File(no_chunks_netcdf4_file) + def test_no_dimension_scales(self, no_chunks_hdf5_file): + f = h5py.File(no_chunks_hdf5_file) ds = f["data"] dims = _dataset_dims(ds) assert dims == ["phony_dim_0", "phony_dim_1"] @@ -76,33 +76,33 @@ def test_chunked_dataset(self, chunked_dimensions_netcdf4_file): var = _dataset_to_variable(chunked_dimensions_netcdf4_file, ds) assert var.chunks == (50, 50) - def test_not_chunked_dataset(self, single_dimension_scale_netcdf4_file): - f = h5py.File(single_dimension_scale_netcdf4_file) + def test_not_chunked_dataset(self, single_dimension_scale_hdf5_file): + f = h5py.File(single_dimension_scale_hdf5_file) ds = f["data"] - var = _dataset_to_variable(single_dimension_scale_netcdf4_file, ds) + var = _dataset_to_variable(single_dimension_scale_hdf5_file, ds) assert var.chunks == (2,) - def test_dataset_attributes(self, string_attributes_netcdf4_file): - f = h5py.File(string_attributes_netcdf4_file) + def test_dataset_attributes(self, string_attributes_hdf5_file): + f = h5py.File(string_attributes_hdf5_file) ds = f["data"] - var = _dataset_to_variable(string_attributes_netcdf4_file, ds) + var = _dataset_to_variable(string_attributes_hdf5_file, ds) assert var.attrs["attribute_name"] == "attribute_name" class TestExtractAttributes: - def test_string_attribute(self, string_attributes_netcdf4_file): - f = h5py.File(string_attributes_netcdf4_file) + def test_string_attribute(self, string_attributes_hdf5_file): + f = h5py.File(string_attributes_hdf5_file) ds = f["data"] attrs = _extract_attrs(ds) assert attrs["attribute_name"] == "attribute_name" - def test_root_attribute(self, root_attributes_netcdf4_file): - f = h5py.File(root_attributes_netcdf4_file) + def test_root_attribute(self, root_attributes_hdf5_file): + f = h5py.File(root_attributes_hdf5_file) attrs = _extract_attrs(f) assert attrs["attribute_name"] == "attribute_name" - def test_multiple_attributes(self, string_attributes_netcdf4_file): - f = h5py.File(string_attributes_netcdf4_file) + def test_multiple_attributes(self, string_attributes_hdf5_file): + f = h5py.File(string_attributes_hdf5_file) ds = f["data"] attrs = _extract_attrs(ds) assert len(attrs.keys()) == 2 @@ -113,10 +113,10 @@ def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file): variables = virtual_vars_from_hdf(chunked_dimensions_netcdf4_file) assert len(variables) == 3 - def test_groups_not_implemented(self, group_netcdf4_file): + def test_groups_not_implemented(self, group_hdf5_file): with pytest.raises(NotImplementedError): - virtual_vars_from_hdf(group_netcdf4_file) + virtual_vars_from_hdf(group_hdf5_file) - def test_drop_variables(self, multiple_datasets_netcdf4_file): - variables = virtual_vars_from_hdf(multiple_datasets_netcdf4_file, ["data2"]) + def test_drop_variables(self, multiple_datasets_hdf5_file): + variables = virtual_vars_from_hdf(multiple_datasets_hdf5_file, ["data2"]) assert "data2" not in variables.keys() diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py index 960bcf2c..99b3af48 100644 --- a/virtualizarr/tests/test_readers/test_hdf_filters.py +++ b/virtualizarr/tests/test_readers/test_hdf_filters.py @@ -39,12 +39,12 @@ def test_zstd(self): class TestCodecsFromDataSet: - def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file): - f = h5py.File(filter_encoded_netcdf4_file) + def test_numcodec_decoding(self, np_uncompressed, filter_encoded_hdf5_file): + f = h5py.File(filter_encoded_hdf5_file) ds = f["data"] chunk_info = ds.id.get_chunk_info(0) codecs = codecs_from_dataset(ds) - with open(filter_encoded_netcdf4_file, "rb") as file: + with open(filter_encoded_hdf5_file, "rb") as file: file.seek(chunk_info.byte_offset) bytes_read = file.read(chunk_info.size) decoded = codecs[0].decode(bytes_read) @@ -52,8 +52,8 @@ def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file): class TestCFCodecFromDataset: - def test_no_cf_convention(self, filter_encoded_netcdf4_file): - f = h5py.File(filter_encoded_netcdf4_file) + def test_no_cf_convention(self, filter_encoded_hdf5_file): + f = h5py.File(filter_encoded_hdf5_file) ds = f["data"] cf_codec = cfcodec_from_dataset(ds) assert cf_codec is None @@ -68,8 +68,8 @@ def test_cf_scale_factor(self, netcdf4_file): assert cf_codec["codec"].dtype == " Date: Sun, 30 Jun 2024 22:14:26 -0600 Subject: [PATCH 51/55] Handle array add_offset property for compressed data. --- virtualizarr/readers/hdf_filters.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index 1a3c2220..5b35d8ff 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -96,7 +96,11 @@ def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None: else: mapping["scale_factor"] = 1 if "add_offset" in attributes: - mapping["add_offset"] = attributes["add_offset"] + try: + offset = attributes["add_offset"][0] + except IndexError: + offset = attributes["add_offset"] + mapping["add_offset"] = offset else: mapping["add_offset"] = 0 if mapping["scale_factor"] != 1 or mapping["add_offset"] != 0: From db5b4213b0c4b512c872ce4acdce04c66936a6a5 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 1 Jul 2024 16:57:11 -0600 Subject: [PATCH 52/55] Include h5py shuffle filter. --- virtualizarr/readers/hdf_filters.py | 18 ++++++++++++++---- .../tests/test_readers/test_hdf_filters.py | 11 ++++++++++- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index 5b35d8ff..a60dd56a 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -36,6 +36,14 @@ class ZstdProperties(BaseModel): level: int +class ShuffleProperties(BaseModel): + elementsize: int + + +class ZlibProperties(BaseModel): + level: int + + class CFCodec(TypedDict): target_dtype: np.dtype codec: Codec @@ -56,9 +64,13 @@ def _filter_to_codec( id = _non_standard_filters[id_str] else: id = id_str - conf["id"] = id # type: ignore[assignment] if id == "zlib": - conf["level"] = filter_properties # type: ignore[assignment] + zlib_props = ZlibProperties(level=filter_properties) + conf = zlib_props.model_dump() # type: ignore[assignment] + if id == "shuffle" and isinstance(filter_properties, tuple): + shuffle_props = ShuffleProperties(elementsize=filter_properties[0]) + conf = shuffle_props.model_dump() # type: ignore[assignment] + conf["id"] = id # type: ignore[assignment] if id_int: filter = hdf5plugin.get_filters(id_int)[0] id = filter.filter_name @@ -77,9 +89,7 @@ def _filter_to_codec( if id == "zstd" and isinstance(filter_properties, tuple): zstd_props = ZstdProperties(level=filter_properties[0]) conf = zstd_props.model_dump() # type: ignore[assignment] - conf["id"] = id - codec = registry.get_codec(conf) return codec diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py index 99b3af48..efaad781 100644 --- a/virtualizarr/tests/test_readers/test_hdf_filters.py +++ b/virtualizarr/tests/test_readers/test_hdf_filters.py @@ -37,6 +37,12 @@ def test_zstd(self): expected_config = {"id": "zstd", "level": 5} assert codec.get_config() == expected_config + def test_shuffle(self): + codec = _filter_to_codec("shuffle", (7,)) + assert isinstance(codec, numcodecs.shuffle.Shuffle) + expected_config = {"id": "shuffle", "elementsize": 7} + assert codec.get_config() == expected_config + class TestCodecsFromDataSet: def test_numcodec_decoding(self, np_uncompressed, filter_encoded_hdf5_file): @@ -48,7 +54,10 @@ def test_numcodec_decoding(self, np_uncompressed, filter_encoded_hdf5_file): file.seek(chunk_info.byte_offset) bytes_read = file.read(chunk_info.size) decoded = codecs[0].decode(bytes_read) - assert decoded == np_uncompressed.tobytes() + if isinstance(decoded, np.ndarray): + assert decoded.tobytes() == np_uncompressed.tobytes() + else: + assert decoded == np_uncompressed.tobytes() class TestCFCodecFromDataset: From 9a1da321e186f56d230cb5609dc787f7d9ec557b Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 1 Jul 2024 17:03:46 -0600 Subject: [PATCH 53/55] Make ScaleAndOffset codec last in filters list. --- virtualizarr/readers/hdf.py | 2 +- virtualizarr/tests/test_readers/conftest.py | 36 ++++++++++++++++++- .../test_readers/test_hdf_integration.py | 10 ++++++ 3 files changed, 46 insertions(+), 2 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index d683f693..f3337c04 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -176,7 +176,7 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> Optional[xr.Variab cfcodec = cfcodec_from_dataset(dataset) attrs = _extract_attrs(dataset) if cfcodec: - codecs.append(cfcodec["codec"]) + codecs.insert(0, cfcodec["codec"]) dtype = cfcodec["target_dtype"] attrs.pop("scale_factor", None) attrs.pop("add_offset", None) diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 539b2fbb..afc0beea 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -137,7 +137,7 @@ def np_uncompressed(): return np.arange(100) -@pytest.fixture(params=["gzip", "blosc_lz4", "lz4", "bzip2", "zstd"]) +@pytest.fixture(params=["gzip", "blosc_lz4", "lz4", "bzip2", "zstd", "shuffle"]) def filter_encoded_hdf5_file(tmpdir, np_uncompressed, request): filepath = f"{tmpdir}/{request.param}.nc" f = h5py.File(filepath, "w") @@ -157,6 +157,8 @@ def filter_encoded_hdf5_file(tmpdir, np_uncompressed, request): f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.BZip2()) if request.param == "zstd": f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.Zstd(clevel=2)) + if request.param == "shuffle": + f.create_dataset(name="data", data=np_uncompressed, shuffle=True) return filepath @@ -251,3 +253,35 @@ def chunked_roundtrip_hdf5_file(tmpdir): filepath, engine="netcdf4", encoding={"var2": {"chunksizes": (10, 10)}} ) return filepath + + +@pytest.fixture(params=["gzip", "zlib"]) +def filter_and_cf_roundtrip_hdf5_file(tmpdir, request): + x = np.arange(100) + y = np.arange(100) + temperature = 0.1 * x[:, None] + 0.1 * y[None, :] + ds = xr.Dataset( + {"temperature": (["x", "y"], temperature)}, + coords={"x": np.arange(100), "y": np.arange(100)}, + ) + encoding = { + "temperature": { + "dtype": "int16", + "scale_factor": 0.1, + "add_offset": 273.15, + } + } + if request.param == "gzip": + encoding["temperature"]["compression"] = "gzip" + encoding["temperature"]["compression_opts"] = 7 + + if request.param == "zlib": + encoding["temperature"]["zlib"] = True + encoding["temperature"]["complevel"] = 9 + + from random import randint + + filepath = f"{tmpdir}/{request.param}_{randint(0,100)}_cf_roundtrip.nc" + ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding) + + return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py index 4fc7bd3e..dd8d6c3b 100644 --- a/virtualizarr/tests/test_readers/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -1,3 +1,4 @@ +import pytest import xarray as xr import xarray.testing as xrt @@ -30,3 +31,12 @@ def test_filters_netcdf4_roundtrip( vds.virtualize.to_kerchunk(kerchunk_file, format="json") roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk") xrt.assert_equal(ds, roundtrip) + + @pytest.mark.xfail(reason="Investigate kerchunk _FillValue logic") + def test_filter_and_cf_roundtrip(self, tmpdir, filter_and_cf_roundtrip_hdf5_file): + ds = xr.open_dataset(filter_and_cf_roundtrip_hdf5_file) + vds = virtualizarr.open_virtual_dataset(filter_and_cf_roundtrip_hdf5_file) + kerchunk_file = f"{tmpdir}/filter_cf_kerchunk.json" + vds.virtualize.to_kerchunk(kerchunk_file, format="json") + roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk") + xrt.assert_allclose(ds, roundtrip) From 9b2b0f8a2b94073c2bf50fe78d8dd068e6d1332c Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Tue, 2 Jul 2024 13:23:23 -0600 Subject: [PATCH 54/55] Apply ScaleAndOffset codec to _FillValue since it's value is now downstream. --- virtualizarr/readers/hdf.py | 4 +++- virtualizarr/tests/test_readers/conftest.py | 7 ++++++- virtualizarr/tests/test_readers/test_hdf_integration.py | 2 -- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index f3337c04..6197067f 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -180,14 +180,16 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> Optional[xr.Variab dtype = cfcodec["target_dtype"] attrs.pop("scale_factor", None) attrs.pop("add_offset", None) + fill_value = cfcodec["codec"].decode(dataset.fillvalue) else: dtype = dataset.dtype + fill_value = dataset.fillvalue filters = [codec.get_config() for codec in codecs] zarray = ZArray( chunks=chunks, compressor=None, dtype=dtype, - fill_value=dataset.fillvalue, + fill_value=fill_value, filters=filters, order="C", shape=dataset.shape, diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index afc0beea..ec4132ba 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -259,7 +259,9 @@ def chunked_roundtrip_hdf5_file(tmpdir): def filter_and_cf_roundtrip_hdf5_file(tmpdir, request): x = np.arange(100) y = np.arange(100) + fill_value = np.int16(-9999) temperature = 0.1 * x[:, None] + 0.1 * y[None, :] + temperature[0][0] = fill_value ds = xr.Dataset( {"temperature": (["x", "y"], temperature)}, coords={"x": np.arange(100), "y": np.arange(100)}, @@ -269,7 +271,10 @@ def filter_and_cf_roundtrip_hdf5_file(tmpdir, request): "dtype": "int16", "scale_factor": 0.1, "add_offset": 273.15, - } + "_FillValue": fill_value, + }, + "x": {"_FillValue": fill_value}, + "y": {"_FillValue": fill_value}, } if request.param == "gzip": encoding["temperature"]["compression"] = "gzip" diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py index dd8d6c3b..5cf3f79d 100644 --- a/virtualizarr/tests/test_readers/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -1,4 +1,3 @@ -import pytest import xarray as xr import xarray.testing as xrt @@ -32,7 +31,6 @@ def test_filters_netcdf4_roundtrip( roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk") xrt.assert_equal(ds, roundtrip) - @pytest.mark.xfail(reason="Investigate kerchunk _FillValue logic") def test_filter_and_cf_roundtrip(self, tmpdir, filter_and_cf_roundtrip_hdf5_file): ds = xr.open_dataset(filter_and_cf_roundtrip_hdf5_file) vds = virtualizarr.open_virtual_dataset(filter_and_cf_roundtrip_hdf5_file) From 9ef136275ff636535dcb7e6ecc5b35c1e7149065 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Tue, 2 Jul 2024 15:12:04 -0600 Subject: [PATCH 55/55] Coerce scale and add_offset values to native float for JSON serialization. --- virtualizarr/readers/hdf_filters.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index a60dd56a..ae232fec 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -102,7 +102,7 @@ def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None: scale_factor = attributes["scale_factor"][0] except IndexError: scale_factor = attributes["scale_factor"] - mapping["scale_factor"] = 1 / scale_factor + mapping["scale_factor"] = float(1 / scale_factor) else: mapping["scale_factor"] = 1 if "add_offset" in attributes: @@ -110,7 +110,7 @@ def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None: offset = attributes["add_offset"][0] except IndexError: offset = attributes["add_offset"] - mapping["add_offset"] = offset + mapping["add_offset"] = float(offset) else: mapping["add_offset"] = 0 if mapping["scale_factor"] != 1 or mapping["add_offset"] != 0: