From 6b7abe2a0dc650ae7e6bf07c080cc9023a17bf2c Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 19 Apr 2024 13:25:28 -0600
Subject: [PATCH 01/79] Generate chunk manifest backed variable from HDF5
 dataset.

---
 pyproject.toml                              |   1 +
 virtualizarr/readers/hdf.py                 | 135 ++++++++++++++++++++
 virtualizarr/tests/test_readers/__init__.py |   0
 virtualizarr/tests/test_readers/conftest.py |  91 +++++++++++++
 virtualizarr/tests/test_readers/test_hdf.py |  71 ++++++++++
 5 files changed, 298 insertions(+)
 create mode 100644 virtualizarr/readers/hdf.py
 create mode 100644 virtualizarr/tests/test_readers/__init__.py
 create mode 100644 virtualizarr/tests/test_readers/conftest.py
 create mode 100644 virtualizarr/tests/test_readers/test_hdf.py

diff --git a/pyproject.toml b/pyproject.toml
index c7505bca..7994c929 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,6 +25,7 @@ dependencies = [
     "kerchunk==0.2.2",
     "pydantic",
     "packaging",
+    "h5netcdf",
 ]
 
 [project.optional-dependencies]
diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
new file mode 100644
index 00000000..a34ae341
--- /dev/null
+++ b/virtualizarr/readers/hdf.py
@@ -0,0 +1,135 @@
+from typing import List
+
+import h5py
+import xarray as xr
+
+from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray
+from virtualizarr.zarr import ZArray
+
+
+def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest:
+    """
+    Generate ChunkManifest for HDF5 dataset.
+
+    Parameters
+    ----------
+    path: str
+        The path the HDF5 container file
+     dset : h5py.Dataset
+        HDF5 dataset for which to create a ChunkManifest
+
+    Returns
+    -------
+    ChunkManifest
+        A Virtualizarr ChunkManifest
+    """
+    dsid = dataset.id
+
+    if dataset.chunks is None:
+        if dsid.get_offset() is None:
+            raise ValueError("Dataset has no space allocated in the file")
+        else:
+            key_list = [0] * (len(dataset.shape) or 1)
+            key = ".".join(map(str, key_list))
+            chunk_entry = ChunkEntry(
+                path=path,
+                offset=dsid.get_offset(),
+                length=dsid.get_storage_size()
+            )
+            chunk_entries = {key: chunk_entry}
+            chunk_manifest = ChunkManifest(
+                entries=chunk_entries
+            )
+            return chunk_manifest
+    else:
+        num_chunks = dsid.get_num_chunks()
+        if num_chunks == 0:
+            raise ValueError("The dataset is chunked but contains no chunks")
+
+        chunk_entries = dict()
+
+        def get_key(blob):
+            key_list = [a // b for a, b in zip(blob.chunk_offset, dataset.chunks)]
+            key = ".".join(map(str, key_list))
+            return key
+
+        def store_chunk_entry(blob):
+            chunk_entries[get_key(blob)] = ChunkEntry(
+                path=path,
+                offset=blob.byte_offset,
+                length=blob.size
+            )
+
+        has_chunk_iter = callable(getattr(dsid, "chunk_iter", None))
+        if has_chunk_iter:
+            dsid.chunk_iter(store_chunk_entry)
+        else:
+            for index in range(num_chunks):
+                store_chunk_entry(dsid.get_chunk_info(index))
+
+        chunk_manifest = ChunkManifest(
+            entries=chunk_entries
+        )
+        return chunk_manifest
+
+def _dataset_dims(dataset: h5py.Dataset) -> List[str]:
+    """
+    Get a list of dimension scale names attached to input HDF5 dataset.
+
+    This is required by the xarray package to work with Zarr arrays. Only
+    one dimension scale per dataset dimension is allowed. If dataset is
+    dimension scale, it will be considered as the dimension to itself.
+
+    Parameters
+    ----------
+    dataset : h5py.Dataset
+        HDF5 dataset.
+
+    Returns
+    -------
+    list
+        List with HDF5 path names of dimension scales attached to input
+        dataset.
+    """
+    dims = list()
+    rank = len(dataset.shape)
+    if rank:
+        for n in range(rank):
+            num_scales = len(dataset.dims[n])
+            if num_scales == 1:
+                dims.append(dataset.dims[n][0].name[1:])
+            elif h5py.h5ds.is_scale(dataset.id):
+                dims.append(dataset.name[1:])
+            elif num_scales > 1:
+                raise ValueError(
+                    f"{dataset.name}: {len(dataset.dims[n])} "
+                    f"dimension scales attached to dimension #{n}"
+                )
+            elif num_scales == 0:
+                # Some HDF5 files do not have dimension scales.
+                # If this is the case, `num_scales` will be 0.
+                # In this case, we mimic netCDF4 and assign phony dimension names.
+                # See https://github.com/fsspec/kerchunk/issues/41
+                dims.append(f"phony_dim_{n}")
+        return dims
+
+
+def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable:
+    # This chunk determination logic mirrors zarr-python's create
+    # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66
+    chunks = dataset.chunks if dataset.chunks else dataset.shape
+    zarray = ZArray(
+        chunks=chunks,
+        compressor=dataset.compression,
+        dtype=dataset.dtype,
+        fill_value=dataset.fillvalue,
+        filters=None,
+        order="C",
+        shape=dataset.shape,
+        zarr_format=2,
+    )
+    manifest = _dataset_chunk_manifest(path, dataset)
+    marray = ManifestArray(zarray=zarray, chunkmanifest=manifest)
+    dims = _dataset_dims(dataset)
+    variable = xr.Variable(data=marray, dims=dims)
+    return variable
diff --git a/virtualizarr/tests/test_readers/__init__.py b/virtualizarr/tests/test_readers/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
new file mode 100644
index 00000000..b4504839
--- /dev/null
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -0,0 +1,91 @@
+import h5py
+import numpy as np
+import pytest
+import xarray as xr
+
+
+@pytest.fixture
+def empty_chunks_netcdf4_file(tmpdir):
+    ds = xr.Dataset({"data": []})
+    filepath = f"{tmpdir}/empty_chunks.nc"
+    ds.to_netcdf(filepath, engine="h5netcdf")
+    return filepath
+
+
+@pytest.fixture
+def empty_dataset_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/empty_dataset.nc"
+    f = h5py.File(filepath, "w")
+    f.create_dataset("data", shape=(0,), dtype="f")
+    return filepath
+
+
+@pytest.fixture
+def no_chunks_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/no_chunks.nc"
+    f = h5py.File(filepath, "w")
+    data = np.random.random((10, 10))
+    f.create_dataset(name="data", data=data, chunks=None)
+    return filepath
+
+
+@pytest.fixture
+def chunked_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/chunks.nc"
+    f = h5py.File(filepath, "w")
+    data = np.random.random((100, 100))
+    f.create_dataset(name="data", data=data, chunks=(50, 50))
+    return filepath
+
+
+@pytest.fixture
+def single_dimension_scale_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/single_dimension_scale.nc"
+    f = h5py.File(filepath, "w")
+    data = [1, 2]
+    x = [0, 1]
+    f.create_dataset(name="data", data=data)
+    f.create_dataset(name="x", data=x)
+    f["x"].make_scale()
+    f["data"].dims[0].attach_scale(f["x"])
+    return filepath
+
+
+@pytest.fixture
+def is_scale_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/is_scale.nc"
+    f = h5py.File(filepath, "w")
+    data = [1, 2]
+    f.create_dataset(name="data", data=data)
+    f["data"].make_scale()
+    return filepath
+
+
+@pytest.fixture
+def multiple_dimension_scales_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/multiple_dimension_scales.nc"
+    f = h5py.File(filepath, "w")
+    data = [1, 2]
+    f.create_dataset(name="data", data=data)
+    f.create_dataset(name="x", data=[0, 1])
+    f.create_dataset(name="y", data=[0, 1])
+    f["x"].make_scale()
+    f["y"].make_scale()
+    f["data"].dims[0].attach_scale(f["x"])
+    f["data"].dims[0].attach_scale(f["y"])
+    return filepath
+
+
+@pytest.fixture
+def chunked_dimensions_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/chunks_dimension.nc"
+    f = h5py.File(filepath, "w")
+    data = np.random.random((100, 100))
+    x = np.random.random((100, 100))
+    y = np.random.random((100, 100))
+    f.create_dataset(name="data", data=data, chunks=(50, 50))
+    f.create_dataset(name="x", data=x, chunks=(50, 50))
+    f.create_dataset(name="y", data=y, chunks=(50, 50))
+    f["data"].dims[0].attach_scale(f["x"])
+    f["data"].dims[1].attach_scale(f["y"])
+    return filepath
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
new file mode 100644
index 00000000..b6b78c11
--- /dev/null
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -0,0 +1,71 @@
+import h5py
+import pytest
+
+from virtualizarr.readers.hdf import (_dataset_chunk_manifest, _dataset_dims,
+                                      _dataset_to_variable)
+
+
+class TestDatasetChunkManifest:
+    def test_empty_chunks(self, empty_chunks_netcdf4_file):
+        f = h5py.File(empty_chunks_netcdf4_file)
+        ds = f["data"]
+        with pytest.raises(ValueError, match="chunked but contains no chunks"):
+            _dataset_chunk_manifest(path=empty_chunks_netcdf4_file, dataset=ds)
+
+    def test_empty_dataset(self, empty_dataset_netcdf4_file):
+        f = h5py.File(empty_dataset_netcdf4_file)
+        ds = f["data"]
+        with pytest.raises(ValueError, match="no space allocated in the file"):
+            _dataset_chunk_manifest(path=empty_dataset_netcdf4_file, dataset=ds)
+
+    def test_no_chunking(self, no_chunks_netcdf4_file):
+        f = h5py.File(no_chunks_netcdf4_file)
+        ds = f["data"]
+        manifest = _dataset_chunk_manifest(path=no_chunks_netcdf4_file, dataset=ds)
+        assert len(manifest.entries) == 1
+
+    def test_chunked(self, chunked_netcdf4_file):
+        f = h5py.File(chunked_netcdf4_file)
+        ds = f["data"]
+        manifest = _dataset_chunk_manifest(path=chunked_netcdf4_file, dataset=ds)
+        assert len(manifest.entries) == 4
+
+
+class TestDatasetDims:
+    def test_single_dimension_scale(self, single_dimension_scale_netcdf4_file):
+        f = h5py.File(single_dimension_scale_netcdf4_file)
+        ds = f["data"]
+        dims = _dataset_dims(ds)
+        assert dims[0] == "x"
+
+    def test_is_dimension_scale(self, is_scale_netcdf4_file):
+        f = h5py.File(is_scale_netcdf4_file)
+        ds = f["data"]
+        dims = _dataset_dims(ds)
+        assert dims[0] == "data"
+
+    def test_multiple_dimension_scales(self, multiple_dimension_scales_netcdf4_file):
+        f = h5py.File(multiple_dimension_scales_netcdf4_file)
+        ds = f["data"]
+        with pytest.raises(ValueError, match="dimension scales attached"):
+            _dataset_dims(ds)
+
+    def test_no_dimension_scales(self, no_chunks_netcdf4_file):
+        f = h5py.File(no_chunks_netcdf4_file)
+        ds = f["data"]
+        dims = _dataset_dims(ds)
+        assert dims == ["phony_dim_0", "phony_dim_1"]
+
+
+class TestDatasetToVariable:
+    def test_chunked_dataset(self, chunked_dimensions_netcdf4_file):
+        f = h5py.File(chunked_dimensions_netcdf4_file)
+        ds = f["data"]
+        var = _dataset_to_variable(chunked_dimensions_netcdf4_file, ds)
+        assert var.chunks == (50, 50)
+
+    def test_not_chunked_dataset(self, single_dimension_scale_netcdf4_file):
+        f = h5py.File(single_dimension_scale_netcdf4_file)
+        ds = f["data"]
+        var = _dataset_to_variable(single_dimension_scale_netcdf4_file, ds)
+        assert var.chunks == (2,)

From bca0aabd6030625156b5fe1e58fb8d9a2ccf46f1 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 19 Apr 2024 14:20:38 -0600
Subject: [PATCH 02/79] Transfer dataset attrs to variable.

---
 virtualizarr/readers/hdf.py                 | 50 ++++++++++++++++++++-
 virtualizarr/tests/test_readers/conftest.py | 10 +++++
 virtualizarr/tests/test_readers/test_hdf.py | 16 ++++++-
 3 files changed, 74 insertions(+), 2 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index a34ae341..d6518a30 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -1,6 +1,7 @@
 from typing import List
 
 import h5py
+import numpy as np
 import xarray as xr
 
 from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray
@@ -114,6 +115,52 @@ def _dataset_dims(dataset: h5py.Dataset) -> List[str]:
         return dims
 
 
+def _extract_attrs(dataset: h5py.Dataset):
+    """
+    Extract attributes from an HDF5 dataset.
+
+    Parameters
+    ----------
+    dataset : h5py.Dataset
+        An HDF5 dataset.
+    """
+    _HIDDEN_ATTRS = {
+        "REFERENCE_LIST",
+        "CLASS",
+        "DIMENSION_LIST",
+        "NAME",
+        "_Netcdf4Dimid",
+        "_Netcdf4Coordinates",
+        "_nc3_strict",
+        "_NCProperties",
+    }
+    attrs = {}
+    for n, v in dataset.attrs.items():
+        if n in _HIDDEN_ATTRS:
+            continue
+        # Fix some attribute values to avoid JSON encoding exceptions...
+        if isinstance(v, bytes):
+            v = v.decode("utf-8") or " "
+        elif isinstance(v, (np.ndarray, np.number, np.bool_)):
+            if v.dtype.kind == "S":
+                v = v.astype(str)
+            if n == "_FillValue":
+                continue
+            elif v.size == 1:
+                v = v.flatten()[0]
+                if isinstance(v, (np.ndarray, np.number, np.bool_)):
+                    v = v.tolist()
+            else:
+                v = v.tolist()
+        elif isinstance(v, h5py._hl.base.Empty):
+            v = ""
+        if v == "DIMENSION_SCALE":
+            continue
+
+        attrs[n] = v
+        return attrs
+
+
 def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable:
     # This chunk determination logic mirrors zarr-python's create
     # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66
@@ -131,5 +178,6 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable:
     manifest = _dataset_chunk_manifest(path, dataset)
     marray = ManifestArray(zarray=zarray, chunkmanifest=manifest)
     dims = _dataset_dims(dataset)
-    variable = xr.Variable(data=marray, dims=dims)
+    attrs = _extract_attrs(dataset)
+    variable = xr.Variable(data=marray, dims=dims, attrs=attrs)
     return variable
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index b4504839..2c40fe17 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -89,3 +89,13 @@ def chunked_dimensions_netcdf4_file(tmpdir):
     f["data"].dims[0].attach_scale(f["x"])
     f["data"].dims[1].attach_scale(f["y"])
     return filepath
+
+
+@pytest.fixture
+def string_attribute_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/attributes.nc"
+    f = h5py.File(filepath, "w")
+    data = np.random.random((10, 10))
+    f.create_dataset(name="data", data=data, chunks=None)
+    f["data"].attrs["attribute_name"] = "attribute_name"
+    return filepath
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
index b6b78c11..495b7de0 100644
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -2,7 +2,7 @@
 import pytest
 
 from virtualizarr.readers.hdf import (_dataset_chunk_manifest, _dataset_dims,
-                                      _dataset_to_variable)
+                                      _dataset_to_variable, _extract_attrs)
 
 
 class TestDatasetChunkManifest:
@@ -69,3 +69,17 @@ def test_not_chunked_dataset(self, single_dimension_scale_netcdf4_file):
         ds = f["data"]
         var = _dataset_to_variable(single_dimension_scale_netcdf4_file, ds)
         assert var.chunks == (2,)
+
+    def test_dataset_attributes(self, string_attribute_netcdf4_file):
+        f = h5py.File(string_attribute_netcdf4_file)
+        ds = f["data"]
+        var = _dataset_to_variable(string_attribute_netcdf4_file, ds)
+        assert var.attrs["attribute_name"] == "attribute_name"
+
+
+class TestExtractAttributes:
+    def test_string_attribute(self, string_attribute_netcdf4_file):
+        f = h5py.File(string_attribute_netcdf4_file)
+        ds = f["data"]
+        attrs = _extract_attrs(ds)
+        assert attrs["attribute_name"] == "attribute_name"

From 384ff6bb2d75b68a4af1f23d56a6544b4e20d6b5 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 19 Apr 2024 15:26:58 -0600
Subject: [PATCH 03/79] Get virtual variables dict from HDF5 file.

---
 virtualizarr/readers/hdf.py                 | 14 +++++++++++++-
 virtualizarr/tests/test_readers/conftest.py | 16 ++++++++++++----
 virtualizarr/tests/test_readers/test_hdf.py | 15 ++++++++++++++-
 3 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index d6518a30..9c3ebf44 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -1,4 +1,4 @@
-from typing import List
+from typing import Mapping, List
 
 import h5py
 import numpy as np
@@ -181,3 +181,15 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable:
     attrs = _extract_attrs(dataset)
     variable = xr.Variable(data=marray, dims=dims, attrs=attrs)
     return variable
+
+
+def virtual_vars_from_hdf(path: str, f: h5py.File) -> Mapping[str, xr.Variable]:
+    variables = {}
+    for key in f.keys():
+        if isinstance(f[key], h5py.Dataset):
+            variable = _dataset_to_variable(path, f[key])
+            variables[key] = variable
+        else:
+            raise NotImplementedError("Nested groups are not yet supported")
+
+    return variables
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index 2c40fe17..735e922a 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -81,11 +81,11 @@ def chunked_dimensions_netcdf4_file(tmpdir):
     filepath = f"{tmpdir}/chunks_dimension.nc"
     f = h5py.File(filepath, "w")
     data = np.random.random((100, 100))
-    x = np.random.random((100, 100))
-    y = np.random.random((100, 100))
+    x = np.random.random((100))
+    y = np.random.random((100))
     f.create_dataset(name="data", data=data, chunks=(50, 50))
-    f.create_dataset(name="x", data=x, chunks=(50, 50))
-    f.create_dataset(name="y", data=y, chunks=(50, 50))
+    f.create_dataset(name="x", data=x)
+    f.create_dataset(name="y", data=y)
     f["data"].dims[0].attach_scale(f["x"])
     f["data"].dims[1].attach_scale(f["y"])
     return filepath
@@ -99,3 +99,11 @@ def string_attribute_netcdf4_file(tmpdir):
     f.create_dataset(name="data", data=data, chunks=None)
     f["data"].attrs["attribute_name"] = "attribute_name"
     return filepath
+
+
+@pytest.fixture
+def group_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/group.nc"
+    f = h5py.File(filepath, "w")
+    f.create_group("group")
+    return filepath
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
index 495b7de0..da331ed9 100644
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -2,7 +2,8 @@
 import pytest
 
 from virtualizarr.readers.hdf import (_dataset_chunk_manifest, _dataset_dims,
-                                      _dataset_to_variable, _extract_attrs)
+                                      _dataset_to_variable, _extract_attrs,
+                                      virtual_vars_from_hdf)
 
 
 class TestDatasetChunkManifest:
@@ -83,3 +84,15 @@ def test_string_attribute(self, string_attribute_netcdf4_file):
         ds = f["data"]
         attrs = _extract_attrs(ds)
         assert attrs["attribute_name"] == "attribute_name"
+
+
+class TestVirtualVarsFromHDF:
+    def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file):
+        f = h5py.File(chunked_dimensions_netcdf4_file)
+        variables = virtual_vars_from_hdf(chunked_dimensions_netcdf4_file, f)
+        assert len(variables) == 3
+
+    def test_groups_not_implemented(self, group_netcdf4_file):
+        f = h5py.File(group_netcdf4_file)
+        with pytest.raises(NotImplementedError):
+            virtual_vars_from_hdf(group_netcdf4_file, f)

From 4c5f9bd30186aee61ff79223a70a3172b1c17d00 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 22 Apr 2024 12:33:24 -0600
Subject: [PATCH 04/79] Update virtual_vars_from_hdf to use fsspec and
 drop_variables arg.

---
 pyproject.toml                              |  2 +-
 virtualizarr/readers/hdf.py                 | 25 +++++++++++++++------
 virtualizarr/tests/test_readers/conftest.py | 10 +++++++++
 virtualizarr/tests/test_readers/test_hdf.py | 13 +++++++----
 4 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 7994c929..d08621e3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,7 +25,6 @@ dependencies = [
     "kerchunk==0.2.2",
     "pydantic",
     "packaging",
-    "h5netcdf",
 ]
 
 [project.optional-dependencies]
@@ -35,6 +34,7 @@ test = [
     "pytest",
     "scipy",
     "pooch",
+    "h5netcdf",
 ]
 
 
diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index 9c3ebf44..c4ab2927 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -1,5 +1,6 @@
-from typing import Mapping, List
+from typing import List, Mapping, Optional
 
+import fsspec
 import h5py
 import numpy as np
 import xarray as xr
@@ -73,6 +74,7 @@ def store_chunk_entry(blob):
         )
         return chunk_manifest
 
+
 def _dataset_dims(dataset: h5py.Dataset) -> List[str]:
     """
     Get a list of dimension scale names attached to input HDF5 dataset.
@@ -183,13 +185,22 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable:
     return variable
 
 
-def virtual_vars_from_hdf(path: str, f: h5py.File) -> Mapping[str, xr.Variable]:
+def virtual_vars_from_hdf(
+    path: str,
+    drop_variables: Optional[List[str]] = None,
+) -> Mapping[str, xr.Variable]:
+    if drop_variables is None:
+        drop_variables = []
+    fs, file_path = fsspec.core.url_to_fs(path)
+    open_file = fs.open(path, "rb")
+    f = h5py.File(open_file, mode="r")
     variables = {}
     for key in f.keys():
-        if isinstance(f[key], h5py.Dataset):
-            variable = _dataset_to_variable(path, f[key])
-            variables[key] = variable
-        else:
-            raise NotImplementedError("Nested groups are not yet supported")
+        if key not in drop_variables:
+            if isinstance(f[key], h5py.Dataset):
+                variable = _dataset_to_variable(path, f[key])
+                variables[key] = variable
+            else:
+                raise NotImplementedError("Nested groups are not yet supported")
 
     return variables
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index 735e922a..aa2b0fe0 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -107,3 +107,13 @@ def group_netcdf4_file(tmpdir):
     f = h5py.File(filepath, "w")
     f.create_group("group")
     return filepath
+
+
+@pytest.fixture
+def multiple_datasets_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/multiple_datasets.nc"
+    f = h5py.File(filepath, "w")
+    data = np.random.random((10, 10))
+    f.create_dataset(name="data", data=data, chunks=None)
+    f.create_dataset(name="data2", data=data, chunks=None)
+    return filepath
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
index da331ed9..36f7bc77 100644
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -88,11 +88,16 @@ def test_string_attribute(self, string_attribute_netcdf4_file):
 
 class TestVirtualVarsFromHDF:
     def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file):
-        f = h5py.File(chunked_dimensions_netcdf4_file)
-        variables = virtual_vars_from_hdf(chunked_dimensions_netcdf4_file, f)
+        variables = virtual_vars_from_hdf(chunked_dimensions_netcdf4_file)
         assert len(variables) == 3
 
     def test_groups_not_implemented(self, group_netcdf4_file):
-        f = h5py.File(group_netcdf4_file)
         with pytest.raises(NotImplementedError):
-            virtual_vars_from_hdf(group_netcdf4_file, f)
+            virtual_vars_from_hdf(group_netcdf4_file)
+
+    def test_drop_variables(self, multiple_datasets_netcdf4_file):
+        variables = virtual_vars_from_hdf(
+            multiple_datasets_netcdf4_file,
+            ["data2"]
+        )
+        assert "data2" not in variables.keys()

From 1dd3370aedc6e0b590f752273387a716366defe9 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 22 Apr 2024 13:02:03 -0600
Subject: [PATCH 05/79] mypy fix to use ChunkKey and empty dimensions list.

---
 virtualizarr/readers/hdf.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index c4ab2927..fdb9a77d 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -1,4 +1,4 @@
-from typing import List, Mapping, Optional
+from typing import List, Mapping, Optional, Union
 
 import fsspec
 import h5py
@@ -8,6 +8,8 @@
 from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray
 from virtualizarr.zarr import ZArray
 
+from virtualizarr.types import ChunkKey
+
 
 def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest:
     """
@@ -38,7 +40,8 @@ def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest:
                 offset=dsid.get_offset(),
                 length=dsid.get_storage_size()
             )
-            chunk_entries = {key: chunk_entry}
+            chunk_key = ChunkKey(key)
+            chunk_entries = {chunk_key: chunk_entry}
             chunk_manifest = ChunkManifest(
                 entries=chunk_entries
             )
@@ -75,7 +78,7 @@ def store_chunk_entry(blob):
         return chunk_manifest
 
 
-def _dataset_dims(dataset: h5py.Dataset) -> List[str]:
+def _dataset_dims(dataset: h5py.Dataset) -> Union[List[str], List[None]]:
     """
     Get a list of dimension scale names attached to input HDF5 dataset.
 
@@ -114,7 +117,7 @@ def _dataset_dims(dataset: h5py.Dataset) -> List[str]:
                 # In this case, we mimic netCDF4 and assign phony dimension names.
                 # See https://github.com/fsspec/kerchunk/issues/41
                 dims.append(f"phony_dim_{n}")
-        return dims
+    return dims
 
 
 def _extract_attrs(dataset: h5py.Dataset):

From d92c75c82cd000bf0fafa5301c22793434fb18ed Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 22 Apr 2024 13:40:52 -0600
Subject: [PATCH 06/79] Extract attributes from hdf5 root group.

---
 virtualizarr/readers/hdf.py                 | 18 +++++++++++++-----
 virtualizarr/tests/test_readers/conftest.py |  8 ++++++++
 virtualizarr/tests/test_readers/test_hdf.py |  5 +++++
 3 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index fdb9a77d..e02d03e7 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -120,14 +120,14 @@ def _dataset_dims(dataset: h5py.Dataset) -> Union[List[str], List[None]]:
     return dims
 
 
-def _extract_attrs(dataset: h5py.Dataset):
+def _extract_attrs(h5obj: Union[h5py.Dataset, h5py.Group]):
     """
-    Extract attributes from an HDF5 dataset.
+    Extract attributes from an HDF5 group or dataset.
 
     Parameters
     ----------
-    dataset : h5py.Dataset
-        An HDF5 dataset.
+    h5obj : h5py.Group or h5py.Dataset
+        An HDF5 group or dataset.
     """
     _HIDDEN_ATTRS = {
         "REFERENCE_LIST",
@@ -140,7 +140,7 @@ def _extract_attrs(dataset: h5py.Dataset):
         "_NCProperties",
     }
     attrs = {}
-    for n, v in dataset.attrs.items():
+    for n, v in h5obj.attrs.items():
         if n in _HIDDEN_ATTRS:
             continue
         # Fix some attribute values to avoid JSON encoding exceptions...
@@ -207,3 +207,11 @@ def virtual_vars_from_hdf(
                 raise NotImplementedError("Nested groups are not yet supported")
 
     return variables
+
+
+def attrs_from_root_group(path: str):
+    fs, file_path = fsspec.core.url_to_fs(path)
+    open_file = fs.open(path, "rb")
+    f = h5py.File(open_file, mode="r")
+    attrs = _extract_attrs(f)
+    return attrs
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index aa2b0fe0..46ac7b2e 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -101,6 +101,14 @@ def string_attribute_netcdf4_file(tmpdir):
     return filepath
 
 
+@pytest.fixture
+def root_attributes_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/root_attributes.nc"
+    f = h5py.File(filepath, "w")
+    f.attrs["attribute_name"] = "attribute_name"
+    return filepath
+
+
 @pytest.fixture
 def group_netcdf4_file(tmpdir):
     filepath = f"{tmpdir}/group.nc"
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
index 36f7bc77..a24e36ab 100644
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -85,6 +85,11 @@ def test_string_attribute(self, string_attribute_netcdf4_file):
         attrs = _extract_attrs(ds)
         assert attrs["attribute_name"] == "attribute_name"
 
+    def test_root_attribute(self, root_attributes_netcdf4_file):
+        f = h5py.File(root_attributes_netcdf4_file)
+        attrs = _extract_attrs(f)
+        assert attrs["attribute_name"] == "attribute_name"
+
 
 class TestVirtualVarsFromHDF:
     def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file):

From 0ed836272d26a62b8de457c30dc6525292efc916 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 22 Apr 2024 14:19:17 -0600
Subject: [PATCH 07/79] Use hdf reader for netcdf4 files.

---
 virtualizarr/xarray.py | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
index 5c3c8548..415b0a05 100644
--- a/virtualizarr/xarray.py
+++ b/virtualizarr/xarray.py
@@ -8,7 +8,8 @@
 from xarray.core.variable import IndexVariable
 
 import virtualizarr.kerchunk as kerchunk
-from virtualizarr.kerchunk import KerchunkStoreRefs, FileType
+from virtualizarr.kerchunk import KerchunkStoreRefs, FileType, _automatically_determine_filetype
+from virtualizarr.readers.hdf import virtual_vars_from_hdf, attrs_from_root_group 
 from virtualizarr.manifests import ChunkManifest, ManifestArray
 
 
@@ -76,18 +77,28 @@ def open_virtual_dataset(
     if common:
         raise ValueError(f"Cannot both load and drop variables {common}")
 
+    if filetype is None:
+        filetype = _automatically_determine_filetype(filepath)
+    filetype = FileType(filetype)
+    if filetype.name.lower() == "netcdf4":
+        virtual_vars = virtual_vars_from_hdf(
+            path=filepath,
+            drop_variables=drop_variables
+        )
+        ds_attrs = attrs_from_root_group(path=filepath)
     # this is the only place we actually always need to use kerchunk directly
     # TODO avoid even reading byte ranges for variables that will be dropped later anyway?
-    vds_refs = kerchunk.read_kerchunk_references_from_file(
-        filepath=filepath,
-        filetype=filetype,
-    )
-    virtual_vars = virtual_vars_from_kerchunk_refs(
-        vds_refs,
-        drop_variables=drop_variables + loadable_variables,
-        virtual_array_class=virtual_array_class,
-    )
-    ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {})
+    else:
+        vds_refs = kerchunk.read_kerchunk_references_from_file(
+            filepath=filepath,
+            filetype=filetype,
+        )
+        virtual_vars = virtual_vars_from_kerchunk_refs(
+            vds_refs,
+            drop_variables=drop_variables + loadable_variables,
+            virtual_array_class=virtual_array_class,
+        )
+        ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {})
 
     if indexes is None or len(loadable_variables) > 0:
         # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables...

From f4485fa10aebc0f8ef5ff7441704f49781325835 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 22 Apr 2024 21:57:39 +0000
Subject: [PATCH 08/79] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 virtualizarr/xarray.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
index 415b0a05..2213ffa9 100644
--- a/virtualizarr/xarray.py
+++ b/virtualizarr/xarray.py
@@ -9,7 +9,7 @@
 
 import virtualizarr.kerchunk as kerchunk
 from virtualizarr.kerchunk import KerchunkStoreRefs, FileType, _automatically_determine_filetype
-from virtualizarr.readers.hdf import virtual_vars_from_hdf, attrs_from_root_group 
+from virtualizarr.readers.hdf import virtual_vars_from_hdf, attrs_from_root_group
 from virtualizarr.manifests import ChunkManifest, ManifestArray
 
 

From 0123df7b802734f1902bee0cdd196f5baca10c9e Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Wed, 8 May 2024 18:03:04 -0600
Subject: [PATCH 09/79] Fix ruff complaints.

---
 virtualizarr/readers/hdf.py                 |  3 +--
 virtualizarr/tests/test_readers/test_hdf.py | 10 +++++++---
 virtualizarr/xarray.py                      |  8 ++++++--
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index e02d03e7..af25c029 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -6,9 +6,8 @@
 import xarray as xr
 
 from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray
-from virtualizarr.zarr import ZArray
-
 from virtualizarr.types import ChunkKey
+from virtualizarr.zarr import ZArray
 
 
 def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest:
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
index a24e36ab..0d5a16db 100644
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -1,9 +1,13 @@
 import h5py
 import pytest
 
-from virtualizarr.readers.hdf import (_dataset_chunk_manifest, _dataset_dims,
-                                      _dataset_to_variable, _extract_attrs,
-                                      virtual_vars_from_hdf)
+from virtualizarr.readers.hdf import (
+    _dataset_chunk_manifest,
+    _dataset_dims,
+    _dataset_to_variable,
+    _extract_attrs,
+    virtual_vars_from_hdf,
+)
 
 
 class TestDatasetChunkManifest:
diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
index fbf6136f..9629a344 100644
--- a/virtualizarr/xarray.py
+++ b/virtualizarr/xarray.py
@@ -18,9 +18,13 @@
 from xarray.core.variable import IndexVariable
 
 import virtualizarr.kerchunk as kerchunk
-from virtualizarr.kerchunk import KerchunkStoreRefs, FileType, _automatically_determine_filetype
-from virtualizarr.readers.hdf import virtual_vars_from_hdf, attrs_from_root_group
+from virtualizarr.kerchunk import (
+    FileType,
+    KerchunkStoreRefs,
+    _automatically_determine_filetype,
+)
 from virtualizarr.manifests import ChunkManifest, ManifestArray
+from virtualizarr.readers.hdf import attrs_from_root_group, virtual_vars_from_hdf
 from virtualizarr.zarr import (
     attrs_from_zarr_group_json,
     dataset_to_zarr,

From 332bcaab1ae182696e1daf7c611f6fe8fd8ee4fd Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 10 May 2024 15:10:30 -0600
Subject: [PATCH 10/79] First steps for handling HDF5 filters.

---
 pyproject.toml                                |  1 +
 virtualizarr/readers/hdf.py                   |  7 +-
 virtualizarr/readers/hdf_filters.py           | 34 +++++++++
 virtualizarr/tests/test_readers/conftest.py   | 26 +++++++
 .../tests/test_readers/test_hdf_filters.py    | 31 ++++++++
 .../test_readers/test_hdf_integration.py      | 21 ++++++
 virtualizarr/xarray.py                        | 71 +++++++++----------
 7 files changed, 153 insertions(+), 38 deletions(-)
 create mode 100644 virtualizarr/readers/hdf_filters.py
 create mode 100644 virtualizarr/tests/test_readers/test_hdf_filters.py
 create mode 100644 virtualizarr/tests/test_readers/test_hdf_integration.py

diff --git a/pyproject.toml b/pyproject.toml
index 79a50789..4818b5f1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,6 +28,7 @@ dependencies = [
     "numpy",
     "ujson",
     "packaging",
+    "hdf5plugin",
 ]
 
 [project.optional-dependencies]
diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index af25c029..7d95d996 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -6,6 +6,7 @@
 import xarray as xr
 
 from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray
+from virtualizarr.readers.hdf_filters import codecs_from_dataset
 from virtualizarr.types import ChunkKey
 from virtualizarr.zarr import ZArray
 
@@ -169,12 +170,14 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable:
     # This chunk determination logic mirrors zarr-python's create
     # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66
     chunks = dataset.chunks if dataset.chunks else dataset.shape
+    codecs = codecs_from_dataset(dataset)
+    filters = [codec.get_config() for codec in codecs]
     zarray = ZArray(
         chunks=chunks,
-        compressor=dataset.compression,
+        compressor=None,
         dtype=dataset.dtype,
         fill_value=dataset.fillvalue,
-        filters=None,
+        filters=filters,
         order="C",
         shape=dataset.shape,
         zarr_format=2,
diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
new file mode 100644
index 00000000..6070fc17
--- /dev/null
+++ b/virtualizarr/readers/hdf_filters.py
@@ -0,0 +1,34 @@
+from typing import List, Tuple, Union
+
+import h5py
+import numcodecs.registry as registry
+from numcodecs.abc import Codec
+
+_non_standard_filters = {
+    "gzip": "zlib"
+}
+
+
+def _filter_to_codec(filter_id: str, filter_properties: Union[int, Tuple] = None) -> Codec:
+    try:
+        id = int(filter_id)
+    except ValueError:
+        id = filter_id
+
+    if isinstance(id, str):
+        if id in _non_standard_filters.keys():
+            id = _non_standard_filters[id]
+        conf = {"id": id}
+        if id == "zlib":
+            conf["level"] = filter_properties
+
+    codec = registry.get_codec(conf)
+    return codec
+
+
+def codecs_from_dataset(dataset: h5py.Dataset) -> List[Codec]:
+    codecs = []
+    for filter_id, filter_properties in dataset._filters.items():
+        codec = _filter_to_codec(filter_id, filter_properties)
+        codecs.append(codec)
+    return codecs
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index 46ac7b2e..4f0d4fce 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -125,3 +125,29 @@ def multiple_datasets_netcdf4_file(tmpdir):
     f.create_dataset(name="data", data=data, chunks=None)
     f.create_dataset(name="data2", data=data, chunks=None)
     return filepath
+
+
+@pytest.fixture
+def np_uncompressed():
+    return np.arange(100)
+
+
+@pytest.fixture
+def gzip_filter_netcdf4_file(tmpdir, np_uncompressed):
+    filepath = f"{tmpdir}/gzip.nc"
+    f = h5py.File(filepath, "w")
+    f.create_dataset(name="data", data=np_uncompressed, compression="gzip", compression_opts=1)
+    return filepath
+
+
+@pytest.fixture
+def gzip_filter_xarray_netcdf4_file(tmpdir):
+    ds = xr.tutorial.open_dataset("air_temperature")
+    encoding = {}
+    for var_name in ds.variables:
+        #  encoding[var_name] = {"zlib": True, "compression_opts": 1}
+        encoding[var_name] = {"compression": "gzip", "compression_opts": 1}
+
+    filepath = f"{tmpdir}/gzip_xarray.nc"
+    ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding)
+    return filepath
diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py
new file mode 100644
index 00000000..50a5d08c
--- /dev/null
+++ b/virtualizarr/tests/test_readers/test_hdf_filters.py
@@ -0,0 +1,31 @@
+import h5py
+import numcodecs
+import pytest
+
+from virtualizarr.readers.hdf_filters import (
+    _filter_to_codec,
+    codecs_from_dataset,
+)
+
+
+class TestFilterToCodec:
+    def test_gzip_uses_zlib_nomcodec(self):
+        codec = _filter_to_codec("gzip", 1)
+        assert isinstance(codec, numcodecs.zlib.Zlib)
+
+    def test_lzf_not_available(self):
+        with pytest.raises(ValueError, match="codec not available"):
+            _filter_to_codec("lzf")
+
+
+class TestCodecsFromDataSet:
+    def test_gzip(self, np_uncompressed, gzip_filter_netcdf4_file):
+        f = h5py.File(gzip_filter_netcdf4_file)
+        ds = f["data"]
+        chunk_info = ds.id.get_chunk_info(0)
+        codecs = codecs_from_dataset(ds)
+        with open(gzip_filter_netcdf4_file, 'rb') as file:
+            file.seek(chunk_info.byte_offset)
+            bytes_read = file.read(chunk_info.size)
+            decoded = codecs[0].decode(bytes_read)
+            assert decoded == np_uncompressed.tobytes()
diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
new file mode 100644
index 00000000..45bfadcd
--- /dev/null
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -0,0 +1,21 @@
+import fsspec
+import numpy
+import xarray as xr
+
+import virtualizarr
+from virtualizarr.kerchunk import FileType
+
+
+class TestIntegration:
+    def test_gzip_filter_end_to_end(self, tmpdir, gzip_filter_xarray_netcdf4_file):
+        virtual_ds = virtualizarr.open_virtual_dataset(
+            gzip_filter_xarray_netcdf4_file,
+            filetype=FileType("netcdf4")
+        )
+        kerchunk_file = f"{tmpdir}/gzip_kerchunk.json"
+        virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json")
+        fs = fsspec.filesystem("reference", fo=kerchunk_file)
+        m = fs.get_mapper("")
+
+        ds = xr.open_dataset(m, engine="kerchunk")
+        assert isinstance(ds.air.values[0][0][0], numpy.float64)
diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
index 9629a344..24ba973a 100644
--- a/virtualizarr/xarray.py
+++ b/virtualizarr/xarray.py
@@ -128,48 +128,47 @@ def open_virtual_dataset(
         )
         ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {})
 
-        if indexes is None or len(loadable_variables) > 0:
-            # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables...
-            # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references
-            # TODO really we probably want a dedicated xarray backend that iterates over all variables only once
-            ds = xr.open_dataset(filepath, drop_variables=drop_variables)
-
-            if indexes is None:
-                # add default indexes by reading data from file
-                indexes = {name: index for name, index in ds.xindexes.items()}
-            elif indexes != {}:
-                # TODO allow manual specification of index objects
-                raise NotImplementedError()
-            else:
-                indexes = dict(**indexes)  # for type hinting: to allow mutation
-
-            loadable_vars = {
-                name: var
-                for name, var in ds.variables.items()
-                if name in loadable_variables
-            }
-
-            # if we only read the indexes we can just close the file right away as nothing is lazy
-            if loadable_vars == {}:
-                ds.close()
+    if indexes is None or len(loadable_variables) > 0:
+        # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables...
+        # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references
+        # TODO really we probably want a dedicated xarray backend that iterates over all variables only once
+        ds = xr.open_dataset(filepath, drop_variables=drop_variables)
+
+        if indexes is None:
+            # add default indexes by reading data from file
+            indexes = {name: index for name, index in ds.xindexes.items()}
+        elif indexes != {}:
+            # TODO allow manual specification of index objects
+            raise NotImplementedError()
         else:
-            loadable_vars = {}
-            indexes = {}
+            indexes = dict(**indexes)  # for type hinting: to allow mutation
 
-        vars = {**virtual_vars, **loadable_vars}
+        loadable_vars = {
+            name: var
+            for name, var in ds.variables.items()
+            if name in loadable_variables
+        }
 
-        data_vars, coords = separate_coords(vars, indexes)
+        # if we only read the indexes we can just close the file right away as nothing is lazy
+        if loadable_vars == {}:
+            ds.close()
+    else:
+        loadable_vars = {}
+        indexes = {}
 
-        vds = xr.Dataset(
-            data_vars,
-            coords=coords,
-            # indexes={},  # TODO should be added in a later version of xarray
-            attrs=ds_attrs,
-        )
+    vars = {**virtual_vars, **loadable_vars}
+
+    data_vars, coords = separate_coords(vars, indexes)
+    vds = xr.Dataset(
+        data_vars,
+        coords=coords,
+        # indexes={},  # TODO should be added in a later version of xarray
+        attrs=ds_attrs,
+    )
 
-        # TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened
+    # TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened
 
-        return vds
+    return vds
 
 
 def open_virtual_dataset_from_v3_store(

From c51e615ca0cd5396bde54868e439419fe9d9b9c8 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 13 May 2024 12:36:29 -0600
Subject: [PATCH 11/79] Initial step for hdf5plugin supported codecs.

---
 virtualizarr/readers/hdf_filters.py           | 25 +++++++++++++++
 virtualizarr/tests/test_readers/conftest.py   | 31 +++++++++++++------
 .../tests/test_readers/test_hdf_filters.py    | 20 +++++++++---
 .../test_readers/test_hdf_integration.py      |  7 +++--
 4 files changed, 66 insertions(+), 17 deletions(-)

diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index 6070fc17..75f06bdc 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -1,14 +1,30 @@
 from typing import List, Tuple, Union
 
 import h5py
+import hdf5plugin
 import numcodecs.registry as registry
 from numcodecs.abc import Codec
+from pydantic import BaseModel, validator
 
 _non_standard_filters = {
     "gzip": "zlib"
 }
 
 
+class BloscProperties(BaseModel):
+    blocksize: int
+    clevel: int
+    shuffle: int
+    cname: str
+
+    @validator("cname", pre=True)
+    def get_cname_from_code(cls, v):
+        blosc_compressor_codes = {
+            value: key for key, value in hdf5plugin._filters.Blosc._Blosc__COMPRESSIONS.items()
+        }
+        return blosc_compressor_codes[v]
+
+
 def _filter_to_codec(filter_id: str, filter_properties: Union[int, Tuple] = None) -> Codec:
     try:
         id = int(filter_id)
@@ -21,6 +37,15 @@ def _filter_to_codec(filter_id: str, filter_properties: Union[int, Tuple] = None
         conf = {"id": id}
         if id == "zlib":
             conf["level"] = filter_properties
+    elif isinstance(id, int):
+        filter = hdf5plugin.get_filters(id)[0]
+        id = filter.filter_name
+        if id == "blosc":
+            blosc_props = BloscProperties(**{k: v for k, v in
+                                             zip(BloscProperties.__fields__.keys(),
+                                                 filter_properties[-4:])})
+            conf = blosc_props.model_dump()
+            conf["id"] = id
 
     codec = registry.get_codec(conf)
     return codec
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index 4f0d4fce..cc9331e1 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -1,4 +1,5 @@
 import h5py
+import hdf5plugin
 import numpy as np
 import pytest
 import xarray as xr
@@ -132,22 +133,32 @@ def np_uncompressed():
     return np.arange(100)
 
 
-@pytest.fixture
-def gzip_filter_netcdf4_file(tmpdir, np_uncompressed):
-    filepath = f"{tmpdir}/gzip.nc"
+@pytest.fixture(params=["gzip", "blosc"])
+def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request):
+    filepath = f"{tmpdir}/{request.param}.nc"
     f = h5py.File(filepath, "w")
-    f.create_dataset(name="data", data=np_uncompressed, compression="gzip", compression_opts=1)
+    if request.param == "gzip":
+        f.create_dataset(name="data", data=np_uncompressed, compression="gzip", compression_opts=1)
+    if request.param == "blosc":
+        f.create_dataset(name="data", data=np_uncompressed,
+                         **hdf5plugin.Blosc(
+                             cname="lz4", clevel=9, shuffle=hdf5plugin.Blosc.SHUFFLE
+                         ))
     return filepath
 
 
-@pytest.fixture
-def gzip_filter_xarray_netcdf4_file(tmpdir):
+@pytest.fixture(params=["gzip"])
+def filter_encoded_xarray_netcdf4_files(tmpdir, request):
     ds = xr.tutorial.open_dataset("air_temperature")
     encoding = {}
+    if request.param == "gzip":
+        encoding_config = {
+            "zlib": True,
+            "complevel": 1
+        }
     for var_name in ds.variables:
-        #  encoding[var_name] = {"zlib": True, "compression_opts": 1}
-        encoding[var_name] = {"compression": "gzip", "compression_opts": 1}
+        encoding[var_name] = encoding_config
 
-    filepath = f"{tmpdir}/gzip_xarray.nc"
-    ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding)
+    filepath = f"{tmpdir}/{request.param}_xarray.nc"
+    ds.to_netcdf(filepath, engine="netcdf4", encoding=encoding)
     return filepath
diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py
index 50a5d08c..8094d4cf 100644
--- a/virtualizarr/tests/test_readers/test_hdf_filters.py
+++ b/virtualizarr/tests/test_readers/test_hdf_filters.py
@@ -9,7 +9,7 @@
 
 
 class TestFilterToCodec:
-    def test_gzip_uses_zlib_nomcodec(self):
+    def test_gzip_uses_zlib_numcodec(self):
         codec = _filter_to_codec("gzip", 1)
         assert isinstance(codec, numcodecs.zlib.Zlib)
 
@@ -17,14 +17,26 @@ def test_lzf_not_available(self):
         with pytest.raises(ValueError, match="codec not available"):
             _filter_to_codec("lzf")
 
+    def test_blosc(self):
+        codec = _filter_to_codec("32001", (2, 2, 8, 800, 9, 2, 1))
+        assert isinstance(codec, numcodecs.blosc.Blosc)
+        expected_config = {
+            "id": "blosc",
+            "blocksize": 800,
+            "clevel": 9,
+            "shuffle": 2,
+            "cname": "lz4",
+        }
+        assert codec.get_config() == expected_config
+
 
 class TestCodecsFromDataSet:
-    def test_gzip(self, np_uncompressed, gzip_filter_netcdf4_file):
-        f = h5py.File(gzip_filter_netcdf4_file)
+    def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file):
+        f = h5py.File(filter_encoded_netcdf4_file)
         ds = f["data"]
         chunk_info = ds.id.get_chunk_info(0)
         codecs = codecs_from_dataset(ds)
-        with open(gzip_filter_netcdf4_file, 'rb') as file:
+        with open(filter_encoded_netcdf4_file, 'rb') as file:
             file.seek(chunk_info.byte_offset)
             bytes_read = file.read(chunk_info.size)
             decoded = codecs[0].decode(bytes_read)
diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index 45bfadcd..94fc0c1c 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -7,12 +7,13 @@
 
 
 class TestIntegration:
-    def test_gzip_filter_end_to_end(self, tmpdir, gzip_filter_xarray_netcdf4_file):
+    def test_filters_end_to_end(self, tmpdir,
+                                    filter_encoded_xarray_netcdf4_files):
         virtual_ds = virtualizarr.open_virtual_dataset(
-            gzip_filter_xarray_netcdf4_file,
+            filter_encoded_xarray_netcdf4_files,
             filetype=FileType("netcdf4")
         )
-        kerchunk_file = f"{tmpdir}/gzip_kerchunk.json"
+        kerchunk_file = f"{tmpdir}/kerchunk.json"
         virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json")
         fs = fsspec.filesystem("reference", fo=kerchunk_file)
         m = fs.get_mapper("")

From 0083f77103c909079427ce3471e65af7fb3bfc54 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Thu, 16 May 2024 16:24:57 -0400
Subject: [PATCH 12/79] Small commit to check compression support in CI
 environment.

---
 pyproject.toml                              | 1 +
 virtualizarr/tests/test_readers/conftest.py | 9 +++++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 4818b5f1..bba695eb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,6 +41,7 @@ test = [
     "scipy",
     "pooch",
     "ruff",
+    "netcdf4",
 ]
 
 
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index cc9331e1..8dc82c33 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -147,7 +147,7 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request):
     return filepath
 
 
-@pytest.fixture(params=["gzip"])
+@pytest.fixture(params=["gzip", "blosc_lz"])
 def filter_encoded_xarray_netcdf4_files(tmpdir, request):
     ds = xr.tutorial.open_dataset("air_temperature")
     encoding = {}
@@ -156,9 +156,14 @@ def filter_encoded_xarray_netcdf4_files(tmpdir, request):
             "zlib": True,
             "complevel": 1
         }
+    if request.param == "blosc_lz":
+        encoding_config = {
+            "compression": "blosc_lz",
+        }
+
     for var_name in ds.variables:
         encoding[var_name] = encoding_config
 
     filepath = f"{tmpdir}/{request.param}_xarray.nc"
-    ds.to_netcdf(filepath, engine="netcdf4", encoding=encoding)
+    ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding)
     return filepath

From 207c4b5cb411637070dc9a5f7011a0e0c98ef877 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 19 May 2024 21:34:26 +0000
Subject: [PATCH 13/79] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 virtualizarr/readers/hdf.py                   | 16 ++++----------
 virtualizarr/readers/hdf_filters.py           | 22 ++++++++++++-------
 virtualizarr/tests/test_readers/conftest.py   | 18 +++++++--------
 virtualizarr/tests/test_readers/test_hdf.py   |  5 +----
 .../tests/test_readers/test_hdf_filters.py    |  2 +-
 .../test_readers/test_hdf_integration.py      |  6 ++---
 virtualizarr/xarray.py                        |  5 ++---
 7 files changed, 33 insertions(+), 41 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index 7d95d996..78e718e4 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -36,15 +36,11 @@ def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest:
             key_list = [0] * (len(dataset.shape) or 1)
             key = ".".join(map(str, key_list))
             chunk_entry = ChunkEntry(
-                path=path,
-                offset=dsid.get_offset(),
-                length=dsid.get_storage_size()
+                path=path, offset=dsid.get_offset(), length=dsid.get_storage_size()
             )
             chunk_key = ChunkKey(key)
             chunk_entries = {chunk_key: chunk_entry}
-            chunk_manifest = ChunkManifest(
-                entries=chunk_entries
-            )
+            chunk_manifest = ChunkManifest(entries=chunk_entries)
             return chunk_manifest
     else:
         num_chunks = dsid.get_num_chunks()
@@ -60,9 +56,7 @@ def get_key(blob):
 
         def store_chunk_entry(blob):
             chunk_entries[get_key(blob)] = ChunkEntry(
-                path=path,
-                offset=blob.byte_offset,
-                length=blob.size
+                path=path, offset=blob.byte_offset, length=blob.size
             )
 
         has_chunk_iter = callable(getattr(dsid, "chunk_iter", None))
@@ -72,9 +66,7 @@ def store_chunk_entry(blob):
             for index in range(num_chunks):
                 store_chunk_entry(dsid.get_chunk_info(index))
 
-        chunk_manifest = ChunkManifest(
-            entries=chunk_entries
-        )
+        chunk_manifest = ChunkManifest(entries=chunk_entries)
         return chunk_manifest
 
 
diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index 75f06bdc..77e7037e 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -6,9 +6,7 @@
 from numcodecs.abc import Codec
 from pydantic import BaseModel, validator
 
-_non_standard_filters = {
-    "gzip": "zlib"
-}
+_non_standard_filters = {"gzip": "zlib"}
 
 
 class BloscProperties(BaseModel):
@@ -20,12 +18,15 @@ class BloscProperties(BaseModel):
     @validator("cname", pre=True)
     def get_cname_from_code(cls, v):
         blosc_compressor_codes = {
-            value: key for key, value in hdf5plugin._filters.Blosc._Blosc__COMPRESSIONS.items()
+            value: key
+            for key, value in hdf5plugin._filters.Blosc._Blosc__COMPRESSIONS.items()
         }
         return blosc_compressor_codes[v]
 
 
-def _filter_to_codec(filter_id: str, filter_properties: Union[int, Tuple] = None) -> Codec:
+def _filter_to_codec(
+    filter_id: str, filter_properties: Union[int, Tuple] = None
+) -> Codec:
     try:
         id = int(filter_id)
     except ValueError:
@@ -41,9 +42,14 @@ def _filter_to_codec(filter_id: str, filter_properties: Union[int, Tuple] = None
         filter = hdf5plugin.get_filters(id)[0]
         id = filter.filter_name
         if id == "blosc":
-            blosc_props = BloscProperties(**{k: v for k, v in
-                                             zip(BloscProperties.__fields__.keys(),
-                                                 filter_properties[-4:])})
+            blosc_props = BloscProperties(
+                **{
+                    k: v
+                    for k, v in zip(
+                        BloscProperties.__fields__.keys(), filter_properties[-4:]
+                    )
+                }
+            )
             conf = blosc_props.model_dump()
             conf["id"] = id
 
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index aa66f933..53c9630e 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -138,12 +138,15 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request):
     filepath = f"{tmpdir}/{request.param}.nc"
     f = h5py.File(filepath, "w")
     if request.param == "gzip":
-        f.create_dataset(name="data", data=np_uncompressed, compression="gzip", compression_opts=1)
+        f.create_dataset(
+            name="data", data=np_uncompressed, compression="gzip", compression_opts=1
+        )
     if request.param == "blosc":
-        f.create_dataset(name="data", data=np_uncompressed,
-                         **hdf5plugin.Blosc(
-                             cname="lz4", clevel=9, shuffle=hdf5plugin.Blosc.SHUFFLE
-                         ))
+        f.create_dataset(
+            name="data",
+            data=np_uncompressed,
+            **hdf5plugin.Blosc(cname="lz4", clevel=9, shuffle=hdf5plugin.Blosc.SHUFFLE),
+        )
     return filepath
 
 
@@ -152,10 +155,7 @@ def filter_encoded_xarray_netcdf4_files(tmpdir, request):
     ds = xr.tutorial.open_dataset("air_temperature")
     encoding = {}
     if request.param == "gzip":
-        encoding_config = {
-            "zlib": True,
-            "complevel": 1
-        }
+        encoding_config = {"zlib": True, "complevel": 1}
 
     for var_name in ds.variables:
         encoding[var_name] = encoding_config
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
index 0d5a16db..a83bfc39 100644
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -105,8 +105,5 @@ def test_groups_not_implemented(self, group_netcdf4_file):
             virtual_vars_from_hdf(group_netcdf4_file)
 
     def test_drop_variables(self, multiple_datasets_netcdf4_file):
-        variables = virtual_vars_from_hdf(
-            multiple_datasets_netcdf4_file,
-            ["data2"]
-        )
+        variables = virtual_vars_from_hdf(multiple_datasets_netcdf4_file, ["data2"])
         assert "data2" not in variables.keys()
diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py
index 8094d4cf..28b5d69f 100644
--- a/virtualizarr/tests/test_readers/test_hdf_filters.py
+++ b/virtualizarr/tests/test_readers/test_hdf_filters.py
@@ -36,7 +36,7 @@ def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file):
         ds = f["data"]
         chunk_info = ds.id.get_chunk_info(0)
         codecs = codecs_from_dataset(ds)
-        with open(filter_encoded_netcdf4_file, 'rb') as file:
+        with open(filter_encoded_netcdf4_file, "rb") as file:
             file.seek(chunk_info.byte_offset)
             bytes_read = file.read(chunk_info.size)
             decoded = codecs[0].decode(bytes_read)
diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index 94fc0c1c..b31289c0 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -7,11 +7,9 @@
 
 
 class TestIntegration:
-    def test_filters_end_to_end(self, tmpdir,
-                                    filter_encoded_xarray_netcdf4_files):
+    def test_filters_end_to_end(self, tmpdir, filter_encoded_xarray_netcdf4_files):
         virtual_ds = virtualizarr.open_virtual_dataset(
-            filter_encoded_xarray_netcdf4_files,
-            filetype=FileType("netcdf4")
+            filter_encoded_xarray_netcdf4_files, filetype=FileType("netcdf4")
         )
         kerchunk_file = f"{tmpdir}/kerchunk.json"
         virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json")
diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
index 72645654..d8b6a080 100644
--- a/virtualizarr/xarray.py
+++ b/virtualizarr/xarray.py
@@ -20,8 +20,8 @@
     _automatically_determine_filetype,
 )
 from virtualizarr.manifests import ChunkManifest, ManifestArray
-from virtualizarr.utils import _fsspec_openfile_from_filepath
 from virtualizarr.readers.hdf import attrs_from_root_group, virtual_vars_from_hdf
+from virtualizarr.utils import _fsspec_openfile_from_filepath
 from virtualizarr.zarr import (
     attrs_from_zarr_group_json,
     dataset_to_zarr,
@@ -109,8 +109,7 @@ def open_virtual_dataset(
     if filetype.name.lower() == "netcdf4":
         print("wat")
         virtual_vars = virtual_vars_from_hdf(
-            path=filepath,
-            drop_variables=drop_variables
+            path=filepath, drop_variables=drop_variables
         )
         ds_attrs = attrs_from_root_group(path=filepath)
     if filetype == "zarr_v3":

From c57380058a5ad6ddbd908d54b1edd85b1f74f91d Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Sun, 19 May 2024 16:12:50 -0600
Subject: [PATCH 14/79] Fix mypy complaints for hdf_filters.

---
 virtualizarr/readers/hdf_filters.py | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index 75f06bdc..7a8bcc81 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -1,4 +1,4 @@
-from typing import List, Tuple, Union
+from typing import List, Optional, Tuple, TypedDict, Union
 
 import h5py
 import hdf5plugin
@@ -25,26 +25,30 @@ def get_cname_from_code(cls, v):
         return blosc_compressor_codes[v]
 
 
-def _filter_to_codec(filter_id: str, filter_properties: Union[int, Tuple] = None) -> Codec:
+def _filter_to_codec(filter_id: str, filter_properties: Union[int, None, Tuple] = None) -> Codec:
+    id_int = None
+    id_str = None
     try:
-        id = int(filter_id)
+        id_int = int(filter_id)
     except ValueError:
-        id = filter_id
+        id_str = filter_id
 
-    if isinstance(id, str):
-        if id in _non_standard_filters.keys():
-            id = _non_standard_filters[id]
+    if id_str:
+        if id_str in _non_standard_filters.keys():
+            id = _non_standard_filters[id_str]
+        else:
+            id = id_str
         conf = {"id": id}
         if id == "zlib":
-            conf["level"] = filter_properties
-    elif isinstance(id, int):
-        filter = hdf5plugin.get_filters(id)[0]
+            conf["level"] = filter_properties # type: ignore[assignment]
+    if id_int:
+        filter = hdf5plugin.get_filters(id_int)[0]
         id = filter.filter_name
-        if id == "blosc":
+        if id == "blosc" and isinstance(filter_properties, tuple):
             blosc_props = BloscProperties(**{k: v for k, v in
                                              zip(BloscProperties.__fields__.keys(),
                                                  filter_properties[-4:])})
-            conf = blosc_props.model_dump()
+            conf = blosc_props.model_dump() # type: ignore[assignment]
             conf["id"] = id
 
     codec = registry.get_codec(conf)

From 588e06b507e8661644e33923ad0295e255152e1e Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Sun, 19 May 2024 16:22:39 -0600
Subject: [PATCH 15/79] Local pre-commit fix for hdf_filters.

---
 virtualizarr/readers/hdf_filters.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index a3868ebd..dfe1c1f3 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Tuple, TypedDict, Union
+from typing import List, Tuple, Union
 
 import h5py
 import hdf5plugin

From 725333e06fad83d4d763317faca5f41167a2c98f Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 20 May 2024 20:13:44 -0600
Subject: [PATCH 16/79] Use fsspec reader_options introduced in #37.

---
 virtualizarr/readers/hdf.py | 22 ++++++++++++++++------
 virtualizarr/xarray.py      |  7 ++++---
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index 78e718e4..19d99b3f 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -1,6 +1,5 @@
 from typing import List, Mapping, Optional, Union
 
-import fsspec
 import h5py
 import numpy as np
 import xarray as xr
@@ -8,6 +7,7 @@
 from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray
 from virtualizarr.readers.hdf_filters import codecs_from_dataset
 from virtualizarr.types import ChunkKey
+from virtualizarr.utils import _fsspec_openfile_from_filepath
 from virtualizarr.zarr import ZArray
 
 
@@ -185,11 +185,15 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable:
 def virtual_vars_from_hdf(
     path: str,
     drop_variables: Optional[List[str]] = None,
+    reader_options: Optional[dict] = {
+        "storage_options": {"key": "", "secret": "", "anon": True}
+    },
 ) -> Mapping[str, xr.Variable]:
     if drop_variables is None:
         drop_variables = []
-    fs, file_path = fsspec.core.url_to_fs(path)
-    open_file = fs.open(path, "rb")
+    open_file = _fsspec_openfile_from_filepath(
+        filepath=path, reader_options=reader_options
+    )
     f = h5py.File(open_file, mode="r")
     variables = {}
     for key in f.keys():
@@ -203,9 +207,15 @@ def virtual_vars_from_hdf(
     return variables
 
 
-def attrs_from_root_group(path: str):
-    fs, file_path = fsspec.core.url_to_fs(path)
-    open_file = fs.open(path, "rb")
+def attrs_from_root_group(
+    path: str,
+    reader_options: Optional[dict] = {
+        "storage_options": {"key": "", "secret": "", "anon": True}
+    },
+):
+    open_file = _fsspec_openfile_from_filepath(
+        filepath=path, reader_options=reader_options
+    )
     f = h5py.File(open_file, mode="r")
     attrs = _extract_attrs(f)
     return attrs
diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
index d8b6a080..8f810ee1 100644
--- a/virtualizarr/xarray.py
+++ b/virtualizarr/xarray.py
@@ -107,11 +107,12 @@ def open_virtual_dataset(
     filetype = FileType(filetype)
 
     if filetype.name.lower() == "netcdf4":
-        print("wat")
         virtual_vars = virtual_vars_from_hdf(
-            path=filepath, drop_variables=drop_variables
+            path=filepath,
+            drop_variables=drop_variables,
+            reader_options=reader_options,
         )
-        ds_attrs = attrs_from_root_group(path=filepath)
+        ds_attrs = attrs_from_root_group(path=filepath, reader_options=reader_options)
     if filetype == "zarr_v3":
         # TODO is there a neat way of auto-detecting this?
         return open_virtual_dataset_from_v3_store(

From 72df10861ab0830531502885c0aaa3ebf3de4dee Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 20 May 2024 20:40:38 -0600
Subject: [PATCH 17/79] Fix incorrect zarr_v3 if block position from merge
 commit ef0d7a8.

---
 virtualizarr/xarray.py | 128 +++++++++++++++++++++--------------------
 1 file changed, 66 insertions(+), 62 deletions(-)

diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
index 8f810ee1..d76e2a67 100644
--- a/virtualizarr/xarray.py
+++ b/virtualizarr/xarray.py
@@ -101,82 +101,86 @@ def open_virtual_dataset(
 
     if virtual_array_class is not ManifestArray:
         raise NotImplementedError()
-
-    if filetype is None:
-        filetype = _automatically_determine_filetype(filepath=filepath)
-    filetype = FileType(filetype)
-
-    if filetype.name.lower() == "netcdf4":
-        virtual_vars = virtual_vars_from_hdf(
-            path=filepath,
-            drop_variables=drop_variables,
-            reader_options=reader_options,
-        )
-        ds_attrs = attrs_from_root_group(path=filepath, reader_options=reader_options)
     if filetype == "zarr_v3":
         # TODO is there a neat way of auto-detecting this?
         return open_virtual_dataset_from_v3_store(
             storepath=filepath, drop_variables=drop_variables, indexes=indexes
         )
     else:
-        # this is the only place we actually always need to use kerchunk directly
-        # TODO avoid even reading byte ranges for variables that will be dropped later anyway?
-        vds_refs = kerchunk.read_kerchunk_references_from_file(
-            filepath=filepath,
-            filetype=filetype,
-        )
-        virtual_vars = virtual_vars_from_kerchunk_refs(
-            vds_refs,
-            drop_variables=drop_variables + loadable_variables,
-            virtual_array_class=virtual_array_class,
-        )
-        ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {})
-
-    if indexes is None or len(loadable_variables) > 0:
-        # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables...
-        # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references
-        # TODO really we probably want a dedicated xarray backend that iterates over all variables only once
-        fpath = _fsspec_openfile_from_filepath(
-            filepath=filepath, reader_options=reader_options
-        )
+        if filetype is None:
+            filetype = _automatically_determine_filetype(filepath=filepath)
+        filetype = FileType(filetype)
+
+        if filetype.name.lower() == "netcdf4":
+            virtual_vars = virtual_vars_from_hdf(
+                path=filepath,
+                drop_variables=drop_variables,
+                reader_options=reader_options,
+            )
+            ds_attrs = attrs_from_root_group(
+                path=filepath, reader_options=reader_options
+            )
+        else:
+            # this is the only place we actually always need to use kerchunk directly
+            # TODO avoid even reading byte ranges for variables that will be dropped later anyway?
+            vds_refs = kerchunk.read_kerchunk_references_from_file(
+                filepath=filepath,
+                filetype=filetype,
+            )
+            virtual_vars = virtual_vars_from_kerchunk_refs(
+                vds_refs,
+                drop_variables=drop_variables + loadable_variables,
+                virtual_array_class=virtual_array_class,
+            )
+            ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get(
+                ".zattrs", {}
+            )
 
-        ds = xr.open_dataset(fpath, drop_variables=drop_variables)
+        if indexes is None or len(loadable_variables) > 0:
+            # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables...
+            # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references
+            # TODO really we probably want a dedicated xarray backend that iterates over all variables only once
+            fpath = _fsspec_openfile_from_filepath(
+                filepath=filepath, reader_options=reader_options
+            )
 
-        if indexes is None:
-            # add default indexes by reading data from file
-            indexes = {name: index for name, index in ds.xindexes.items()}
-        elif indexes != {}:
-            # TODO allow manual specification of index objects
-            raise NotImplementedError()
-        else:
-            indexes = dict(**indexes)  # for type hinting: to allow mutation
+            ds = xr.open_dataset(fpath, drop_variables=drop_variables)
 
-        loadable_vars = {
-            name: var
-            for name, var in ds.variables.items()
-            if name in loadable_variables
-        }
+            if indexes is None:
+                # add default indexes by reading data from file
+                indexes = {name: index for name, index in ds.xindexes.items()}
+            elif indexes != {}:
+                # TODO allow manual specification of index objects
+                raise NotImplementedError()
+            else:
+                indexes = dict(**indexes)  # for type hinting: to allow mutation
 
-        # if we only read the indexes we can just close the file right away as nothing is lazy
-        if loadable_vars == {}:
-            ds.close()
-    else:
-        loadable_vars = {}
-        indexes = {}
+            loadable_vars = {
+                name: var
+                for name, var in ds.variables.items()
+                if name in loadable_variables
+            }
 
-    vars = {**virtual_vars, **loadable_vars}
+            # if we only read the indexes we can just close the file right away as nothing is lazy
+            if loadable_vars == {}:
+                ds.close()
+        else:
+            loadable_vars = {}
+            indexes = {}
 
-    data_vars, coords = separate_coords(vars, indexes)
-    vds = xr.Dataset(
-        data_vars,
-        coords=coords,
-        # indexes={},  # TODO should be added in a later version of xarray
-        attrs=ds_attrs,
-    )
+        vars = {**virtual_vars, **loadable_vars}
 
-    # TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened
+        data_vars, coords = separate_coords(vars, indexes)
+        vds = xr.Dataset(
+            data_vars,
+            coords=coords,
+            # indexes={},  # TODO should be added in a later version of xarray
+            attrs=ds_attrs,
+        )
 
-    return vds
+        # TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened
+
+        return vds
 
 
 def open_virtual_dataset_from_v3_store(

From d1e85cb169adc3851951afc2a64fcdec6180243c Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Tue, 21 May 2024 08:48:05 -0600
Subject: [PATCH 18/79] Fix early return from hdf _extract_attrs.

---
 virtualizarr/readers/hdf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index 19d99b3f..be93237f 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -155,7 +155,7 @@ def _extract_attrs(h5obj: Union[h5py.Dataset, h5py.Group]):
             continue
 
         attrs[n] = v
-        return attrs
+    return attrs
 
 
 def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable:

From 1e2b3436fd086f8188c516f2fda4f6cd3a521325 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Tue, 21 May 2024 09:23:50 -0600
Subject: [PATCH 19/79] Test that _extract_attrs correctly handles multiple
 attributes.

---
 virtualizarr/tests/test_readers/conftest.py |  3 ++-
 virtualizarr/tests/test_readers/test_hdf.py | 16 +++++++++++-----
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index 53c9630e..fe2ec889 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -93,12 +93,13 @@ def chunked_dimensions_netcdf4_file(tmpdir):
 
 
 @pytest.fixture
-def string_attribute_netcdf4_file(tmpdir):
+def string_attributes_netcdf4_file(tmpdir):
     filepath = f"{tmpdir}/attributes.nc"
     f = h5py.File(filepath, "w")
     data = np.random.random((10, 10))
     f.create_dataset(name="data", data=data, chunks=None)
     f["data"].attrs["attribute_name"] = "attribute_name"
+    f["data"].attrs["attribute_name2"] = "attribute_name2"
     return filepath
 
 
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
index a83bfc39..a67352e6 100644
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -75,16 +75,16 @@ def test_not_chunked_dataset(self, single_dimension_scale_netcdf4_file):
         var = _dataset_to_variable(single_dimension_scale_netcdf4_file, ds)
         assert var.chunks == (2,)
 
-    def test_dataset_attributes(self, string_attribute_netcdf4_file):
-        f = h5py.File(string_attribute_netcdf4_file)
+    def test_dataset_attributes(self, string_attributes_netcdf4_file):
+        f = h5py.File(string_attributes_netcdf4_file)
         ds = f["data"]
-        var = _dataset_to_variable(string_attribute_netcdf4_file, ds)
+        var = _dataset_to_variable(string_attributes_netcdf4_file, ds)
         assert var.attrs["attribute_name"] == "attribute_name"
 
 
 class TestExtractAttributes:
-    def test_string_attribute(self, string_attribute_netcdf4_file):
-        f = h5py.File(string_attribute_netcdf4_file)
+    def test_string_attribute(self, string_attributes_netcdf4_file):
+        f = h5py.File(string_attributes_netcdf4_file)
         ds = f["data"]
         attrs = _extract_attrs(ds)
         assert attrs["attribute_name"] == "attribute_name"
@@ -94,6 +94,12 @@ def test_root_attribute(self, root_attributes_netcdf4_file):
         attrs = _extract_attrs(f)
         assert attrs["attribute_name"] == "attribute_name"
 
+    def test_multiple_attributes(self, string_attributes_netcdf4_file):
+        f = h5py.File(string_attributes_netcdf4_file)
+        ds = f["data"]
+        attrs = _extract_attrs(ds)
+        assert len(attrs.keys()) == 2
+
 
 class TestVirtualVarsFromHDF:
     def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file):

From 7f1c1897dcad92cb988ea7e14a165d63fe23dad6 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Wed, 22 May 2024 14:16:12 -0600
Subject: [PATCH 20/79] Initial attempt at scale and offset via numcodecs.

---
 virtualizarr/readers/hdf.py         | 14 ++++++++---
 virtualizarr/readers/hdf_filters.py | 36 ++++++++++++++++++++++++++++-
 2 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index be93237f..c251866b 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -5,7 +5,7 @@
 import xarray as xr
 
 from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray
-from virtualizarr.readers.hdf_filters import codecs_from_dataset
+from virtualizarr.readers.hdf_filters import cfcodec_from_dataset, codecs_from_dataset
 from virtualizarr.types import ChunkKey
 from virtualizarr.utils import _fsspec_openfile_from_filepath
 from virtualizarr.zarr import ZArray
@@ -163,11 +163,20 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable:
     # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66
     chunks = dataset.chunks if dataset.chunks else dataset.shape
     codecs = codecs_from_dataset(dataset)
+    cfcodec = cfcodec_from_dataset(dataset)
+    attrs = _extract_attrs(dataset)
+    if cfcodec:
+        codecs.append(cfcodec["codec"])
+        dtype = cfcodec["target_dtype"]
+        attrs.pop("scale_factor", None)
+        attrs.pop("add_offset", None)
+    else:
+        dtype = dataset.dtype
     filters = [codec.get_config() for codec in codecs]
     zarray = ZArray(
         chunks=chunks,
         compressor=None,
-        dtype=dataset.dtype,
+        dtype=dtype,
         fill_value=dataset.fillvalue,
         filters=filters,
         order="C",
@@ -177,7 +186,6 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable:
     manifest = _dataset_chunk_manifest(path, dataset)
     marray = ManifestArray(zarray=zarray, chunkmanifest=manifest)
     dims = _dataset_dims(dataset)
-    attrs = _extract_attrs(dataset)
     variable = xr.Variable(data=marray, dims=dims, attrs=attrs)
     return variable
 
diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index dfe1c1f3..169eab97 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -1,10 +1,13 @@
-from typing import List, Tuple, Union
+from typing import List, Tuple, TypedDict, Union
 
 import h5py
 import hdf5plugin
 import numcodecs.registry as registry
+import numpy as np
 from numcodecs.abc import Codec
+from numcodecs.fixedscaleoffset import FixedScaleOffset
 from pydantic import BaseModel, validator
+from xarray.coding.variables import _choose_float_dtype
 
 _non_standard_filters = {"gzip": "zlib"}
 
@@ -24,6 +27,11 @@ def get_cname_from_code(cls, v):
         return blosc_compressor_codes[v]
 
 
+class CFCodec(TypedDict):
+    target_dtype: np.dtype
+    codec: Codec
+
+
 def _filter_to_codec(
     filter_id: str, filter_properties: Union[int, None, Tuple] = None
 ) -> Codec:
@@ -61,6 +69,32 @@ def _filter_to_codec(
     return codec
 
 
+def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None:
+    attributes = {attr: dataset.attrs[attr] for attr in dataset.attrs}
+    mapping = {}
+    if "scale_factor" in attributes:
+        mapping["scale_factor"] = 1 / attributes["scale_factor"][0]
+    else:
+        mapping["scale_factor"] = 1
+    if "add_offset" in attributes:
+        mapping["add_offset"] = attributes["add_offset"]
+    else:
+        mapping["add_offset"] = 0
+    if mapping["scale_factor"] != 1 or mapping["add_offset"] != 0:
+        float_dtype = _choose_float_dtype(dtype=dataset.dtype, mapping=mapping)
+        target_dtype = np.dtype(float_dtype)
+        codec = FixedScaleOffset(
+            offset=mapping["add_offset"],
+            scale=mapping["scale_factor"],
+            dtype=target_dtype,
+            astype=dataset.dtype,
+        )
+        cfcodec = CFCodec(target_dtype=target_dtype, codec=codec)
+        return cfcodec
+    else:
+        return None
+
+
 def codecs_from_dataset(dataset: h5py.Dataset) -> List[Codec]:
     codecs = []
     for filter_id, filter_properties in dataset._filters.items():

From 908e332ae9860a7e7d36845633a7c9267ee72ca0 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Thu, 23 May 2024 10:54:48 -0600
Subject: [PATCH 21/79] Tests for cfcodec_from_dataset.

---
 virtualizarr/tests/test_readers/conftest.py   | 10 +++++++
 .../tests/test_readers/test_hdf_filters.py    | 29 +++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index fe2ec889..202cdd9c 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -164,3 +164,13 @@ def filter_encoded_xarray_netcdf4_files(tmpdir, request):
     filepath = f"{tmpdir}/{request.param}_xarray.nc"
     ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding)
     return filepath
+
+
+@pytest.fixture
+def add_offset_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/offset.nc"
+    f = h5py.File(filepath, "w")
+    data = np.random.random((10, 10))
+    f.create_dataset(name="data", data=data, chunks=None)
+    f["data"].attrs.create(name="add_offset", data=5)
+    return filepath
diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py
index 28b5d69f..dca9f40d 100644
--- a/virtualizarr/tests/test_readers/test_hdf_filters.py
+++ b/virtualizarr/tests/test_readers/test_hdf_filters.py
@@ -1,9 +1,11 @@
 import h5py
 import numcodecs
+import numpy as np
 import pytest
 
 from virtualizarr.readers.hdf_filters import (
     _filter_to_codec,
+    cfcodec_from_dataset,
     codecs_from_dataset,
 )
 
@@ -41,3 +43,30 @@ def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file):
             bytes_read = file.read(chunk_info.size)
             decoded = codecs[0].decode(bytes_read)
             assert decoded == np_uncompressed.tobytes()
+
+
+class TestCFCodecFromDataset:
+    def test_no_cf_convention(self, filter_encoded_netcdf4_file):
+        f = h5py.File(filter_encoded_netcdf4_file)
+        ds = f["data"]
+        cf_codec = cfcodec_from_dataset(ds)
+        assert cf_codec is None
+
+    def test_cf_scale_factor(self, netcdf4_file):
+        f = h5py.File(netcdf4_file)
+        ds = f["air"]
+        cf_codec = cfcodec_from_dataset(ds)
+        assert cf_codec["target_dtype"] == np.dtype(np.float64)
+        assert cf_codec["codec"].scale == 100.0
+        assert cf_codec["codec"].offset == 0
+        assert cf_codec["codec"].dtype == "<f8"
+        assert cf_codec["codec"].astype == "<i2"
+
+    def test_cf_add_offset(self, add_offset_netcdf4_file):
+        f = h5py.File(add_offset_netcdf4_file)
+        ds = f["data"]
+        cf_codec = cfcodec_from_dataset(ds)
+        assert cf_codec["target_dtype"] == np.dtype(np.float64)
+        assert cf_codec["codec"].scale == 1
+        assert cf_codec["codec"].offset == 5
+        assert cf_codec["codec"].dtype == "<f8"

From 0df332d04d438f291abc7b952a15e8851e6e9777 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 24 May 2024 12:47:12 -0600
Subject: [PATCH 22/79] Temporarily relax integration tests to assert_allclose.

---
 virtualizarr/tests/test_integration.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py
index 064968b3..1b9aad83 100644
--- a/virtualizarr/tests/test_integration.py
+++ b/virtualizarr/tests/test_integration.py
@@ -62,7 +62,7 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format):
         roundtrip = xr.open_dataset(f"{tmpdir}/refs.{format}", engine="kerchunk")
 
         # assert equal to original dataset
-        xrt.assert_equal(roundtrip, ds)
+        xrt.assert_allclose(roundtrip, ds)
 
     def test_kerchunk_roundtrip_concat(self, tmpdir, format):
         # set up example xarray dataset
@@ -89,7 +89,7 @@ def test_kerchunk_roundtrip_concat(self, tmpdir, format):
         roundtrip = xr.open_dataset(f"{tmpdir}/refs.{format}", engine="kerchunk")
 
         # assert equal to original dataset
-        xrt.assert_equal(roundtrip, ds)
+        xrt.assert_allclose(roundtrip, ds)
 
 
 def test_open_scalar_variable(tmpdir):

From ca6b236b36fabf96c0659556f2cff2ef59435d6c Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 24 May 2024 13:50:49 -0600
Subject: [PATCH 23/79] Add blosc_lz4 fixture parameterization to confirm
 libnetcdf environment.

---
 virtualizarr/tests/test_readers/conftest.py         | 13 +++++++++----
 .../tests/test_readers/test_hdf_integration.py      |  4 ++--
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index 202cdd9c..20d5433e 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -134,7 +134,7 @@ def np_uncompressed():
     return np.arange(100)
 
 
-@pytest.fixture(params=["gzip", "blosc"])
+@pytest.fixture(params=["gzip", "blosc_lz4"])
 def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request):
     filepath = f"{tmpdir}/{request.param}.nc"
     f = h5py.File(filepath, "w")
@@ -142,7 +142,7 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request):
         f.create_dataset(
             name="data", data=np_uncompressed, compression="gzip", compression_opts=1
         )
-    if request.param == "blosc":
+    if request.param == "blosc_lz4":
         f.create_dataset(
             name="data",
             data=np_uncompressed,
@@ -151,18 +151,23 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request):
     return filepath
 
 
-@pytest.fixture(params=["gzip"])
-def filter_encoded_xarray_netcdf4_files(tmpdir, request):
+@pytest.fixture(params=["gzip", "blosc_zlib"])
+def filter_encoded_xarray_netcdf4_file(tmpdir, request):
     ds = xr.tutorial.open_dataset("air_temperature")
     encoding = {}
     if request.param == "gzip":
         encoding_config = {"zlib": True, "complevel": 1}
+    if "blosc" in request.param:
+        encoding_config = {
+            "compression": request.param,
+        }
 
     for var_name in ds.variables:
         encoding[var_name] = encoding_config
 
     filepath = f"{tmpdir}/{request.param}_xarray.nc"
     ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding)
+    #  ds.to_netcdf(filepath, engine="netcdf4", encoding=encoding)
     return filepath
 
 
diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index b31289c0..ade8e7ce 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -7,9 +7,9 @@
 
 
 class TestIntegration:
-    def test_filters_end_to_end(self, tmpdir, filter_encoded_xarray_netcdf4_files):
+    def test_filters_roundtrip(self, tmpdir, filter_encoded_xarray_netcdf4_file):
         virtual_ds = virtualizarr.open_virtual_dataset(
-            filter_encoded_xarray_netcdf4_files, filetype=FileType("netcdf4")
+            filter_encoded_xarray_netcdf4_file, filetype=FileType("netcdf4")
         )
         kerchunk_file = f"{tmpdir}/kerchunk.json"
         virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json")

From b7426c5b15f33a65a0890a51fbc6d9464b673eaf Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 24 May 2024 14:05:21 -0600
Subject: [PATCH 24/79] Check for compatability with netcdf4 engine.

---
 virtualizarr/tests/test_readers/conftest.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index 20d5433e..cb1212f0 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -166,8 +166,7 @@ def filter_encoded_xarray_netcdf4_file(tmpdir, request):
         encoding[var_name] = encoding_config
 
     filepath = f"{tmpdir}/{request.param}_xarray.nc"
-    ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding)
-    #  ds.to_netcdf(filepath, engine="netcdf4", encoding=encoding)
+    ds.to_netcdf(filepath, engine="netcdf4", encoding=encoding)
     return filepath
 
 

From dac21dde6239b5ea7e918ff50aef8839ab2f7773 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 27 May 2024 12:58:48 -0600
Subject: [PATCH 25/79] Use separate fixtures for h5netcdf and netcdf4
 compression styles.

---
 virtualizarr/tests/test_readers/conftest.py   | 27 ++++++++++++++-----
 .../test_readers/test_hdf_integration.py      | 20 ++++++++++++--
 2 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index cb1212f0..a4fafed3 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -3,6 +3,7 @@
 import numpy as np
 import pytest
 import xarray as xr
+from xarray.tests.test_dataset import create_test_data
 
 
 @pytest.fixture
@@ -151,22 +152,36 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request):
     return filepath
 
 
-@pytest.fixture(params=["gzip", "blosc_zlib"])
-def filter_encoded_xarray_netcdf4_file(tmpdir, request):
+@pytest.fixture(params=["gzip"])
+def filter_encoded_xarray_h5netcdf_file(tmpdir, request):
     ds = xr.tutorial.open_dataset("air_temperature")
     encoding = {}
     if request.param == "gzip":
         encoding_config = {"zlib": True, "complevel": 1}
+
+    for var_name in ds.variables:
+        encoding[var_name] = encoding_config
+
+    filepath = f"{tmpdir}/{request.param}_xarray.nc"
+    ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding)
+    return filepath
+
+
+@pytest.fixture(params=["blosc_zlib"])
+def filter_encoded_xarray_netcdf4_file(tmpdir, request):
+    ds = create_test_data(dim_sizes=(20, 80, 10))
     if "blosc" in request.param:
         encoding_config = {
             "compression": request.param,
+            "chunksizes": (20, 40),
+            "original_shape": ds.var2.shape,
+            "blosc_shuffle": 1,
+            "fletcher32": False,
         }
 
-    for var_name in ds.variables:
-        encoding[var_name] = encoding_config
-
+    ds["var2"].encoding.update(encoding_config)
     filepath = f"{tmpdir}/{request.param}_xarray.nc"
-    ds.to_netcdf(filepath, engine="netcdf4", encoding=encoding)
+    ds.to_netcdf(filepath, engine="netcdf4")
     return filepath
 
 
diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index ade8e7ce..d6ecf2f1 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -7,9 +7,11 @@
 
 
 class TestIntegration:
-    def test_filters_roundtrip(self, tmpdir, filter_encoded_xarray_netcdf4_file):
+    def test_filters_h5netcdf_roundtrip(
+        self, tmpdir, filter_encoded_xarray_h5netcdf_file
+    ):
         virtual_ds = virtualizarr.open_virtual_dataset(
-            filter_encoded_xarray_netcdf4_file, filetype=FileType("netcdf4")
+            filter_encoded_xarray_h5netcdf_file, filetype=FileType("netcdf4")
         )
         kerchunk_file = f"{tmpdir}/kerchunk.json"
         virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json")
@@ -18,3 +20,17 @@ def test_filters_roundtrip(self, tmpdir, filter_encoded_xarray_netcdf4_file):
 
         ds = xr.open_dataset(m, engine="kerchunk")
         assert isinstance(ds.air.values[0][0][0], numpy.float64)
+
+    def test_filters_netcdf4_roundtrip(
+        self, tmpdir, filter_encoded_xarray_netcdf4_file
+    ):
+        virtual_ds = virtualizarr.open_virtual_dataset(
+            filter_encoded_xarray_netcdf4_file, filetype=FileType("netcdf4")
+        )
+        kerchunk_file = f"{tmpdir}/kerchunk.json"
+        virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json")
+        fs = fsspec.filesystem("reference", fo=kerchunk_file)
+        m = fs.get_mapper("")
+
+        ds = xr.open_dataset(m, engine="kerchunk")
+        print(ds["var2"].encoding)

From e968772a3a206658064e3e29294afec7604d0bc9 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 27 May 2024 15:49:22 -0600
Subject: [PATCH 26/79] Print libhdf5 and libnetcdf4 versions to confirm
 compiled environment.

---
 virtualizarr/tests/test_readers/conftest.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index a4fafed3..8904dd38 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -4,6 +4,7 @@
 import pytest
 import xarray as xr
 from xarray.tests.test_dataset import create_test_data
+from xarray.util.print_versions import netcdf_and_hdf5_versions
 
 
 @pytest.fixture
@@ -181,6 +182,7 @@ def filter_encoded_xarray_netcdf4_file(tmpdir, request):
 
     ds["var2"].encoding.update(encoding_config)
     filepath = f"{tmpdir}/{request.param}_xarray.nc"
+    print(netcdf_and_hdf5_versions())
     ds.to_netcdf(filepath, engine="netcdf4")
     return filepath
 

From 9a98e57e55fd020bcf3d682604eee2f03775ff26 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 27 May 2024 17:07:51 -0600
Subject: [PATCH 27/79] Skip netcdf4 style compression tests when libhdf5 <
 1.14.

---
 virtualizarr/tests/test_readers/conftest.py   | 15 ++++++++++++---
 .../test_readers/test_hdf_integration.py      | 19 ++++++++++++++++---
 2 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index 8904dd38..0ddb2a01 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -3,6 +3,7 @@
 import numpy as np
 import pytest
 import xarray as xr
+from packaging.version import Version
 from xarray.tests.test_dataset import create_test_data
 from xarray.util.print_versions import netcdf_and_hdf5_versions
 
@@ -168,8 +169,17 @@ def filter_encoded_xarray_h5netcdf_file(tmpdir, request):
     return filepath
 
 
+@pytest.fixture()
+def skip_test_for_libhdf5_version():
+    versions = netcdf_and_hdf5_versions()
+    libhdf5_version = Version(versions[0][1])
+    return libhdf5_version < Version("1.14")
+
+
 @pytest.fixture(params=["blosc_zlib"])
-def filter_encoded_xarray_netcdf4_file(tmpdir, request):
+def filter_encoded_xarray_netcdf4_file(tmpdir, request, skip_test_for_libhdf5_version):
+    if skip_test_for_libhdf5_version:
+        pytest.skip("Requires libhdf5 >= 1.14")
     ds = create_test_data(dim_sizes=(20, 80, 10))
     if "blosc" in request.param:
         encoding_config = {
@@ -182,9 +192,8 @@ def filter_encoded_xarray_netcdf4_file(tmpdir, request):
 
     ds["var2"].encoding.update(encoding_config)
     filepath = f"{tmpdir}/{request.param}_xarray.nc"
-    print(netcdf_and_hdf5_versions())
     ds.to_netcdf(filepath, engine="netcdf4")
-    return filepath
+    return {"filepath": filepath, "compressor": request.param}
 
 
 @pytest.fixture
diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index d6ecf2f1..f51ebd45 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -24,13 +24,26 @@ def test_filters_h5netcdf_roundtrip(
     def test_filters_netcdf4_roundtrip(
         self, tmpdir, filter_encoded_xarray_netcdf4_file
     ):
+        filepath = filter_encoded_xarray_netcdf4_file["filepath"]
+        compressor = filter_encoded_xarray_netcdf4_file["compressor"]
         virtual_ds = virtualizarr.open_virtual_dataset(
-            filter_encoded_xarray_netcdf4_file, filetype=FileType("netcdf4")
+            filepath, filetype=FileType("netcdf4")
         )
         kerchunk_file = f"{tmpdir}/kerchunk.json"
         virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json")
         fs = fsspec.filesystem("reference", fo=kerchunk_file)
         m = fs.get_mapper("")
-
         ds = xr.open_dataset(m, engine="kerchunk")
-        print(ds["var2"].encoding)
+
+        expected_encoding = ds["var2"].encoding.copy()
+        compression = expected_encoding.pop("compression")
+        blosc_shuffle = expected_encoding.pop("blosc_shuffle")
+        if compression is not None:
+            if "blosc" in compression and blosc_shuffle:
+                expected_encoding["blosc"] = {
+                    "compressor": compressor,
+                    "shuffle": blosc_shuffle,
+                }
+                expected_encoding["shuffle"] = False
+        actual_encoding = ds["var2"].encoding
+        assert expected_encoding.items() <= actual_encoding.items()

From 7590b87e375f0dea6683aceba4322ca5a0c8a95d Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Tue, 11 Jun 2024 13:57:51 -0600
Subject: [PATCH 28/79] Include imagecodecs.numcodecs to support HDF5 lzf
 filters.

---
 pyproject.toml                                      | 1 +
 virtualizarr/readers/hdf_filters.py                 | 2 +-
 virtualizarr/tests/test_readers/test_hdf_filters.py | 8 ++++----
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index f0563f09..773cccc2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,6 +46,7 @@ test = [
     "fsspec",
     "s3fs",
     "fastparquet",
+    "imagecodecs-numcodecs",
 ]
 
 
diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index 169eab97..08a3bba4 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -9,7 +9,7 @@
 from pydantic import BaseModel, validator
 from xarray.coding.variables import _choose_float_dtype
 
-_non_standard_filters = {"gzip": "zlib"}
+_non_standard_filters = {"gzip": "zlib", "lzf": "imagecodecs_lzf"}
 
 
 class BloscProperties(BaseModel):
diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py
index dca9f40d..b5b04047 100644
--- a/virtualizarr/tests/test_readers/test_hdf_filters.py
+++ b/virtualizarr/tests/test_readers/test_hdf_filters.py
@@ -1,7 +1,7 @@
 import h5py
+import imagecodecs
 import numcodecs
 import numpy as np
-import pytest
 
 from virtualizarr.readers.hdf_filters import (
     _filter_to_codec,
@@ -15,9 +15,9 @@ def test_gzip_uses_zlib_numcodec(self):
         codec = _filter_to_codec("gzip", 1)
         assert isinstance(codec, numcodecs.zlib.Zlib)
 
-    def test_lzf_not_available(self):
-        with pytest.raises(ValueError, match="codec not available"):
-            _filter_to_codec("lzf")
+    def test_lzf(self):
+        codec = _filter_to_codec("lzf")
+        assert isinstance(codec, imagecodecs.numcodecs.Lzf)
 
     def test_blosc(self):
         codec = _filter_to_codec("32001", (2, 2, 8, 800, 9, 2, 1))

From 14bd7098545bd7f443b791f24aafa11bcc00fdbb Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Tue, 11 Jun 2024 16:24:30 -0600
Subject: [PATCH 29/79] Remove test that verifies call to
 read_kerchunk_references_from_file.

---
 virtualizarr/tests/test_xarray.py | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py
index 695759bd..d145550e 100644
--- a/virtualizarr/tests/test_xarray.py
+++ b/virtualizarr/tests/test_xarray.py
@@ -1,5 +1,4 @@
 from collections.abc import Mapping
-from unittest.mock import patch
 
 import numpy as np
 import pytest
@@ -304,16 +303,3 @@ def test_loadable_variables(self, netcdf4_file):
         for name in full_ds.variables:
             if name in vars_to_load:
                 xrt.assert_identical(vds.variables[name], full_ds.variables[name])
-
-    @patch("virtualizarr.kerchunk.read_kerchunk_references_from_file")
-    def test_open_virtual_dataset_passes_expected_args(
-        self, mock_read_kerchunk, netcdf4_file
-    ):
-        reader_options = {"option1": "value1", "option2": "value2"}
-        open_virtual_dataset(netcdf4_file, indexes={}, reader_options=reader_options)
-        args = {
-            "filepath": netcdf4_file,
-            "filetype": None,
-            "reader_options": reader_options,
-        }
-        mock_read_kerchunk.assert_called_once_with(**args)

From acdf0d76557a5abdf2657f1278f57c732a4dd347 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Wed, 12 Jun 2024 15:05:34 -0600
Subject: [PATCH 30/79] Add additional codec support structures for imagecodecs
 and numcodecs.

---
 virtualizarr/readers/hdf_filters.py         | 23 +++++++++++++++++----
 virtualizarr/tests/test_readers/conftest.py |  9 +++++++-
 2 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index 08a3bba4..667ff09a 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -9,7 +9,12 @@
 from pydantic import BaseModel, validator
 from xarray.coding.variables import _choose_float_dtype
 
-_non_standard_filters = {"gzip": "zlib", "lzf": "imagecodecs_lzf"}
+_non_standard_filters = {
+    "gzip": "zlib",
+    "lzf": "imagecodecs_lzf",
+}
+
+_hdf5plugin_imagecodecs = {"lz4": "imagecodecs_lz4h5", "bzip2": "imagecodecs_bz2"}
 
 
 class BloscProperties(BaseModel):
@@ -27,6 +32,10 @@ def get_cname_from_code(cls, v):
         return blosc_compressor_codes[v]
 
 
+class ZstdProperties(BaseModel):
+    level: int
+
+
 class CFCodec(TypedDict):
     target_dtype: np.dtype
     codec: Codec
@@ -41,18 +50,20 @@ def _filter_to_codec(
         id_int = int(filter_id)
     except ValueError:
         id_str = filter_id
-
+    conf = {}
     if id_str:
         if id_str in _non_standard_filters.keys():
             id = _non_standard_filters[id_str]
         else:
             id = id_str
-        conf = {"id": id}
+        conf["id"] = id  # type: ignore[assignment]
         if id == "zlib":
             conf["level"] = filter_properties  # type: ignore[assignment]
     if id_int:
         filter = hdf5plugin.get_filters(id_int)[0]
         id = filter.filter_name
+        if id in _hdf5plugin_imagecodecs.keys():
+            id = _hdf5plugin_imagecodecs[id]
         if id == "blosc" and isinstance(filter_properties, tuple):
             blosc_props = BloscProperties(
                 **{
@@ -63,7 +74,11 @@ def _filter_to_codec(
                 }
             )
             conf = blosc_props.model_dump()  # type: ignore[assignment]
-            conf["id"] = id
+        if id == "zstd" and isinstance(filter_properties, tuple):
+            zstd_props = ZstdProperties(level=filter_properties[0])
+            conf = zstd_props.model_dump()  # type: ignore[assignment]
+
+        conf["id"] = id
 
     codec = registry.get_codec(conf)
     return codec
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index 0ddb2a01..3e6f9c3f 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -137,7 +137,7 @@ def np_uncompressed():
     return np.arange(100)
 
 
-@pytest.fixture(params=["gzip", "blosc_lz4"])
+@pytest.fixture(params=["gzip", "blosc_lz4", "lz4", "bzip2", "zstd"])
 def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request):
     filepath = f"{tmpdir}/{request.param}.nc"
     f = h5py.File(filepath, "w")
@@ -151,6 +151,13 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request):
             data=np_uncompressed,
             **hdf5plugin.Blosc(cname="lz4", clevel=9, shuffle=hdf5plugin.Blosc.SHUFFLE),
         )
+    if request.param == "lz4":
+        f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.LZ4(nbytes=0))
+    if request.param == "bzip2":
+        f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.BZip2())
+    if request.param == "zstd":
+        f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.Zstd(clevel=2))
+
     return filepath
 
 

From 4ba323a6c862deb8908706373b6df429fd78f986 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Wed, 12 Jun 2024 16:17:04 -0600
Subject: [PATCH 31/79] Add codec config test for Zstd.

---
 virtualizarr/tests/test_readers/test_hdf_filters.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py
index b5b04047..4d23a756 100644
--- a/virtualizarr/tests/test_readers/test_hdf_filters.py
+++ b/virtualizarr/tests/test_readers/test_hdf_filters.py
@@ -31,6 +31,12 @@ def test_blosc(self):
         }
         assert codec.get_config() == expected_config
 
+    def test_zstd(self):
+        codec = _filter_to_codec("32015", (5,))
+        assert isinstance(codec, numcodecs.zstd.Zstd)
+        expected_config = {"id": "zstd", "level": 5}
+        assert codec.get_config() == expected_config
+
 
 class TestCodecsFromDataSet:
     def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file):

From e14e53b0fc2bb7ed1ca3d5b73fc43594aff77426 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Thu, 20 Jun 2024 18:03:26 -0600
Subject: [PATCH 32/79] Include initial cf decoding tests.

---
 virtualizarr/readers/hdf_filters.py           |  3 +-
 virtualizarr/tests/test_readers/conftest.py   | 34 ++++++++++++++++---
 .../tests/test_readers/test_hdf_filters.py    | 28 +++++++++++++++
 3 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index 667ff09a..f4e2dcfa 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -88,7 +88,8 @@ def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None:
     attributes = {attr: dataset.attrs[attr] for attr in dataset.attrs}
     mapping = {}
     if "scale_factor" in attributes:
-        mapping["scale_factor"] = 1 / attributes["scale_factor"][0]
+        mapping["scale_factor"] = 1 / attributes["scale_factor"]
+        # mapping["scale_factor"] =attributes["scale_factor"][0]
     else:
         mapping["scale_factor"] = 1
     if "add_offset" in attributes:
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index 3e6f9c3f..e1a53c5e 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -204,10 +204,36 @@ def filter_encoded_xarray_netcdf4_file(tmpdir, request, skip_test_for_libhdf5_ve
 
 
 @pytest.fixture
-def add_offset_netcdf4_file(tmpdir):
+def np_uncompressed_int16():
+    return np.arange(100, dtype=np.int16)
+
+
+@pytest.fixture
+def offset():
+    return np.float32(5.0)
+
+
+@pytest.fixture
+def add_offset_netcdf4_file(tmpdir, np_uncompressed_int16, offset):
     filepath = f"{tmpdir}/offset.nc"
     f = h5py.File(filepath, "w")
-    data = np.random.random((10, 10))
-    f.create_dataset(name="data", data=data, chunks=None)
-    f["data"].attrs.create(name="add_offset", data=5)
+    data = np_uncompressed_int16 - offset
+    f.create_dataset(name="data", data=data, chunks=True)
+    f["data"].attrs.create(name="add_offset", data=offset)
+    return filepath
+
+
+@pytest.fixture
+def scale_factor():
+    return 0.01
+
+
+@pytest.fixture
+def scale_add_offset_netcdf4_file(tmpdir, np_uncompressed_int16, offset, scale_factor):
+    filepath = f"{tmpdir}/scale_offset.nc"
+    f = h5py.File(filepath, "w")
+    data = (np_uncompressed_int16 - offset) / scale_factor
+    f.create_dataset(name="data", data=data, chunks=True)
+    f["data"].attrs.create(name="add_offset", data=offset)
+    f["data"].attrs.create(name="scale_factor", data=np.array([scale_factor]))
     return filepath
diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py
index 4d23a756..960bcf2c 100644
--- a/virtualizarr/tests/test_readers/test_hdf_filters.py
+++ b/virtualizarr/tests/test_readers/test_hdf_filters.py
@@ -76,3 +76,31 @@ def test_cf_add_offset(self, add_offset_netcdf4_file):
         assert cf_codec["codec"].scale == 1
         assert cf_codec["codec"].offset == 5
         assert cf_codec["codec"].dtype == "<f8"
+
+    def test_cf_codec_decoding_offset(
+        self, add_offset_netcdf4_file, np_uncompressed_int16
+    ):
+        f = h5py.File(add_offset_netcdf4_file)
+        ds = f["data"]
+        chunk_info = ds.id.get_chunk_info(0)
+        cfcodec = cfcodec_from_dataset(ds)
+        with open(add_offset_netcdf4_file, "rb") as file:
+            file.seek(chunk_info.byte_offset)
+            bytes_read = file.read(chunk_info.size)
+            decoded = cfcodec["codec"].decode(bytes_read)
+            assert np.array_equal(decoded, np_uncompressed_int16)
+            assert decoded.dtype == np.float64
+
+    def test_cf_codec_decoding_scale_offset(
+        self, scale_add_offset_netcdf4_file, np_uncompressed_int16
+    ):
+        f = h5py.File(scale_add_offset_netcdf4_file)
+        ds = f["data"]
+        chunk_info = ds.id.get_chunk_info(0)
+        cfcodec = cfcodec_from_dataset(ds)
+        with open(scale_add_offset_netcdf4_file, "rb") as file:
+            file.seek(chunk_info.byte_offset)
+            bytes_read = file.read(chunk_info.size)
+            decoded = cfcodec["codec"].decode(bytes_read)
+            assert np.allclose(decoded, np_uncompressed_int16)
+            assert decoded.dtype == np.float64

From b052f8c8f88e287bfdc684db0b595f32dfa88b15 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Thu, 20 Jun 2024 19:49:54 -0600
Subject: [PATCH 33/79] Revert typo for scale_factor retrieval.

---
 virtualizarr/readers/hdf_filters.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index f4e2dcfa..667ff09a 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -88,8 +88,7 @@ def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None:
     attributes = {attr: dataset.attrs[attr] for attr in dataset.attrs}
     mapping = {}
     if "scale_factor" in attributes:
-        mapping["scale_factor"] = 1 / attributes["scale_factor"]
-        # mapping["scale_factor"] =attributes["scale_factor"][0]
+        mapping["scale_factor"] = 1 / attributes["scale_factor"][0]
     else:
         mapping["scale_factor"] = 1
     if "add_offset" in attributes:

From 01a3980f541a45c8a33a907dd6d3bed722eacae9 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Thu, 20 Jun 2024 20:12:44 -0600
Subject: [PATCH 34/79] Update reader to use new numpy manifest representation.

---
 virtualizarr/readers/hdf.py                 | 29 ++++++++++-----------
 virtualizarr/tests/test_readers/test_hdf.py |  4 +--
 2 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index c251866b..b96bdff7 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -39,34 +39,33 @@ def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest:
                 path=path, offset=dsid.get_offset(), length=dsid.get_storage_size()
             )
             chunk_key = ChunkKey(key)
-            chunk_entries = {chunk_key: chunk_entry}
+            chunk_entries = {chunk_key: chunk_entry.dict()}
             chunk_manifest = ChunkManifest(entries=chunk_entries)
             return chunk_manifest
     else:
         num_chunks = dsid.get_num_chunks()
         if num_chunks == 0:
             raise ValueError("The dataset is chunked but contains no chunks")
+        paths = np.full(num_chunks, path, dtype=np.dtypes.StringDType)  # type: ignore
+        offsets = np.empty((num_chunks), dtype=np.int32)
+        lengths = np.empty((num_chunks), dtype=np.int32)
 
-        chunk_entries = dict()
-
-        def get_key(blob):
-            key_list = [a // b for a, b in zip(blob.chunk_offset, dataset.chunks)]
-            key = ".".join(map(str, key_list))
-            return key
-
-        def store_chunk_entry(blob):
-            chunk_entries[get_key(blob)] = ChunkEntry(
-                path=path, offset=blob.byte_offset, length=blob.size
-            )
+        def add_chunk_info(blob, chunk_index):
+            offsets[chunk_index] = blob.byte_offset
+            lengths[chunk_index] = blob.size
+            chunk_index += 1
 
         has_chunk_iter = callable(getattr(dsid, "chunk_iter", None))
         if has_chunk_iter:
-            dsid.chunk_iter(store_chunk_entry)
+            chunk_index = 0
+            dsid.chunk_iter(add_chunk_info, chunk_index)
         else:
             for index in range(num_chunks):
-                store_chunk_entry(dsid.get_chunk_info(index))
+                add_chunk_info(dsid.get_chunk_info(index), index)
 
-        chunk_manifest = ChunkManifest(entries=chunk_entries)
+        chunk_manifest = ChunkManifest.from_arrays(
+            paths=paths, offsets=offsets, lengths=lengths
+        )
         return chunk_manifest
 
 
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
index a67352e6..8c5a40a7 100644
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -27,13 +27,13 @@ def test_no_chunking(self, no_chunks_netcdf4_file):
         f = h5py.File(no_chunks_netcdf4_file)
         ds = f["data"]
         manifest = _dataset_chunk_manifest(path=no_chunks_netcdf4_file, dataset=ds)
-        assert len(manifest.entries) == 1
+        assert len(manifest) == 1
 
     def test_chunked(self, chunked_netcdf4_file):
         f = h5py.File(chunked_netcdf4_file)
         ds = f["data"]
         manifest = _dataset_chunk_manifest(path=chunked_netcdf4_file, dataset=ds)
-        assert len(manifest.entries) == 4
+        assert len(manifest) == 4
 
 
 class TestDatasetDims:

From c37d9e526239ad5207f76d400924fffaabb578ec Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 21 Jun 2024 19:05:01 -0600
Subject: [PATCH 35/79] Temporarily skip test until blosc netcdf4 issue is
 solved.

---
 virtualizarr/tests/test_readers/test_hdf_integration.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index f51ebd45..dca34dbd 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -1,5 +1,6 @@
 import fsspec
 import numpy
+import pytest
 import xarray as xr
 
 import virtualizarr
@@ -21,6 +22,9 @@ def test_filters_h5netcdf_roundtrip(
         ds = xr.open_dataset(m, engine="kerchunk")
         assert isinstance(ds.air.values[0][0][0], numpy.float64)
 
+    @pytest.mark.skip(
+        reason="Issue with xr 'dim1' serialization and blosc availability"
+    )
     def test_filters_netcdf4_roundtrip(
         self, tmpdir, filter_encoded_xarray_netcdf4_file
     ):

From 17b30d4149603c952e0b24892b2d104ed7499a52 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 21 Jun 2024 19:24:07 -0600
Subject: [PATCH 36/79] Fix Pydantic 2 migration warnings.

---
 virtualizarr/readers/hdf_filters.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index 667ff09a..cc8e810e 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -6,7 +6,7 @@
 import numpy as np
 from numcodecs.abc import Codec
 from numcodecs.fixedscaleoffset import FixedScaleOffset
-from pydantic import BaseModel, validator
+from pydantic import BaseModel, field_validator
 from xarray.coding.variables import _choose_float_dtype
 
 _non_standard_filters = {
@@ -23,7 +23,7 @@ class BloscProperties(BaseModel):
     shuffle: int
     cname: str
 
-    @validator("cname", pre=True)
+    @field_validator("cname", mode="before")
     def get_cname_from_code(cls, v):
         blosc_compressor_codes = {
             value: key
@@ -69,7 +69,7 @@ def _filter_to_codec(
                 **{
                     k: v
                     for k, v in zip(
-                        BloscProperties.__fields__.keys(), filter_properties[-4:]
+                        BloscProperties.model_fields.keys(), filter_properties[-4:]
                     )
                 }
             )

From f6b596a6563aff90a70acb0b8190898399368f32 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 21 Jun 2024 19:30:55 -0600
Subject: [PATCH 37/79] Include hdf5plugin and imagecodecs-numcodecs in mamba
 test environment.

---
 ci/environment.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ci/environment.yml b/ci/environment.yml
index 0385ea5a..e909beec 100644
--- a/ci/environment.yml
+++ b/ci/environment.yml
@@ -14,6 +14,7 @@ dependencies:
   - ujson
   - packaging
   - universal_pathlib
+  - hdf5plugin
   # Testing
   - codecov
   - pre-commit
@@ -26,3 +27,4 @@ dependencies:
   - fsspec
   - s3fs
   - fastparquet
+  - imagecodecs-numcodecs

From eb6e24d10385fa68a9a8909d0c6cfb9a97a34461 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 21 Jun 2024 19:35:24 -0600
Subject: [PATCH 38/79] Mamba attempt with imagecodecs rather than
 imagecodecs-numcodecs.

---
 ci/environment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/environment.yml b/ci/environment.yml
index e909beec..20784a6e 100644
--- a/ci/environment.yml
+++ b/ci/environment.yml
@@ -27,4 +27,4 @@ dependencies:
   - fsspec
   - s3fs
   - fastparquet
-  - imagecodecs-numcodecs
+  - imagecodecs

From c85bd168025d4c96c1112aff22cc82fc0e07cbfd Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 21 Jun 2024 19:41:14 -0600
Subject: [PATCH 39/79] Mamba attempt with latest imagecodecs release.

---
 ci/environment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/environment.yml b/ci/environment.yml
index 20784a6e..fb967bcd 100644
--- a/ci/environment.yml
+++ b/ci/environment.yml
@@ -27,4 +27,4 @@ dependencies:
   - fsspec
   - s3fs
   - fastparquet
-  - imagecodecs
+  - imagecodecs>=2024.6.1

From ca435da5007263136bf489ffe647cb690145cbd7 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Tue, 25 Jun 2024 19:34:35 -0600
Subject: [PATCH 40/79] Use correct iter_chunks callback function signtature.

---
 virtualizarr/readers/hdf.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index b96bdff7..d082b717 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -53,12 +53,22 @@ def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest:
         def add_chunk_info(blob, chunk_index):
             offsets[chunk_index] = blob.byte_offset
             lengths[chunk_index] = blob.size
-            chunk_index += 1
 
         has_chunk_iter = callable(getattr(dsid, "chunk_iter", None))
         if has_chunk_iter:
-            chunk_index = 0
-            dsid.chunk_iter(add_chunk_info, chunk_index)
+
+            def create_callback(initial=0):
+                value = initial
+
+                def callback(blob):
+                    nonlocal value
+                    add_chunk_info(blob, chunk_index=value)
+                    value += 1
+
+                return callback
+
+            callback = create_callback()
+            dsid.chunk_iter(callback)
         else:
             for index in range(num_chunks):
                 add_chunk_info(dsid.get_chunk_info(index), index)

From 3017951549fe4b3d9d7099b1357aa76136d23f16 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Tue, 25 Jun 2024 19:35:40 -0600
Subject: [PATCH 41/79] Include pip based imagecodecs-numcodecs until
 conda-forge availability.

---
 ci/environment.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ci/environment.yml b/ci/environment.yml
index fb967bcd..e2f5a865 100644
--- a/ci/environment.yml
+++ b/ci/environment.yml
@@ -28,3 +28,5 @@ dependencies:
   - s3fs
   - fastparquet
   - imagecodecs>=2024.6.1
+  - pip:
+    - imagecodecs-numcodecs

From 32ba13537070fbee7e861d8618f6a77eacbe0da8 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Thu, 27 Jun 2024 15:43:10 -0600
Subject: [PATCH 42/79] Handle non-coordinate dims which are serialized to hdf
 as empty dataset.

---
 virtualizarr/readers/hdf.py                 | 65 ++++++++++++---------
 virtualizarr/tests/test_integration.py      | 18 +++++-
 virtualizarr/tests/test_readers/test_hdf.py |  1 +
 virtualizarr/xarray.py                      |  2 +-
 4 files changed, 53 insertions(+), 33 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index d082b717..cbbe824f 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -11,7 +11,9 @@
 from virtualizarr.zarr import ZArray
 
 
-def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest:
+def _dataset_chunk_manifest(
+    path: str, dataset: h5py.Dataset
+) -> Optional[ChunkManifest]:
     """
     Generate ChunkManifest for HDF5 dataset.
 
@@ -31,7 +33,7 @@ def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest:
 
     if dataset.chunks is None:
         if dsid.get_offset() is None:
-            raise ValueError("Dataset has no space allocated in the file")
+            return None
         else:
             key_list = [0] * (len(dataset.shape) or 1)
             key = ".".join(map(str, key_list))
@@ -167,35 +169,39 @@ def _extract_attrs(h5obj: Union[h5py.Dataset, h5py.Group]):
     return attrs
 
 
-def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable:
+def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> Optional[xr.Variable]:
     # This chunk determination logic mirrors zarr-python's create
     # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66
-    chunks = dataset.chunks if dataset.chunks else dataset.shape
-    codecs = codecs_from_dataset(dataset)
-    cfcodec = cfcodec_from_dataset(dataset)
-    attrs = _extract_attrs(dataset)
-    if cfcodec:
-        codecs.append(cfcodec["codec"])
-        dtype = cfcodec["target_dtype"]
-        attrs.pop("scale_factor", None)
-        attrs.pop("add_offset", None)
-    else:
-        dtype = dataset.dtype
-    filters = [codec.get_config() for codec in codecs]
-    zarray = ZArray(
-        chunks=chunks,
-        compressor=None,
-        dtype=dtype,
-        fill_value=dataset.fillvalue,
-        filters=filters,
-        order="C",
-        shape=dataset.shape,
-        zarr_format=2,
-    )
+
     manifest = _dataset_chunk_manifest(path, dataset)
-    marray = ManifestArray(zarray=zarray, chunkmanifest=manifest)
-    dims = _dataset_dims(dataset)
-    variable = xr.Variable(data=marray, dims=dims, attrs=attrs)
+    if manifest:
+        chunks = dataset.chunks if dataset.chunks else dataset.shape
+        codecs = codecs_from_dataset(dataset)
+        cfcodec = cfcodec_from_dataset(dataset)
+        attrs = _extract_attrs(dataset)
+        if cfcodec:
+            codecs.append(cfcodec["codec"])
+            dtype = cfcodec["target_dtype"]
+            attrs.pop("scale_factor", None)
+            attrs.pop("add_offset", None)
+        else:
+            dtype = dataset.dtype
+        filters = [codec.get_config() for codec in codecs]
+        zarray = ZArray(
+            chunks=chunks,
+            compressor=None,
+            dtype=dtype,
+            fill_value=dataset.fillvalue,
+            filters=filters,
+            order="C",
+            shape=dataset.shape,
+            zarr_format=2,
+        )
+        marray = ManifestArray(zarray=zarray, chunkmanifest=manifest)
+        dims = _dataset_dims(dataset)
+        variable = xr.Variable(data=marray, dims=dims, attrs=attrs)
+    else:
+        variable = None
     return variable
 
 
@@ -217,7 +223,8 @@ def virtual_vars_from_hdf(
         if key not in drop_variables:
             if isinstance(f[key], h5py.Dataset):
                 variable = _dataset_to_variable(path, f[key])
-                variables[key] = variable
+                if variable is not None:
+                    variables[key] = variable
             else:
                 raise NotImplementedError("Nested groups are not yet supported")
 
diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py
index 451862c6..6a1f91ef 100644
--- a/virtualizarr/tests/test_integration.py
+++ b/virtualizarr/tests/test_integration.py
@@ -71,9 +71,13 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format):
                 f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=False
             )
 
-        # assert identical to original dataset
+        # assert all_close to original dataset
         xrt.assert_allclose(roundtrip, ds)
 
+        # assert coordinate attributes are maintained
+        for coord in ds.coords:
+            assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs
+
     def test_kerchunk_roundtrip_concat(self, tmpdir, format):
         # set up example xarray dataset
         ds = xr.tutorial.open_dataset("air_temperature", decode_times=False)
@@ -107,8 +111,12 @@ def test_kerchunk_roundtrip_concat(self, tmpdir, format):
                 f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=False
             )
 
-        # assert identical to original dataset
-        xrt.assert_identical(roundtrip, ds)
+        # assert all_close to original dataset
+        xrt.assert_allclose(roundtrip, ds)
+
+        # assert coordinate attributes are maintained
+        for coord in ds.coords:
+            assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs
 
     def test_non_dimension_coordinates(self, tmpdir, format):
         # regression test for GH issue #105
@@ -142,6 +150,10 @@ def test_non_dimension_coordinates(self, tmpdir, format):
         # assert equal to original dataset
         xrt.assert_allclose(roundtrip, ds)
 
+        # assert coordinate attributes are maintained
+        for coord in ds.coords:
+            assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs
+
 
 def test_open_scalar_variable(tmpdir):
     # regression test for GH issue #100
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
index 8c5a40a7..c744cd68 100644
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -17,6 +17,7 @@ def test_empty_chunks(self, empty_chunks_netcdf4_file):
         with pytest.raises(ValueError, match="chunked but contains no chunks"):
             _dataset_chunk_manifest(path=empty_chunks_netcdf4_file, dataset=ds)
 
+    @pytest.mark.skip("Need to differentiate non coordinate dimensions from empty")
     def test_empty_dataset(self, empty_dataset_netcdf4_file):
         f = h5py.File(empty_dataset_netcdf4_file)
         ds = f["data"]
diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
index 39bd0671..a8a23693 100644
--- a/virtualizarr/xarray.py
+++ b/virtualizarr/xarray.py
@@ -121,7 +121,7 @@ def open_virtual_dataset(
             ds_attrs = attrs_from_root_group(
                 path=filepath, reader_options=reader_options
             )
-            coord_names = None
+            coord_names = ds_attrs.pop("coordinates", [])
         else:
             # this is the only place we actually always need to use kerchunk directly
             # TODO avoid even reading byte ranges for variables that will be dropped later anyway?

From 64f446c8d452291548bba2c73a104bf068dc2d7e Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Thu, 27 Jun 2024 16:23:43 -0600
Subject: [PATCH 43/79] Use reader_options for filetype check and update
 failing kerchunk call.

---
 virtualizarr/tests/test_xarray.py | 18 +++++++++++++-----
 virtualizarr/xarray.py            |  4 +++-
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py
index e55583bf..282d4ad1 100644
--- a/virtualizarr/tests/test_xarray.py
+++ b/virtualizarr/tests/test_xarray.py
@@ -8,6 +8,7 @@
 from xarray.core.indexes import Index
 
 from virtualizarr import open_virtual_dataset
+from virtualizarr.kerchunk import FileType
 from virtualizarr.manifests import ChunkManifest, ManifestArray
 from virtualizarr.tests import network, requires_s3fs
 from virtualizarr.zarr import ZArray
@@ -325,18 +326,25 @@ def test_loadable_variables(self, netcdf4_file):
             if name in vars_to_load:
                 xrt.assert_identical(vds.variables[name], full_ds.variables[name])
 
-    @patch("virtualizarr.kerchunk.read_kerchunk_references_from_file")
+    @patch("virtualizarr.xarray._automatically_determine_filetype")
+    @patch("virtualizarr.xarray.virtual_vars_from_hdf")
     def test_open_virtual_dataset_passes_expected_args(
-        self, mock_read_kerchunk, netcdf4_file
+        self, mock_reader, mock_determine_filetype, netcdf4_file
     ):
         reader_options = {"option1": "value1", "option2": "value2"}
+        mock_determine_filetype.return_value = FileType.netcdf4
         open_virtual_dataset(netcdf4_file, indexes={}, reader_options=reader_options)
-        args = {
+        reader_args = {
+            "path": netcdf4_file,
+            "drop_variables": [],
+            "reader_options": reader_options,
+        }
+        mock_reader.assert_called_once_with(**reader_args)
+        filetype_args = {
             "filepath": netcdf4_file,
-            "filetype": None,
             "reader_options": reader_options,
         }
-        mock_read_kerchunk.assert_called_once_with(**args)
+        mock_determine_filetype.assert_called_once_with(**filetype_args)
 
 
 class TestRenamePaths:
diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
index a8a23693..86a59c8d 100644
--- a/virtualizarr/xarray.py
+++ b/virtualizarr/xarray.py
@@ -109,7 +109,9 @@ def open_virtual_dataset(
         )
     else:
         if filetype is None:
-            filetype = _automatically_determine_filetype(filepath=filepath)
+            filetype = _automatically_determine_filetype(
+                filepath=filepath, reader_options=reader_options
+            )
         filetype = FileType(filetype)
 
         if filetype.name.lower() == "netcdf4":

From 9797346463e443d6f48b567569156f4ca01490cf Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Sat, 29 Jun 2024 18:20:06 -0600
Subject: [PATCH 44/79] Fix chunkmanifest shaping for chunked datasets.

---
 virtualizarr/readers/hdf.py                 | 36 +++++++++------------
 virtualizarr/tests/test_readers/test_hdf.py | 10 ++++--
 2 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index cbbe824f..d683f693 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -1,3 +1,4 @@
+import math
 from typing import List, Mapping, Optional, Union
 
 import h5py
@@ -48,32 +49,27 @@ def _dataset_chunk_manifest(
         num_chunks = dsid.get_num_chunks()
         if num_chunks == 0:
             raise ValueError("The dataset is chunked but contains no chunks")
-        paths = np.full(num_chunks, path, dtype=np.dtypes.StringDType)  # type: ignore
-        offsets = np.empty((num_chunks), dtype=np.int32)
-        lengths = np.empty((num_chunks), dtype=np.int32)
 
-        def add_chunk_info(blob, chunk_index):
-            offsets[chunk_index] = blob.byte_offset
-            lengths[chunk_index] = blob.size
+        shape = tuple(math.ceil(a / b) for a, b in zip(dataset.shape, dataset.chunks))
+        paths = np.empty(shape, dtype=np.dtypes.StringDType)  # type: ignore
+        offsets = np.empty(shape, dtype=np.int32)
+        lengths = np.empty(shape, dtype=np.int32)
 
-        has_chunk_iter = callable(getattr(dsid, "chunk_iter", None))
-        if has_chunk_iter:
-
-            def create_callback(initial=0):
-                value = initial
+        def get_key(blob):
+            return tuple([a // b for a, b in zip(blob.chunk_offset, dataset.chunks)])
 
-                def callback(blob):
-                    nonlocal value
-                    add_chunk_info(blob, chunk_index=value)
-                    value += 1
+        def add_chunk_info(blob):
+            key = get_key(blob)
+            paths[key] = path
+            offsets[key] = blob.byte_offset
+            lengths[key] = blob.size
 
-                return callback
-
-            callback = create_callback()
-            dsid.chunk_iter(callback)
+        has_chunk_iter = callable(getattr(dsid, "chunk_iter", None))
+        if has_chunk_iter:
+            dsid.chunk_iter(add_chunk_info)
         else:
             for index in range(num_chunks):
-                add_chunk_info(dsid.get_chunk_info(index), index)
+                add_chunk_info(dsid.get_chunk_info(index))
 
         chunk_manifest = ChunkManifest.from_arrays(
             paths=paths, offsets=offsets, lengths=lengths
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
index c744cd68..25caab93 100644
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -28,13 +28,19 @@ def test_no_chunking(self, no_chunks_netcdf4_file):
         f = h5py.File(no_chunks_netcdf4_file)
         ds = f["data"]
         manifest = _dataset_chunk_manifest(path=no_chunks_netcdf4_file, dataset=ds)
-        assert len(manifest) == 1
+        assert manifest.shape_chunk_grid == (1, 1)
 
     def test_chunked(self, chunked_netcdf4_file):
         f = h5py.File(chunked_netcdf4_file)
         ds = f["data"]
         manifest = _dataset_chunk_manifest(path=chunked_netcdf4_file, dataset=ds)
-        assert len(manifest) == 4
+        assert manifest.shape_chunk_grid == (2, 2)
+
+    def test_chunked_roundtrip(self, chunked_roundtrip):
+        f = h5py.File(chunked_roundtrip)
+        ds = f["var2"]
+        manifest = _dataset_chunk_manifest(path=chunked_roundtrip, dataset=ds)
+        assert manifest.shape_chunk_grid == (2, 8)
 
 
 class TestDatasetDims:

From c833e191abb773e409aec6eeb47ab6438d0ee0a9 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Sat, 29 Jun 2024 18:22:05 -0600
Subject: [PATCH 45/79] Handle scale_factor attribute serialization for
 compressed files.

---
 virtualizarr/readers/hdf_filters.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index cc8e810e..1a3c2220 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -88,7 +88,11 @@ def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None:
     attributes = {attr: dataset.attrs[attr] for attr in dataset.attrs}
     mapping = {}
     if "scale_factor" in attributes:
-        mapping["scale_factor"] = 1 / attributes["scale_factor"][0]
+        try:
+            scale_factor = attributes["scale_factor"][0]
+        except IndexError:
+            scale_factor = attributes["scale_factor"]
+        mapping["scale_factor"] = 1 / scale_factor
     else:
         mapping["scale_factor"] = 1
     if "add_offset" in attributes:

From 701bcfad494326a71ec08c454465bceaa33803e9 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Sat, 29 Jun 2024 18:24:13 -0600
Subject: [PATCH 46/79] Include chunked roundtrip fixture.

---
 virtualizarr/tests/test_readers/conftest.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index e1a53c5e..5fbec00e 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -196,7 +196,8 @@ def filter_encoded_xarray_netcdf4_file(tmpdir, request, skip_test_for_libhdf5_ve
             "blosc_shuffle": 1,
             "fletcher32": False,
         }
-
+    #  Check on how handle scalar dim.
+    ds = ds.drop_dims("dim3")
     ds["var2"].encoding.update(encoding_config)
     filepath = f"{tmpdir}/{request.param}_xarray.nc"
     ds.to_netcdf(filepath, engine="netcdf4")
@@ -237,3 +238,14 @@ def scale_add_offset_netcdf4_file(tmpdir, np_uncompressed_int16, offset, scale_f
     f["data"].attrs.create(name="add_offset", data=offset)
     f["data"].attrs.create(name="scale_factor", data=np.array([scale_factor]))
     return filepath
+
+
+@pytest.fixture()
+def chunked_roundtrip(tmpdir):
+    ds = create_test_data(dim_sizes=(20, 80, 10))
+    ds = ds.drop_dims("dim3")
+    filepath = f"{tmpdir}/chunked_xarray.nc"
+    ds.to_netcdf(
+        filepath, engine="netcdf4", encoding={"var2": {"chunksizes": (10, 10)}}
+    )
+    return filepath

From 08c988e2c16a7366a4ea99f2fc073da407b326d5 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Sat, 29 Jun 2024 18:24:48 -0600
Subject: [PATCH 47/79] Standardize xarray integration tests for hdf filters.

---
 .../test_readers/test_hdf_integration.py      | 47 ++++++-------------
 1 file changed, 14 insertions(+), 33 deletions(-)

diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index dca34dbd..abc23df6 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -1,53 +1,34 @@
-import fsspec
-import numpy
 import pytest
 import xarray as xr
+import xarray.testing as xrt
 
 import virtualizarr
 from virtualizarr.kerchunk import FileType
 
 
 class TestIntegration:
+    @pytest.mark.xfail(reason="Investigate initial time value decoding issue")
     def test_filters_h5netcdf_roundtrip(
         self, tmpdir, filter_encoded_xarray_h5netcdf_file
     ):
-        virtual_ds = virtualizarr.open_virtual_dataset(
+        ds = xr.open_dataset(filter_encoded_xarray_h5netcdf_file, decode_times=False)
+        vds = virtualizarr.open_virtual_dataset(
             filter_encoded_xarray_h5netcdf_file, filetype=FileType("netcdf4")
         )
         kerchunk_file = f"{tmpdir}/kerchunk.json"
-        virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json")
-        fs = fsspec.filesystem("reference", fo=kerchunk_file)
-        m = fs.get_mapper("")
-
-        ds = xr.open_dataset(m, engine="kerchunk")
-        assert isinstance(ds.air.values[0][0][0], numpy.float64)
+        vds.virtualize.to_kerchunk(kerchunk_file, format="json")
+        roundtrip = xr.open_dataset(
+            kerchunk_file, engine="kerchunk", decode_times=False
+        )
+        xrt.assert_allclose(ds, roundtrip)
 
-    @pytest.mark.skip(
-        reason="Issue with xr 'dim1' serialization and blosc availability"
-    )
     def test_filters_netcdf4_roundtrip(
         self, tmpdir, filter_encoded_xarray_netcdf4_file
     ):
         filepath = filter_encoded_xarray_netcdf4_file["filepath"]
-        compressor = filter_encoded_xarray_netcdf4_file["compressor"]
-        virtual_ds = virtualizarr.open_virtual_dataset(
-            filepath, filetype=FileType("netcdf4")
-        )
+        ds = xr.open_dataset(filepath)
+        vds = virtualizarr.open_virtual_dataset(filepath, filetype=FileType("netcdf4"))
         kerchunk_file = f"{tmpdir}/kerchunk.json"
-        virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json")
-        fs = fsspec.filesystem("reference", fo=kerchunk_file)
-        m = fs.get_mapper("")
-        ds = xr.open_dataset(m, engine="kerchunk")
-
-        expected_encoding = ds["var2"].encoding.copy()
-        compression = expected_encoding.pop("compression")
-        blosc_shuffle = expected_encoding.pop("blosc_shuffle")
-        if compression is not None:
-            if "blosc" in compression and blosc_shuffle:
-                expected_encoding["blosc"] = {
-                    "compressor": compressor,
-                    "shuffle": blosc_shuffle,
-                }
-                expected_encoding["shuffle"] = False
-        actual_encoding = ds["var2"].encoding
-        assert expected_encoding.items() <= actual_encoding.items()
+        vds.virtualize.to_kerchunk(kerchunk_file, format="json")
+        roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
+        xrt.assert_equal(ds, roundtrip)

From 4cb4bac261a7825f44798e247c13a6faeb752a5a Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Sat, 29 Jun 2024 20:00:56 -0600
Subject: [PATCH 48/79] Update reader selection logic for new filetype
 determination.

---
 virtualizarr/xarray.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
index 1a795e56..9671264d 100644
--- a/virtualizarr/xarray.py
+++ b/virtualizarr/xarray.py
@@ -136,8 +136,7 @@ def open_virtual_dataset(
                 filepath=filepath, reader_options=reader_options
             )
         filetype = FileType(filetype)
-
-        if filetype.name.lower() == "netcdf4":
+        if filetype.name.lower() == "netcdf4" or filetype.name.lower() == "hdf5":
             virtual_vars = virtual_vars_from_hdf(
                 path=filepath,
                 drop_variables=drop_variables,

From d352104393d0eeacfc3b566a9f0cb79c7e688c8f Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Sun, 30 Jun 2024 13:07:17 -0600
Subject: [PATCH 49/79] Use decode_times for integration test.

---
 .../tests/test_readers/test_hdf_integration.py       | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index abc23df6..882dea31 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -1,4 +1,3 @@
-import pytest
 import xarray as xr
 import xarray.testing as xrt
 
@@ -7,19 +6,18 @@
 
 
 class TestIntegration:
-    @pytest.mark.xfail(reason="Investigate initial time value decoding issue")
     def test_filters_h5netcdf_roundtrip(
         self, tmpdir, filter_encoded_xarray_h5netcdf_file
     ):
-        ds = xr.open_dataset(filter_encoded_xarray_h5netcdf_file, decode_times=False)
+        ds = xr.open_dataset(filter_encoded_xarray_h5netcdf_file, decode_times=True)
         vds = virtualizarr.open_virtual_dataset(
-            filter_encoded_xarray_h5netcdf_file, filetype=FileType("netcdf4")
+            filter_encoded_xarray_h5netcdf_file,
+            loadable_variables=["time"],
+            cftime_variables=["time"],
         )
         kerchunk_file = f"{tmpdir}/kerchunk.json"
         vds.virtualize.to_kerchunk(kerchunk_file, format="json")
-        roundtrip = xr.open_dataset(
-            kerchunk_file, engine="kerchunk", decode_times=False
-        )
+        roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk", decode_times=True)
         xrt.assert_allclose(ds, roundtrip)
 
     def test_filters_netcdf4_roundtrip(

From 3d89ea426ccb0f8abdcb961e55773887d48d38d6 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Sun, 30 Jun 2024 13:38:46 -0600
Subject: [PATCH 50/79] Standardize fixture names for hdf5 vs netcdf4 file
 types.

---
 virtualizarr/tests/test_readers/conftest.py   | 36 +++++----
 virtualizarr/tests/test_readers/test_hdf.py   | 78 +++++++++----------
 .../tests/test_readers/test_hdf_filters.py    | 26 +++----
 .../test_readers/test_hdf_integration.py      | 10 +--
 4 files changed, 76 insertions(+), 74 deletions(-)

diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index 5fbec00e..539b2fbb 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -9,7 +9,7 @@
 
 
 @pytest.fixture
-def empty_chunks_netcdf4_file(tmpdir):
+def empty_chunks_hdf5_file(tmpdir):
     ds = xr.Dataset({"data": []})
     filepath = f"{tmpdir}/empty_chunks.nc"
     ds.to_netcdf(filepath, engine="h5netcdf")
@@ -17,7 +17,7 @@ def empty_chunks_netcdf4_file(tmpdir):
 
 
 @pytest.fixture
-def empty_dataset_netcdf4_file(tmpdir):
+def empty_dataset_hdf5_file(tmpdir):
     filepath = f"{tmpdir}/empty_dataset.nc"
     f = h5py.File(filepath, "w")
     f.create_dataset("data", shape=(0,), dtype="f")
@@ -25,7 +25,7 @@ def empty_dataset_netcdf4_file(tmpdir):
 
 
 @pytest.fixture
-def no_chunks_netcdf4_file(tmpdir):
+def no_chunks_hdf5_file(tmpdir):
     filepath = f"{tmpdir}/no_chunks.nc"
     f = h5py.File(filepath, "w")
     data = np.random.random((10, 10))
@@ -34,7 +34,7 @@ def no_chunks_netcdf4_file(tmpdir):
 
 
 @pytest.fixture
-def chunked_netcdf4_file(tmpdir):
+def chunked_hdf5_file(tmpdir):
     filepath = f"{tmpdir}/chunks.nc"
     f = h5py.File(filepath, "w")
     data = np.random.random((100, 100))
@@ -43,7 +43,7 @@ def chunked_netcdf4_file(tmpdir):
 
 
 @pytest.fixture
-def single_dimension_scale_netcdf4_file(tmpdir):
+def single_dimension_scale_hdf5_file(tmpdir):
     filepath = f"{tmpdir}/single_dimension_scale.nc"
     f = h5py.File(filepath, "w")
     data = [1, 2]
@@ -56,7 +56,7 @@ def single_dimension_scale_netcdf4_file(tmpdir):
 
 
 @pytest.fixture
-def is_scale_netcdf4_file(tmpdir):
+def is_scale_hdf5_file(tmpdir):
     filepath = f"{tmpdir}/is_scale.nc"
     f = h5py.File(filepath, "w")
     data = [1, 2]
@@ -66,7 +66,7 @@ def is_scale_netcdf4_file(tmpdir):
 
 
 @pytest.fixture
-def multiple_dimension_scales_netcdf4_file(tmpdir):
+def multiple_dimension_scales_hdf5_file(tmpdir):
     filepath = f"{tmpdir}/multiple_dimension_scales.nc"
     f = h5py.File(filepath, "w")
     data = [1, 2]
@@ -96,7 +96,7 @@ def chunked_dimensions_netcdf4_file(tmpdir):
 
 
 @pytest.fixture
-def string_attributes_netcdf4_file(tmpdir):
+def string_attributes_hdf5_file(tmpdir):
     filepath = f"{tmpdir}/attributes.nc"
     f = h5py.File(filepath, "w")
     data = np.random.random((10, 10))
@@ -107,7 +107,7 @@ def string_attributes_netcdf4_file(tmpdir):
 
 
 @pytest.fixture
-def root_attributes_netcdf4_file(tmpdir):
+def root_attributes_hdf5_file(tmpdir):
     filepath = f"{tmpdir}/root_attributes.nc"
     f = h5py.File(filepath, "w")
     f.attrs["attribute_name"] = "attribute_name"
@@ -115,7 +115,7 @@ def root_attributes_netcdf4_file(tmpdir):
 
 
 @pytest.fixture
-def group_netcdf4_file(tmpdir):
+def group_hdf5_file(tmpdir):
     filepath = f"{tmpdir}/group.nc"
     f = h5py.File(filepath, "w")
     f.create_group("group")
@@ -123,7 +123,7 @@ def group_netcdf4_file(tmpdir):
 
 
 @pytest.fixture
-def multiple_datasets_netcdf4_file(tmpdir):
+def multiple_datasets_hdf5_file(tmpdir):
     filepath = f"{tmpdir}/multiple_datasets.nc"
     f = h5py.File(filepath, "w")
     data = np.random.random((10, 10))
@@ -138,7 +138,7 @@ def np_uncompressed():
 
 
 @pytest.fixture(params=["gzip", "blosc_lz4", "lz4", "bzip2", "zstd"])
-def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request):
+def filter_encoded_hdf5_file(tmpdir, np_uncompressed, request):
     filepath = f"{tmpdir}/{request.param}.nc"
     f = h5py.File(filepath, "w")
     if request.param == "gzip":
@@ -162,7 +162,7 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request):
 
 
 @pytest.fixture(params=["gzip"])
-def filter_encoded_xarray_h5netcdf_file(tmpdir, request):
+def filter_encoded_roundtrip_hdf5_file(tmpdir, request):
     ds = xr.tutorial.open_dataset("air_temperature")
     encoding = {}
     if request.param == "gzip":
@@ -184,7 +184,9 @@ def skip_test_for_libhdf5_version():
 
 
 @pytest.fixture(params=["blosc_zlib"])
-def filter_encoded_xarray_netcdf4_file(tmpdir, request, skip_test_for_libhdf5_version):
+def filter_encoded_roundtrip_netcdf4_file(
+    tmpdir, request, skip_test_for_libhdf5_version
+):
     if skip_test_for_libhdf5_version:
         pytest.skip("Requires libhdf5 >= 1.14")
     ds = create_test_data(dim_sizes=(20, 80, 10))
@@ -215,7 +217,7 @@ def offset():
 
 
 @pytest.fixture
-def add_offset_netcdf4_file(tmpdir, np_uncompressed_int16, offset):
+def add_offset_hdf5_file(tmpdir, np_uncompressed_int16, offset):
     filepath = f"{tmpdir}/offset.nc"
     f = h5py.File(filepath, "w")
     data = np_uncompressed_int16 - offset
@@ -230,7 +232,7 @@ def scale_factor():
 
 
 @pytest.fixture
-def scale_add_offset_netcdf4_file(tmpdir, np_uncompressed_int16, offset, scale_factor):
+def scale_add_offset_hdf5_file(tmpdir, np_uncompressed_int16, offset, scale_factor):
     filepath = f"{tmpdir}/scale_offset.nc"
     f = h5py.File(filepath, "w")
     data = (np_uncompressed_int16 - offset) / scale_factor
@@ -241,7 +243,7 @@ def scale_add_offset_netcdf4_file(tmpdir, np_uncompressed_int16, offset, scale_f
 
 
 @pytest.fixture()
-def chunked_roundtrip(tmpdir):
+def chunked_roundtrip_hdf5_file(tmpdir):
     ds = create_test_data(dim_sizes=(20, 80, 10))
     ds = ds.drop_dims("dim3")
     filepath = f"{tmpdir}/chunked_xarray.nc"
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
index 25caab93..1fb0f6ee 100644
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -11,59 +11,59 @@
 
 
 class TestDatasetChunkManifest:
-    def test_empty_chunks(self, empty_chunks_netcdf4_file):
-        f = h5py.File(empty_chunks_netcdf4_file)
+    def test_empty_chunks(self, empty_chunks_hdf5_file):
+        f = h5py.File(empty_chunks_hdf5_file)
         ds = f["data"]
         with pytest.raises(ValueError, match="chunked but contains no chunks"):
-            _dataset_chunk_manifest(path=empty_chunks_netcdf4_file, dataset=ds)
+            _dataset_chunk_manifest(path=empty_chunks_hdf5_file, dataset=ds)
 
     @pytest.mark.skip("Need to differentiate non coordinate dimensions from empty")
-    def test_empty_dataset(self, empty_dataset_netcdf4_file):
-        f = h5py.File(empty_dataset_netcdf4_file)
+    def test_empty_dataset(self, empty_dataset_hdf5_file):
+        f = h5py.File(empty_dataset_hdf5_file)
         ds = f["data"]
         with pytest.raises(ValueError, match="no space allocated in the file"):
-            _dataset_chunk_manifest(path=empty_dataset_netcdf4_file, dataset=ds)
+            _dataset_chunk_manifest(path=empty_dataset_hdf5_file, dataset=ds)
 
-    def test_no_chunking(self, no_chunks_netcdf4_file):
-        f = h5py.File(no_chunks_netcdf4_file)
+    def test_no_chunking(self, no_chunks_hdf5_file):
+        f = h5py.File(no_chunks_hdf5_file)
         ds = f["data"]
-        manifest = _dataset_chunk_manifest(path=no_chunks_netcdf4_file, dataset=ds)
+        manifest = _dataset_chunk_manifest(path=no_chunks_hdf5_file, dataset=ds)
         assert manifest.shape_chunk_grid == (1, 1)
 
-    def test_chunked(self, chunked_netcdf4_file):
-        f = h5py.File(chunked_netcdf4_file)
+    def test_chunked(self, chunked_hdf5_file):
+        f = h5py.File(chunked_hdf5_file)
         ds = f["data"]
-        manifest = _dataset_chunk_manifest(path=chunked_netcdf4_file, dataset=ds)
+        manifest = _dataset_chunk_manifest(path=chunked_hdf5_file, dataset=ds)
         assert manifest.shape_chunk_grid == (2, 2)
 
-    def test_chunked_roundtrip(self, chunked_roundtrip):
-        f = h5py.File(chunked_roundtrip)
+    def test_chunked_roundtrip(self, chunked_roundtrip_hdf5_file):
+        f = h5py.File(chunked_roundtrip_hdf5_file)
         ds = f["var2"]
-        manifest = _dataset_chunk_manifest(path=chunked_roundtrip, dataset=ds)
+        manifest = _dataset_chunk_manifest(path=chunked_roundtrip_hdf5_file, dataset=ds)
         assert manifest.shape_chunk_grid == (2, 8)
 
 
 class TestDatasetDims:
-    def test_single_dimension_scale(self, single_dimension_scale_netcdf4_file):
-        f = h5py.File(single_dimension_scale_netcdf4_file)
+    def test_single_dimension_scale(self, single_dimension_scale_hdf5_file):
+        f = h5py.File(single_dimension_scale_hdf5_file)
         ds = f["data"]
         dims = _dataset_dims(ds)
         assert dims[0] == "x"
 
-    def test_is_dimension_scale(self, is_scale_netcdf4_file):
-        f = h5py.File(is_scale_netcdf4_file)
+    def test_is_dimension_scale(self, is_scale_hdf5_file):
+        f = h5py.File(is_scale_hdf5_file)
         ds = f["data"]
         dims = _dataset_dims(ds)
         assert dims[0] == "data"
 
-    def test_multiple_dimension_scales(self, multiple_dimension_scales_netcdf4_file):
-        f = h5py.File(multiple_dimension_scales_netcdf4_file)
+    def test_multiple_dimension_scales(self, multiple_dimension_scales_hdf5_file):
+        f = h5py.File(multiple_dimension_scales_hdf5_file)
         ds = f["data"]
         with pytest.raises(ValueError, match="dimension scales attached"):
             _dataset_dims(ds)
 
-    def test_no_dimension_scales(self, no_chunks_netcdf4_file):
-        f = h5py.File(no_chunks_netcdf4_file)
+    def test_no_dimension_scales(self, no_chunks_hdf5_file):
+        f = h5py.File(no_chunks_hdf5_file)
         ds = f["data"]
         dims = _dataset_dims(ds)
         assert dims == ["phony_dim_0", "phony_dim_1"]
@@ -76,33 +76,33 @@ def test_chunked_dataset(self, chunked_dimensions_netcdf4_file):
         var = _dataset_to_variable(chunked_dimensions_netcdf4_file, ds)
         assert var.chunks == (50, 50)
 
-    def test_not_chunked_dataset(self, single_dimension_scale_netcdf4_file):
-        f = h5py.File(single_dimension_scale_netcdf4_file)
+    def test_not_chunked_dataset(self, single_dimension_scale_hdf5_file):
+        f = h5py.File(single_dimension_scale_hdf5_file)
         ds = f["data"]
-        var = _dataset_to_variable(single_dimension_scale_netcdf4_file, ds)
+        var = _dataset_to_variable(single_dimension_scale_hdf5_file, ds)
         assert var.chunks == (2,)
 
-    def test_dataset_attributes(self, string_attributes_netcdf4_file):
-        f = h5py.File(string_attributes_netcdf4_file)
+    def test_dataset_attributes(self, string_attributes_hdf5_file):
+        f = h5py.File(string_attributes_hdf5_file)
         ds = f["data"]
-        var = _dataset_to_variable(string_attributes_netcdf4_file, ds)
+        var = _dataset_to_variable(string_attributes_hdf5_file, ds)
         assert var.attrs["attribute_name"] == "attribute_name"
 
 
 class TestExtractAttributes:
-    def test_string_attribute(self, string_attributes_netcdf4_file):
-        f = h5py.File(string_attributes_netcdf4_file)
+    def test_string_attribute(self, string_attributes_hdf5_file):
+        f = h5py.File(string_attributes_hdf5_file)
         ds = f["data"]
         attrs = _extract_attrs(ds)
         assert attrs["attribute_name"] == "attribute_name"
 
-    def test_root_attribute(self, root_attributes_netcdf4_file):
-        f = h5py.File(root_attributes_netcdf4_file)
+    def test_root_attribute(self, root_attributes_hdf5_file):
+        f = h5py.File(root_attributes_hdf5_file)
         attrs = _extract_attrs(f)
         assert attrs["attribute_name"] == "attribute_name"
 
-    def test_multiple_attributes(self, string_attributes_netcdf4_file):
-        f = h5py.File(string_attributes_netcdf4_file)
+    def test_multiple_attributes(self, string_attributes_hdf5_file):
+        f = h5py.File(string_attributes_hdf5_file)
         ds = f["data"]
         attrs = _extract_attrs(ds)
         assert len(attrs.keys()) == 2
@@ -113,10 +113,10 @@ def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file):
         variables = virtual_vars_from_hdf(chunked_dimensions_netcdf4_file)
         assert len(variables) == 3
 
-    def test_groups_not_implemented(self, group_netcdf4_file):
+    def test_groups_not_implemented(self, group_hdf5_file):
         with pytest.raises(NotImplementedError):
-            virtual_vars_from_hdf(group_netcdf4_file)
+            virtual_vars_from_hdf(group_hdf5_file)
 
-    def test_drop_variables(self, multiple_datasets_netcdf4_file):
-        variables = virtual_vars_from_hdf(multiple_datasets_netcdf4_file, ["data2"])
+    def test_drop_variables(self, multiple_datasets_hdf5_file):
+        variables = virtual_vars_from_hdf(multiple_datasets_hdf5_file, ["data2"])
         assert "data2" not in variables.keys()
diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py
index 960bcf2c..99b3af48 100644
--- a/virtualizarr/tests/test_readers/test_hdf_filters.py
+++ b/virtualizarr/tests/test_readers/test_hdf_filters.py
@@ -39,12 +39,12 @@ def test_zstd(self):
 
 
 class TestCodecsFromDataSet:
-    def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file):
-        f = h5py.File(filter_encoded_netcdf4_file)
+    def test_numcodec_decoding(self, np_uncompressed, filter_encoded_hdf5_file):
+        f = h5py.File(filter_encoded_hdf5_file)
         ds = f["data"]
         chunk_info = ds.id.get_chunk_info(0)
         codecs = codecs_from_dataset(ds)
-        with open(filter_encoded_netcdf4_file, "rb") as file:
+        with open(filter_encoded_hdf5_file, "rb") as file:
             file.seek(chunk_info.byte_offset)
             bytes_read = file.read(chunk_info.size)
             decoded = codecs[0].decode(bytes_read)
@@ -52,8 +52,8 @@ def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file):
 
 
 class TestCFCodecFromDataset:
-    def test_no_cf_convention(self, filter_encoded_netcdf4_file):
-        f = h5py.File(filter_encoded_netcdf4_file)
+    def test_no_cf_convention(self, filter_encoded_hdf5_file):
+        f = h5py.File(filter_encoded_hdf5_file)
         ds = f["data"]
         cf_codec = cfcodec_from_dataset(ds)
         assert cf_codec is None
@@ -68,8 +68,8 @@ def test_cf_scale_factor(self, netcdf4_file):
         assert cf_codec["codec"].dtype == "<f8"
         assert cf_codec["codec"].astype == "<i2"
 
-    def test_cf_add_offset(self, add_offset_netcdf4_file):
-        f = h5py.File(add_offset_netcdf4_file)
+    def test_cf_add_offset(self, add_offset_hdf5_file):
+        f = h5py.File(add_offset_hdf5_file)
         ds = f["data"]
         cf_codec = cfcodec_from_dataset(ds)
         assert cf_codec["target_dtype"] == np.dtype(np.float64)
@@ -78,13 +78,13 @@ def test_cf_add_offset(self, add_offset_netcdf4_file):
         assert cf_codec["codec"].dtype == "<f8"
 
     def test_cf_codec_decoding_offset(
-        self, add_offset_netcdf4_file, np_uncompressed_int16
+        self, add_offset_hdf5_file, np_uncompressed_int16
     ):
-        f = h5py.File(add_offset_netcdf4_file)
+        f = h5py.File(add_offset_hdf5_file)
         ds = f["data"]
         chunk_info = ds.id.get_chunk_info(0)
         cfcodec = cfcodec_from_dataset(ds)
-        with open(add_offset_netcdf4_file, "rb") as file:
+        with open(add_offset_hdf5_file, "rb") as file:
             file.seek(chunk_info.byte_offset)
             bytes_read = file.read(chunk_info.size)
             decoded = cfcodec["codec"].decode(bytes_read)
@@ -92,13 +92,13 @@ def test_cf_codec_decoding_offset(
             assert decoded.dtype == np.float64
 
     def test_cf_codec_decoding_scale_offset(
-        self, scale_add_offset_netcdf4_file, np_uncompressed_int16
+        self, scale_add_offset_hdf5_file, np_uncompressed_int16
     ):
-        f = h5py.File(scale_add_offset_netcdf4_file)
+        f = h5py.File(scale_add_offset_hdf5_file)
         ds = f["data"]
         chunk_info = ds.id.get_chunk_info(0)
         cfcodec = cfcodec_from_dataset(ds)
-        with open(scale_add_offset_netcdf4_file, "rb") as file:
+        with open(scale_add_offset_hdf5_file, "rb") as file:
             file.seek(chunk_info.byte_offset)
             bytes_read = file.read(chunk_info.size)
             decoded = cfcodec["codec"].decode(bytes_read)
diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index 882dea31..4fc7bd3e 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -7,11 +7,11 @@
 
 class TestIntegration:
     def test_filters_h5netcdf_roundtrip(
-        self, tmpdir, filter_encoded_xarray_h5netcdf_file
+        self, tmpdir, filter_encoded_roundtrip_hdf5_file
     ):
-        ds = xr.open_dataset(filter_encoded_xarray_h5netcdf_file, decode_times=True)
+        ds = xr.open_dataset(filter_encoded_roundtrip_hdf5_file, decode_times=True)
         vds = virtualizarr.open_virtual_dataset(
-            filter_encoded_xarray_h5netcdf_file,
+            filter_encoded_roundtrip_hdf5_file,
             loadable_variables=["time"],
             cftime_variables=["time"],
         )
@@ -21,9 +21,9 @@ def test_filters_h5netcdf_roundtrip(
         xrt.assert_allclose(ds, roundtrip)
 
     def test_filters_netcdf4_roundtrip(
-        self, tmpdir, filter_encoded_xarray_netcdf4_file
+        self, tmpdir, filter_encoded_roundtrip_netcdf4_file
     ):
-        filepath = filter_encoded_xarray_netcdf4_file["filepath"]
+        filepath = filter_encoded_roundtrip_netcdf4_file["filepath"]
         ds = xr.open_dataset(filepath)
         vds = virtualizarr.open_virtual_dataset(filepath, filetype=FileType("netcdf4"))
         kerchunk_file = f"{tmpdir}/kerchunk.json"

From c9dd0d9cbcc638a4f9d116e3b36a86de997140b6 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Sun, 30 Jun 2024 22:14:26 -0600
Subject: [PATCH 51/79] Handle array add_offset property for compressed data.

---
 virtualizarr/readers/hdf_filters.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index 1a3c2220..5b35d8ff 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -96,7 +96,11 @@ def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None:
     else:
         mapping["scale_factor"] = 1
     if "add_offset" in attributes:
-        mapping["add_offset"] = attributes["add_offset"]
+        try:
+            offset = attributes["add_offset"][0]
+        except IndexError:
+            offset = attributes["add_offset"]
+        mapping["add_offset"] = offset
     else:
         mapping["add_offset"] = 0
     if mapping["scale_factor"] != 1 or mapping["add_offset"] != 0:

From db5b4213b0c4b512c872ce4acdce04c66936a6a5 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 1 Jul 2024 16:57:11 -0600
Subject: [PATCH 52/79] Include h5py shuffle filter.

---
 virtualizarr/readers/hdf_filters.py            | 18 ++++++++++++++----
 .../tests/test_readers/test_hdf_filters.py     | 11 ++++++++++-
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index 5b35d8ff..a60dd56a 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -36,6 +36,14 @@ class ZstdProperties(BaseModel):
     level: int
 
 
+class ShuffleProperties(BaseModel):
+    elementsize: int
+
+
+class ZlibProperties(BaseModel):
+    level: int
+
+
 class CFCodec(TypedDict):
     target_dtype: np.dtype
     codec: Codec
@@ -56,9 +64,13 @@ def _filter_to_codec(
             id = _non_standard_filters[id_str]
         else:
             id = id_str
-        conf["id"] = id  # type: ignore[assignment]
         if id == "zlib":
-            conf["level"] = filter_properties  # type: ignore[assignment]
+            zlib_props = ZlibProperties(level=filter_properties)
+            conf = zlib_props.model_dump()  # type: ignore[assignment]
+        if id == "shuffle" and isinstance(filter_properties, tuple):
+            shuffle_props = ShuffleProperties(elementsize=filter_properties[0])
+            conf = shuffle_props.model_dump()  # type: ignore[assignment]
+        conf["id"] = id  # type: ignore[assignment]
     if id_int:
         filter = hdf5plugin.get_filters(id_int)[0]
         id = filter.filter_name
@@ -77,9 +89,7 @@ def _filter_to_codec(
         if id == "zstd" and isinstance(filter_properties, tuple):
             zstd_props = ZstdProperties(level=filter_properties[0])
             conf = zstd_props.model_dump()  # type: ignore[assignment]
-
         conf["id"] = id
-
     codec = registry.get_codec(conf)
     return codec
 
diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py
index 99b3af48..efaad781 100644
--- a/virtualizarr/tests/test_readers/test_hdf_filters.py
+++ b/virtualizarr/tests/test_readers/test_hdf_filters.py
@@ -37,6 +37,12 @@ def test_zstd(self):
         expected_config = {"id": "zstd", "level": 5}
         assert codec.get_config() == expected_config
 
+    def test_shuffle(self):
+        codec = _filter_to_codec("shuffle", (7,))
+        assert isinstance(codec, numcodecs.shuffle.Shuffle)
+        expected_config = {"id": "shuffle", "elementsize": 7}
+        assert codec.get_config() == expected_config
+
 
 class TestCodecsFromDataSet:
     def test_numcodec_decoding(self, np_uncompressed, filter_encoded_hdf5_file):
@@ -48,7 +54,10 @@ def test_numcodec_decoding(self, np_uncompressed, filter_encoded_hdf5_file):
             file.seek(chunk_info.byte_offset)
             bytes_read = file.read(chunk_info.size)
             decoded = codecs[0].decode(bytes_read)
-            assert decoded == np_uncompressed.tobytes()
+            if isinstance(decoded, np.ndarray):
+                assert decoded.tobytes() == np_uncompressed.tobytes()
+            else:
+                assert decoded == np_uncompressed.tobytes()
 
 
 class TestCFCodecFromDataset:

From 9a1da321e186f56d230cb5609dc787f7d9ec557b Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 1 Jul 2024 17:03:46 -0600
Subject: [PATCH 53/79] Make ScaleAndOffset codec last in filters list.

---
 virtualizarr/readers/hdf.py                   |  2 +-
 virtualizarr/tests/test_readers/conftest.py   | 36 ++++++++++++++++++-
 .../test_readers/test_hdf_integration.py      | 10 ++++++
 3 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index d683f693..f3337c04 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -176,7 +176,7 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> Optional[xr.Variab
         cfcodec = cfcodec_from_dataset(dataset)
         attrs = _extract_attrs(dataset)
         if cfcodec:
-            codecs.append(cfcodec["codec"])
+            codecs.insert(0, cfcodec["codec"])
             dtype = cfcodec["target_dtype"]
             attrs.pop("scale_factor", None)
             attrs.pop("add_offset", None)
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index 539b2fbb..afc0beea 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -137,7 +137,7 @@ def np_uncompressed():
     return np.arange(100)
 
 
-@pytest.fixture(params=["gzip", "blosc_lz4", "lz4", "bzip2", "zstd"])
+@pytest.fixture(params=["gzip", "blosc_lz4", "lz4", "bzip2", "zstd", "shuffle"])
 def filter_encoded_hdf5_file(tmpdir, np_uncompressed, request):
     filepath = f"{tmpdir}/{request.param}.nc"
     f = h5py.File(filepath, "w")
@@ -157,6 +157,8 @@ def filter_encoded_hdf5_file(tmpdir, np_uncompressed, request):
         f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.BZip2())
     if request.param == "zstd":
         f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.Zstd(clevel=2))
+    if request.param == "shuffle":
+        f.create_dataset(name="data", data=np_uncompressed, shuffle=True)
 
     return filepath
 
@@ -251,3 +253,35 @@ def chunked_roundtrip_hdf5_file(tmpdir):
         filepath, engine="netcdf4", encoding={"var2": {"chunksizes": (10, 10)}}
     )
     return filepath
+
+
+@pytest.fixture(params=["gzip", "zlib"])
+def filter_and_cf_roundtrip_hdf5_file(tmpdir, request):
+    x = np.arange(100)
+    y = np.arange(100)
+    temperature = 0.1 * x[:, None] + 0.1 * y[None, :]
+    ds = xr.Dataset(
+        {"temperature": (["x", "y"], temperature)},
+        coords={"x": np.arange(100), "y": np.arange(100)},
+    )
+    encoding = {
+        "temperature": {
+            "dtype": "int16",
+            "scale_factor": 0.1,
+            "add_offset": 273.15,
+        }
+    }
+    if request.param == "gzip":
+        encoding["temperature"]["compression"] = "gzip"
+        encoding["temperature"]["compression_opts"] = 7
+
+    if request.param == "zlib":
+        encoding["temperature"]["zlib"] = True
+        encoding["temperature"]["complevel"] = 9
+
+    from random import randint
+
+    filepath = f"{tmpdir}/{request.param}_{randint(0,100)}_cf_roundtrip.nc"
+    ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding)
+
+    return filepath
diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index 4fc7bd3e..dd8d6c3b 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -1,3 +1,4 @@
+import pytest
 import xarray as xr
 import xarray.testing as xrt
 
@@ -30,3 +31,12 @@ def test_filters_netcdf4_roundtrip(
         vds.virtualize.to_kerchunk(kerchunk_file, format="json")
         roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
         xrt.assert_equal(ds, roundtrip)
+
+    @pytest.mark.xfail(reason="Investigate kerchunk _FillValue logic")
+    def test_filter_and_cf_roundtrip(self, tmpdir, filter_and_cf_roundtrip_hdf5_file):
+        ds = xr.open_dataset(filter_and_cf_roundtrip_hdf5_file)
+        vds = virtualizarr.open_virtual_dataset(filter_and_cf_roundtrip_hdf5_file)
+        kerchunk_file = f"{tmpdir}/filter_cf_kerchunk.json"
+        vds.virtualize.to_kerchunk(kerchunk_file, format="json")
+        roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
+        xrt.assert_allclose(ds, roundtrip)

From 9b2b0f8a2b94073c2bf50fe78d8dd068e6d1332c Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Tue, 2 Jul 2024 13:23:23 -0600
Subject: [PATCH 54/79] Apply ScaleAndOffset codec to _FillValue since it's
 value is now downstream.

---
 virtualizarr/readers/hdf.py                             | 4 +++-
 virtualizarr/tests/test_readers/conftest.py             | 7 ++++++-
 virtualizarr/tests/test_readers/test_hdf_integration.py | 2 --
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index f3337c04..6197067f 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -180,14 +180,16 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> Optional[xr.Variab
             dtype = cfcodec["target_dtype"]
             attrs.pop("scale_factor", None)
             attrs.pop("add_offset", None)
+            fill_value = cfcodec["codec"].decode(dataset.fillvalue)
         else:
             dtype = dataset.dtype
+            fill_value = dataset.fillvalue
         filters = [codec.get_config() for codec in codecs]
         zarray = ZArray(
             chunks=chunks,
             compressor=None,
             dtype=dtype,
-            fill_value=dataset.fillvalue,
+            fill_value=fill_value,
             filters=filters,
             order="C",
             shape=dataset.shape,
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index afc0beea..ec4132ba 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -259,7 +259,9 @@ def chunked_roundtrip_hdf5_file(tmpdir):
 def filter_and_cf_roundtrip_hdf5_file(tmpdir, request):
     x = np.arange(100)
     y = np.arange(100)
+    fill_value = np.int16(-9999)
     temperature = 0.1 * x[:, None] + 0.1 * y[None, :]
+    temperature[0][0] = fill_value
     ds = xr.Dataset(
         {"temperature": (["x", "y"], temperature)},
         coords={"x": np.arange(100), "y": np.arange(100)},
@@ -269,7 +271,10 @@ def filter_and_cf_roundtrip_hdf5_file(tmpdir, request):
             "dtype": "int16",
             "scale_factor": 0.1,
             "add_offset": 273.15,
-        }
+            "_FillValue": fill_value,
+        },
+        "x": {"_FillValue": fill_value},
+        "y": {"_FillValue": fill_value},
     }
     if request.param == "gzip":
         encoding["temperature"]["compression"] = "gzip"
diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index dd8d6c3b..5cf3f79d 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -1,4 +1,3 @@
-import pytest
 import xarray as xr
 import xarray.testing as xrt
 
@@ -32,7 +31,6 @@ def test_filters_netcdf4_roundtrip(
         roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
         xrt.assert_equal(ds, roundtrip)
 
-    @pytest.mark.xfail(reason="Investigate kerchunk _FillValue logic")
     def test_filter_and_cf_roundtrip(self, tmpdir, filter_and_cf_roundtrip_hdf5_file):
         ds = xr.open_dataset(filter_and_cf_roundtrip_hdf5_file)
         vds = virtualizarr.open_virtual_dataset(filter_and_cf_roundtrip_hdf5_file)

From 9ef136275ff636535dcb7e6ecc5b35c1e7149065 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Tue, 2 Jul 2024 15:12:04 -0600
Subject: [PATCH 55/79] Coerce scale and add_offset values to native float for
 JSON serialization.

---
 virtualizarr/readers/hdf_filters.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index a60dd56a..ae232fec 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -102,7 +102,7 @@ def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None:
             scale_factor = attributes["scale_factor"][0]
         except IndexError:
             scale_factor = attributes["scale_factor"]
-        mapping["scale_factor"] = 1 / scale_factor
+        mapping["scale_factor"] = float(1 / scale_factor)
     else:
         mapping["scale_factor"] = 1
     if "add_offset" in attributes:
@@ -110,7 +110,7 @@ def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None:
             offset = attributes["add_offset"][0]
         except IndexError:
             offset = attributes["add_offset"]
-        mapping["add_offset"] = offset
+        mapping["add_offset"] = float(offset)
     else:
         mapping["add_offset"] = 0
     if mapping["scale_factor"] != 1 or mapping["add_offset"] != 0:

From f4f9c8f643c34cbabe71faa6b439853499d4464a Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Thu, 8 Aug 2024 19:36:39 -0600
Subject: [PATCH 56/79] Temporarily xfail integration tests for main

---
 virtualizarr/tests/test_readers/test_hdf_integration.py | 2 ++
 virtualizarr/tests/test_xarray.py                       | 1 +
 2 files changed, 3 insertions(+)

diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index 5cf3f79d..9d5d2a26 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -1,3 +1,4 @@
+import pytest
 import xarray as xr
 import xarray.testing as xrt
 
@@ -6,6 +7,7 @@
 
 
 class TestIntegration:
+    @pytest.mark.xfail(reason="0 time start is being interpreted as fillvalue")
     def test_filters_h5netcdf_roundtrip(
         self, tmpdir, filter_encoded_roundtrip_hdf5_file
     ):
diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py
index d5b5f360..8942f4ad 100644
--- a/virtualizarr/tests/test_xarray.py
+++ b/virtualizarr/tests/test_xarray.py
@@ -427,6 +427,7 @@ def test_open_virtual_dataset_passes_expected_args(
         }
         mock_determine_filetype.assert_called_once_with(**filetype_args)
 
+    @pytest.mark.xfail(reason="Requires discussion on validity of empty datasets")
     def test_open_dataset_with_empty(self, hdf5_empty, tmpdir):
         vds = open_virtual_dataset(hdf5_empty)
         assert vds.empty.dims == ()

From a9e59f2bff085bf23d3d99849eaf68adca49fb80 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Tue, 8 Oct 2024 15:13:37 -0400
Subject: [PATCH 57/79] Remove pydantic dependency as per pull/210.

---
 virtualizarr/readers/hdf_filters.py           | 35 +++++++++----------
 .../tests/test_readers/test_hdf_filters.py    |  2 +-
 2 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index ae232fec..349d10dc 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -1,3 +1,4 @@
+import dataclasses
 from typing import List, Tuple, TypedDict, Union
 
 import h5py
@@ -6,7 +7,6 @@
 import numpy as np
 from numcodecs.abc import Codec
 from numcodecs.fixedscaleoffset import FixedScaleOffset
-from pydantic import BaseModel, field_validator
 from xarray.coding.variables import _choose_float_dtype
 
 _non_standard_filters = {
@@ -17,30 +17,33 @@
 _hdf5plugin_imagecodecs = {"lz4": "imagecodecs_lz4h5", "bzip2": "imagecodecs_bz2"}
 
 
-class BloscProperties(BaseModel):
+@dataclasses.dataclass
+class BloscProperties:
     blocksize: int
     clevel: int
     shuffle: int
     cname: str
 
-    @field_validator("cname", mode="before")
-    def get_cname_from_code(cls, v):
+    def __post_init__(self):
         blosc_compressor_codes = {
             value: key
             for key, value in hdf5plugin._filters.Blosc._Blosc__COMPRESSIONS.items()
         }
-        return blosc_compressor_codes[v]
+        self.cname = blosc_compressor_codes[self.cname]
 
 
-class ZstdProperties(BaseModel):
+@dataclasses.dataclass
+class ZstdProperties:
     level: int
 
 
-class ShuffleProperties(BaseModel):
+@dataclasses.dataclass
+class ShuffleProperties:
     elementsize: int
 
 
-class ZlibProperties(BaseModel):
+@dataclasses.dataclass
+class ZlibProperties:
     level: int
 
 
@@ -66,10 +69,10 @@ def _filter_to_codec(
             id = id_str
         if id == "zlib":
             zlib_props = ZlibProperties(level=filter_properties)
-            conf = zlib_props.model_dump()  # type: ignore[assignment]
+            conf = dataclasses.asdict(zlib_props)
         if id == "shuffle" and isinstance(filter_properties, tuple):
             shuffle_props = ShuffleProperties(elementsize=filter_properties[0])
-            conf = shuffle_props.model_dump()  # type: ignore[assignment]
+            conf = dataclasses.asdict(shuffle_props)
         conf["id"] = id  # type: ignore[assignment]
     if id_int:
         filter = hdf5plugin.get_filters(id_int)[0]
@@ -77,18 +80,14 @@ def _filter_to_codec(
         if id in _hdf5plugin_imagecodecs.keys():
             id = _hdf5plugin_imagecodecs[id]
         if id == "blosc" and isinstance(filter_properties, tuple):
+            blosc_fields = [field.name for field in dataclasses.fields(BloscProperties)]
             blosc_props = BloscProperties(
-                **{
-                    k: v
-                    for k, v in zip(
-                        BloscProperties.model_fields.keys(), filter_properties[-4:]
-                    )
-                }
+                **{k: v for k, v in zip(blosc_fields, filter_properties[-4:])}
             )
-            conf = blosc_props.model_dump()  # type: ignore[assignment]
+            conf = dataclasses.asdict(blosc_props)
         if id == "zstd" and isinstance(filter_properties, tuple):
             zstd_props = ZstdProperties(level=filter_properties[0])
-            conf = zstd_props.model_dump()  # type: ignore[assignment]
+            conf = dataclasses.asdict(zstd_props)
         conf["id"] = id
     codec = registry.get_codec(conf)
     return codec
diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py
index efaad781..b2581c58 100644
--- a/virtualizarr/tests/test_readers/test_hdf_filters.py
+++ b/virtualizarr/tests/test_readers/test_hdf_filters.py
@@ -34,7 +34,7 @@ def test_blosc(self):
     def test_zstd(self):
         codec = _filter_to_codec("32015", (5,))
         assert isinstance(codec, numcodecs.zstd.Zstd)
-        expected_config = {"id": "zstd", "level": 5}
+        expected_config = {"id": "zstd", "level": 5, "checksum": False}
         assert codec.get_config() == expected_config
 
     def test_shuffle(self):

From 2b33bc2a46c3742e886151eb064b439efde2e8f2 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Tue, 8 Oct 2024 15:15:12 -0400
Subject: [PATCH 58/79] Update test for new kerchunk reader module location.

---
 virtualizarr/tests/test_readers/test_hdf_integration.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index 9d5d2a26..5973a8b9 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -3,7 +3,7 @@
 import xarray.testing as xrt
 
 import virtualizarr
-from virtualizarr.kerchunk import FileType
+from virtualizarr.readers.kerchunk import FileType
 
 
 class TestIntegration:

From a57ae9e7578c3e6167f8fc423af6a2c11891a8ab Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Wed, 9 Oct 2024 11:14:20 -0400
Subject: [PATCH 59/79] Fix branch typing errors.

---
 virtualizarr/readers/hdf.py                         | 6 +++---
 virtualizarr/readers/hdf_filters.py                 | 6 +++---
 virtualizarr/tests/test_readers/conftest.py         | 4 ++--
 virtualizarr/tests/test_readers/test_hdf.py         | 2 +-
 virtualizarr/tests/test_readers/test_hdf_filters.py | 2 +-
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index 42127ba3..bcf896a8 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -1,7 +1,7 @@
 import math
-from typing import List, Mapping, Optional, Union
+from typing import Dict, List, Optional, Union
 
-import h5py
+import h5py  # type: ignore
 import numpy as np
 import xarray as xr
 
@@ -209,7 +209,7 @@ def virtual_vars_from_hdf(
     reader_options: Optional[dict] = {
         "storage_options": {"key": "", "secret": "", "anon": True}
     },
-) -> Mapping[str, xr.Variable]:
+) -> Dict[str, xr.Variable]:
     if drop_variables is None:
         drop_variables = []
     open_file = _fsspec_openfile_from_filepath(
diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index 349d10dc..aedf89b3 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -1,8 +1,8 @@
 import dataclasses
 from typing import List, Tuple, TypedDict, Union
 
-import h5py
-import hdf5plugin
+import h5py  # type: ignore
+import hdf5plugin  # type: ignore
 import numcodecs.registry as registry
 import numpy as np
 from numcodecs.abc import Codec
@@ -68,7 +68,7 @@ def _filter_to_codec(
         else:
             id = id_str
         if id == "zlib":
-            zlib_props = ZlibProperties(level=filter_properties)
+            zlib_props = ZlibProperties(level=filter_properties)  # type: ignore
             conf = dataclasses.asdict(zlib_props)
         if id == "shuffle" and isinstance(filter_properties, tuple):
             shuffle_props = ShuffleProperties(elementsize=filter_properties[0])
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index ec4132ba..c47c26c9 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -1,5 +1,5 @@
-import h5py
-import hdf5plugin
+import h5py  # type: ignore
+import hdf5plugin  # type: ignore
 import numpy as np
 import pytest
 import xarray as xr
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
index 1fb0f6ee..32970a33 100644
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -1,4 +1,4 @@
-import h5py
+import h5py  # type: ignore
 import pytest
 
 from virtualizarr.readers.hdf import (
diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py
index b2581c58..c05a7eeb 100644
--- a/virtualizarr/tests/test_readers/test_hdf_filters.py
+++ b/virtualizarr/tests/test_readers/test_hdf_filters.py
@@ -1,4 +1,4 @@
-import h5py
+import h5py  # type: ignore
 import imagecodecs
 import numcodecs
 import numpy as np

From e21fc6976d8c42633c8c6ee413855aee9ddb997f Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Wed, 9 Oct 2024 12:56:45 -0400
Subject: [PATCH 60/79] Re-include automatic file type determination.

---
 virtualizarr/backend.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py
index fb9a452b..61fa5b96 100644
--- a/virtualizarr/backend.py
+++ b/virtualizarr/backend.py
@@ -162,6 +162,13 @@ def open_virtual_dataset(
         if reader_options is None:
             reader_options = {}
 
+        from virtualizarr.readers.kerchunk import _automatically_determine_filetype
+
+        if filetype is None:
+            filetype = _automatically_determine_filetype(
+                filepath=filepath, reader_options=reader_options
+            )
+        filetype = FileType(filetype)
         if filetype == FileType.hdf5:
             from virtualizarr.readers.hdf import (
                 attrs_from_root_group,

From df69a12db513df051d82cde43fb3ce958b0d02ba Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Wed, 9 Oct 2024 16:27:56 -0400
Subject: [PATCH 61/79] Handle various hdf flavors of _FillValue storage.

---
 virtualizarr/readers/hdf.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index bcf896a8..9a0d6307 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -31,7 +31,6 @@ def _dataset_chunk_manifest(
         A Virtualizarr ChunkManifest
     """
     dsid = dataset.id
-
     if dataset.chunks is None:
         if dsid.get_offset() is None:
             return None
@@ -49,7 +48,6 @@ def _dataset_chunk_manifest(
         num_chunks = dsid.get_num_chunks()
         if num_chunks == 0:
             raise ValueError("The dataset is chunked but contains no chunks")
-
         shape = tuple(math.ceil(a / b) for a, b in zip(dataset.shape, dataset.chunks))
         paths = np.empty(shape, dtype=np.dtypes.StringDType)  # type: ignore
         offsets = np.empty(shape, dtype=np.uint64)
@@ -184,6 +182,10 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> Optional[xr.Variab
         else:
             dtype = dataset.dtype
             fill_value = dataset.fillvalue
+        if isinstance(fill_value, np.ndarray):
+            fill_value = fill_value[0]
+        if np.isnan(fill_value):
+            fill_value = float("nan")
         filters = [codec.get_config() for codec in codecs]
         zarray = ZArray(
             chunks=chunks,

From 169337c7d4c2bd5764f10fd038d63a4bd4d5fb94 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Wed, 9 Oct 2024 16:28:57 -0400
Subject: [PATCH 62/79] Include loadable variables in drop variables list.

---
 virtualizarr/backend.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py
index 61fa5b96..0cc7b14a 100644
--- a/virtualizarr/backend.py
+++ b/virtualizarr/backend.py
@@ -177,7 +177,7 @@ def open_virtual_dataset(
 
             virtual_vars = virtual_vars_from_hdf(
                 path=filepath,
-                drop_variables=drop_variables,
+                drop_variables=drop_variables + loadable_variables,
                 reader_options=reader_options,
             )
             ds_attrs = attrs_from_root_group(

From bdcbfbf70bada38cfa196e748113e49e6b74f3e9 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Wed, 9 Oct 2024 16:31:04 -0400
Subject: [PATCH 63/79] Mock readers.hdf.virtual_vars_from_hdf to verify option
 passing.

---
 virtualizarr/tests/test_backend.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py
index e42ad9ac..bb68c186 100644
--- a/virtualizarr/tests/test_backend.py
+++ b/virtualizarr/tests/test_backend.py
@@ -313,19 +313,18 @@ def test_group_kwarg(self, hdf5_groups_file):
             if name in vars_to_load:
                 xrt.assert_identical(vds.variables[name], full_ds.variables[name])
 
-    @patch("virtualizarr.readers.kerchunk.read_kerchunk_references_from_file")
+    @patch("virtualizarr.readers.hdf.virtual_vars_from_hdf")
     def test_open_virtual_dataset_passes_expected_args(
-        self, mock_read_kerchunk, netcdf4_file
+        self, mock_read_hdf, netcdf4_file
     ):
         reader_options = {"option1": "value1", "option2": "value2"}
         open_virtual_dataset(netcdf4_file, indexes={}, reader_options=reader_options)
         args = {
-            "filepath": netcdf4_file,
-            "filetype": None,
-            "group": None,
+            "path": netcdf4_file,
+            "drop_variables": [],
             "reader_options": reader_options,
         }
-        mock_read_kerchunk.assert_called_once_with(**args)
+        mock_read_hdf.assert_called_once_with(**args)
 
     def test_open_dataset_with_empty(self, hdf5_empty, tmpdir):
         vds = open_virtual_dataset(hdf5_empty)

From 77f1689aee1e9288a518ae78d6066b9a7435e62f Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Wed, 9 Oct 2024 16:41:10 -0400
Subject: [PATCH 64/79] Convert numpy _FillValue to native Python for
 serialization support.

---
 virtualizarr/readers/hdf.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index 9a0d6307..8d2c44ce 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -186,6 +186,8 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> Optional[xr.Variab
             fill_value = fill_value[0]
         if np.isnan(fill_value):
             fill_value = float("nan")
+        if isinstance(fill_value, np.generic):
+            fill_value = fill_value.item()
         filters = [codec.get_config() for codec in codecs]
         zarray = ZArray(
             chunks=chunks,

From 42c653ad0c0d098d1b652c65df242a51363e9867 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Thu, 10 Oct 2024 19:03:43 -0400
Subject: [PATCH 65/79] Support groups with HDF5 reader.

---
 virtualizarr/backend.py                     |  3 ++-
 virtualizarr/readers/hdf.py                 | 22 +++++++++++++++++----
 virtualizarr/tests/test_backend.py          |  5 +++--
 virtualizarr/tests/test_readers/conftest.py | 15 +++++++++++++-
 virtualizarr/tests/test_readers/test_hdf.py | 16 ++++++++++++---
 5 files changed, 50 insertions(+), 11 deletions(-)

diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py
index 0cc7b14a..076fc559 100644
--- a/virtualizarr/backend.py
+++ b/virtualizarr/backend.py
@@ -177,11 +177,12 @@ def open_virtual_dataset(
 
             virtual_vars = virtual_vars_from_hdf(
                 path=filepath,
+                group=group,
                 drop_variables=drop_variables + loadable_variables,
                 reader_options=reader_options,
             )
             ds_attrs = attrs_from_root_group(
-                path=filepath, reader_options=reader_options
+                path=filepath, reader_options=reader_options, group=group
             )
             coord_names = ds_attrs.pop("coordinates", [])
         # we currently read every other filetype using kerchunks various file format backends
diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index 8d2c44ce..8db6d781 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -209,6 +209,7 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> Optional[xr.Variab
 
 def virtual_vars_from_hdf(
     path: str,
+    group: Optional[str] = None,
     drop_variables: Optional[List[str]] = None,
     reader_options: Optional[dict] = {
         "storage_options": {"key": "", "secret": "", "anon": True}
@@ -220,11 +221,17 @@ def virtual_vars_from_hdf(
         filepath=path, reader_options=reader_options
     )
     f = h5py.File(open_file, mode="r")
+    if group:
+        g = f[group]
+        if not isinstance(g, h5py.Group):
+            raise ValueError("The provided group is not an HDF group")
+    else:
+        g = f
     variables = {}
-    for key in f.keys():
+    for key in g.keys():
         if key not in drop_variables:
-            if isinstance(f[key], h5py.Dataset):
-                variable = _dataset_to_variable(path, f[key])
+            if isinstance(g[key], h5py.Dataset):
+                variable = _dataset_to_variable(path, g[key])
                 if variable is not None:
                     variables[key] = variable
             else:
@@ -235,6 +242,7 @@ def virtual_vars_from_hdf(
 
 def attrs_from_root_group(
     path: str,
+    group: Optional[str] = None,
     reader_options: Optional[dict] = {
         "storage_options": {"key": "", "secret": "", "anon": True}
     },
@@ -243,5 +251,11 @@ def attrs_from_root_group(
         filepath=path, reader_options=reader_options
     )
     f = h5py.File(open_file, mode="r")
-    attrs = _extract_attrs(f)
+    if group:
+        g = f[group]
+        if not isinstance(g, h5py.Group):
+            raise ValueError("The provided group is not an HDF group")
+    else:
+        g = f
+    attrs = _extract_attrs(g)
     return attrs
diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py
index bb68c186..3feab262 100644
--- a/virtualizarr/tests/test_backend.py
+++ b/virtualizarr/tests/test_backend.py
@@ -293,9 +293,9 @@ def test_explicit_filetype(self, netcdf4_file):
             open_virtual_dataset(netcdf4_file, filetype="grib")
 
     def test_group_kwarg(self, hdf5_groups_file):
-        with pytest.raises(ValueError, match="Multiple HDF Groups found"):
+        with pytest.raises(NotImplementedError, match="Nested groups"):
             open_virtual_dataset(hdf5_groups_file)
-        with pytest.raises(ValueError, match="not found in"):
+        with pytest.raises(KeyError, match="doesn't exist"):
             open_virtual_dataset(hdf5_groups_file, group="doesnt_exist")
 
         vars_to_load = ["air", "time"]
@@ -321,6 +321,7 @@ def test_open_virtual_dataset_passes_expected_args(
         open_virtual_dataset(netcdf4_file, indexes={}, reader_options=reader_options)
         args = {
             "path": netcdf4_file,
+            "group": None,
             "drop_variables": [],
             "reader_options": reader_options,
         }
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index c47c26c9..b0b7c41f 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -118,7 +118,20 @@ def root_attributes_hdf5_file(tmpdir):
 def group_hdf5_file(tmpdir):
     filepath = f"{tmpdir}/group.nc"
     f = h5py.File(filepath, "w")
-    f.create_group("group")
+    g = f.create_group("group")
+    data = np.random.random((10, 10))
+    g.create_dataset("data", data=data)
+    return filepath
+
+
+@pytest.fixture
+def nested_group_hdf5_file(tmpdir):
+    filepath = f"{tmpdir}/nested_group.nc"
+    f = h5py.File(filepath, "w")
+    g = f.create_group("group")
+    data = np.random.random((10, 10))
+    g.create_dataset("data", data=data)
+    g.create_group("nested_group")
     return filepath
 
 
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
index 32970a33..cc9e2dff 100644
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -113,10 +113,20 @@ def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file):
         variables = virtual_vars_from_hdf(chunked_dimensions_netcdf4_file)
         assert len(variables) == 3
 
-    def test_groups_not_implemented(self, group_hdf5_file):
+    def test_nested_groups_not_implemented(self, nested_group_hdf5_file):
         with pytest.raises(NotImplementedError):
-            virtual_vars_from_hdf(group_hdf5_file)
+            virtual_vars_from_hdf(path=nested_group_hdf5_file, group="group")
 
     def test_drop_variables(self, multiple_datasets_hdf5_file):
-        variables = virtual_vars_from_hdf(multiple_datasets_hdf5_file, ["data2"])
+        variables = virtual_vars_from_hdf(
+            path=multiple_datasets_hdf5_file, drop_variables=["data2"]
+        )
         assert "data2" not in variables.keys()
+
+    def test_dataset_in_group(self, group_hdf5_file):
+        variables = virtual_vars_from_hdf(path=group_hdf5_file, group="group")
+        assert len(variables) == 1
+
+    def test_non_group_error(self, group_hdf5_file):
+        with pytest.raises(ValueError):
+            virtual_vars_from_hdf(path=group_hdf5_file, group="group/data")

From 9c86e0d2c0f8135b0a53cbf5313bfa11cc2a222e Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Thu, 17 Oct 2024 15:11:57 -0400
Subject: [PATCH 66/79] Handle empty variables with a shape.

---
 virtualizarr/readers/hdf.py | 64 ++++++++++++++++++-------------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index 8db6d781..65b97eeb 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -167,43 +167,43 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> Optional[xr.Variab
     # This chunk determination logic mirrors zarr-python's create
     # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66
 
+    chunks = dataset.chunks if dataset.chunks else dataset.shape
+    codecs = codecs_from_dataset(dataset)
+    cfcodec = cfcodec_from_dataset(dataset)
+    attrs = _extract_attrs(dataset)
+    if cfcodec:
+        codecs.insert(0, cfcodec["codec"])
+        dtype = cfcodec["target_dtype"]
+        attrs.pop("scale_factor", None)
+        attrs.pop("add_offset", None)
+        fill_value = cfcodec["codec"].decode(dataset.fillvalue)
+    else:
+        dtype = dataset.dtype
+        fill_value = dataset.fillvalue
+    if isinstance(fill_value, np.ndarray):
+        fill_value = fill_value[0]
+    if np.isnan(fill_value):
+        fill_value = float("nan")
+    if isinstance(fill_value, np.generic):
+        fill_value = fill_value.item()
+    filters = [codec.get_config() for codec in codecs]
+    zarray = ZArray(
+        chunks=chunks,
+        compressor=None,
+        dtype=dtype,
+        fill_value=fill_value,
+        filters=filters,
+        order="C",
+        shape=dataset.shape,
+        zarr_format=2,
+    )
+    dims = _dataset_dims(dataset)
     manifest = _dataset_chunk_manifest(path, dataset)
     if manifest:
-        chunks = dataset.chunks if dataset.chunks else dataset.shape
-        codecs = codecs_from_dataset(dataset)
-        cfcodec = cfcodec_from_dataset(dataset)
-        attrs = _extract_attrs(dataset)
-        if cfcodec:
-            codecs.insert(0, cfcodec["codec"])
-            dtype = cfcodec["target_dtype"]
-            attrs.pop("scale_factor", None)
-            attrs.pop("add_offset", None)
-            fill_value = cfcodec["codec"].decode(dataset.fillvalue)
-        else:
-            dtype = dataset.dtype
-            fill_value = dataset.fillvalue
-        if isinstance(fill_value, np.ndarray):
-            fill_value = fill_value[0]
-        if np.isnan(fill_value):
-            fill_value = float("nan")
-        if isinstance(fill_value, np.generic):
-            fill_value = fill_value.item()
-        filters = [codec.get_config() for codec in codecs]
-        zarray = ZArray(
-            chunks=chunks,
-            compressor=None,
-            dtype=dtype,
-            fill_value=fill_value,
-            filters=filters,
-            order="C",
-            shape=dataset.shape,
-            zarr_format=2,
-        )
         marray = ManifestArray(zarray=zarray, chunkmanifest=manifest)
-        dims = _dataset_dims(dataset)
         variable = xr.Variable(data=marray, dims=dims, attrs=attrs)
     else:
-        variable = None
+        variable = xr.Variable(data=np.empty(dataset.shape), dims=dims, attrs=attrs)
     return variable
 
 

From 15897765e61454331dd1fa4a9d151c8673dbb179 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Wed, 23 Oct 2024 18:31:10 -0400
Subject: [PATCH 67/79] Import top-level version of xarray classes.

---
 virtualizarr/readers/hdf.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index 30dd402f..dd67475e 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -3,8 +3,7 @@
 
 import h5py  # type: ignore
 import numpy as np
-from xarray import Dataset, Variable
-from xarray.core.indexes import Index
+from xarray import Dataset, Index, Variable
 
 from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray
 from virtualizarr.readers.common import (

From 772c5800b30507dac37c083b0afa119442e995e3 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Wed, 23 Oct 2024 20:59:35 -0400
Subject: [PATCH 68/79] Add option to explicitly specify use of an experimental
 hdf backend.

---
 virtualizarr/backend.py                       | 13 ++-
 virtualizarr/tests/test_backend.py            | 85 +++++++++++++------
 virtualizarr/tests/test_integration.py        | 30 +++++--
 .../test_readers/test_hdf_integration.py      | 20 +++--
 virtualizarr/tests/test_xarray.py             | 36 +++++---
 5 files changed, 126 insertions(+), 58 deletions(-)

diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py
index 19aebfdd..3ab76d1f 100644
--- a/virtualizarr/backend.py
+++ b/virtualizarr/backend.py
@@ -13,12 +13,13 @@
 from virtualizarr.readers import (
     DMRPPVirtualBackend,
     FITSVirtualBackend,
-    HDFVirtualBackend,
+    HDF5VirtualBackend,
     KerchunkVirtualBackend,
     NetCDF3VirtualBackend,
     TIFFVirtualBackend,
     ZarrV3VirtualBackend,
 )
+from virtualizarr.readers.common import VirtualBackend
 from virtualizarr.utils import _FsspecFSFromFilepath, check_for_collisions
 
 # TODO add entrypoint to allow external libraries to add to this mapping
@@ -26,9 +27,9 @@
     "kerchunk": KerchunkVirtualBackend,
     "zarr_v3": ZarrV3VirtualBackend,
     "dmrpp": DMRPPVirtualBackend,
-    "hdf5": HDFVirtualBackend,
-    "netcdf4": HDFVirtualBackend,  # note this is the same as for hdf5
     # all the below call one of the kerchunk backends internally (https://fsspec.github.io/kerchunk/reference.html#file-format-backends)
+    "hdf5": HDF5VirtualBackend,
+    "netcdf4": HDF5VirtualBackend,  # note this is the same as for hdf5
     "netcdf3": NetCDF3VirtualBackend,
     "tiff": TIFFVirtualBackend,
     "fits": FITSVirtualBackend,
@@ -113,6 +114,7 @@ def open_virtual_dataset(
     indexes: Mapping[str, Index] | None = None,
     virtual_array_class=ManifestArray,
     reader_options: Optional[dict] = None,
+    backend: Optional[VirtualBackend] = None,
 ) -> Dataset:
     """
     Open a file or store as an xarray Dataset wrapping virtualized zarr arrays.
@@ -182,7 +184,10 @@ def open_virtual_dataset(
             filepath=filepath, reader_options=reader_options
         )
 
-    backend_cls = VIRTUAL_BACKENDS.get(filetype.name.lower())
+    if backend:
+        backend_cls = backend
+    else:
+        backend_cls = VIRTUAL_BACKENDS.get(filetype.name.lower())
 
     if backend_cls is None:
         raise NotImplementedError(f"Unsupported file type: {filetype.name}")
diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py
index 43a6bbd8..2368848a 100644
--- a/virtualizarr/tests/test_backend.py
+++ b/virtualizarr/tests/test_backend.py
@@ -11,6 +11,7 @@
 from virtualizarr import open_virtual_dataset
 from virtualizarr.backend import FileType, automatically_determine_filetype
 from virtualizarr.manifests import ManifestArray
+from virtualizarr.readers.hdf import HDFVirtualBackend
 from virtualizarr.tests import (
     has_astropy,
     has_tifffile,
@@ -82,14 +83,15 @@ def test_FileType():
 
 
 @requires_kerchunk
+@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
 class TestOpenVirtualDatasetIndexes:
-    def test_no_indexes(self, netcdf4_file):
-        vds = open_virtual_dataset(netcdf4_file, indexes={})
+    def test_no_indexes(self, netcdf4_file, hdf_backend):
+        vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend)
         assert vds.indexes == {}
 
-    def test_create_default_indexes(self, netcdf4_file):
+    def test_create_default_indexes(self, netcdf4_file, hdf_backend):
         with pytest.warns(UserWarning, match="will create in-memory pandas indexes"):
-            vds = open_virtual_dataset(netcdf4_file, indexes=None)
+            vds = open_virtual_dataset(netcdf4_file, indexes=None, backend=hdf_backend)
         ds = open_dataset(netcdf4_file, decode_times=True)
 
         # TODO use xr.testing.assert_identical(vds.indexes, ds.indexes) instead once class supported by assertion comparison, see https://github.com/pydata/xarray/issues/5812
@@ -113,7 +115,8 @@ def index_mappings_equal(indexes1: Mapping[str, Index], indexes2: Mapping[str, I
 
 
 @requires_kerchunk
-def test_cftime_index(tmpdir):
+@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
+def test_cftime_index(tmpdir, hdf_backend):
     """Ensure a virtual dataset contains the same indexes as an Xarray dataset"""
     # Note: Test was created to debug: https://github.com/zarr-developers/VirtualiZarr/issues/168
     ds = xr.Dataset(
@@ -129,7 +132,10 @@ def test_cftime_index(tmpdir):
     )
     ds.to_netcdf(f"{tmpdir}/tmp.nc")
     vds = open_virtual_dataset(
-        f"{tmpdir}/tmp.nc", loadable_variables=["time", "lat", "lon"], indexes={}
+        f"{tmpdir}/tmp.nc",
+        loadable_variables=["time", "lat", "lon"],
+        indexes={},
+        backend=hdf_backend,
     )
     # TODO use xr.testing.assert_identical(vds.indexes, ds.indexes) instead once class supported by assertion comparison, see https://github.com/pydata/xarray/issues/5812
     assert index_mappings_equal(vds.xindexes, ds.xindexes)
@@ -139,15 +145,16 @@ def test_cftime_index(tmpdir):
 
 
 @requires_kerchunk
+@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
 class TestOpenVirtualDatasetAttrs:
-    def test_drop_array_dimensions(self, netcdf4_file):
+    def test_drop_array_dimensions(self, netcdf4_file, hdf_backend):
         # regression test for GH issue #150
-        vds = open_virtual_dataset(netcdf4_file, indexes={})
+        vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend)
         assert "_ARRAY_DIMENSIONS" not in vds["air"].attrs
 
-    def test_coordinate_variable_attrs_preserved(self, netcdf4_file):
+    def test_coordinate_variable_attrs_preserved(self, netcdf4_file, hdf_backend):
         # regression test for GH issue #155
-        vds = open_virtual_dataset(netcdf4_file, indexes={})
+        vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend)
         assert vds["lat"].attrs == {
             "standard_name": "latitude",
             "long_name": "Latitude",
@@ -165,7 +172,8 @@ class TestReadFromS3:
     @pytest.mark.parametrize(
         "indexes", [None, {}], ids=["None index", "empty dict index"]
     )
-    def test_anon_read_s3(self, filetype, indexes):
+    @pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
+    def test_anon_read_s3(self, filetype, indexes, hdf_backend):
         """Parameterized tests for empty vs supplied indexes and filetypes."""
         # TODO: Switch away from this s3 url after minIO is implemented.
         fpath = "s3://carbonplan-share/virtualizarr/local.nc"
@@ -174,6 +182,7 @@ def test_anon_read_s3(self, filetype, indexes):
             filetype=filetype,
             indexes=indexes,
             reader_options={"storage_options": {"anon": True}},
+            backend=hdf_backend,
         )
 
         assert vds.dims == {"time": 2920, "lat": 25, "lon": 53}
@@ -182,6 +191,7 @@ def test_anon_read_s3(self, filetype, indexes):
 
 
 @network
+@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
 class TestReadFromURL:
     @pytest.mark.parametrize(
         "filetype, url",
@@ -228,10 +238,15 @@ class TestReadFromURL:
             ),
         ],
     )
-    def test_read_from_url(self, filetype, url):
+    def test_read_from_url(self, hdf_backend, filetype, url):
         if filetype in ["grib", "jpg", "hdf4"]:
             with pytest.raises(NotImplementedError):
-                vds = open_virtual_dataset(url, reader_options={}, indexes={})
+                vds = open_virtual_dataset(
+                    url,
+                    reader_options={},
+                    indexes={},
+                    backend=hdf_backend,
+                )
         elif filetype == "hdf5":
             vds = open_virtual_dataset(
                 url,
@@ -239,13 +254,14 @@ def test_read_from_url(self, filetype, url):
                 drop_variables=["listOfCovarianceTerms", "listOfPolarizations"],
                 indexes={},
                 reader_options={},
+                backend=hdf_backend,
             )
             assert isinstance(vds, xr.Dataset)
         else:
-            vds = open_virtual_dataset(url, indexes={})
+            vds = open_virtual_dataset(url, indexes={}, backend=hdf_backend)
             assert isinstance(vds, xr.Dataset)
 
-    def test_virtualizarr_vs_local_nisar(self):
+    def test_virtualizarr_vs_local_nisar(self, hdf_backend):
         import fsspec
 
         # Open group directly from locally cached file with xarray
@@ -268,6 +284,7 @@ def test_virtualizarr_vs_local_nisar(self):
             group=hdf_group,
             indexes={},
             drop_variables=["listOfCovarianceTerms", "listOfPolarizations"],
+            backend=hdf_backend,
         )
         tmpref = "/tmp/cmip6.json"
         vds.virtualize.to_kerchunk(tmpref, format="json")
@@ -279,10 +296,14 @@ def test_virtualizarr_vs_local_nisar(self):
 
 @requires_kerchunk
 class TestLoadVirtualDataset:
-    def test_loadable_variables(self, netcdf4_file):
+    @pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
+    def test_loadable_variables(self, netcdf4_file, hdf_backend):
         vars_to_load = ["air", "time"]
         vds = open_virtual_dataset(
-            netcdf4_file, loadable_variables=vars_to_load, indexes={}
+            netcdf4_file,
+            loadable_variables=vars_to_load,
+            indexes={},
+            backend=hdf_backend,
         )
 
         for name in vds.variables:
@@ -304,11 +325,20 @@ def test_explicit_filetype(self, netcdf4_file):
         with pytest.raises(NotImplementedError):
             open_virtual_dataset(netcdf4_file, filetype="grib")
 
-    def test_group_kwarg(self, hdf5_groups_file):
-        with pytest.raises(ValueError, match="Multiple HDF Groups found"):
-            open_virtual_dataset(hdf5_groups_file)
-        with pytest.raises(ValueError, match="not found in"):
-            open_virtual_dataset(hdf5_groups_file, group="doesnt_exist")
+    @pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
+    def test_group_kwarg(self, hdf5_groups_file, hdf_backend):
+        if hdf_backend:
+            with pytest.raises(NotImplementedError, match="Nested groups"):
+                open_virtual_dataset(hdf5_groups_file, backend=hdf_backend)
+            with pytest.raises(KeyError, match="doesn't exist"):
+                open_virtual_dataset(
+                    hdf5_groups_file, group="doesnt_exist", backend=hdf_backend
+                )
+        else:
+            with pytest.raises(ValueError, match="Multiple HDF Groups found"):
+                open_virtual_dataset(hdf5_groups_file)
+            with pytest.raises(ValueError, match="not found in"):
+                open_virtual_dataset(hdf5_groups_file, group="doesnt_exist")
 
         vars_to_load = ["air", "time"]
         vds = open_virtual_dataset(
@@ -316,6 +346,7 @@ def test_group_kwarg(self, hdf5_groups_file):
             group="test/group",
             loadable_variables=vars_to_load,
             indexes={},
+            backend=hdf_backend,
         )
         full_ds = xr.open_dataset(
             hdf5_groups_file,
@@ -340,13 +371,15 @@ def test_open_virtual_dataset_passes_expected_args(
         }
         mock_read_kerchunk.assert_called_once_with(**args)
 
-    def test_open_dataset_with_empty(self, hdf5_empty, tmpdir):
-        vds = open_virtual_dataset(hdf5_empty)
+    @pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
+    def test_open_dataset_with_empty(self, hdf5_empty, tmpdir, hdf_backend):
+        vds = open_virtual_dataset(hdf5_empty, backend=hdf_backend)
         assert vds.empty.dims == ()
         assert vds.empty.attrs == {"empty": "true"}
 
-    def test_open_dataset_with_scalar(self, hdf5_scalar, tmpdir):
-        vds = open_virtual_dataset(hdf5_scalar)
+    @pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
+    def test_open_dataset_with_scalar(self, hdf5_scalar, tmpdir, hdf_backend):
+        vds = open_virtual_dataset(hdf5_scalar, backend=hdf_backend)
         assert vds.scalar.dims == ()
         assert vds.scalar.attrs == {"scalar": "true"}
 
diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py
index 63158777..0a39eb3d 100644
--- a/virtualizarr/tests/test_integration.py
+++ b/virtualizarr/tests/test_integration.py
@@ -5,6 +5,7 @@
 
 from virtualizarr import open_virtual_dataset
 from virtualizarr.manifests import ChunkManifest, ManifestArray
+from virtualizarr.readers.hdf import HDFVirtualBackend
 from virtualizarr.tests import requires_kerchunk
 from virtualizarr.translators.kerchunk import (
     dataset_from_kerchunk_refs,
@@ -63,8 +64,9 @@ def test_no_duplicates_find_var_names():
         ),
     ],
 )
+@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
 def test_numpy_arrays_to_inlined_kerchunk_refs(
-    netcdf4_file, inline_threshold, vars_to_inline
+    netcdf4_file, inline_threshold, vars_to_inline, hdf_backend
 ):
     from kerchunk.hdf import SingleHdf5ToZarr
 
@@ -75,7 +77,7 @@ def test_numpy_arrays_to_inlined_kerchunk_refs(
 
     # loading the variables should produce same result as inlining them using kerchunk
     vds = open_virtual_dataset(
-        netcdf4_file, loadable_variables=vars_to_inline, indexes={}
+        netcdf4_file, loadable_variables=vars_to_inline, indexes={}, backend=hdf_backend
     )
     refs = vds.virtualize.to_kerchunk(format="dict")
 
@@ -90,7 +92,8 @@ def test_numpy_arrays_to_inlined_kerchunk_refs(
 @requires_kerchunk
 @pytest.mark.parametrize("format", ["dict", "json", "parquet"])
 class TestKerchunkRoundtrip:
-    def test_kerchunk_roundtrip_no_concat(self, tmpdir, format):
+    @pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
+    def test_kerchunk_roundtrip_no_concat(self, tmpdir, format, hdf_backend):
         # set up example xarray dataset
         ds = xr.tutorial.open_dataset("air_temperature", decode_times=False)
 
@@ -98,7 +101,7 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format):
         ds.to_netcdf(f"{tmpdir}/air.nc")
 
         # use open_dataset_via_kerchunk to read it as references
-        vds = open_virtual_dataset(f"{tmpdir}/air.nc", indexes={})
+        vds = open_virtual_dataset(f"{tmpdir}/air.nc", indexes={}, backend=hdf_backend)
 
         if format == "dict":
             # write those references to an in-memory kerchunk-formatted references dictionary
@@ -122,8 +125,11 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format):
         for coord in ds.coords:
             assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs
 
+    @pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
     @pytest.mark.parametrize("decode_times,time_vars", [(False, []), (True, ["time"])])
-    def test_kerchunk_roundtrip_concat(self, tmpdir, format, decode_times, time_vars):
+    def test_kerchunk_roundtrip_concat(
+        self, tmpdir, format, hdf_backend, decode_times, time_vars
+    ):
         # set up example xarray dataset
         ds = xr.tutorial.open_dataset("air_temperature", decode_times=decode_times)
 
@@ -139,11 +145,13 @@ def test_kerchunk_roundtrip_concat(self, tmpdir, format, decode_times, time_vars
             f"{tmpdir}/air1.nc",
             indexes={},
             loadable_variables=time_vars,
+            backend=hdf_backend,
         )
         vds2 = open_virtual_dataset(
             f"{tmpdir}/air2.nc",
             indexes={},
             loadable_variables=time_vars,
+            backend=hdf_backend,
         )
 
         if decode_times is False:
@@ -187,7 +195,8 @@ def test_kerchunk_roundtrip_concat(self, tmpdir, format, decode_times, time_vars
             assert roundtrip.time.encoding["units"] == ds.time.encoding["units"]
             assert roundtrip.time.encoding["calendar"] == ds.time.encoding["calendar"]
 
-    def test_non_dimension_coordinates(self, tmpdir, format):
+    @pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
+    def test_non_dimension_coordinates(self, tmpdir, format, hdf_backend):
         # regression test for GH issue #105
 
         # set up example xarray dataset containing non-dimension coordinate variables
@@ -196,7 +205,9 @@ def test_non_dimension_coordinates(self, tmpdir, format):
         # save it to disk as netCDF (in temporary directory)
         ds.to_netcdf(f"{tmpdir}/non_dim_coords.nc")
 
-        vds = open_virtual_dataset(f"{tmpdir}/non_dim_coords.nc", indexes={})
+        vds = open_virtual_dataset(
+            f"{tmpdir}/non_dim_coords.nc", indexes={}, backend=hdf_backend
+        )
 
         assert "lat" in vds.coords
         assert "coordinates" not in vds.attrs
@@ -269,11 +280,12 @@ def test_datetime64_dtype_fill_value(self, tmpdir, format):
 
 
 @requires_kerchunk
-def test_open_scalar_variable(tmpdir):
+@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
+def test_open_scalar_variable(tmpdir, hdf_backend):
     # regression test for GH issue #100
 
     ds = xr.Dataset(data_vars={"a": 0})
     ds.to_netcdf(f"{tmpdir}/scalar.nc")
 
-    vds = open_virtual_dataset(f"{tmpdir}/scalar.nc", indexes={})
+    vds = open_virtual_dataset(f"{tmpdir}/scalar.nc", indexes={}, backend=hdf_backend)
     assert vds["a"].shape == ()
diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index f73292ee..716d1f28 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -4,18 +4,21 @@
 
 import virtualizarr
 from virtualizarr.backend import FileType
+from virtualizarr.readers.hdf import HDFVirtualBackend
 
 
+@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
 class TestIntegration:
     @pytest.mark.xfail(reason="0 time start is being interpreted as fillvalue")
     def test_filters_h5netcdf_roundtrip(
-        self, tmpdir, filter_encoded_roundtrip_hdf5_file
+        self, tmpdir, filter_encoded_roundtrip_hdf5_file, hdf_backend
     ):
         ds = xr.open_dataset(filter_encoded_roundtrip_hdf5_file, decode_times=True)
         vds = virtualizarr.open_virtual_dataset(
             filter_encoded_roundtrip_hdf5_file,
             loadable_variables=["time"],
             cftime_variables=["time"],
+            backend=hdf_backend,
         )
         kerchunk_file = f"{tmpdir}/kerchunk.json"
         vds.virtualize.to_kerchunk(kerchunk_file, format="json")
@@ -23,19 +26,26 @@ def test_filters_h5netcdf_roundtrip(
         xrt.assert_allclose(ds, roundtrip)
 
     def test_filters_netcdf4_roundtrip(
-        self, tmpdir, filter_encoded_roundtrip_netcdf4_file
+        self, tmpdir, filter_encoded_roundtrip_netcdf4_file, hdf_backend
     ):
         filepath = filter_encoded_roundtrip_netcdf4_file["filepath"]
         ds = xr.open_dataset(filepath)
-        vds = virtualizarr.open_virtual_dataset(filepath, filetype=FileType("netcdf4"))
+        vds = virtualizarr.open_virtual_dataset(
+            filepath, filetype=FileType("netcdf4"), backend=hdf_backend
+        )
         kerchunk_file = f"{tmpdir}/kerchunk.json"
         vds.virtualize.to_kerchunk(kerchunk_file, format="json")
         roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
         xrt.assert_equal(ds, roundtrip)
 
-    def test_filter_and_cf_roundtrip(self, tmpdir, filter_and_cf_roundtrip_hdf5_file):
+    def test_filter_and_cf_roundtrip(
+        self, tmpdir, filter_and_cf_roundtrip_hdf5_file, hdf_backend
+    ):
         ds = xr.open_dataset(filter_and_cf_roundtrip_hdf5_file)
-        vds = virtualizarr.open_virtual_dataset(filter_and_cf_roundtrip_hdf5_file)
+        vds = virtualizarr.open_virtual_dataset(
+            filter_and_cf_roundtrip_hdf5_file,
+            backend=hdf_backend,
+        )
         kerchunk_file = f"{tmpdir}/filter_cf_kerchunk.json"
         vds.virtualize.to_kerchunk(kerchunk_file, format="json")
         roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py
index 062eda5f..12f6fadf 100644
--- a/virtualizarr/tests/test_xarray.py
+++ b/virtualizarr/tests/test_xarray.py
@@ -4,6 +4,7 @@
 
 from virtualizarr import open_virtual_dataset
 from virtualizarr.manifests import ChunkManifest, ManifestArray
+from virtualizarr.readers.hdf import HDFVirtualBackend
 from virtualizarr.tests import requires_kerchunk
 from virtualizarr.zarr import ZArray
 
@@ -224,14 +225,15 @@ def test_concat_dim_coords_along_existing_dim(self):
 
 
 @requires_kerchunk
+@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
 class TestCombineUsingIndexes:
-    def test_combine_by_coords(self, netcdf4_files):
+    def test_combine_by_coords(self, netcdf4_files, hdf_backend):
         filepath1, filepath2 = netcdf4_files
 
         with pytest.warns(UserWarning, match="will create in-memory pandas indexes"):
-            vds1 = open_virtual_dataset(filepath1)
+            vds1 = open_virtual_dataset(filepath1, backend=hdf_backend)
         with pytest.warns(UserWarning, match="will create in-memory pandas indexes"):
-            vds2 = open_virtual_dataset(filepath2)
+            vds2 = open_virtual_dataset(filepath2, backend=hdf_backend)
 
         combined_vds = xr.combine_by_coords(
             [vds2, vds1],
@@ -240,13 +242,13 @@ def test_combine_by_coords(self, netcdf4_files):
         assert combined_vds.xindexes["time"].to_pandas_index().is_monotonic_increasing
 
     @pytest.mark.xfail(reason="Not yet implemented, see issue #18")
-    def test_combine_by_coords_keeping_manifestarrays(self, netcdf4_files):
+    def test_combine_by_coords_keeping_manifestarrays(self, netcdf4_files, hdf_backend):
         filepath1, filepath2 = netcdf4_files
 
         with pytest.warns(UserWarning, match="will create in-memory pandas indexes"):
-            vds1 = open_virtual_dataset(filepath1)
+            vds1 = open_virtual_dataset(filepath1, backend=hdf_backend)
         with pytest.warns(UserWarning, match="will create in-memory pandas indexes"):
-            vds2 = open_virtual_dataset(filepath2)
+            vds2 = open_virtual_dataset(filepath2, backend=hdf_backend)
 
         combined_vds = xr.combine_by_coords(
             [vds2, vds1],
@@ -258,17 +260,18 @@ def test_combine_by_coords_keeping_manifestarrays(self, netcdf4_files):
 
 
 @requires_kerchunk
+@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
 class TestRenamePaths:
-    def test_rename_to_str(self, netcdf4_file):
-        vds = open_virtual_dataset(netcdf4_file, indexes={})
+    def test_rename_to_str(self, netcdf4_file, hdf_backend):
+        vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend)
         renamed_vds = vds.virtualize.rename_paths("s3://bucket/air.nc")
         assert (
             renamed_vds["air"].data.manifest.dict()["0.0.0"]["path"]
             == "s3://bucket/air.nc"
         )
 
-    def test_rename_using_function(self, netcdf4_file):
-        vds = open_virtual_dataset(netcdf4_file, indexes={})
+    def test_rename_using_function(self, netcdf4_file, hdf_backend):
+        vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend)
 
         def local_to_s3_url(old_local_path: str) -> str:
             from pathlib import Path
@@ -284,15 +287,20 @@ def local_to_s3_url(old_local_path: str) -> str:
             == "s3://bucket/air.nc"
         )
 
-    def test_invalid_type(self, netcdf4_file):
-        vds = open_virtual_dataset(netcdf4_file, indexes={})
+    def test_invalid_type(self, netcdf4_file, hdf_backend):
+        vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend)
 
         with pytest.raises(TypeError):
             vds.virtualize.rename_paths(["file1.nc", "file2.nc"])
 
-    def test_mixture_of_manifestarrays_and_numpy_arrays(self, netcdf4_file):
+    def test_mixture_of_manifestarrays_and_numpy_arrays(
+        self, netcdf4_file, hdf_backend
+    ):
         vds = open_virtual_dataset(
-            netcdf4_file, indexes={}, loadable_variables=["lat", "lon"]
+            netcdf4_file,
+            indexes={},
+            loadable_variables=["lat", "lon"],
+            backend=hdf_backend,
         )
         renamed_vds = vds.virtualize.rename_paths("s3://bucket/air.nc")
         assert (

From 3ab90c6d59c7dbd929ad317afbbaf6843097b7d6 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Wed, 23 Oct 2024 21:17:00 -0400
Subject: [PATCH 69/79] Include imagecodecs and hdf5plugin in all CI
 environments.

---
 ci/environment.yml | 2 ++
 ci/min-deps.yml    | 5 +++++
 ci/upstream.yml    | 4 ++++
 3 files changed, 11 insertions(+)

diff --git a/ci/environment.yml b/ci/environment.yml
index 25ac0bb1..1ff25449 100644
--- a/ci/environment.yml
+++ b/ci/environment.yml
@@ -14,6 +14,7 @@ dependencies:
   - packaging
   - universal_pathlib
   - hdf5plugin
+  - numcodecs
   # Testing
   - codecov
   - pre-commit
@@ -33,5 +34,6 @@ dependencies:
   - tifffile
   # for opening FITS files
   - astropy
+  - pip
   - pip:
     - imagecodecs-numcodecs
diff --git a/ci/min-deps.yml b/ci/min-deps.yml
index 7ca8c0b3..12086543 100644
--- a/ci/min-deps.yml
+++ b/ci/min-deps.yml
@@ -10,9 +10,11 @@ dependencies:
   - xarray>=2024.6.0
   - numpy>=2.0.0
   - numcodecs
+  - imagecodecs>=2024.6.1
   - packaging
   - ujson
   - universal_pathlib
+  - hdf5plugin
   # Testing
   - codecov
   - pre-commit
@@ -24,3 +26,6 @@ dependencies:
   - pytest
   - pooch
   - fsspec
+  - pip
+  - pip:
+    - imagecodecs-numcodecs
diff --git a/ci/upstream.yml b/ci/upstream.yml
index 2c2680bc..931e346c 100644
--- a/ci/upstream.yml
+++ b/ci/upstream.yml
@@ -11,6 +11,9 @@ dependencies:
   - packaging
   - ujson
   - universal_pathlib
+  - hdf5plugin
+  - numcodecs
+  - imagecodecs>=2024.6.1
   # Testing
   - codecov
   - pre-commit
@@ -27,4 +30,5 @@ dependencies:
     - icechunk # Installs zarr v3 as dependency
     - git+https://github.com/pydata/xarray@zarr-v3  # zarr-v3 compatibility branch
     - git+https://github.com/zarr-developers/numcodecs@zarr3-codecs  # zarr-v3 compatibility branch
+    - imagecodecs-numcodecs
     # - git+https://github.com/fsspec/kerchunk@main  # kerchunk is currently incompatible with zarr-python v3 (https://github.com/fsspec/kerchunk/pull/516)

From 150d06d215ff50657c85eb197ff9d0cf4d3eeae5 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Wed, 23 Oct 2024 21:57:37 -0400
Subject: [PATCH 70/79] Add test_hdf_integration tests to be skipped for
 non-kerchunk env.

---
 virtualizarr/tests/test_readers/test_hdf_integration.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index 716d1f28..f2d2367d 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -5,8 +5,10 @@
 import virtualizarr
 from virtualizarr.backend import FileType
 from virtualizarr.readers.hdf import HDFVirtualBackend
+from virtualizarr.tests import requires_kerchunk
 
 
+@requires_kerchunk
 @pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
 class TestIntegration:
     @pytest.mark.xfail(reason="0 time start is being interpreted as fillvalue")
@@ -43,8 +45,7 @@ def test_filter_and_cf_roundtrip(
     ):
         ds = xr.open_dataset(filter_and_cf_roundtrip_hdf5_file)
         vds = virtualizarr.open_virtual_dataset(
-            filter_and_cf_roundtrip_hdf5_file,
-            backend=hdf_backend,
+            filter_and_cf_roundtrip_hdf5_file, backend=hdf_backend
         )
         kerchunk_file = f"{tmpdir}/filter_cf_kerchunk.json"
         vds.virtualize.to_kerchunk(kerchunk_file, format="json")

From 8ccba34862950b32a1559af33cddbc6d657608bb Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Wed, 23 Oct 2024 22:01:31 -0400
Subject: [PATCH 71/79] Include imagecodecs in dependencies.

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index df6c37be..0d0744b5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,6 +28,7 @@ dependencies = [
     "h5py",
     "hdf5plugin",
     "numcodecs",
+    "imagecodecs",
     "imagecodecs-numcodecs",
     "ujson",
 ]

From 81874e0488c5be595b2545fbc5ec66d802524fd7 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Wed, 23 Oct 2024 22:12:32 -0400
Subject: [PATCH 72/79] Diagnose imagecodecs-numcodecs installation failures in
 CI.

---
 ci/environment.yml | 2 +-
 ci/min-deps.yml    | 2 +-
 ci/upstream.yml    | 2 +-
 pyproject.toml     | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/ci/environment.yml b/ci/environment.yml
index 1ff25449..70a4c9c3 100644
--- a/ci/environment.yml
+++ b/ci/environment.yml
@@ -36,4 +36,4 @@ dependencies:
   - astropy
   - pip
   - pip:
-    - imagecodecs-numcodecs
+    - imagecodecs-numcodecs==2024.6.1
diff --git a/ci/min-deps.yml b/ci/min-deps.yml
index 12086543..af4a732c 100644
--- a/ci/min-deps.yml
+++ b/ci/min-deps.yml
@@ -28,4 +28,4 @@ dependencies:
   - fsspec
   - pip
   - pip:
-    - imagecodecs-numcodecs
+    - imagecodecs-numcodecs==2024.6.1
diff --git a/ci/upstream.yml b/ci/upstream.yml
index 931e346c..f6c66df3 100644
--- a/ci/upstream.yml
+++ b/ci/upstream.yml
@@ -30,5 +30,5 @@ dependencies:
     - icechunk # Installs zarr v3 as dependency
     - git+https://github.com/pydata/xarray@zarr-v3  # zarr-v3 compatibility branch
     - git+https://github.com/zarr-developers/numcodecs@zarr3-codecs  # zarr-v3 compatibility branch
-    - imagecodecs-numcodecs
     # - git+https://github.com/fsspec/kerchunk@main  # kerchunk is currently incompatible with zarr-python v3 (https://github.com/fsspec/kerchunk/pull/516)
+    - imagecodecs-numcodecs==2024.6.1
diff --git a/pyproject.toml b/pyproject.toml
index 0d0744b5..672398f7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,7 +29,7 @@ dependencies = [
     "hdf5plugin",
     "numcodecs",
     "imagecodecs",
-    "imagecodecs-numcodecs",
+    "imagecodecs-numcodecs==2024.6.1",
     "ujson",
 ]
 

From f87abe2c0b6dd2c9074e566bb3083dbd8856b821 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Thu, 24 Oct 2024 10:59:35 -0400
Subject: [PATCH 73/79] Ignore mypy complaints for VirtualBackend.

---
 virtualizarr/backend.py     | 2 +-
 virtualizarr/readers/hdf.py | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py
index 3ab76d1f..247657d0 100644
--- a/virtualizarr/backend.py
+++ b/virtualizarr/backend.py
@@ -187,7 +187,7 @@ def open_virtual_dataset(
     if backend:
         backend_cls = backend
     else:
-        backend_cls = VIRTUAL_BACKENDS.get(filetype.name.lower())
+        backend_cls = VIRTUAL_BACKENDS.get(filetype.name.lower())  # type: ignore
 
     if backend_cls is None:
         raise NotImplementedError(f"Unsupported file type: {filetype.name}")
diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index dd67475e..b4723ded 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -127,7 +127,9 @@ def add_chunk_info(blob):
                     add_chunk_info(dsid.get_chunk_info(index))
 
             chunk_manifest = ChunkManifest.from_arrays(
-                paths=paths, offsets=offsets, lengths=lengths
+                paths=paths,
+                offsets=offsets,
+                lengths=lengths,  # type: ignore
             )
             return chunk_manifest
 

From 70e7e29301527b96c91313c68f080e93ed0b79f5 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Thu, 24 Oct 2024 11:08:47 -0400
Subject: [PATCH 74/79] Remove checksum assert which varies across different
 zstd versions.

---
 virtualizarr/tests/test_readers/test_hdf_filters.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py
index d0bde948..0dd8efa8 100644
--- a/virtualizarr/tests/test_readers/test_hdf_filters.py
+++ b/virtualizarr/tests/test_readers/test_hdf_filters.py
@@ -34,8 +34,9 @@ def test_blosc(self):
     def test_zstd(self):
         codec = _filter_to_codec("32015", (5,))
         assert isinstance(codec, numcodecs.zstd.Zstd)
-        expected_config = {"id": "zstd", "level": 5}
-        assert codec.get_config() == expected_config
+        config = codec.get_config()
+        assert config["id"] == "zstd"
+        assert config["level"] == 5
 
     def test_shuffle(self):
         codec = _filter_to_codec("shuffle", (7,))

From 43bc0e4ca03977c4e0d64bdd8308229e08407677 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Thu, 24 Oct 2024 11:21:34 -0400
Subject: [PATCH 75/79] Temporarily xfail integration tests with coordinate
 inconsistency.

---
 virtualizarr/tests/test_integration.py                  | 4 +++-
 virtualizarr/tests/test_readers/test_hdf_integration.py | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py
index 0a39eb3d..3953e59e 100644
--- a/virtualizarr/tests/test_integration.py
+++ b/virtualizarr/tests/test_integration.py
@@ -199,6 +199,9 @@ def test_kerchunk_roundtrip_concat(
     def test_non_dimension_coordinates(self, tmpdir, format, hdf_backend):
         # regression test for GH issue #105
 
+        if hdf_backend:
+            pytest.xfail("To fix coordinate behavior with HDF reader")
+
         # set up example xarray dataset containing non-dimension coordinate variables
         ds = xr.Dataset(coords={"lat": (["x", "y"], np.arange(6.0).reshape(2, 3))})
 
@@ -208,7 +211,6 @@ def test_non_dimension_coordinates(self, tmpdir, format, hdf_backend):
         vds = open_virtual_dataset(
             f"{tmpdir}/non_dim_coords.nc", indexes={}, backend=hdf_backend
         )
-
         assert "lat" in vds.coords
         assert "coordinates" not in vds.attrs
 
diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index f2d2367d..db452086 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -27,6 +27,7 @@ def test_filters_h5netcdf_roundtrip(
         roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk", decode_times=True)
         xrt.assert_allclose(ds, roundtrip)
 
+    @pytest.mark.xfail(reason="Coordinate issue affecting kerchunk and HDF reader.")
     def test_filters_netcdf4_roundtrip(
         self, tmpdir, filter_encoded_roundtrip_netcdf4_file, hdf_backend
     ):

From 82a63214b599095dcbb152e60029f2c6cbb15151 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Thu, 24 Oct 2024 12:35:20 -0400
Subject: [PATCH 76/79] Remove backend arg for non-hdf network file tests.

---
 virtualizarr/tests/test_backend.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/virtualizarr/tests/test_backend.py b/virtualizarr/tests/test_backend.py
index 2368848a..7436abba 100644
--- a/virtualizarr/tests/test_backend.py
+++ b/virtualizarr/tests/test_backend.py
@@ -245,7 +245,6 @@ def test_read_from_url(self, hdf_backend, filetype, url):
                     url,
                     reader_options={},
                     indexes={},
-                    backend=hdf_backend,
                 )
         elif filetype == "hdf5":
             vds = open_virtual_dataset(
@@ -258,7 +257,7 @@ def test_read_from_url(self, hdf_backend, filetype, url):
             )
             assert isinstance(vds, xr.Dataset)
         else:
-            vds = open_virtual_dataset(url, indexes={}, backend=hdf_backend)
+            vds = open_virtual_dataset(url, indexes={})
             assert isinstance(vds, xr.Dataset)
 
     def test_virtualizarr_vs_local_nisar(self, hdf_backend):

From b34f260f6e260b1ef66f4907746374ebfc63e2b6 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Thu, 24 Oct 2024 12:40:00 -0400
Subject: [PATCH 77/79] Fix mypy comment moved by ruff formatting.

---
 virtualizarr/readers/hdf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index b4723ded..c722d7af 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -127,9 +127,9 @@ def add_chunk_info(blob):
                     add_chunk_info(dsid.get_chunk_info(index))
 
             chunk_manifest = ChunkManifest.from_arrays(
-                paths=paths,
+                paths=paths,  # type: ignore
                 offsets=offsets,
-                lengths=lengths,  # type: ignore
+                lengths=lengths,
             )
             return chunk_manifest
 

From f9ead06fafef91bed37a3310bf8a32bb5df74c96 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 25 Oct 2024 10:48:35 -0400
Subject: [PATCH 78/79] Make HDR reader dependencies optional.

---
 ci/min-deps.yml | 6 ------
 pyproject.toml  | 7 ++-----
 2 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/ci/min-deps.yml b/ci/min-deps.yml
index af4a732c..7debcf95 100644
--- a/ci/min-deps.yml
+++ b/ci/min-deps.yml
@@ -3,18 +3,15 @@ channels:
   - conda-forge
   - nodefaults
 dependencies:
-  - h5netcdf
   - h5py
   - hdf5
   - netcdf4
   - xarray>=2024.6.0
   - numpy>=2.0.0
   - numcodecs
-  - imagecodecs>=2024.6.1
   - packaging
   - ujson
   - universal_pathlib
-  - hdf5plugin
   # Testing
   - codecov
   - pre-commit
@@ -26,6 +23,3 @@ dependencies:
   - pytest
   - pooch
   - fsspec
-  - pip
-  - pip:
-    - imagecodecs-numcodecs==2024.6.1
diff --git a/pyproject.toml b/pyproject.toml
index 672398f7..64453032 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,11 +25,7 @@ dependencies = [
     "numpy>=2.0.0",
     "packaging",
     "universal-pathlib",
-    "h5py",
-    "hdf5plugin",
     "numcodecs",
-    "imagecodecs",
-    "imagecodecs-numcodecs==2024.6.1",
     "ujson",
 ]
 
@@ -41,11 +37,12 @@ test = [
     "h5netcdf",
     "h5py",
     "hdf5plugin",
+    "imagecodecs",
+    "imagecodecs-numcodecs==2024.6.1",
     "kerchunk>=0.2.5",
     "mypy",
     "netcdf4",
     "numcodecs",
-    "imagecodecs-numcodecs",
     "pandas-stubs",
     "pooch",
     "pre-commit",

From 560829266f9036951a6f5f39a016339402422bc0 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 25 Oct 2024 14:31:50 -0400
Subject: [PATCH 79/79] Handle optional imagecodecs and hdf5plugin dependency
 imports for tests.

---
 pyproject.toml                                |  7 ++++++-
 virtualizarr/readers/hdf_filters.py           | 13 +++++++++++-
 virtualizarr/tests/__init__.py                |  2 ++
 virtualizarr/tests/test_readers/conftest.py   |  9 ++++++++-
 virtualizarr/tests/test_readers/test_hdf.py   | 14 +++++++++++++
 .../tests/test_readers/test_hdf_filters.py    | 20 ++++++++++++++++++-
 6 files changed, 61 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 64453032..7be7b0a7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,11 +30,16 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
+hdf_reader = [
+    "fsspec",
+    "hdf5plugin",
+    "imagecodecs",
+    "imagecodecs-numcodecs==2024.6.1",
+]
 test = [
     "codecov",
     "fastparquet",
     "fsspec",
-    "h5netcdf",
     "h5py",
     "hdf5plugin",
     "imagecodecs",
diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index aedf89b3..cab1f351 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -1,14 +1,25 @@
 import dataclasses
+import warnings
 from typing import List, Tuple, TypedDict, Union
 
 import h5py  # type: ignore
-import hdf5plugin  # type: ignore
 import numcodecs.registry as registry
 import numpy as np
 from numcodecs.abc import Codec
 from numcodecs.fixedscaleoffset import FixedScaleOffset
 from xarray.coding.variables import _choose_float_dtype
 
+try:
+    import hdf5plugin  # type: ignore
+except ModuleNotFoundError:
+    hdf5plugin = None  # type: ignore
+    warnings.warn("hdf5plugin is required for HDF reader")
+
+try:
+    import imagecodecs  # noqa
+except ModuleNotFoundError:
+    warnings.warn("imagecodecs is required for HDF reader")
+
 _non_standard_filters = {
     "gzip": "zlib",
     "lzf": "imagecodecs_lzf",
diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py
index 70f613ce..aee82542 100644
--- a/virtualizarr/tests/__init__.py
+++ b/virtualizarr/tests/__init__.py
@@ -37,6 +37,8 @@ def _importorskip(
 has_s3fs, requires_s3fs = _importorskip("s3fs")
 has_scipy, requires_scipy = _importorskip("scipy")
 has_tifffile, requires_tifffile = _importorskip("tifffile")
+has_imagecodecs, requires_imagecodecs = _importorskip("imagecodecs")
+has_hdf5plugin, requires_hdf5plugin = _importorskip("hdf5plugin")
 
 
 def create_manifestarray(
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index b0b7c41f..35df93a9 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -1,5 +1,6 @@
+import warnings
+
 import h5py  # type: ignore
-import hdf5plugin  # type: ignore
 import numpy as np
 import pytest
 import xarray as xr
@@ -7,6 +8,12 @@
 from xarray.tests.test_dataset import create_test_data
 from xarray.util.print_versions import netcdf_and_hdf5_versions
 
+try:
+    import hdf5plugin  # type: ignore
+except ModuleNotFoundError:
+    hdf5plugin = None  # type: ignore
+    warnings.warn("hdf5plugin is required for HDF reader")
+
 
 @pytest.fixture
 def empty_chunks_hdf5_file(tmpdir):
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
index 0e51fe28..71d2b352 100644
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -2,8 +2,14 @@
 import pytest
 
 from virtualizarr.readers.hdf import HDFVirtualBackend
+from virtualizarr.tests import (
+    requires_hdf5plugin,
+    requires_imagecodecs,
+)
 
 
+@requires_hdf5plugin
+@requires_imagecodecs
 class TestDatasetChunkManifest:
     def test_empty_chunks(self, empty_chunks_hdf5_file):
         f = h5py.File(empty_chunks_hdf5_file)
@@ -47,6 +53,8 @@ def test_chunked_roundtrip(self, chunked_roundtrip_hdf5_file):
         assert manifest.shape_chunk_grid == (2, 8)
 
 
+@requires_hdf5plugin
+@requires_imagecodecs
 class TestDatasetDims:
     def test_single_dimension_scale(self, single_dimension_scale_hdf5_file):
         f = h5py.File(single_dimension_scale_hdf5_file)
@@ -73,6 +81,8 @@ def test_no_dimension_scales(self, no_chunks_hdf5_file):
         assert dims == ["phony_dim_0", "phony_dim_1"]
 
 
+@requires_hdf5plugin
+@requires_imagecodecs
 class TestDatasetToVariable:
     def test_chunked_dataset(self, chunked_dimensions_netcdf4_file):
         f = h5py.File(chunked_dimensions_netcdf4_file)
@@ -97,6 +107,8 @@ def test_dataset_attributes(self, string_attributes_hdf5_file):
         assert var.attrs["attribute_name"] == "attribute_name"
 
 
+@requires_hdf5plugin
+@requires_imagecodecs
 class TestExtractAttributes:
     def test_string_attribute(self, string_attributes_hdf5_file):
         f = h5py.File(string_attributes_hdf5_file)
@@ -116,6 +128,8 @@ def test_multiple_attributes(self, string_attributes_hdf5_file):
         assert len(attrs.keys()) == 2
 
 
+@requires_hdf5plugin
+@requires_imagecodecs
 class TestVirtualVarsFromHDF:
     def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file):
         variables = HDFVirtualBackend._virtual_vars_from_hdf(
diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py
index 0dd8efa8..20086b88 100644
--- a/virtualizarr/tests/test_readers/test_hdf_filters.py
+++ b/virtualizarr/tests/test_readers/test_hdf_filters.py
@@ -1,15 +1,29 @@
+import warnings
+
 import h5py  # type: ignore
-import imagecodecs
 import numcodecs
 import numpy as np
 
+try:
+    import imagecodecs  # noqa
+except ModuleNotFoundError:
+    imagecodecs = None  # type: ignore
+    warnings.warn("imagecodecs is required for HDF reader")
+
+
 from virtualizarr.readers.hdf_filters import (
     _filter_to_codec,
     cfcodec_from_dataset,
     codecs_from_dataset,
 )
+from virtualizarr.tests import (
+    requires_hdf5plugin,
+    requires_imagecodecs,
+)
 
 
+@requires_hdf5plugin
+@requires_imagecodecs
 class TestFilterToCodec:
     def test_gzip_uses_zlib_numcodec(self):
         codec = _filter_to_codec("gzip", 1)
@@ -45,6 +59,8 @@ def test_shuffle(self):
         assert codec.get_config() == expected_config
 
 
+@requires_hdf5plugin
+@requires_imagecodecs
 class TestCodecsFromDataSet:
     def test_numcodec_decoding(self, np_uncompressed, filter_encoded_hdf5_file):
         f = h5py.File(filter_encoded_hdf5_file)
@@ -61,6 +77,8 @@ def test_numcodec_decoding(self, np_uncompressed, filter_encoded_hdf5_file):
                 assert decoded == np_uncompressed.tobytes()
 
 
+@requires_hdf5plugin
+@requires_imagecodecs
 class TestCFCodecFromDataset:
     def test_no_cf_convention(self, filter_encoded_hdf5_file):
         f = h5py.File(filter_encoded_hdf5_file)