From 6b7abe2a0dc650ae7e6bf07c080cc9023a17bf2c Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 19 Apr 2024 13:25:28 -0600
Subject: [PATCH 01/55] Generate chunk manifest backed variable from HDF5
 dataset.

---
 pyproject.toml                              |   1 +
 virtualizarr/readers/hdf.py                 | 135 ++++++++++++++++++++
 virtualizarr/tests/test_readers/__init__.py |   0
 virtualizarr/tests/test_readers/conftest.py |  91 +++++++++++++
 virtualizarr/tests/test_readers/test_hdf.py |  71 ++++++++++
 5 files changed, 298 insertions(+)
 create mode 100644 virtualizarr/readers/hdf.py
 create mode 100644 virtualizarr/tests/test_readers/__init__.py
 create mode 100644 virtualizarr/tests/test_readers/conftest.py
 create mode 100644 virtualizarr/tests/test_readers/test_hdf.py

diff --git a/pyproject.toml b/pyproject.toml
index c7505bca..7994c929 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,6 +25,7 @@ dependencies = [
     "kerchunk==0.2.2",
     "pydantic",
     "packaging",
+    "h5netcdf",
 ]
 
 [project.optional-dependencies]
diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
new file mode 100644
index 00000000..a34ae341
--- /dev/null
+++ b/virtualizarr/readers/hdf.py
@@ -0,0 +1,135 @@
+from typing import List
+
+import h5py
+import xarray as xr
+
+from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray
+from virtualizarr.zarr import ZArray
+
+
+def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest:
+    """
+    Generate ChunkManifest for HDF5 dataset.
+
+    Parameters
+    ----------
+    path: str
+        The path the HDF5 container file
+     dset : h5py.Dataset
+        HDF5 dataset for which to create a ChunkManifest
+
+    Returns
+    -------
+    ChunkManifest
+        A Virtualizarr ChunkManifest
+    """
+    dsid = dataset.id
+
+    if dataset.chunks is None:
+        if dsid.get_offset() is None:
+            raise ValueError("Dataset has no space allocated in the file")
+        else:
+            key_list = [0] * (len(dataset.shape) or 1)
+            key = ".".join(map(str, key_list))
+            chunk_entry = ChunkEntry(
+                path=path,
+                offset=dsid.get_offset(),
+                length=dsid.get_storage_size()
+            )
+            chunk_entries = {key: chunk_entry}
+            chunk_manifest = ChunkManifest(
+                entries=chunk_entries
+            )
+            return chunk_manifest
+    else:
+        num_chunks = dsid.get_num_chunks()
+        if num_chunks == 0:
+            raise ValueError("The dataset is chunked but contains no chunks")
+
+        chunk_entries = dict()
+
+        def get_key(blob):
+            key_list = [a // b for a, b in zip(blob.chunk_offset, dataset.chunks)]
+            key = ".".join(map(str, key_list))
+            return key
+
+        def store_chunk_entry(blob):
+            chunk_entries[get_key(blob)] = ChunkEntry(
+                path=path,
+                offset=blob.byte_offset,
+                length=blob.size
+            )
+
+        has_chunk_iter = callable(getattr(dsid, "chunk_iter", None))
+        if has_chunk_iter:
+            dsid.chunk_iter(store_chunk_entry)
+        else:
+            for index in range(num_chunks):
+                store_chunk_entry(dsid.get_chunk_info(index))
+
+        chunk_manifest = ChunkManifest(
+            entries=chunk_entries
+        )
+        return chunk_manifest
+
+def _dataset_dims(dataset: h5py.Dataset) -> List[str]:
+    """
+    Get a list of dimension scale names attached to input HDF5 dataset.
+
+    This is required by the xarray package to work with Zarr arrays. Only
+    one dimension scale per dataset dimension is allowed. If dataset is
+    dimension scale, it will be considered as the dimension to itself.
+
+    Parameters
+    ----------
+    dataset : h5py.Dataset
+        HDF5 dataset.
+
+    Returns
+    -------
+    list
+        List with HDF5 path names of dimension scales attached to input
+        dataset.
+    """
+    dims = list()
+    rank = len(dataset.shape)
+    if rank:
+        for n in range(rank):
+            num_scales = len(dataset.dims[n])
+            if num_scales == 1:
+                dims.append(dataset.dims[n][0].name[1:])
+            elif h5py.h5ds.is_scale(dataset.id):
+                dims.append(dataset.name[1:])
+            elif num_scales > 1:
+                raise ValueError(
+                    f"{dataset.name}: {len(dataset.dims[n])} "
+                    f"dimension scales attached to dimension #{n}"
+                )
+            elif num_scales == 0:
+                # Some HDF5 files do not have dimension scales.
+                # If this is the case, `num_scales` will be 0.
+                # In this case, we mimic netCDF4 and assign phony dimension names.
+                # See https://github.com/fsspec/kerchunk/issues/41
+                dims.append(f"phony_dim_{n}")
+        return dims
+
+
+def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable:
+    # This chunk determination logic mirrors zarr-python's create
+    # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66
+    chunks = dataset.chunks if dataset.chunks else dataset.shape
+    zarray = ZArray(
+        chunks=chunks,
+        compressor=dataset.compression,
+        dtype=dataset.dtype,
+        fill_value=dataset.fillvalue,
+        filters=None,
+        order="C",
+        shape=dataset.shape,
+        zarr_format=2,
+    )
+    manifest = _dataset_chunk_manifest(path, dataset)
+    marray = ManifestArray(zarray=zarray, chunkmanifest=manifest)
+    dims = _dataset_dims(dataset)
+    variable = xr.Variable(data=marray, dims=dims)
+    return variable
diff --git a/virtualizarr/tests/test_readers/__init__.py b/virtualizarr/tests/test_readers/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
new file mode 100644
index 00000000..b4504839
--- /dev/null
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -0,0 +1,91 @@
+import h5py
+import numpy as np
+import pytest
+import xarray as xr
+
+
+@pytest.fixture
+def empty_chunks_netcdf4_file(tmpdir):
+    ds = xr.Dataset({"data": []})
+    filepath = f"{tmpdir}/empty_chunks.nc"
+    ds.to_netcdf(filepath, engine="h5netcdf")
+    return filepath
+
+
+@pytest.fixture
+def empty_dataset_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/empty_dataset.nc"
+    f = h5py.File(filepath, "w")
+    f.create_dataset("data", shape=(0,), dtype="f")
+    return filepath
+
+
+@pytest.fixture
+def no_chunks_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/no_chunks.nc"
+    f = h5py.File(filepath, "w")
+    data = np.random.random((10, 10))
+    f.create_dataset(name="data", data=data, chunks=None)
+    return filepath
+
+
+@pytest.fixture
+def chunked_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/chunks.nc"
+    f = h5py.File(filepath, "w")
+    data = np.random.random((100, 100))
+    f.create_dataset(name="data", data=data, chunks=(50, 50))
+    return filepath
+
+
+@pytest.fixture
+def single_dimension_scale_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/single_dimension_scale.nc"
+    f = h5py.File(filepath, "w")
+    data = [1, 2]
+    x = [0, 1]
+    f.create_dataset(name="data", data=data)
+    f.create_dataset(name="x", data=x)
+    f["x"].make_scale()
+    f["data"].dims[0].attach_scale(f["x"])
+    return filepath
+
+
+@pytest.fixture
+def is_scale_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/is_scale.nc"
+    f = h5py.File(filepath, "w")
+    data = [1, 2]
+    f.create_dataset(name="data", data=data)
+    f["data"].make_scale()
+    return filepath
+
+
+@pytest.fixture
+def multiple_dimension_scales_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/multiple_dimension_scales.nc"
+    f = h5py.File(filepath, "w")
+    data = [1, 2]
+    f.create_dataset(name="data", data=data)
+    f.create_dataset(name="x", data=[0, 1])
+    f.create_dataset(name="y", data=[0, 1])
+    f["x"].make_scale()
+    f["y"].make_scale()
+    f["data"].dims[0].attach_scale(f["x"])
+    f["data"].dims[0].attach_scale(f["y"])
+    return filepath
+
+
+@pytest.fixture
+def chunked_dimensions_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/chunks_dimension.nc"
+    f = h5py.File(filepath, "w")
+    data = np.random.random((100, 100))
+    x = np.random.random((100, 100))
+    y = np.random.random((100, 100))
+    f.create_dataset(name="data", data=data, chunks=(50, 50))
+    f.create_dataset(name="x", data=x, chunks=(50, 50))
+    f.create_dataset(name="y", data=y, chunks=(50, 50))
+    f["data"].dims[0].attach_scale(f["x"])
+    f["data"].dims[1].attach_scale(f["y"])
+    return filepath
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
new file mode 100644
index 00000000..b6b78c11
--- /dev/null
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -0,0 +1,71 @@
+import h5py
+import pytest
+
+from virtualizarr.readers.hdf import (_dataset_chunk_manifest, _dataset_dims,
+                                      _dataset_to_variable)
+
+
+class TestDatasetChunkManifest:
+    def test_empty_chunks(self, empty_chunks_netcdf4_file):
+        f = h5py.File(empty_chunks_netcdf4_file)
+        ds = f["data"]
+        with pytest.raises(ValueError, match="chunked but contains no chunks"):
+            _dataset_chunk_manifest(path=empty_chunks_netcdf4_file, dataset=ds)
+
+    def test_empty_dataset(self, empty_dataset_netcdf4_file):
+        f = h5py.File(empty_dataset_netcdf4_file)
+        ds = f["data"]
+        with pytest.raises(ValueError, match="no space allocated in the file"):
+            _dataset_chunk_manifest(path=empty_dataset_netcdf4_file, dataset=ds)
+
+    def test_no_chunking(self, no_chunks_netcdf4_file):
+        f = h5py.File(no_chunks_netcdf4_file)
+        ds = f["data"]
+        manifest = _dataset_chunk_manifest(path=no_chunks_netcdf4_file, dataset=ds)
+        assert len(manifest.entries) == 1
+
+    def test_chunked(self, chunked_netcdf4_file):
+        f = h5py.File(chunked_netcdf4_file)
+        ds = f["data"]
+        manifest = _dataset_chunk_manifest(path=chunked_netcdf4_file, dataset=ds)
+        assert len(manifest.entries) == 4
+
+
+class TestDatasetDims:
+    def test_single_dimension_scale(self, single_dimension_scale_netcdf4_file):
+        f = h5py.File(single_dimension_scale_netcdf4_file)
+        ds = f["data"]
+        dims = _dataset_dims(ds)
+        assert dims[0] == "x"
+
+    def test_is_dimension_scale(self, is_scale_netcdf4_file):
+        f = h5py.File(is_scale_netcdf4_file)
+        ds = f["data"]
+        dims = _dataset_dims(ds)
+        assert dims[0] == "data"
+
+    def test_multiple_dimension_scales(self, multiple_dimension_scales_netcdf4_file):
+        f = h5py.File(multiple_dimension_scales_netcdf4_file)
+        ds = f["data"]
+        with pytest.raises(ValueError, match="dimension scales attached"):
+            _dataset_dims(ds)
+
+    def test_no_dimension_scales(self, no_chunks_netcdf4_file):
+        f = h5py.File(no_chunks_netcdf4_file)
+        ds = f["data"]
+        dims = _dataset_dims(ds)
+        assert dims == ["phony_dim_0", "phony_dim_1"]
+
+
+class TestDatasetToVariable:
+    def test_chunked_dataset(self, chunked_dimensions_netcdf4_file):
+        f = h5py.File(chunked_dimensions_netcdf4_file)
+        ds = f["data"]
+        var = _dataset_to_variable(chunked_dimensions_netcdf4_file, ds)
+        assert var.chunks == (50, 50)
+
+    def test_not_chunked_dataset(self, single_dimension_scale_netcdf4_file):
+        f = h5py.File(single_dimension_scale_netcdf4_file)
+        ds = f["data"]
+        var = _dataset_to_variable(single_dimension_scale_netcdf4_file, ds)
+        assert var.chunks == (2,)

From bca0aabd6030625156b5fe1e58fb8d9a2ccf46f1 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 19 Apr 2024 14:20:38 -0600
Subject: [PATCH 02/55] Transfer dataset attrs to variable.

---
 virtualizarr/readers/hdf.py                 | 50 ++++++++++++++++++++-
 virtualizarr/tests/test_readers/conftest.py | 10 +++++
 virtualizarr/tests/test_readers/test_hdf.py | 16 ++++++-
 3 files changed, 74 insertions(+), 2 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index a34ae341..d6518a30 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -1,6 +1,7 @@
 from typing import List
 
 import h5py
+import numpy as np
 import xarray as xr
 
 from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray
@@ -114,6 +115,52 @@ def _dataset_dims(dataset: h5py.Dataset) -> List[str]:
         return dims
 
 
+def _extract_attrs(dataset: h5py.Dataset):
+    """
+    Extract attributes from an HDF5 dataset.
+
+    Parameters
+    ----------
+    dataset : h5py.Dataset
+        An HDF5 dataset.
+    """
+    _HIDDEN_ATTRS = {
+        "REFERENCE_LIST",
+        "CLASS",
+        "DIMENSION_LIST",
+        "NAME",
+        "_Netcdf4Dimid",
+        "_Netcdf4Coordinates",
+        "_nc3_strict",
+        "_NCProperties",
+    }
+    attrs = {}
+    for n, v in dataset.attrs.items():
+        if n in _HIDDEN_ATTRS:
+            continue
+        # Fix some attribute values to avoid JSON encoding exceptions...
+        if isinstance(v, bytes):
+            v = v.decode("utf-8") or " "
+        elif isinstance(v, (np.ndarray, np.number, np.bool_)):
+            if v.dtype.kind == "S":
+                v = v.astype(str)
+            if n == "_FillValue":
+                continue
+            elif v.size == 1:
+                v = v.flatten()[0]
+                if isinstance(v, (np.ndarray, np.number, np.bool_)):
+                    v = v.tolist()
+            else:
+                v = v.tolist()
+        elif isinstance(v, h5py._hl.base.Empty):
+            v = ""
+        if v == "DIMENSION_SCALE":
+            continue
+
+        attrs[n] = v
+        return attrs
+
+
 def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable:
     # This chunk determination logic mirrors zarr-python's create
     # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66
@@ -131,5 +178,6 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable:
     manifest = _dataset_chunk_manifest(path, dataset)
     marray = ManifestArray(zarray=zarray, chunkmanifest=manifest)
     dims = _dataset_dims(dataset)
-    variable = xr.Variable(data=marray, dims=dims)
+    attrs = _extract_attrs(dataset)
+    variable = xr.Variable(data=marray, dims=dims, attrs=attrs)
     return variable
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index b4504839..2c40fe17 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -89,3 +89,13 @@ def chunked_dimensions_netcdf4_file(tmpdir):
     f["data"].dims[0].attach_scale(f["x"])
     f["data"].dims[1].attach_scale(f["y"])
     return filepath
+
+
+@pytest.fixture
+def string_attribute_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/attributes.nc"
+    f = h5py.File(filepath, "w")
+    data = np.random.random((10, 10))
+    f.create_dataset(name="data", data=data, chunks=None)
+    f["data"].attrs["attribute_name"] = "attribute_name"
+    return filepath
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
index b6b78c11..495b7de0 100644
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -2,7 +2,7 @@
 import pytest
 
 from virtualizarr.readers.hdf import (_dataset_chunk_manifest, _dataset_dims,
-                                      _dataset_to_variable)
+                                      _dataset_to_variable, _extract_attrs)
 
 
 class TestDatasetChunkManifest:
@@ -69,3 +69,17 @@ def test_not_chunked_dataset(self, single_dimension_scale_netcdf4_file):
         ds = f["data"]
         var = _dataset_to_variable(single_dimension_scale_netcdf4_file, ds)
         assert var.chunks == (2,)
+
+    def test_dataset_attributes(self, string_attribute_netcdf4_file):
+        f = h5py.File(string_attribute_netcdf4_file)
+        ds = f["data"]
+        var = _dataset_to_variable(string_attribute_netcdf4_file, ds)
+        assert var.attrs["attribute_name"] == "attribute_name"
+
+
+class TestExtractAttributes:
+    def test_string_attribute(self, string_attribute_netcdf4_file):
+        f = h5py.File(string_attribute_netcdf4_file)
+        ds = f["data"]
+        attrs = _extract_attrs(ds)
+        assert attrs["attribute_name"] == "attribute_name"

From 384ff6bb2d75b68a4af1f23d56a6544b4e20d6b5 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 19 Apr 2024 15:26:58 -0600
Subject: [PATCH 03/55] Get virtual variables dict from HDF5 file.

---
 virtualizarr/readers/hdf.py                 | 14 +++++++++++++-
 virtualizarr/tests/test_readers/conftest.py | 16 ++++++++++++----
 virtualizarr/tests/test_readers/test_hdf.py | 15 ++++++++++++++-
 3 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index d6518a30..9c3ebf44 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -1,4 +1,4 @@
-from typing import List
+from typing import Mapping, List
 
 import h5py
 import numpy as np
@@ -181,3 +181,15 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable:
     attrs = _extract_attrs(dataset)
     variable = xr.Variable(data=marray, dims=dims, attrs=attrs)
     return variable
+
+
+def virtual_vars_from_hdf(path: str, f: h5py.File) -> Mapping[str, xr.Variable]:
+    variables = {}
+    for key in f.keys():
+        if isinstance(f[key], h5py.Dataset):
+            variable = _dataset_to_variable(path, f[key])
+            variables[key] = variable
+        else:
+            raise NotImplementedError("Nested groups are not yet supported")
+
+    return variables
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index 2c40fe17..735e922a 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -81,11 +81,11 @@ def chunked_dimensions_netcdf4_file(tmpdir):
     filepath = f"{tmpdir}/chunks_dimension.nc"
     f = h5py.File(filepath, "w")
     data = np.random.random((100, 100))
-    x = np.random.random((100, 100))
-    y = np.random.random((100, 100))
+    x = np.random.random((100))
+    y = np.random.random((100))
     f.create_dataset(name="data", data=data, chunks=(50, 50))
-    f.create_dataset(name="x", data=x, chunks=(50, 50))
-    f.create_dataset(name="y", data=y, chunks=(50, 50))
+    f.create_dataset(name="x", data=x)
+    f.create_dataset(name="y", data=y)
     f["data"].dims[0].attach_scale(f["x"])
     f["data"].dims[1].attach_scale(f["y"])
     return filepath
@@ -99,3 +99,11 @@ def string_attribute_netcdf4_file(tmpdir):
     f.create_dataset(name="data", data=data, chunks=None)
     f["data"].attrs["attribute_name"] = "attribute_name"
     return filepath
+
+
+@pytest.fixture
+def group_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/group.nc"
+    f = h5py.File(filepath, "w")
+    f.create_group("group")
+    return filepath
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
index 495b7de0..da331ed9 100644
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -2,7 +2,8 @@
 import pytest
 
 from virtualizarr.readers.hdf import (_dataset_chunk_manifest, _dataset_dims,
-                                      _dataset_to_variable, _extract_attrs)
+                                      _dataset_to_variable, _extract_attrs,
+                                      virtual_vars_from_hdf)
 
 
 class TestDatasetChunkManifest:
@@ -83,3 +84,15 @@ def test_string_attribute(self, string_attribute_netcdf4_file):
         ds = f["data"]
         attrs = _extract_attrs(ds)
         assert attrs["attribute_name"] == "attribute_name"
+
+
+class TestVirtualVarsFromHDF:
+    def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file):
+        f = h5py.File(chunked_dimensions_netcdf4_file)
+        variables = virtual_vars_from_hdf(chunked_dimensions_netcdf4_file, f)
+        assert len(variables) == 3
+
+    def test_groups_not_implemented(self, group_netcdf4_file):
+        f = h5py.File(group_netcdf4_file)
+        with pytest.raises(NotImplementedError):
+            virtual_vars_from_hdf(group_netcdf4_file, f)

From 4c5f9bd30186aee61ff79223a70a3172b1c17d00 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 22 Apr 2024 12:33:24 -0600
Subject: [PATCH 04/55] Update virtual_vars_from_hdf to use fsspec and
 drop_variables arg.

---
 pyproject.toml                              |  2 +-
 virtualizarr/readers/hdf.py                 | 25 +++++++++++++++------
 virtualizarr/tests/test_readers/conftest.py | 10 +++++++++
 virtualizarr/tests/test_readers/test_hdf.py | 13 +++++++----
 4 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 7994c929..d08621e3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,7 +25,6 @@ dependencies = [
     "kerchunk==0.2.2",
     "pydantic",
     "packaging",
-    "h5netcdf",
 ]
 
 [project.optional-dependencies]
@@ -35,6 +34,7 @@ test = [
     "pytest",
     "scipy",
     "pooch",
+    "h5netcdf",
 ]
 
 
diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index 9c3ebf44..c4ab2927 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -1,5 +1,6 @@
-from typing import Mapping, List
+from typing import List, Mapping, Optional
 
+import fsspec
 import h5py
 import numpy as np
 import xarray as xr
@@ -73,6 +74,7 @@ def store_chunk_entry(blob):
         )
         return chunk_manifest
 
+
 def _dataset_dims(dataset: h5py.Dataset) -> List[str]:
     """
     Get a list of dimension scale names attached to input HDF5 dataset.
@@ -183,13 +185,22 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable:
     return variable
 
 
-def virtual_vars_from_hdf(path: str, f: h5py.File) -> Mapping[str, xr.Variable]:
+def virtual_vars_from_hdf(
+    path: str,
+    drop_variables: Optional[List[str]] = None,
+) -> Mapping[str, xr.Variable]:
+    if drop_variables is None:
+        drop_variables = []
+    fs, file_path = fsspec.core.url_to_fs(path)
+    open_file = fs.open(path, "rb")
+    f = h5py.File(open_file, mode="r")
     variables = {}
     for key in f.keys():
-        if isinstance(f[key], h5py.Dataset):
-            variable = _dataset_to_variable(path, f[key])
-            variables[key] = variable
-        else:
-            raise NotImplementedError("Nested groups are not yet supported")
+        if key not in drop_variables:
+            if isinstance(f[key], h5py.Dataset):
+                variable = _dataset_to_variable(path, f[key])
+                variables[key] = variable
+            else:
+                raise NotImplementedError("Nested groups are not yet supported")
 
     return variables
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index 735e922a..aa2b0fe0 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -107,3 +107,13 @@ def group_netcdf4_file(tmpdir):
     f = h5py.File(filepath, "w")
     f.create_group("group")
     return filepath
+
+
+@pytest.fixture
+def multiple_datasets_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/multiple_datasets.nc"
+    f = h5py.File(filepath, "w")
+    data = np.random.random((10, 10))
+    f.create_dataset(name="data", data=data, chunks=None)
+    f.create_dataset(name="data2", data=data, chunks=None)
+    return filepath
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
index da331ed9..36f7bc77 100644
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -88,11 +88,16 @@ def test_string_attribute(self, string_attribute_netcdf4_file):
 
 class TestVirtualVarsFromHDF:
     def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file):
-        f = h5py.File(chunked_dimensions_netcdf4_file)
-        variables = virtual_vars_from_hdf(chunked_dimensions_netcdf4_file, f)
+        variables = virtual_vars_from_hdf(chunked_dimensions_netcdf4_file)
         assert len(variables) == 3
 
     def test_groups_not_implemented(self, group_netcdf4_file):
-        f = h5py.File(group_netcdf4_file)
         with pytest.raises(NotImplementedError):
-            virtual_vars_from_hdf(group_netcdf4_file, f)
+            virtual_vars_from_hdf(group_netcdf4_file)
+
+    def test_drop_variables(self, multiple_datasets_netcdf4_file):
+        variables = virtual_vars_from_hdf(
+            multiple_datasets_netcdf4_file,
+            ["data2"]
+        )
+        assert "data2" not in variables.keys()

From 1dd3370aedc6e0b590f752273387a716366defe9 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 22 Apr 2024 13:02:03 -0600
Subject: [PATCH 05/55] mypy fix to use ChunkKey and empty dimensions list.

---
 virtualizarr/readers/hdf.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index c4ab2927..fdb9a77d 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -1,4 +1,4 @@
-from typing import List, Mapping, Optional
+from typing import List, Mapping, Optional, Union
 
 import fsspec
 import h5py
@@ -8,6 +8,8 @@
 from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray
 from virtualizarr.zarr import ZArray
 
+from virtualizarr.types import ChunkKey
+
 
 def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest:
     """
@@ -38,7 +40,8 @@ def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest:
                 offset=dsid.get_offset(),
                 length=dsid.get_storage_size()
             )
-            chunk_entries = {key: chunk_entry}
+            chunk_key = ChunkKey(key)
+            chunk_entries = {chunk_key: chunk_entry}
             chunk_manifest = ChunkManifest(
                 entries=chunk_entries
             )
@@ -75,7 +78,7 @@ def store_chunk_entry(blob):
         return chunk_manifest
 
 
-def _dataset_dims(dataset: h5py.Dataset) -> List[str]:
+def _dataset_dims(dataset: h5py.Dataset) -> Union[List[str], List[None]]:
     """
     Get a list of dimension scale names attached to input HDF5 dataset.
 
@@ -114,7 +117,7 @@ def _dataset_dims(dataset: h5py.Dataset) -> List[str]:
                 # In this case, we mimic netCDF4 and assign phony dimension names.
                 # See https://github.com/fsspec/kerchunk/issues/41
                 dims.append(f"phony_dim_{n}")
-        return dims
+    return dims
 
 
 def _extract_attrs(dataset: h5py.Dataset):

From d92c75c82cd000bf0fafa5301c22793434fb18ed Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 22 Apr 2024 13:40:52 -0600
Subject: [PATCH 06/55] Extract attributes from hdf5 root group.

---
 virtualizarr/readers/hdf.py                 | 18 +++++++++++++-----
 virtualizarr/tests/test_readers/conftest.py |  8 ++++++++
 virtualizarr/tests/test_readers/test_hdf.py |  5 +++++
 3 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index fdb9a77d..e02d03e7 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -120,14 +120,14 @@ def _dataset_dims(dataset: h5py.Dataset) -> Union[List[str], List[None]]:
     return dims
 
 
-def _extract_attrs(dataset: h5py.Dataset):
+def _extract_attrs(h5obj: Union[h5py.Dataset, h5py.Group]):
     """
-    Extract attributes from an HDF5 dataset.
+    Extract attributes from an HDF5 group or dataset.
 
     Parameters
     ----------
-    dataset : h5py.Dataset
-        An HDF5 dataset.
+    h5obj : h5py.Group or h5py.Dataset
+        An HDF5 group or dataset.
     """
     _HIDDEN_ATTRS = {
         "REFERENCE_LIST",
@@ -140,7 +140,7 @@ def _extract_attrs(dataset: h5py.Dataset):
         "_NCProperties",
     }
     attrs = {}
-    for n, v in dataset.attrs.items():
+    for n, v in h5obj.attrs.items():
         if n in _HIDDEN_ATTRS:
             continue
         # Fix some attribute values to avoid JSON encoding exceptions...
@@ -207,3 +207,11 @@ def virtual_vars_from_hdf(
                 raise NotImplementedError("Nested groups are not yet supported")
 
     return variables
+
+
+def attrs_from_root_group(path: str):
+    fs, file_path = fsspec.core.url_to_fs(path)
+    open_file = fs.open(path, "rb")
+    f = h5py.File(open_file, mode="r")
+    attrs = _extract_attrs(f)
+    return attrs
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index aa2b0fe0..46ac7b2e 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -101,6 +101,14 @@ def string_attribute_netcdf4_file(tmpdir):
     return filepath
 
 
+@pytest.fixture
+def root_attributes_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/root_attributes.nc"
+    f = h5py.File(filepath, "w")
+    f.attrs["attribute_name"] = "attribute_name"
+    return filepath
+
+
 @pytest.fixture
 def group_netcdf4_file(tmpdir):
     filepath = f"{tmpdir}/group.nc"
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
index 36f7bc77..a24e36ab 100644
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -85,6 +85,11 @@ def test_string_attribute(self, string_attribute_netcdf4_file):
         attrs = _extract_attrs(ds)
         assert attrs["attribute_name"] == "attribute_name"
 
+    def test_root_attribute(self, root_attributes_netcdf4_file):
+        f = h5py.File(root_attributes_netcdf4_file)
+        attrs = _extract_attrs(f)
+        assert attrs["attribute_name"] == "attribute_name"
+
 
 class TestVirtualVarsFromHDF:
     def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file):

From 0ed836272d26a62b8de457c30dc6525292efc916 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 22 Apr 2024 14:19:17 -0600
Subject: [PATCH 07/55] Use hdf reader for netcdf4 files.

---
 virtualizarr/xarray.py | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
index 5c3c8548..415b0a05 100644
--- a/virtualizarr/xarray.py
+++ b/virtualizarr/xarray.py
@@ -8,7 +8,8 @@
 from xarray.core.variable import IndexVariable
 
 import virtualizarr.kerchunk as kerchunk
-from virtualizarr.kerchunk import KerchunkStoreRefs, FileType
+from virtualizarr.kerchunk import KerchunkStoreRefs, FileType, _automatically_determine_filetype
+from virtualizarr.readers.hdf import virtual_vars_from_hdf, attrs_from_root_group 
 from virtualizarr.manifests import ChunkManifest, ManifestArray
 
 
@@ -76,18 +77,28 @@ def open_virtual_dataset(
     if common:
         raise ValueError(f"Cannot both load and drop variables {common}")
 
+    if filetype is None:
+        filetype = _automatically_determine_filetype(filepath)
+    filetype = FileType(filetype)
+    if filetype.name.lower() == "netcdf4":
+        virtual_vars = virtual_vars_from_hdf(
+            path=filepath,
+            drop_variables=drop_variables
+        )
+        ds_attrs = attrs_from_root_group(path=filepath)
     # this is the only place we actually always need to use kerchunk directly
     # TODO avoid even reading byte ranges for variables that will be dropped later anyway?
-    vds_refs = kerchunk.read_kerchunk_references_from_file(
-        filepath=filepath,
-        filetype=filetype,
-    )
-    virtual_vars = virtual_vars_from_kerchunk_refs(
-        vds_refs,
-        drop_variables=drop_variables + loadable_variables,
-        virtual_array_class=virtual_array_class,
-    )
-    ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {})
+    else:
+        vds_refs = kerchunk.read_kerchunk_references_from_file(
+            filepath=filepath,
+            filetype=filetype,
+        )
+        virtual_vars = virtual_vars_from_kerchunk_refs(
+            vds_refs,
+            drop_variables=drop_variables + loadable_variables,
+            virtual_array_class=virtual_array_class,
+        )
+        ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {})
 
     if indexes is None or len(loadable_variables) > 0:
         # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables...

From f4485fa10aebc0f8ef5ff7441704f49781325835 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 22 Apr 2024 21:57:39 +0000
Subject: [PATCH 08/55] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 virtualizarr/xarray.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
index 415b0a05..2213ffa9 100644
--- a/virtualizarr/xarray.py
+++ b/virtualizarr/xarray.py
@@ -9,7 +9,7 @@
 
 import virtualizarr.kerchunk as kerchunk
 from virtualizarr.kerchunk import KerchunkStoreRefs, FileType, _automatically_determine_filetype
-from virtualizarr.readers.hdf import virtual_vars_from_hdf, attrs_from_root_group 
+from virtualizarr.readers.hdf import virtual_vars_from_hdf, attrs_from_root_group
 from virtualizarr.manifests import ChunkManifest, ManifestArray
 
 

From 0123df7b802734f1902bee0cdd196f5baca10c9e Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Wed, 8 May 2024 18:03:04 -0600
Subject: [PATCH 09/55] Fix ruff complaints.

---
 virtualizarr/readers/hdf.py                 |  3 +--
 virtualizarr/tests/test_readers/test_hdf.py | 10 +++++++---
 virtualizarr/xarray.py                      |  8 ++++++--
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index e02d03e7..af25c029 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -6,9 +6,8 @@
 import xarray as xr
 
 from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray
-from virtualizarr.zarr import ZArray
-
 from virtualizarr.types import ChunkKey
+from virtualizarr.zarr import ZArray
 
 
 def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest:
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
index a24e36ab..0d5a16db 100644
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -1,9 +1,13 @@
 import h5py
 import pytest
 
-from virtualizarr.readers.hdf import (_dataset_chunk_manifest, _dataset_dims,
-                                      _dataset_to_variable, _extract_attrs,
-                                      virtual_vars_from_hdf)
+from virtualizarr.readers.hdf import (
+    _dataset_chunk_manifest,
+    _dataset_dims,
+    _dataset_to_variable,
+    _extract_attrs,
+    virtual_vars_from_hdf,
+)
 
 
 class TestDatasetChunkManifest:
diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
index fbf6136f..9629a344 100644
--- a/virtualizarr/xarray.py
+++ b/virtualizarr/xarray.py
@@ -18,9 +18,13 @@
 from xarray.core.variable import IndexVariable
 
 import virtualizarr.kerchunk as kerchunk
-from virtualizarr.kerchunk import KerchunkStoreRefs, FileType, _automatically_determine_filetype
-from virtualizarr.readers.hdf import virtual_vars_from_hdf, attrs_from_root_group
+from virtualizarr.kerchunk import (
+    FileType,
+    KerchunkStoreRefs,
+    _automatically_determine_filetype,
+)
 from virtualizarr.manifests import ChunkManifest, ManifestArray
+from virtualizarr.readers.hdf import attrs_from_root_group, virtual_vars_from_hdf
 from virtualizarr.zarr import (
     attrs_from_zarr_group_json,
     dataset_to_zarr,

From 332bcaab1ae182696e1daf7c611f6fe8fd8ee4fd Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 10 May 2024 15:10:30 -0600
Subject: [PATCH 10/55] First steps for handling HDF5 filters.

---
 pyproject.toml                                |  1 +
 virtualizarr/readers/hdf.py                   |  7 +-
 virtualizarr/readers/hdf_filters.py           | 34 +++++++++
 virtualizarr/tests/test_readers/conftest.py   | 26 +++++++
 .../tests/test_readers/test_hdf_filters.py    | 31 ++++++++
 .../test_readers/test_hdf_integration.py      | 21 ++++++
 virtualizarr/xarray.py                        | 71 +++++++++----------
 7 files changed, 153 insertions(+), 38 deletions(-)
 create mode 100644 virtualizarr/readers/hdf_filters.py
 create mode 100644 virtualizarr/tests/test_readers/test_hdf_filters.py
 create mode 100644 virtualizarr/tests/test_readers/test_hdf_integration.py

diff --git a/pyproject.toml b/pyproject.toml
index 79a50789..4818b5f1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,6 +28,7 @@ dependencies = [
     "numpy",
     "ujson",
     "packaging",
+    "hdf5plugin",
 ]
 
 [project.optional-dependencies]
diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index af25c029..7d95d996 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -6,6 +6,7 @@
 import xarray as xr
 
 from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray
+from virtualizarr.readers.hdf_filters import codecs_from_dataset
 from virtualizarr.types import ChunkKey
 from virtualizarr.zarr import ZArray
 
@@ -169,12 +170,14 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable:
     # This chunk determination logic mirrors zarr-python's create
     # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66
     chunks = dataset.chunks if dataset.chunks else dataset.shape
+    codecs = codecs_from_dataset(dataset)
+    filters = [codec.get_config() for codec in codecs]
     zarray = ZArray(
         chunks=chunks,
-        compressor=dataset.compression,
+        compressor=None,
         dtype=dataset.dtype,
         fill_value=dataset.fillvalue,
-        filters=None,
+        filters=filters,
         order="C",
         shape=dataset.shape,
         zarr_format=2,
diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
new file mode 100644
index 00000000..6070fc17
--- /dev/null
+++ b/virtualizarr/readers/hdf_filters.py
@@ -0,0 +1,34 @@
+from typing import List, Tuple, Union
+
+import h5py
+import numcodecs.registry as registry
+from numcodecs.abc import Codec
+
+_non_standard_filters = {
+    "gzip": "zlib"
+}
+
+
+def _filter_to_codec(filter_id: str, filter_properties: Union[int, Tuple] = None) -> Codec:
+    try:
+        id = int(filter_id)
+    except ValueError:
+        id = filter_id
+
+    if isinstance(id, str):
+        if id in _non_standard_filters.keys():
+            id = _non_standard_filters[id]
+        conf = {"id": id}
+        if id == "zlib":
+            conf["level"] = filter_properties
+
+    codec = registry.get_codec(conf)
+    return codec
+
+
+def codecs_from_dataset(dataset: h5py.Dataset) -> List[Codec]:
+    codecs = []
+    for filter_id, filter_properties in dataset._filters.items():
+        codec = _filter_to_codec(filter_id, filter_properties)
+        codecs.append(codec)
+    return codecs
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index 46ac7b2e..4f0d4fce 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -125,3 +125,29 @@ def multiple_datasets_netcdf4_file(tmpdir):
     f.create_dataset(name="data", data=data, chunks=None)
     f.create_dataset(name="data2", data=data, chunks=None)
     return filepath
+
+
+@pytest.fixture
+def np_uncompressed():
+    return np.arange(100)
+
+
+@pytest.fixture
+def gzip_filter_netcdf4_file(tmpdir, np_uncompressed):
+    filepath = f"{tmpdir}/gzip.nc"
+    f = h5py.File(filepath, "w")
+    f.create_dataset(name="data", data=np_uncompressed, compression="gzip", compression_opts=1)
+    return filepath
+
+
+@pytest.fixture
+def gzip_filter_xarray_netcdf4_file(tmpdir):
+    ds = xr.tutorial.open_dataset("air_temperature")
+    encoding = {}
+    for var_name in ds.variables:
+        #  encoding[var_name] = {"zlib": True, "compression_opts": 1}
+        encoding[var_name] = {"compression": "gzip", "compression_opts": 1}
+
+    filepath = f"{tmpdir}/gzip_xarray.nc"
+    ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding)
+    return filepath
diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py
new file mode 100644
index 00000000..50a5d08c
--- /dev/null
+++ b/virtualizarr/tests/test_readers/test_hdf_filters.py
@@ -0,0 +1,31 @@
+import h5py
+import numcodecs
+import pytest
+
+from virtualizarr.readers.hdf_filters import (
+    _filter_to_codec,
+    codecs_from_dataset,
+)
+
+
+class TestFilterToCodec:
+    def test_gzip_uses_zlib_nomcodec(self):
+        codec = _filter_to_codec("gzip", 1)
+        assert isinstance(codec, numcodecs.zlib.Zlib)
+
+    def test_lzf_not_available(self):
+        with pytest.raises(ValueError, match="codec not available"):
+            _filter_to_codec("lzf")
+
+
+class TestCodecsFromDataSet:
+    def test_gzip(self, np_uncompressed, gzip_filter_netcdf4_file):
+        f = h5py.File(gzip_filter_netcdf4_file)
+        ds = f["data"]
+        chunk_info = ds.id.get_chunk_info(0)
+        codecs = codecs_from_dataset(ds)
+        with open(gzip_filter_netcdf4_file, 'rb') as file:
+            file.seek(chunk_info.byte_offset)
+            bytes_read = file.read(chunk_info.size)
+            decoded = codecs[0].decode(bytes_read)
+            assert decoded == np_uncompressed.tobytes()
diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
new file mode 100644
index 00000000..45bfadcd
--- /dev/null
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -0,0 +1,21 @@
+import fsspec
+import numpy
+import xarray as xr
+
+import virtualizarr
+from virtualizarr.kerchunk import FileType
+
+
+class TestIntegration:
+    def test_gzip_filter_end_to_end(self, tmpdir, gzip_filter_xarray_netcdf4_file):
+        virtual_ds = virtualizarr.open_virtual_dataset(
+            gzip_filter_xarray_netcdf4_file,
+            filetype=FileType("netcdf4")
+        )
+        kerchunk_file = f"{tmpdir}/gzip_kerchunk.json"
+        virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json")
+        fs = fsspec.filesystem("reference", fo=kerchunk_file)
+        m = fs.get_mapper("")
+
+        ds = xr.open_dataset(m, engine="kerchunk")
+        assert isinstance(ds.air.values[0][0][0], numpy.float64)
diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
index 9629a344..24ba973a 100644
--- a/virtualizarr/xarray.py
+++ b/virtualizarr/xarray.py
@@ -128,48 +128,47 @@ def open_virtual_dataset(
         )
         ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {})
 
-        if indexes is None or len(loadable_variables) > 0:
-            # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables...
-            # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references
-            # TODO really we probably want a dedicated xarray backend that iterates over all variables only once
-            ds = xr.open_dataset(filepath, drop_variables=drop_variables)
-
-            if indexes is None:
-                # add default indexes by reading data from file
-                indexes = {name: index for name, index in ds.xindexes.items()}
-            elif indexes != {}:
-                # TODO allow manual specification of index objects
-                raise NotImplementedError()
-            else:
-                indexes = dict(**indexes)  # for type hinting: to allow mutation
-
-            loadable_vars = {
-                name: var
-                for name, var in ds.variables.items()
-                if name in loadable_variables
-            }
-
-            # if we only read the indexes we can just close the file right away as nothing is lazy
-            if loadable_vars == {}:
-                ds.close()
+    if indexes is None or len(loadable_variables) > 0:
+        # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables...
+        # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references
+        # TODO really we probably want a dedicated xarray backend that iterates over all variables only once
+        ds = xr.open_dataset(filepath, drop_variables=drop_variables)
+
+        if indexes is None:
+            # add default indexes by reading data from file
+            indexes = {name: index for name, index in ds.xindexes.items()}
+        elif indexes != {}:
+            # TODO allow manual specification of index objects
+            raise NotImplementedError()
         else:
-            loadable_vars = {}
-            indexes = {}
+            indexes = dict(**indexes)  # for type hinting: to allow mutation
 
-        vars = {**virtual_vars, **loadable_vars}
+        loadable_vars = {
+            name: var
+            for name, var in ds.variables.items()
+            if name in loadable_variables
+        }
 
-        data_vars, coords = separate_coords(vars, indexes)
+        # if we only read the indexes we can just close the file right away as nothing is lazy
+        if loadable_vars == {}:
+            ds.close()
+    else:
+        loadable_vars = {}
+        indexes = {}
 
-        vds = xr.Dataset(
-            data_vars,
-            coords=coords,
-            # indexes={},  # TODO should be added in a later version of xarray
-            attrs=ds_attrs,
-        )
+    vars = {**virtual_vars, **loadable_vars}
+
+    data_vars, coords = separate_coords(vars, indexes)
+    vds = xr.Dataset(
+        data_vars,
+        coords=coords,
+        # indexes={},  # TODO should be added in a later version of xarray
+        attrs=ds_attrs,
+    )
 
-        # TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened
+    # TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened
 
-        return vds
+    return vds
 
 
 def open_virtual_dataset_from_v3_store(

From c51e615ca0cd5396bde54868e439419fe9d9b9c8 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 13 May 2024 12:36:29 -0600
Subject: [PATCH 11/55] Initial step for hdf5plugin supported codecs.

---
 virtualizarr/readers/hdf_filters.py           | 25 +++++++++++++++
 virtualizarr/tests/test_readers/conftest.py   | 31 +++++++++++++------
 .../tests/test_readers/test_hdf_filters.py    | 20 +++++++++---
 .../test_readers/test_hdf_integration.py      |  7 +++--
 4 files changed, 66 insertions(+), 17 deletions(-)

diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index 6070fc17..75f06bdc 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -1,14 +1,30 @@
 from typing import List, Tuple, Union
 
 import h5py
+import hdf5plugin
 import numcodecs.registry as registry
 from numcodecs.abc import Codec
+from pydantic import BaseModel, validator
 
 _non_standard_filters = {
     "gzip": "zlib"
 }
 
 
+class BloscProperties(BaseModel):
+    blocksize: int
+    clevel: int
+    shuffle: int
+    cname: str
+
+    @validator("cname", pre=True)
+    def get_cname_from_code(cls, v):
+        blosc_compressor_codes = {
+            value: key for key, value in hdf5plugin._filters.Blosc._Blosc__COMPRESSIONS.items()
+        }
+        return blosc_compressor_codes[v]
+
+
 def _filter_to_codec(filter_id: str, filter_properties: Union[int, Tuple] = None) -> Codec:
     try:
         id = int(filter_id)
@@ -21,6 +37,15 @@ def _filter_to_codec(filter_id: str, filter_properties: Union[int, Tuple] = None
         conf = {"id": id}
         if id == "zlib":
             conf["level"] = filter_properties
+    elif isinstance(id, int):
+        filter = hdf5plugin.get_filters(id)[0]
+        id = filter.filter_name
+        if id == "blosc":
+            blosc_props = BloscProperties(**{k: v for k, v in
+                                             zip(BloscProperties.__fields__.keys(),
+                                                 filter_properties[-4:])})
+            conf = blosc_props.model_dump()
+            conf["id"] = id
 
     codec = registry.get_codec(conf)
     return codec
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index 4f0d4fce..cc9331e1 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -1,4 +1,5 @@
 import h5py
+import hdf5plugin
 import numpy as np
 import pytest
 import xarray as xr
@@ -132,22 +133,32 @@ def np_uncompressed():
     return np.arange(100)
 
 
-@pytest.fixture
-def gzip_filter_netcdf4_file(tmpdir, np_uncompressed):
-    filepath = f"{tmpdir}/gzip.nc"
+@pytest.fixture(params=["gzip", "blosc"])
+def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request):
+    filepath = f"{tmpdir}/{request.param}.nc"
     f = h5py.File(filepath, "w")
-    f.create_dataset(name="data", data=np_uncompressed, compression="gzip", compression_opts=1)
+    if request.param == "gzip":
+        f.create_dataset(name="data", data=np_uncompressed, compression="gzip", compression_opts=1)
+    if request.param == "blosc":
+        f.create_dataset(name="data", data=np_uncompressed,
+                         **hdf5plugin.Blosc(
+                             cname="lz4", clevel=9, shuffle=hdf5plugin.Blosc.SHUFFLE
+                         ))
     return filepath
 
 
-@pytest.fixture
-def gzip_filter_xarray_netcdf4_file(tmpdir):
+@pytest.fixture(params=["gzip"])
+def filter_encoded_xarray_netcdf4_files(tmpdir, request):
     ds = xr.tutorial.open_dataset("air_temperature")
     encoding = {}
+    if request.param == "gzip":
+        encoding_config = {
+            "zlib": True,
+            "complevel": 1
+        }
     for var_name in ds.variables:
-        #  encoding[var_name] = {"zlib": True, "compression_opts": 1}
-        encoding[var_name] = {"compression": "gzip", "compression_opts": 1}
+        encoding[var_name] = encoding_config
 
-    filepath = f"{tmpdir}/gzip_xarray.nc"
-    ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding)
+    filepath = f"{tmpdir}/{request.param}_xarray.nc"
+    ds.to_netcdf(filepath, engine="netcdf4", encoding=encoding)
     return filepath
diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py
index 50a5d08c..8094d4cf 100644
--- a/virtualizarr/tests/test_readers/test_hdf_filters.py
+++ b/virtualizarr/tests/test_readers/test_hdf_filters.py
@@ -9,7 +9,7 @@
 
 
 class TestFilterToCodec:
-    def test_gzip_uses_zlib_nomcodec(self):
+    def test_gzip_uses_zlib_numcodec(self):
         codec = _filter_to_codec("gzip", 1)
         assert isinstance(codec, numcodecs.zlib.Zlib)
 
@@ -17,14 +17,26 @@ def test_lzf_not_available(self):
         with pytest.raises(ValueError, match="codec not available"):
             _filter_to_codec("lzf")
 
+    def test_blosc(self):
+        codec = _filter_to_codec("32001", (2, 2, 8, 800, 9, 2, 1))
+        assert isinstance(codec, numcodecs.blosc.Blosc)
+        expected_config = {
+            "id": "blosc",
+            "blocksize": 800,
+            "clevel": 9,
+            "shuffle": 2,
+            "cname": "lz4",
+        }
+        assert codec.get_config() == expected_config
+
 
 class TestCodecsFromDataSet:
-    def test_gzip(self, np_uncompressed, gzip_filter_netcdf4_file):
-        f = h5py.File(gzip_filter_netcdf4_file)
+    def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file):
+        f = h5py.File(filter_encoded_netcdf4_file)
         ds = f["data"]
         chunk_info = ds.id.get_chunk_info(0)
         codecs = codecs_from_dataset(ds)
-        with open(gzip_filter_netcdf4_file, 'rb') as file:
+        with open(filter_encoded_netcdf4_file, 'rb') as file:
             file.seek(chunk_info.byte_offset)
             bytes_read = file.read(chunk_info.size)
             decoded = codecs[0].decode(bytes_read)
diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index 45bfadcd..94fc0c1c 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -7,12 +7,13 @@
 
 
 class TestIntegration:
-    def test_gzip_filter_end_to_end(self, tmpdir, gzip_filter_xarray_netcdf4_file):
+    def test_filters_end_to_end(self, tmpdir,
+                                    filter_encoded_xarray_netcdf4_files):
         virtual_ds = virtualizarr.open_virtual_dataset(
-            gzip_filter_xarray_netcdf4_file,
+            filter_encoded_xarray_netcdf4_files,
             filetype=FileType("netcdf4")
         )
-        kerchunk_file = f"{tmpdir}/gzip_kerchunk.json"
+        kerchunk_file = f"{tmpdir}/kerchunk.json"
         virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json")
         fs = fsspec.filesystem("reference", fo=kerchunk_file)
         m = fs.get_mapper("")

From 0083f77103c909079427ce3471e65af7fb3bfc54 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Thu, 16 May 2024 16:24:57 -0400
Subject: [PATCH 12/55] Small commit to check compression support in CI
 environment.

---
 pyproject.toml                              | 1 +
 virtualizarr/tests/test_readers/conftest.py | 9 +++++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 4818b5f1..bba695eb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,6 +41,7 @@ test = [
     "scipy",
     "pooch",
     "ruff",
+    "netcdf4",
 ]
 
 
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index cc9331e1..8dc82c33 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -147,7 +147,7 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request):
     return filepath
 
 
-@pytest.fixture(params=["gzip"])
+@pytest.fixture(params=["gzip", "blosc_lz"])
 def filter_encoded_xarray_netcdf4_files(tmpdir, request):
     ds = xr.tutorial.open_dataset("air_temperature")
     encoding = {}
@@ -156,9 +156,14 @@ def filter_encoded_xarray_netcdf4_files(tmpdir, request):
             "zlib": True,
             "complevel": 1
         }
+    if request.param == "blosc_lz":
+        encoding_config = {
+            "compression": "blosc_lz",
+        }
+
     for var_name in ds.variables:
         encoding[var_name] = encoding_config
 
     filepath = f"{tmpdir}/{request.param}_xarray.nc"
-    ds.to_netcdf(filepath, engine="netcdf4", encoding=encoding)
+    ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding)
     return filepath

From 207c4b5cb411637070dc9a5f7011a0e0c98ef877 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 19 May 2024 21:34:26 +0000
Subject: [PATCH 13/55] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 virtualizarr/readers/hdf.py                   | 16 ++++----------
 virtualizarr/readers/hdf_filters.py           | 22 ++++++++++++-------
 virtualizarr/tests/test_readers/conftest.py   | 18 +++++++--------
 virtualizarr/tests/test_readers/test_hdf.py   |  5 +----
 .../tests/test_readers/test_hdf_filters.py    |  2 +-
 .../test_readers/test_hdf_integration.py      |  6 ++---
 virtualizarr/xarray.py                        |  5 ++---
 7 files changed, 33 insertions(+), 41 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index 7d95d996..78e718e4 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -36,15 +36,11 @@ def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest:
             key_list = [0] * (len(dataset.shape) or 1)
             key = ".".join(map(str, key_list))
             chunk_entry = ChunkEntry(
-                path=path,
-                offset=dsid.get_offset(),
-                length=dsid.get_storage_size()
+                path=path, offset=dsid.get_offset(), length=dsid.get_storage_size()
             )
             chunk_key = ChunkKey(key)
             chunk_entries = {chunk_key: chunk_entry}
-            chunk_manifest = ChunkManifest(
-                entries=chunk_entries
-            )
+            chunk_manifest = ChunkManifest(entries=chunk_entries)
             return chunk_manifest
     else:
         num_chunks = dsid.get_num_chunks()
@@ -60,9 +56,7 @@ def get_key(blob):
 
         def store_chunk_entry(blob):
             chunk_entries[get_key(blob)] = ChunkEntry(
-                path=path,
-                offset=blob.byte_offset,
-                length=blob.size
+                path=path, offset=blob.byte_offset, length=blob.size
             )
 
         has_chunk_iter = callable(getattr(dsid, "chunk_iter", None))
@@ -72,9 +66,7 @@ def store_chunk_entry(blob):
             for index in range(num_chunks):
                 store_chunk_entry(dsid.get_chunk_info(index))
 
-        chunk_manifest = ChunkManifest(
-            entries=chunk_entries
-        )
+        chunk_manifest = ChunkManifest(entries=chunk_entries)
         return chunk_manifest
 
 
diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index 75f06bdc..77e7037e 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -6,9 +6,7 @@
 from numcodecs.abc import Codec
 from pydantic import BaseModel, validator
 
-_non_standard_filters = {
-    "gzip": "zlib"
-}
+_non_standard_filters = {"gzip": "zlib"}
 
 
 class BloscProperties(BaseModel):
@@ -20,12 +18,15 @@ class BloscProperties(BaseModel):
     @validator("cname", pre=True)
     def get_cname_from_code(cls, v):
         blosc_compressor_codes = {
-            value: key for key, value in hdf5plugin._filters.Blosc._Blosc__COMPRESSIONS.items()
+            value: key
+            for key, value in hdf5plugin._filters.Blosc._Blosc__COMPRESSIONS.items()
         }
         return blosc_compressor_codes[v]
 
 
-def _filter_to_codec(filter_id: str, filter_properties: Union[int, Tuple] = None) -> Codec:
+def _filter_to_codec(
+    filter_id: str, filter_properties: Union[int, Tuple] = None
+) -> Codec:
     try:
         id = int(filter_id)
     except ValueError:
@@ -41,9 +42,14 @@ def _filter_to_codec(filter_id: str, filter_properties: Union[int, Tuple] = None
         filter = hdf5plugin.get_filters(id)[0]
         id = filter.filter_name
         if id == "blosc":
-            blosc_props = BloscProperties(**{k: v for k, v in
-                                             zip(BloscProperties.__fields__.keys(),
-                                                 filter_properties[-4:])})
+            blosc_props = BloscProperties(
+                **{
+                    k: v
+                    for k, v in zip(
+                        BloscProperties.__fields__.keys(), filter_properties[-4:]
+                    )
+                }
+            )
             conf = blosc_props.model_dump()
             conf["id"] = id
 
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index aa66f933..53c9630e 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -138,12 +138,15 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request):
     filepath = f"{tmpdir}/{request.param}.nc"
     f = h5py.File(filepath, "w")
     if request.param == "gzip":
-        f.create_dataset(name="data", data=np_uncompressed, compression="gzip", compression_opts=1)
+        f.create_dataset(
+            name="data", data=np_uncompressed, compression="gzip", compression_opts=1
+        )
     if request.param == "blosc":
-        f.create_dataset(name="data", data=np_uncompressed,
-                         **hdf5plugin.Blosc(
-                             cname="lz4", clevel=9, shuffle=hdf5plugin.Blosc.SHUFFLE
-                         ))
+        f.create_dataset(
+            name="data",
+            data=np_uncompressed,
+            **hdf5plugin.Blosc(cname="lz4", clevel=9, shuffle=hdf5plugin.Blosc.SHUFFLE),
+        )
     return filepath
 
 
@@ -152,10 +155,7 @@ def filter_encoded_xarray_netcdf4_files(tmpdir, request):
     ds = xr.tutorial.open_dataset("air_temperature")
     encoding = {}
     if request.param == "gzip":
-        encoding_config = {
-            "zlib": True,
-            "complevel": 1
-        }
+        encoding_config = {"zlib": True, "complevel": 1}
 
     for var_name in ds.variables:
         encoding[var_name] = encoding_config
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
index 0d5a16db..a83bfc39 100644
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -105,8 +105,5 @@ def test_groups_not_implemented(self, group_netcdf4_file):
             virtual_vars_from_hdf(group_netcdf4_file)
 
     def test_drop_variables(self, multiple_datasets_netcdf4_file):
-        variables = virtual_vars_from_hdf(
-            multiple_datasets_netcdf4_file,
-            ["data2"]
-        )
+        variables = virtual_vars_from_hdf(multiple_datasets_netcdf4_file, ["data2"])
         assert "data2" not in variables.keys()
diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py
index 8094d4cf..28b5d69f 100644
--- a/virtualizarr/tests/test_readers/test_hdf_filters.py
+++ b/virtualizarr/tests/test_readers/test_hdf_filters.py
@@ -36,7 +36,7 @@ def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file):
         ds = f["data"]
         chunk_info = ds.id.get_chunk_info(0)
         codecs = codecs_from_dataset(ds)
-        with open(filter_encoded_netcdf4_file, 'rb') as file:
+        with open(filter_encoded_netcdf4_file, "rb") as file:
             file.seek(chunk_info.byte_offset)
             bytes_read = file.read(chunk_info.size)
             decoded = codecs[0].decode(bytes_read)
diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index 94fc0c1c..b31289c0 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -7,11 +7,9 @@
 
 
 class TestIntegration:
-    def test_filters_end_to_end(self, tmpdir,
-                                    filter_encoded_xarray_netcdf4_files):
+    def test_filters_end_to_end(self, tmpdir, filter_encoded_xarray_netcdf4_files):
         virtual_ds = virtualizarr.open_virtual_dataset(
-            filter_encoded_xarray_netcdf4_files,
-            filetype=FileType("netcdf4")
+            filter_encoded_xarray_netcdf4_files, filetype=FileType("netcdf4")
         )
         kerchunk_file = f"{tmpdir}/kerchunk.json"
         virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json")
diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
index 72645654..d8b6a080 100644
--- a/virtualizarr/xarray.py
+++ b/virtualizarr/xarray.py
@@ -20,8 +20,8 @@
     _automatically_determine_filetype,
 )
 from virtualizarr.manifests import ChunkManifest, ManifestArray
-from virtualizarr.utils import _fsspec_openfile_from_filepath
 from virtualizarr.readers.hdf import attrs_from_root_group, virtual_vars_from_hdf
+from virtualizarr.utils import _fsspec_openfile_from_filepath
 from virtualizarr.zarr import (
     attrs_from_zarr_group_json,
     dataset_to_zarr,
@@ -109,8 +109,7 @@ def open_virtual_dataset(
     if filetype.name.lower() == "netcdf4":
         print("wat")
         virtual_vars = virtual_vars_from_hdf(
-            path=filepath,
-            drop_variables=drop_variables
+            path=filepath, drop_variables=drop_variables
         )
         ds_attrs = attrs_from_root_group(path=filepath)
     if filetype == "zarr_v3":

From c57380058a5ad6ddbd908d54b1edd85b1f74f91d Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Sun, 19 May 2024 16:12:50 -0600
Subject: [PATCH 14/55] Fix mypy complaints for hdf_filters.

---
 virtualizarr/readers/hdf_filters.py | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index 75f06bdc..7a8bcc81 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -1,4 +1,4 @@
-from typing import List, Tuple, Union
+from typing import List, Optional, Tuple, TypedDict, Union
 
 import h5py
 import hdf5plugin
@@ -25,26 +25,30 @@ def get_cname_from_code(cls, v):
         return blosc_compressor_codes[v]
 
 
-def _filter_to_codec(filter_id: str, filter_properties: Union[int, Tuple] = None) -> Codec:
+def _filter_to_codec(filter_id: str, filter_properties: Union[int, None, Tuple] = None) -> Codec:
+    id_int = None
+    id_str = None
     try:
-        id = int(filter_id)
+        id_int = int(filter_id)
     except ValueError:
-        id = filter_id
+        id_str = filter_id
 
-    if isinstance(id, str):
-        if id in _non_standard_filters.keys():
-            id = _non_standard_filters[id]
+    if id_str:
+        if id_str in _non_standard_filters.keys():
+            id = _non_standard_filters[id_str]
+        else:
+            id = id_str
         conf = {"id": id}
         if id == "zlib":
-            conf["level"] = filter_properties
-    elif isinstance(id, int):
-        filter = hdf5plugin.get_filters(id)[0]
+            conf["level"] = filter_properties # type: ignore[assignment]
+    if id_int:
+        filter = hdf5plugin.get_filters(id_int)[0]
         id = filter.filter_name
-        if id == "blosc":
+        if id == "blosc" and isinstance(filter_properties, tuple):
             blosc_props = BloscProperties(**{k: v for k, v in
                                              zip(BloscProperties.__fields__.keys(),
                                                  filter_properties[-4:])})
-            conf = blosc_props.model_dump()
+            conf = blosc_props.model_dump() # type: ignore[assignment]
             conf["id"] = id
 
     codec = registry.get_codec(conf)

From 588e06b507e8661644e33923ad0295e255152e1e Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Sun, 19 May 2024 16:22:39 -0600
Subject: [PATCH 15/55] Local pre-commit fix for hdf_filters.

---
 virtualizarr/readers/hdf_filters.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index a3868ebd..dfe1c1f3 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Tuple, TypedDict, Union
+from typing import List, Tuple, Union
 
 import h5py
 import hdf5plugin

From 725333e06fad83d4d763317faca5f41167a2c98f Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 20 May 2024 20:13:44 -0600
Subject: [PATCH 16/55] Use fsspec reader_options introduced in #37.

---
 virtualizarr/readers/hdf.py | 22 ++++++++++++++++------
 virtualizarr/xarray.py      |  7 ++++---
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index 78e718e4..19d99b3f 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -1,6 +1,5 @@
 from typing import List, Mapping, Optional, Union
 
-import fsspec
 import h5py
 import numpy as np
 import xarray as xr
@@ -8,6 +7,7 @@
 from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray
 from virtualizarr.readers.hdf_filters import codecs_from_dataset
 from virtualizarr.types import ChunkKey
+from virtualizarr.utils import _fsspec_openfile_from_filepath
 from virtualizarr.zarr import ZArray
 
 
@@ -185,11 +185,15 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable:
 def virtual_vars_from_hdf(
     path: str,
     drop_variables: Optional[List[str]] = None,
+    reader_options: Optional[dict] = {
+        "storage_options": {"key": "", "secret": "", "anon": True}
+    },
 ) -> Mapping[str, xr.Variable]:
     if drop_variables is None:
         drop_variables = []
-    fs, file_path = fsspec.core.url_to_fs(path)
-    open_file = fs.open(path, "rb")
+    open_file = _fsspec_openfile_from_filepath(
+        filepath=path, reader_options=reader_options
+    )
     f = h5py.File(open_file, mode="r")
     variables = {}
     for key in f.keys():
@@ -203,9 +207,15 @@ def virtual_vars_from_hdf(
     return variables
 
 
-def attrs_from_root_group(path: str):
-    fs, file_path = fsspec.core.url_to_fs(path)
-    open_file = fs.open(path, "rb")
+def attrs_from_root_group(
+    path: str,
+    reader_options: Optional[dict] = {
+        "storage_options": {"key": "", "secret": "", "anon": True}
+    },
+):
+    open_file = _fsspec_openfile_from_filepath(
+        filepath=path, reader_options=reader_options
+    )
     f = h5py.File(open_file, mode="r")
     attrs = _extract_attrs(f)
     return attrs
diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
index d8b6a080..8f810ee1 100644
--- a/virtualizarr/xarray.py
+++ b/virtualizarr/xarray.py
@@ -107,11 +107,12 @@ def open_virtual_dataset(
     filetype = FileType(filetype)
 
     if filetype.name.lower() == "netcdf4":
-        print("wat")
         virtual_vars = virtual_vars_from_hdf(
-            path=filepath, drop_variables=drop_variables
+            path=filepath,
+            drop_variables=drop_variables,
+            reader_options=reader_options,
         )
-        ds_attrs = attrs_from_root_group(path=filepath)
+        ds_attrs = attrs_from_root_group(path=filepath, reader_options=reader_options)
     if filetype == "zarr_v3":
         # TODO is there a neat way of auto-detecting this?
         return open_virtual_dataset_from_v3_store(

From 72df10861ab0830531502885c0aaa3ebf3de4dee Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 20 May 2024 20:40:38 -0600
Subject: [PATCH 17/55] Fix incorrect zarr_v3 if block position from merge
 commit ef0d7a8.

---
 virtualizarr/xarray.py | 128 +++++++++++++++++++++--------------------
 1 file changed, 66 insertions(+), 62 deletions(-)

diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
index 8f810ee1..d76e2a67 100644
--- a/virtualizarr/xarray.py
+++ b/virtualizarr/xarray.py
@@ -101,82 +101,86 @@ def open_virtual_dataset(
 
     if virtual_array_class is not ManifestArray:
         raise NotImplementedError()
-
-    if filetype is None:
-        filetype = _automatically_determine_filetype(filepath=filepath)
-    filetype = FileType(filetype)
-
-    if filetype.name.lower() == "netcdf4":
-        virtual_vars = virtual_vars_from_hdf(
-            path=filepath,
-            drop_variables=drop_variables,
-            reader_options=reader_options,
-        )
-        ds_attrs = attrs_from_root_group(path=filepath, reader_options=reader_options)
     if filetype == "zarr_v3":
         # TODO is there a neat way of auto-detecting this?
         return open_virtual_dataset_from_v3_store(
             storepath=filepath, drop_variables=drop_variables, indexes=indexes
         )
     else:
-        # this is the only place we actually always need to use kerchunk directly
-        # TODO avoid even reading byte ranges for variables that will be dropped later anyway?
-        vds_refs = kerchunk.read_kerchunk_references_from_file(
-            filepath=filepath,
-            filetype=filetype,
-        )
-        virtual_vars = virtual_vars_from_kerchunk_refs(
-            vds_refs,
-            drop_variables=drop_variables + loadable_variables,
-            virtual_array_class=virtual_array_class,
-        )
-        ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {})
-
-    if indexes is None or len(loadable_variables) > 0:
-        # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables...
-        # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references
-        # TODO really we probably want a dedicated xarray backend that iterates over all variables only once
-        fpath = _fsspec_openfile_from_filepath(
-            filepath=filepath, reader_options=reader_options
-        )
+        if filetype is None:
+            filetype = _automatically_determine_filetype(filepath=filepath)
+        filetype = FileType(filetype)
+
+        if filetype.name.lower() == "netcdf4":
+            virtual_vars = virtual_vars_from_hdf(
+                path=filepath,
+                drop_variables=drop_variables,
+                reader_options=reader_options,
+            )
+            ds_attrs = attrs_from_root_group(
+                path=filepath, reader_options=reader_options
+            )
+        else:
+            # this is the only place we actually always need to use kerchunk directly
+            # TODO avoid even reading byte ranges for variables that will be dropped later anyway?
+            vds_refs = kerchunk.read_kerchunk_references_from_file(
+                filepath=filepath,
+                filetype=filetype,
+            )
+            virtual_vars = virtual_vars_from_kerchunk_refs(
+                vds_refs,
+                drop_variables=drop_variables + loadable_variables,
+                virtual_array_class=virtual_array_class,
+            )
+            ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get(
+                ".zattrs", {}
+            )
 
-        ds = xr.open_dataset(fpath, drop_variables=drop_variables)
+        if indexes is None or len(loadable_variables) > 0:
+            # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables...
+            # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references
+            # TODO really we probably want a dedicated xarray backend that iterates over all variables only once
+            fpath = _fsspec_openfile_from_filepath(
+                filepath=filepath, reader_options=reader_options
+            )
 
-        if indexes is None:
-            # add default indexes by reading data from file
-            indexes = {name: index for name, index in ds.xindexes.items()}
-        elif indexes != {}:
-            # TODO allow manual specification of index objects
-            raise NotImplementedError()
-        else:
-            indexes = dict(**indexes)  # for type hinting: to allow mutation
+            ds = xr.open_dataset(fpath, drop_variables=drop_variables)
 
-        loadable_vars = {
-            name: var
-            for name, var in ds.variables.items()
-            if name in loadable_variables
-        }
+            if indexes is None:
+                # add default indexes by reading data from file
+                indexes = {name: index for name, index in ds.xindexes.items()}
+            elif indexes != {}:
+                # TODO allow manual specification of index objects
+                raise NotImplementedError()
+            else:
+                indexes = dict(**indexes)  # for type hinting: to allow mutation
 
-        # if we only read the indexes we can just close the file right away as nothing is lazy
-        if loadable_vars == {}:
-            ds.close()
-    else:
-        loadable_vars = {}
-        indexes = {}
+            loadable_vars = {
+                name: var
+                for name, var in ds.variables.items()
+                if name in loadable_variables
+            }
 
-    vars = {**virtual_vars, **loadable_vars}
+            # if we only read the indexes we can just close the file right away as nothing is lazy
+            if loadable_vars == {}:
+                ds.close()
+        else:
+            loadable_vars = {}
+            indexes = {}
 
-    data_vars, coords = separate_coords(vars, indexes)
-    vds = xr.Dataset(
-        data_vars,
-        coords=coords,
-        # indexes={},  # TODO should be added in a later version of xarray
-        attrs=ds_attrs,
-    )
+        vars = {**virtual_vars, **loadable_vars}
 
-    # TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened
+        data_vars, coords = separate_coords(vars, indexes)
+        vds = xr.Dataset(
+            data_vars,
+            coords=coords,
+            # indexes={},  # TODO should be added in a later version of xarray
+            attrs=ds_attrs,
+        )
 
-    return vds
+        # TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened
+
+        return vds
 
 
 def open_virtual_dataset_from_v3_store(

From d1e85cb169adc3851951afc2a64fcdec6180243c Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Tue, 21 May 2024 08:48:05 -0600
Subject: [PATCH 18/55] Fix early return from hdf _extract_attrs.

---
 virtualizarr/readers/hdf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index 19d99b3f..be93237f 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -155,7 +155,7 @@ def _extract_attrs(h5obj: Union[h5py.Dataset, h5py.Group]):
             continue
 
         attrs[n] = v
-        return attrs
+    return attrs
 
 
 def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable:

From 1e2b3436fd086f8188c516f2fda4f6cd3a521325 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Tue, 21 May 2024 09:23:50 -0600
Subject: [PATCH 19/55] Test that _extract_attrs correctly handles multiple
 attributes.

---
 virtualizarr/tests/test_readers/conftest.py |  3 ++-
 virtualizarr/tests/test_readers/test_hdf.py | 16 +++++++++++-----
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index 53c9630e..fe2ec889 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -93,12 +93,13 @@ def chunked_dimensions_netcdf4_file(tmpdir):
 
 
 @pytest.fixture
-def string_attribute_netcdf4_file(tmpdir):
+def string_attributes_netcdf4_file(tmpdir):
     filepath = f"{tmpdir}/attributes.nc"
     f = h5py.File(filepath, "w")
     data = np.random.random((10, 10))
     f.create_dataset(name="data", data=data, chunks=None)
     f["data"].attrs["attribute_name"] = "attribute_name"
+    f["data"].attrs["attribute_name2"] = "attribute_name2"
     return filepath
 
 
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
index a83bfc39..a67352e6 100644
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -75,16 +75,16 @@ def test_not_chunked_dataset(self, single_dimension_scale_netcdf4_file):
         var = _dataset_to_variable(single_dimension_scale_netcdf4_file, ds)
         assert var.chunks == (2,)
 
-    def test_dataset_attributes(self, string_attribute_netcdf4_file):
-        f = h5py.File(string_attribute_netcdf4_file)
+    def test_dataset_attributes(self, string_attributes_netcdf4_file):
+        f = h5py.File(string_attributes_netcdf4_file)
         ds = f["data"]
-        var = _dataset_to_variable(string_attribute_netcdf4_file, ds)
+        var = _dataset_to_variable(string_attributes_netcdf4_file, ds)
         assert var.attrs["attribute_name"] == "attribute_name"
 
 
 class TestExtractAttributes:
-    def test_string_attribute(self, string_attribute_netcdf4_file):
-        f = h5py.File(string_attribute_netcdf4_file)
+    def test_string_attribute(self, string_attributes_netcdf4_file):
+        f = h5py.File(string_attributes_netcdf4_file)
         ds = f["data"]
         attrs = _extract_attrs(ds)
         assert attrs["attribute_name"] == "attribute_name"
@@ -94,6 +94,12 @@ def test_root_attribute(self, root_attributes_netcdf4_file):
         attrs = _extract_attrs(f)
         assert attrs["attribute_name"] == "attribute_name"
 
+    def test_multiple_attributes(self, string_attributes_netcdf4_file):
+        f = h5py.File(string_attributes_netcdf4_file)
+        ds = f["data"]
+        attrs = _extract_attrs(ds)
+        assert len(attrs.keys()) == 2
+
 
 class TestVirtualVarsFromHDF:
     def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file):

From 7f1c1897dcad92cb988ea7e14a165d63fe23dad6 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Wed, 22 May 2024 14:16:12 -0600
Subject: [PATCH 20/55] Initial attempt at scale and offset via numcodecs.

---
 virtualizarr/readers/hdf.py         | 14 ++++++++---
 virtualizarr/readers/hdf_filters.py | 36 ++++++++++++++++++++++++++++-
 2 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index be93237f..c251866b 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -5,7 +5,7 @@
 import xarray as xr
 
 from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray
-from virtualizarr.readers.hdf_filters import codecs_from_dataset
+from virtualizarr.readers.hdf_filters import cfcodec_from_dataset, codecs_from_dataset
 from virtualizarr.types import ChunkKey
 from virtualizarr.utils import _fsspec_openfile_from_filepath
 from virtualizarr.zarr import ZArray
@@ -163,11 +163,20 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable:
     # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66
     chunks = dataset.chunks if dataset.chunks else dataset.shape
     codecs = codecs_from_dataset(dataset)
+    cfcodec = cfcodec_from_dataset(dataset)
+    attrs = _extract_attrs(dataset)
+    if cfcodec:
+        codecs.append(cfcodec["codec"])
+        dtype = cfcodec["target_dtype"]
+        attrs.pop("scale_factor", None)
+        attrs.pop("add_offset", None)
+    else:
+        dtype = dataset.dtype
     filters = [codec.get_config() for codec in codecs]
     zarray = ZArray(
         chunks=chunks,
         compressor=None,
-        dtype=dataset.dtype,
+        dtype=dtype,
         fill_value=dataset.fillvalue,
         filters=filters,
         order="C",
@@ -177,7 +186,6 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable:
     manifest = _dataset_chunk_manifest(path, dataset)
     marray = ManifestArray(zarray=zarray, chunkmanifest=manifest)
     dims = _dataset_dims(dataset)
-    attrs = _extract_attrs(dataset)
     variable = xr.Variable(data=marray, dims=dims, attrs=attrs)
     return variable
 
diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index dfe1c1f3..169eab97 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -1,10 +1,13 @@
-from typing import List, Tuple, Union
+from typing import List, Tuple, TypedDict, Union
 
 import h5py
 import hdf5plugin
 import numcodecs.registry as registry
+import numpy as np
 from numcodecs.abc import Codec
+from numcodecs.fixedscaleoffset import FixedScaleOffset
 from pydantic import BaseModel, validator
+from xarray.coding.variables import _choose_float_dtype
 
 _non_standard_filters = {"gzip": "zlib"}
 
@@ -24,6 +27,11 @@ def get_cname_from_code(cls, v):
         return blosc_compressor_codes[v]
 
 
+class CFCodec(TypedDict):
+    target_dtype: np.dtype
+    codec: Codec
+
+
 def _filter_to_codec(
     filter_id: str, filter_properties: Union[int, None, Tuple] = None
 ) -> Codec:
@@ -61,6 +69,32 @@ def _filter_to_codec(
     return codec
 
 
+def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None:
+    attributes = {attr: dataset.attrs[attr] for attr in dataset.attrs}
+    mapping = {}
+    if "scale_factor" in attributes:
+        mapping["scale_factor"] = 1 / attributes["scale_factor"][0]
+    else:
+        mapping["scale_factor"] = 1
+    if "add_offset" in attributes:
+        mapping["add_offset"] = attributes["add_offset"]
+    else:
+        mapping["add_offset"] = 0
+    if mapping["scale_factor"] != 1 or mapping["add_offset"] != 0:
+        float_dtype = _choose_float_dtype(dtype=dataset.dtype, mapping=mapping)
+        target_dtype = np.dtype(float_dtype)
+        codec = FixedScaleOffset(
+            offset=mapping["add_offset"],
+            scale=mapping["scale_factor"],
+            dtype=target_dtype,
+            astype=dataset.dtype,
+        )
+        cfcodec = CFCodec(target_dtype=target_dtype, codec=codec)
+        return cfcodec
+    else:
+        return None
+
+
 def codecs_from_dataset(dataset: h5py.Dataset) -> List[Codec]:
     codecs = []
     for filter_id, filter_properties in dataset._filters.items():

From 908e332ae9860a7e7d36845633a7c9267ee72ca0 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Thu, 23 May 2024 10:54:48 -0600
Subject: [PATCH 21/55] Tests for cfcodec_from_dataset.

---
 virtualizarr/tests/test_readers/conftest.py   | 10 +++++++
 .../tests/test_readers/test_hdf_filters.py    | 29 +++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index fe2ec889..202cdd9c 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -164,3 +164,13 @@ def filter_encoded_xarray_netcdf4_files(tmpdir, request):
     filepath = f"{tmpdir}/{request.param}_xarray.nc"
     ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding)
     return filepath
+
+
+@pytest.fixture
+def add_offset_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/offset.nc"
+    f = h5py.File(filepath, "w")
+    data = np.random.random((10, 10))
+    f.create_dataset(name="data", data=data, chunks=None)
+    f["data"].attrs.create(name="add_offset", data=5)
+    return filepath
diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py
index 28b5d69f..dca9f40d 100644
--- a/virtualizarr/tests/test_readers/test_hdf_filters.py
+++ b/virtualizarr/tests/test_readers/test_hdf_filters.py
@@ -1,9 +1,11 @@
 import h5py
 import numcodecs
+import numpy as np
 import pytest
 
 from virtualizarr.readers.hdf_filters import (
     _filter_to_codec,
+    cfcodec_from_dataset,
     codecs_from_dataset,
 )
 
@@ -41,3 +43,30 @@ def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file):
             bytes_read = file.read(chunk_info.size)
             decoded = codecs[0].decode(bytes_read)
             assert decoded == np_uncompressed.tobytes()
+
+
+class TestCFCodecFromDataset:
+    def test_no_cf_convention(self, filter_encoded_netcdf4_file):
+        f = h5py.File(filter_encoded_netcdf4_file)
+        ds = f["data"]
+        cf_codec = cfcodec_from_dataset(ds)
+        assert cf_codec is None
+
+    def test_cf_scale_factor(self, netcdf4_file):
+        f = h5py.File(netcdf4_file)
+        ds = f["air"]
+        cf_codec = cfcodec_from_dataset(ds)
+        assert cf_codec["target_dtype"] == np.dtype(np.float64)
+        assert cf_codec["codec"].scale == 100.0
+        assert cf_codec["codec"].offset == 0
+        assert cf_codec["codec"].dtype == "<f8"
+        assert cf_codec["codec"].astype == "<i2"
+
+    def test_cf_add_offset(self, add_offset_netcdf4_file):
+        f = h5py.File(add_offset_netcdf4_file)
+        ds = f["data"]
+        cf_codec = cfcodec_from_dataset(ds)
+        assert cf_codec["target_dtype"] == np.dtype(np.float64)
+        assert cf_codec["codec"].scale == 1
+        assert cf_codec["codec"].offset == 5
+        assert cf_codec["codec"].dtype == "<f8"

From 0df332d04d438f291abc7b952a15e8851e6e9777 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 24 May 2024 12:47:12 -0600
Subject: [PATCH 22/55] Temporarily relax integration tests to assert_allclose.

---
 virtualizarr/tests/test_integration.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py
index 064968b3..1b9aad83 100644
--- a/virtualizarr/tests/test_integration.py
+++ b/virtualizarr/tests/test_integration.py
@@ -62,7 +62,7 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format):
         roundtrip = xr.open_dataset(f"{tmpdir}/refs.{format}", engine="kerchunk")
 
         # assert equal to original dataset
-        xrt.assert_equal(roundtrip, ds)
+        xrt.assert_allclose(roundtrip, ds)
 
     def test_kerchunk_roundtrip_concat(self, tmpdir, format):
         # set up example xarray dataset
@@ -89,7 +89,7 @@ def test_kerchunk_roundtrip_concat(self, tmpdir, format):
         roundtrip = xr.open_dataset(f"{tmpdir}/refs.{format}", engine="kerchunk")
 
         # assert equal to original dataset
-        xrt.assert_equal(roundtrip, ds)
+        xrt.assert_allclose(roundtrip, ds)
 
 
 def test_open_scalar_variable(tmpdir):

From ca6b236b36fabf96c0659556f2cff2ef59435d6c Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 24 May 2024 13:50:49 -0600
Subject: [PATCH 23/55] Add blosc_lz4 fixture parameterization to confirm
 libnetcdf environment.

---
 virtualizarr/tests/test_readers/conftest.py         | 13 +++++++++----
 .../tests/test_readers/test_hdf_integration.py      |  4 ++--
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index 202cdd9c..20d5433e 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -134,7 +134,7 @@ def np_uncompressed():
     return np.arange(100)
 
 
-@pytest.fixture(params=["gzip", "blosc"])
+@pytest.fixture(params=["gzip", "blosc_lz4"])
 def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request):
     filepath = f"{tmpdir}/{request.param}.nc"
     f = h5py.File(filepath, "w")
@@ -142,7 +142,7 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request):
         f.create_dataset(
             name="data", data=np_uncompressed, compression="gzip", compression_opts=1
         )
-    if request.param == "blosc":
+    if request.param == "blosc_lz4":
         f.create_dataset(
             name="data",
             data=np_uncompressed,
@@ -151,18 +151,23 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request):
     return filepath
 
 
-@pytest.fixture(params=["gzip"])
-def filter_encoded_xarray_netcdf4_files(tmpdir, request):
+@pytest.fixture(params=["gzip", "blosc_zlib"])
+def filter_encoded_xarray_netcdf4_file(tmpdir, request):
     ds = xr.tutorial.open_dataset("air_temperature")
     encoding = {}
     if request.param == "gzip":
         encoding_config = {"zlib": True, "complevel": 1}
+    if "blosc" in request.param:
+        encoding_config = {
+            "compression": request.param,
+        }
 
     for var_name in ds.variables:
         encoding[var_name] = encoding_config
 
     filepath = f"{tmpdir}/{request.param}_xarray.nc"
     ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding)
+    #  ds.to_netcdf(filepath, engine="netcdf4", encoding=encoding)
     return filepath
 
 
diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index b31289c0..ade8e7ce 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -7,9 +7,9 @@
 
 
 class TestIntegration:
-    def test_filters_end_to_end(self, tmpdir, filter_encoded_xarray_netcdf4_files):
+    def test_filters_roundtrip(self, tmpdir, filter_encoded_xarray_netcdf4_file):
         virtual_ds = virtualizarr.open_virtual_dataset(
-            filter_encoded_xarray_netcdf4_files, filetype=FileType("netcdf4")
+            filter_encoded_xarray_netcdf4_file, filetype=FileType("netcdf4")
         )
         kerchunk_file = f"{tmpdir}/kerchunk.json"
         virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json")

From b7426c5b15f33a65a0890a51fbc6d9464b673eaf Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 24 May 2024 14:05:21 -0600
Subject: [PATCH 24/55] Check for compatability with netcdf4 engine.

---
 virtualizarr/tests/test_readers/conftest.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index 20d5433e..cb1212f0 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -166,8 +166,7 @@ def filter_encoded_xarray_netcdf4_file(tmpdir, request):
         encoding[var_name] = encoding_config
 
     filepath = f"{tmpdir}/{request.param}_xarray.nc"
-    ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding)
-    #  ds.to_netcdf(filepath, engine="netcdf4", encoding=encoding)
+    ds.to_netcdf(filepath, engine="netcdf4", encoding=encoding)
     return filepath
 
 

From dac21dde6239b5ea7e918ff50aef8839ab2f7773 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 27 May 2024 12:58:48 -0600
Subject: [PATCH 25/55] Use separate fixtures for h5netcdf and netcdf4
 compression styles.

---
 virtualizarr/tests/test_readers/conftest.py   | 27 ++++++++++++++-----
 .../test_readers/test_hdf_integration.py      | 20 ++++++++++++--
 2 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index cb1212f0..a4fafed3 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -3,6 +3,7 @@
 import numpy as np
 import pytest
 import xarray as xr
+from xarray.tests.test_dataset import create_test_data
 
 
 @pytest.fixture
@@ -151,22 +152,36 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request):
     return filepath
 
 
-@pytest.fixture(params=["gzip", "blosc_zlib"])
-def filter_encoded_xarray_netcdf4_file(tmpdir, request):
+@pytest.fixture(params=["gzip"])
+def filter_encoded_xarray_h5netcdf_file(tmpdir, request):
     ds = xr.tutorial.open_dataset("air_temperature")
     encoding = {}
     if request.param == "gzip":
         encoding_config = {"zlib": True, "complevel": 1}
+
+    for var_name in ds.variables:
+        encoding[var_name] = encoding_config
+
+    filepath = f"{tmpdir}/{request.param}_xarray.nc"
+    ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding)
+    return filepath
+
+
+@pytest.fixture(params=["blosc_zlib"])
+def filter_encoded_xarray_netcdf4_file(tmpdir, request):
+    ds = create_test_data(dim_sizes=(20, 80, 10))
     if "blosc" in request.param:
         encoding_config = {
             "compression": request.param,
+            "chunksizes": (20, 40),
+            "original_shape": ds.var2.shape,
+            "blosc_shuffle": 1,
+            "fletcher32": False,
         }
 
-    for var_name in ds.variables:
-        encoding[var_name] = encoding_config
-
+    ds["var2"].encoding.update(encoding_config)
     filepath = f"{tmpdir}/{request.param}_xarray.nc"
-    ds.to_netcdf(filepath, engine="netcdf4", encoding=encoding)
+    ds.to_netcdf(filepath, engine="netcdf4")
     return filepath
 
 
diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index ade8e7ce..d6ecf2f1 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -7,9 +7,11 @@
 
 
 class TestIntegration:
-    def test_filters_roundtrip(self, tmpdir, filter_encoded_xarray_netcdf4_file):
+    def test_filters_h5netcdf_roundtrip(
+        self, tmpdir, filter_encoded_xarray_h5netcdf_file
+    ):
         virtual_ds = virtualizarr.open_virtual_dataset(
-            filter_encoded_xarray_netcdf4_file, filetype=FileType("netcdf4")
+            filter_encoded_xarray_h5netcdf_file, filetype=FileType("netcdf4")
         )
         kerchunk_file = f"{tmpdir}/kerchunk.json"
         virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json")
@@ -18,3 +20,17 @@ def test_filters_roundtrip(self, tmpdir, filter_encoded_xarray_netcdf4_file):
 
         ds = xr.open_dataset(m, engine="kerchunk")
         assert isinstance(ds.air.values[0][0][0], numpy.float64)
+
+    def test_filters_netcdf4_roundtrip(
+        self, tmpdir, filter_encoded_xarray_netcdf4_file
+    ):
+        virtual_ds = virtualizarr.open_virtual_dataset(
+            filter_encoded_xarray_netcdf4_file, filetype=FileType("netcdf4")
+        )
+        kerchunk_file = f"{tmpdir}/kerchunk.json"
+        virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json")
+        fs = fsspec.filesystem("reference", fo=kerchunk_file)
+        m = fs.get_mapper("")
+
+        ds = xr.open_dataset(m, engine="kerchunk")
+        print(ds["var2"].encoding)

From e968772a3a206658064e3e29294afec7604d0bc9 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 27 May 2024 15:49:22 -0600
Subject: [PATCH 26/55] Print libhdf5 and libnetcdf4 versions to confirm
 compiled environment.

---
 virtualizarr/tests/test_readers/conftest.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index a4fafed3..8904dd38 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -4,6 +4,7 @@
 import pytest
 import xarray as xr
 from xarray.tests.test_dataset import create_test_data
+from xarray.util.print_versions import netcdf_and_hdf5_versions
 
 
 @pytest.fixture
@@ -181,6 +182,7 @@ def filter_encoded_xarray_netcdf4_file(tmpdir, request):
 
     ds["var2"].encoding.update(encoding_config)
     filepath = f"{tmpdir}/{request.param}_xarray.nc"
+    print(netcdf_and_hdf5_versions())
     ds.to_netcdf(filepath, engine="netcdf4")
     return filepath
 

From 9a98e57e55fd020bcf3d682604eee2f03775ff26 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 27 May 2024 17:07:51 -0600
Subject: [PATCH 27/55] Skip netcdf4 style compression tests when libhdf5 <
 1.14.

---
 virtualizarr/tests/test_readers/conftest.py   | 15 ++++++++++++---
 .../test_readers/test_hdf_integration.py      | 19 ++++++++++++++++---
 2 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index 8904dd38..0ddb2a01 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -3,6 +3,7 @@
 import numpy as np
 import pytest
 import xarray as xr
+from packaging.version import Version
 from xarray.tests.test_dataset import create_test_data
 from xarray.util.print_versions import netcdf_and_hdf5_versions
 
@@ -168,8 +169,17 @@ def filter_encoded_xarray_h5netcdf_file(tmpdir, request):
     return filepath
 
 
+@pytest.fixture()
+def skip_test_for_libhdf5_version():
+    versions = netcdf_and_hdf5_versions()
+    libhdf5_version = Version(versions[0][1])
+    return libhdf5_version < Version("1.14")
+
+
 @pytest.fixture(params=["blosc_zlib"])
-def filter_encoded_xarray_netcdf4_file(tmpdir, request):
+def filter_encoded_xarray_netcdf4_file(tmpdir, request, skip_test_for_libhdf5_version):
+    if skip_test_for_libhdf5_version:
+        pytest.skip("Requires libhdf5 >= 1.14")
     ds = create_test_data(dim_sizes=(20, 80, 10))
     if "blosc" in request.param:
         encoding_config = {
@@ -182,9 +192,8 @@ def filter_encoded_xarray_netcdf4_file(tmpdir, request):
 
     ds["var2"].encoding.update(encoding_config)
     filepath = f"{tmpdir}/{request.param}_xarray.nc"
-    print(netcdf_and_hdf5_versions())
     ds.to_netcdf(filepath, engine="netcdf4")
-    return filepath
+    return {"filepath": filepath, "compressor": request.param}
 
 
 @pytest.fixture
diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index d6ecf2f1..f51ebd45 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -24,13 +24,26 @@ def test_filters_h5netcdf_roundtrip(
     def test_filters_netcdf4_roundtrip(
         self, tmpdir, filter_encoded_xarray_netcdf4_file
     ):
+        filepath = filter_encoded_xarray_netcdf4_file["filepath"]
+        compressor = filter_encoded_xarray_netcdf4_file["compressor"]
         virtual_ds = virtualizarr.open_virtual_dataset(
-            filter_encoded_xarray_netcdf4_file, filetype=FileType("netcdf4")
+            filepath, filetype=FileType("netcdf4")
         )
         kerchunk_file = f"{tmpdir}/kerchunk.json"
         virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json")
         fs = fsspec.filesystem("reference", fo=kerchunk_file)
         m = fs.get_mapper("")
-
         ds = xr.open_dataset(m, engine="kerchunk")
-        print(ds["var2"].encoding)
+
+        expected_encoding = ds["var2"].encoding.copy()
+        compression = expected_encoding.pop("compression")
+        blosc_shuffle = expected_encoding.pop("blosc_shuffle")
+        if compression is not None:
+            if "blosc" in compression and blosc_shuffle:
+                expected_encoding["blosc"] = {
+                    "compressor": compressor,
+                    "shuffle": blosc_shuffle,
+                }
+                expected_encoding["shuffle"] = False
+        actual_encoding = ds["var2"].encoding
+        assert expected_encoding.items() <= actual_encoding.items()

From 7590b87e375f0dea6683aceba4322ca5a0c8a95d Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Tue, 11 Jun 2024 13:57:51 -0600
Subject: [PATCH 28/55] Include imagecodecs.numcodecs to support HDF5 lzf
 filters.

---
 pyproject.toml                                      | 1 +
 virtualizarr/readers/hdf_filters.py                 | 2 +-
 virtualizarr/tests/test_readers/test_hdf_filters.py | 8 ++++----
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index f0563f09..773cccc2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,6 +46,7 @@ test = [
     "fsspec",
     "s3fs",
     "fastparquet",
+    "imagecodecs-numcodecs",
 ]
 
 
diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index 169eab97..08a3bba4 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -9,7 +9,7 @@
 from pydantic import BaseModel, validator
 from xarray.coding.variables import _choose_float_dtype
 
-_non_standard_filters = {"gzip": "zlib"}
+_non_standard_filters = {"gzip": "zlib", "lzf": "imagecodecs_lzf"}
 
 
 class BloscProperties(BaseModel):
diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py
index dca9f40d..b5b04047 100644
--- a/virtualizarr/tests/test_readers/test_hdf_filters.py
+++ b/virtualizarr/tests/test_readers/test_hdf_filters.py
@@ -1,7 +1,7 @@
 import h5py
+import imagecodecs
 import numcodecs
 import numpy as np
-import pytest
 
 from virtualizarr.readers.hdf_filters import (
     _filter_to_codec,
@@ -15,9 +15,9 @@ def test_gzip_uses_zlib_numcodec(self):
         codec = _filter_to_codec("gzip", 1)
         assert isinstance(codec, numcodecs.zlib.Zlib)
 
-    def test_lzf_not_available(self):
-        with pytest.raises(ValueError, match="codec not available"):
-            _filter_to_codec("lzf")
+    def test_lzf(self):
+        codec = _filter_to_codec("lzf")
+        assert isinstance(codec, imagecodecs.numcodecs.Lzf)
 
     def test_blosc(self):
         codec = _filter_to_codec("32001", (2, 2, 8, 800, 9, 2, 1))

From 14bd7098545bd7f443b791f24aafa11bcc00fdbb Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Tue, 11 Jun 2024 16:24:30 -0600
Subject: [PATCH 29/55] Remove test that verifies call to
 read_kerchunk_references_from_file.

---
 virtualizarr/tests/test_xarray.py | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py
index 695759bd..d145550e 100644
--- a/virtualizarr/tests/test_xarray.py
+++ b/virtualizarr/tests/test_xarray.py
@@ -1,5 +1,4 @@
 from collections.abc import Mapping
-from unittest.mock import patch
 
 import numpy as np
 import pytest
@@ -304,16 +303,3 @@ def test_loadable_variables(self, netcdf4_file):
         for name in full_ds.variables:
             if name in vars_to_load:
                 xrt.assert_identical(vds.variables[name], full_ds.variables[name])
-
-    @patch("virtualizarr.kerchunk.read_kerchunk_references_from_file")
-    def test_open_virtual_dataset_passes_expected_args(
-        self, mock_read_kerchunk, netcdf4_file
-    ):
-        reader_options = {"option1": "value1", "option2": "value2"}
-        open_virtual_dataset(netcdf4_file, indexes={}, reader_options=reader_options)
-        args = {
-            "filepath": netcdf4_file,
-            "filetype": None,
-            "reader_options": reader_options,
-        }
-        mock_read_kerchunk.assert_called_once_with(**args)

From acdf0d76557a5abdf2657f1278f57c732a4dd347 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Wed, 12 Jun 2024 15:05:34 -0600
Subject: [PATCH 30/55] Add additional codec support structures for imagecodecs
 and numcodecs.

---
 virtualizarr/readers/hdf_filters.py         | 23 +++++++++++++++++----
 virtualizarr/tests/test_readers/conftest.py |  9 +++++++-
 2 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index 08a3bba4..667ff09a 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -9,7 +9,12 @@
 from pydantic import BaseModel, validator
 from xarray.coding.variables import _choose_float_dtype
 
-_non_standard_filters = {"gzip": "zlib", "lzf": "imagecodecs_lzf"}
+_non_standard_filters = {
+    "gzip": "zlib",
+    "lzf": "imagecodecs_lzf",
+}
+
+_hdf5plugin_imagecodecs = {"lz4": "imagecodecs_lz4h5", "bzip2": "imagecodecs_bz2"}
 
 
 class BloscProperties(BaseModel):
@@ -27,6 +32,10 @@ def get_cname_from_code(cls, v):
         return blosc_compressor_codes[v]
 
 
+class ZstdProperties(BaseModel):
+    level: int
+
+
 class CFCodec(TypedDict):
     target_dtype: np.dtype
     codec: Codec
@@ -41,18 +50,20 @@ def _filter_to_codec(
         id_int = int(filter_id)
     except ValueError:
         id_str = filter_id
-
+    conf = {}
     if id_str:
         if id_str in _non_standard_filters.keys():
             id = _non_standard_filters[id_str]
         else:
             id = id_str
-        conf = {"id": id}
+        conf["id"] = id  # type: ignore[assignment]
         if id == "zlib":
             conf["level"] = filter_properties  # type: ignore[assignment]
     if id_int:
         filter = hdf5plugin.get_filters(id_int)[0]
         id = filter.filter_name
+        if id in _hdf5plugin_imagecodecs.keys():
+            id = _hdf5plugin_imagecodecs[id]
         if id == "blosc" and isinstance(filter_properties, tuple):
             blosc_props = BloscProperties(
                 **{
@@ -63,7 +74,11 @@ def _filter_to_codec(
                 }
             )
             conf = blosc_props.model_dump()  # type: ignore[assignment]
-            conf["id"] = id
+        if id == "zstd" and isinstance(filter_properties, tuple):
+            zstd_props = ZstdProperties(level=filter_properties[0])
+            conf = zstd_props.model_dump()  # type: ignore[assignment]
+
+        conf["id"] = id
 
     codec = registry.get_codec(conf)
     return codec
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index 0ddb2a01..3e6f9c3f 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -137,7 +137,7 @@ def np_uncompressed():
     return np.arange(100)
 
 
-@pytest.fixture(params=["gzip", "blosc_lz4"])
+@pytest.fixture(params=["gzip", "blosc_lz4", "lz4", "bzip2", "zstd"])
 def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request):
     filepath = f"{tmpdir}/{request.param}.nc"
     f = h5py.File(filepath, "w")
@@ -151,6 +151,13 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request):
             data=np_uncompressed,
             **hdf5plugin.Blosc(cname="lz4", clevel=9, shuffle=hdf5plugin.Blosc.SHUFFLE),
         )
+    if request.param == "lz4":
+        f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.LZ4(nbytes=0))
+    if request.param == "bzip2":
+        f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.BZip2())
+    if request.param == "zstd":
+        f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.Zstd(clevel=2))
+
     return filepath
 
 

From 4ba323a6c862deb8908706373b6df429fd78f986 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Wed, 12 Jun 2024 16:17:04 -0600
Subject: [PATCH 31/55] Add codec config test for Zstd.

---
 virtualizarr/tests/test_readers/test_hdf_filters.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py
index b5b04047..4d23a756 100644
--- a/virtualizarr/tests/test_readers/test_hdf_filters.py
+++ b/virtualizarr/tests/test_readers/test_hdf_filters.py
@@ -31,6 +31,12 @@ def test_blosc(self):
         }
         assert codec.get_config() == expected_config
 
+    def test_zstd(self):
+        codec = _filter_to_codec("32015", (5,))
+        assert isinstance(codec, numcodecs.zstd.Zstd)
+        expected_config = {"id": "zstd", "level": 5}
+        assert codec.get_config() == expected_config
+
 
 class TestCodecsFromDataSet:
     def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file):

From e14e53b0fc2bb7ed1ca3d5b73fc43594aff77426 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Thu, 20 Jun 2024 18:03:26 -0600
Subject: [PATCH 32/55] Include initial cf decoding tests.

---
 virtualizarr/readers/hdf_filters.py           |  3 +-
 virtualizarr/tests/test_readers/conftest.py   | 34 ++++++++++++++++---
 .../tests/test_readers/test_hdf_filters.py    | 28 +++++++++++++++
 3 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index 667ff09a..f4e2dcfa 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -88,7 +88,8 @@ def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None:
     attributes = {attr: dataset.attrs[attr] for attr in dataset.attrs}
     mapping = {}
     if "scale_factor" in attributes:
-        mapping["scale_factor"] = 1 / attributes["scale_factor"][0]
+        mapping["scale_factor"] = 1 / attributes["scale_factor"]
+        # mapping["scale_factor"] =attributes["scale_factor"][0]
     else:
         mapping["scale_factor"] = 1
     if "add_offset" in attributes:
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index 3e6f9c3f..e1a53c5e 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -204,10 +204,36 @@ def filter_encoded_xarray_netcdf4_file(tmpdir, request, skip_test_for_libhdf5_ve
 
 
 @pytest.fixture
-def add_offset_netcdf4_file(tmpdir):
+def np_uncompressed_int16():
+    return np.arange(100, dtype=np.int16)
+
+
+@pytest.fixture
+def offset():
+    return np.float32(5.0)
+
+
+@pytest.fixture
+def add_offset_netcdf4_file(tmpdir, np_uncompressed_int16, offset):
     filepath = f"{tmpdir}/offset.nc"
     f = h5py.File(filepath, "w")
-    data = np.random.random((10, 10))
-    f.create_dataset(name="data", data=data, chunks=None)
-    f["data"].attrs.create(name="add_offset", data=5)
+    data = np_uncompressed_int16 - offset
+    f.create_dataset(name="data", data=data, chunks=True)
+    f["data"].attrs.create(name="add_offset", data=offset)
+    return filepath
+
+
+@pytest.fixture
+def scale_factor():
+    return 0.01
+
+
+@pytest.fixture
+def scale_add_offset_netcdf4_file(tmpdir, np_uncompressed_int16, offset, scale_factor):
+    filepath = f"{tmpdir}/scale_offset.nc"
+    f = h5py.File(filepath, "w")
+    data = (np_uncompressed_int16 - offset) / scale_factor
+    f.create_dataset(name="data", data=data, chunks=True)
+    f["data"].attrs.create(name="add_offset", data=offset)
+    f["data"].attrs.create(name="scale_factor", data=np.array([scale_factor]))
     return filepath
diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py
index 4d23a756..960bcf2c 100644
--- a/virtualizarr/tests/test_readers/test_hdf_filters.py
+++ b/virtualizarr/tests/test_readers/test_hdf_filters.py
@@ -76,3 +76,31 @@ def test_cf_add_offset(self, add_offset_netcdf4_file):
         assert cf_codec["codec"].scale == 1
         assert cf_codec["codec"].offset == 5
         assert cf_codec["codec"].dtype == "<f8"
+
+    def test_cf_codec_decoding_offset(
+        self, add_offset_netcdf4_file, np_uncompressed_int16
+    ):
+        f = h5py.File(add_offset_netcdf4_file)
+        ds = f["data"]
+        chunk_info = ds.id.get_chunk_info(0)
+        cfcodec = cfcodec_from_dataset(ds)
+        with open(add_offset_netcdf4_file, "rb") as file:
+            file.seek(chunk_info.byte_offset)
+            bytes_read = file.read(chunk_info.size)
+            decoded = cfcodec["codec"].decode(bytes_read)
+            assert np.array_equal(decoded, np_uncompressed_int16)
+            assert decoded.dtype == np.float64
+
+    def test_cf_codec_decoding_scale_offset(
+        self, scale_add_offset_netcdf4_file, np_uncompressed_int16
+    ):
+        f = h5py.File(scale_add_offset_netcdf4_file)
+        ds = f["data"]
+        chunk_info = ds.id.get_chunk_info(0)
+        cfcodec = cfcodec_from_dataset(ds)
+        with open(scale_add_offset_netcdf4_file, "rb") as file:
+            file.seek(chunk_info.byte_offset)
+            bytes_read = file.read(chunk_info.size)
+            decoded = cfcodec["codec"].decode(bytes_read)
+            assert np.allclose(decoded, np_uncompressed_int16)
+            assert decoded.dtype == np.float64

From b052f8c8f88e287bfdc684db0b595f32dfa88b15 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Thu, 20 Jun 2024 19:49:54 -0600
Subject: [PATCH 33/55] Revert typo for scale_factor retrieval.

---
 virtualizarr/readers/hdf_filters.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index f4e2dcfa..667ff09a 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -88,8 +88,7 @@ def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None:
     attributes = {attr: dataset.attrs[attr] for attr in dataset.attrs}
     mapping = {}
     if "scale_factor" in attributes:
-        mapping["scale_factor"] = 1 / attributes["scale_factor"]
-        # mapping["scale_factor"] =attributes["scale_factor"][0]
+        mapping["scale_factor"] = 1 / attributes["scale_factor"][0]
     else:
         mapping["scale_factor"] = 1
     if "add_offset" in attributes:

From 01a3980f541a45c8a33a907dd6d3bed722eacae9 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Thu, 20 Jun 2024 20:12:44 -0600
Subject: [PATCH 34/55] Update reader to use new numpy manifest representation.

---
 virtualizarr/readers/hdf.py                 | 29 ++++++++++-----------
 virtualizarr/tests/test_readers/test_hdf.py |  4 +--
 2 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index c251866b..b96bdff7 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -39,34 +39,33 @@ def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest:
                 path=path, offset=dsid.get_offset(), length=dsid.get_storage_size()
             )
             chunk_key = ChunkKey(key)
-            chunk_entries = {chunk_key: chunk_entry}
+            chunk_entries = {chunk_key: chunk_entry.dict()}
             chunk_manifest = ChunkManifest(entries=chunk_entries)
             return chunk_manifest
     else:
         num_chunks = dsid.get_num_chunks()
         if num_chunks == 0:
             raise ValueError("The dataset is chunked but contains no chunks")
+        paths = np.full(num_chunks, path, dtype=np.dtypes.StringDType)  # type: ignore
+        offsets = np.empty((num_chunks), dtype=np.int32)
+        lengths = np.empty((num_chunks), dtype=np.int32)
 
-        chunk_entries = dict()
-
-        def get_key(blob):
-            key_list = [a // b for a, b in zip(blob.chunk_offset, dataset.chunks)]
-            key = ".".join(map(str, key_list))
-            return key
-
-        def store_chunk_entry(blob):
-            chunk_entries[get_key(blob)] = ChunkEntry(
-                path=path, offset=blob.byte_offset, length=blob.size
-            )
+        def add_chunk_info(blob, chunk_index):
+            offsets[chunk_index] = blob.byte_offset
+            lengths[chunk_index] = blob.size
+            chunk_index += 1
 
         has_chunk_iter = callable(getattr(dsid, "chunk_iter", None))
         if has_chunk_iter:
-            dsid.chunk_iter(store_chunk_entry)
+            chunk_index = 0
+            dsid.chunk_iter(add_chunk_info, chunk_index)
         else:
             for index in range(num_chunks):
-                store_chunk_entry(dsid.get_chunk_info(index))
+                add_chunk_info(dsid.get_chunk_info(index), index)
 
-        chunk_manifest = ChunkManifest(entries=chunk_entries)
+        chunk_manifest = ChunkManifest.from_arrays(
+            paths=paths, offsets=offsets, lengths=lengths
+        )
         return chunk_manifest
 
 
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
index a67352e6..8c5a40a7 100644
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -27,13 +27,13 @@ def test_no_chunking(self, no_chunks_netcdf4_file):
         f = h5py.File(no_chunks_netcdf4_file)
         ds = f["data"]
         manifest = _dataset_chunk_manifest(path=no_chunks_netcdf4_file, dataset=ds)
-        assert len(manifest.entries) == 1
+        assert len(manifest) == 1
 
     def test_chunked(self, chunked_netcdf4_file):
         f = h5py.File(chunked_netcdf4_file)
         ds = f["data"]
         manifest = _dataset_chunk_manifest(path=chunked_netcdf4_file, dataset=ds)
-        assert len(manifest.entries) == 4
+        assert len(manifest) == 4
 
 
 class TestDatasetDims:

From c37d9e526239ad5207f76d400924fffaabb578ec Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 21 Jun 2024 19:05:01 -0600
Subject: [PATCH 35/55] Temporarily skip test until blosc netcdf4 issue is
 solved.

---
 virtualizarr/tests/test_readers/test_hdf_integration.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index f51ebd45..dca34dbd 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -1,5 +1,6 @@
 import fsspec
 import numpy
+import pytest
 import xarray as xr
 
 import virtualizarr
@@ -21,6 +22,9 @@ def test_filters_h5netcdf_roundtrip(
         ds = xr.open_dataset(m, engine="kerchunk")
         assert isinstance(ds.air.values[0][0][0], numpy.float64)
 
+    @pytest.mark.skip(
+        reason="Issue with xr 'dim1' serialization and blosc availability"
+    )
     def test_filters_netcdf4_roundtrip(
         self, tmpdir, filter_encoded_xarray_netcdf4_file
     ):

From 17b30d4149603c952e0b24892b2d104ed7499a52 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 21 Jun 2024 19:24:07 -0600
Subject: [PATCH 36/55] Fix Pydantic 2 migration warnings.

---
 virtualizarr/readers/hdf_filters.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index 667ff09a..cc8e810e 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -6,7 +6,7 @@
 import numpy as np
 from numcodecs.abc import Codec
 from numcodecs.fixedscaleoffset import FixedScaleOffset
-from pydantic import BaseModel, validator
+from pydantic import BaseModel, field_validator
 from xarray.coding.variables import _choose_float_dtype
 
 _non_standard_filters = {
@@ -23,7 +23,7 @@ class BloscProperties(BaseModel):
     shuffle: int
     cname: str
 
-    @validator("cname", pre=True)
+    @field_validator("cname", mode="before")
     def get_cname_from_code(cls, v):
         blosc_compressor_codes = {
             value: key
@@ -69,7 +69,7 @@ def _filter_to_codec(
                 **{
                     k: v
                     for k, v in zip(
-                        BloscProperties.__fields__.keys(), filter_properties[-4:]
+                        BloscProperties.model_fields.keys(), filter_properties[-4:]
                     )
                 }
             )

From f6b596a6563aff90a70acb0b8190898399368f32 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 21 Jun 2024 19:30:55 -0600
Subject: [PATCH 37/55] Include hdf5plugin and imagecodecs-numcodecs in mamba
 test environment.

---
 ci/environment.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ci/environment.yml b/ci/environment.yml
index 0385ea5a..e909beec 100644
--- a/ci/environment.yml
+++ b/ci/environment.yml
@@ -14,6 +14,7 @@ dependencies:
   - ujson
   - packaging
   - universal_pathlib
+  - hdf5plugin
   # Testing
   - codecov
   - pre-commit
@@ -26,3 +27,4 @@ dependencies:
   - fsspec
   - s3fs
   - fastparquet
+  - imagecodecs-numcodecs

From eb6e24d10385fa68a9a8909d0c6cfb9a97a34461 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 21 Jun 2024 19:35:24 -0600
Subject: [PATCH 38/55] Mamba attempt with imagecodecs rather than
 imagecodecs-numcodecs.

---
 ci/environment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/environment.yml b/ci/environment.yml
index e909beec..20784a6e 100644
--- a/ci/environment.yml
+++ b/ci/environment.yml
@@ -27,4 +27,4 @@ dependencies:
   - fsspec
   - s3fs
   - fastparquet
-  - imagecodecs-numcodecs
+  - imagecodecs

From c85bd168025d4c96c1112aff22cc82fc0e07cbfd Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 21 Jun 2024 19:41:14 -0600
Subject: [PATCH 39/55] Mamba attempt with latest imagecodecs release.

---
 ci/environment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/environment.yml b/ci/environment.yml
index 20784a6e..fb967bcd 100644
--- a/ci/environment.yml
+++ b/ci/environment.yml
@@ -27,4 +27,4 @@ dependencies:
   - fsspec
   - s3fs
   - fastparquet
-  - imagecodecs
+  - imagecodecs>=2024.6.1

From ca435da5007263136bf489ffe647cb690145cbd7 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Tue, 25 Jun 2024 19:34:35 -0600
Subject: [PATCH 40/55] Use correct iter_chunks callback function signtature.

---
 virtualizarr/readers/hdf.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index b96bdff7..d082b717 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -53,12 +53,22 @@ def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest:
         def add_chunk_info(blob, chunk_index):
             offsets[chunk_index] = blob.byte_offset
             lengths[chunk_index] = blob.size
-            chunk_index += 1
 
         has_chunk_iter = callable(getattr(dsid, "chunk_iter", None))
         if has_chunk_iter:
-            chunk_index = 0
-            dsid.chunk_iter(add_chunk_info, chunk_index)
+
+            def create_callback(initial=0):
+                value = initial
+
+                def callback(blob):
+                    nonlocal value
+                    add_chunk_info(blob, chunk_index=value)
+                    value += 1
+
+                return callback
+
+            callback = create_callback()
+            dsid.chunk_iter(callback)
         else:
             for index in range(num_chunks):
                 add_chunk_info(dsid.get_chunk_info(index), index)

From 3017951549fe4b3d9d7099b1357aa76136d23f16 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Tue, 25 Jun 2024 19:35:40 -0600
Subject: [PATCH 41/55] Include pip based imagecodecs-numcodecs until
 conda-forge availability.

---
 ci/environment.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ci/environment.yml b/ci/environment.yml
index fb967bcd..e2f5a865 100644
--- a/ci/environment.yml
+++ b/ci/environment.yml
@@ -28,3 +28,5 @@ dependencies:
   - s3fs
   - fastparquet
   - imagecodecs>=2024.6.1
+  - pip:
+    - imagecodecs-numcodecs

From 32ba13537070fbee7e861d8618f6a77eacbe0da8 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Thu, 27 Jun 2024 15:43:10 -0600
Subject: [PATCH 42/55] Handle non-coordinate dims which are serialized to hdf
 as empty dataset.

---
 virtualizarr/readers/hdf.py                 | 65 ++++++++++++---------
 virtualizarr/tests/test_integration.py      | 18 +++++-
 virtualizarr/tests/test_readers/test_hdf.py |  1 +
 virtualizarr/xarray.py                      |  2 +-
 4 files changed, 53 insertions(+), 33 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index d082b717..cbbe824f 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -11,7 +11,9 @@
 from virtualizarr.zarr import ZArray
 
 
-def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest:
+def _dataset_chunk_manifest(
+    path: str, dataset: h5py.Dataset
+) -> Optional[ChunkManifest]:
     """
     Generate ChunkManifest for HDF5 dataset.
 
@@ -31,7 +33,7 @@ def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest:
 
     if dataset.chunks is None:
         if dsid.get_offset() is None:
-            raise ValueError("Dataset has no space allocated in the file")
+            return None
         else:
             key_list = [0] * (len(dataset.shape) or 1)
             key = ".".join(map(str, key_list))
@@ -167,35 +169,39 @@ def _extract_attrs(h5obj: Union[h5py.Dataset, h5py.Group]):
     return attrs
 
 
-def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable:
+def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> Optional[xr.Variable]:
     # This chunk determination logic mirrors zarr-python's create
     # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66
-    chunks = dataset.chunks if dataset.chunks else dataset.shape
-    codecs = codecs_from_dataset(dataset)
-    cfcodec = cfcodec_from_dataset(dataset)
-    attrs = _extract_attrs(dataset)
-    if cfcodec:
-        codecs.append(cfcodec["codec"])
-        dtype = cfcodec["target_dtype"]
-        attrs.pop("scale_factor", None)
-        attrs.pop("add_offset", None)
-    else:
-        dtype = dataset.dtype
-    filters = [codec.get_config() for codec in codecs]
-    zarray = ZArray(
-        chunks=chunks,
-        compressor=None,
-        dtype=dtype,
-        fill_value=dataset.fillvalue,
-        filters=filters,
-        order="C",
-        shape=dataset.shape,
-        zarr_format=2,
-    )
+
     manifest = _dataset_chunk_manifest(path, dataset)
-    marray = ManifestArray(zarray=zarray, chunkmanifest=manifest)
-    dims = _dataset_dims(dataset)
-    variable = xr.Variable(data=marray, dims=dims, attrs=attrs)
+    if manifest:
+        chunks = dataset.chunks if dataset.chunks else dataset.shape
+        codecs = codecs_from_dataset(dataset)
+        cfcodec = cfcodec_from_dataset(dataset)
+        attrs = _extract_attrs(dataset)
+        if cfcodec:
+            codecs.append(cfcodec["codec"])
+            dtype = cfcodec["target_dtype"]
+            attrs.pop("scale_factor", None)
+            attrs.pop("add_offset", None)
+        else:
+            dtype = dataset.dtype
+        filters = [codec.get_config() for codec in codecs]
+        zarray = ZArray(
+            chunks=chunks,
+            compressor=None,
+            dtype=dtype,
+            fill_value=dataset.fillvalue,
+            filters=filters,
+            order="C",
+            shape=dataset.shape,
+            zarr_format=2,
+        )
+        marray = ManifestArray(zarray=zarray, chunkmanifest=manifest)
+        dims = _dataset_dims(dataset)
+        variable = xr.Variable(data=marray, dims=dims, attrs=attrs)
+    else:
+        variable = None
     return variable
 
 
@@ -217,7 +223,8 @@ def virtual_vars_from_hdf(
         if key not in drop_variables:
             if isinstance(f[key], h5py.Dataset):
                 variable = _dataset_to_variable(path, f[key])
-                variables[key] = variable
+                if variable is not None:
+                    variables[key] = variable
             else:
                 raise NotImplementedError("Nested groups are not yet supported")
 
diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py
index 451862c6..6a1f91ef 100644
--- a/virtualizarr/tests/test_integration.py
+++ b/virtualizarr/tests/test_integration.py
@@ -71,9 +71,13 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format):
                 f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=False
             )
 
-        # assert identical to original dataset
+        # assert all_close to original dataset
         xrt.assert_allclose(roundtrip, ds)
 
+        # assert coordinate attributes are maintained
+        for coord in ds.coords:
+            assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs
+
     def test_kerchunk_roundtrip_concat(self, tmpdir, format):
         # set up example xarray dataset
         ds = xr.tutorial.open_dataset("air_temperature", decode_times=False)
@@ -107,8 +111,12 @@ def test_kerchunk_roundtrip_concat(self, tmpdir, format):
                 f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=False
             )
 
-        # assert identical to original dataset
-        xrt.assert_identical(roundtrip, ds)
+        # assert all_close to original dataset
+        xrt.assert_allclose(roundtrip, ds)
+
+        # assert coordinate attributes are maintained
+        for coord in ds.coords:
+            assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs
 
     def test_non_dimension_coordinates(self, tmpdir, format):
         # regression test for GH issue #105
@@ -142,6 +150,10 @@ def test_non_dimension_coordinates(self, tmpdir, format):
         # assert equal to original dataset
         xrt.assert_allclose(roundtrip, ds)
 
+        # assert coordinate attributes are maintained
+        for coord in ds.coords:
+            assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs
+
 
 def test_open_scalar_variable(tmpdir):
     # regression test for GH issue #100
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
index 8c5a40a7..c744cd68 100644
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -17,6 +17,7 @@ def test_empty_chunks(self, empty_chunks_netcdf4_file):
         with pytest.raises(ValueError, match="chunked but contains no chunks"):
             _dataset_chunk_manifest(path=empty_chunks_netcdf4_file, dataset=ds)
 
+    @pytest.mark.skip("Need to differentiate non coordinate dimensions from empty")
     def test_empty_dataset(self, empty_dataset_netcdf4_file):
         f = h5py.File(empty_dataset_netcdf4_file)
         ds = f["data"]
diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
index 39bd0671..a8a23693 100644
--- a/virtualizarr/xarray.py
+++ b/virtualizarr/xarray.py
@@ -121,7 +121,7 @@ def open_virtual_dataset(
             ds_attrs = attrs_from_root_group(
                 path=filepath, reader_options=reader_options
             )
-            coord_names = None
+            coord_names = ds_attrs.pop("coordinates", [])
         else:
             # this is the only place we actually always need to use kerchunk directly
             # TODO avoid even reading byte ranges for variables that will be dropped later anyway?

From 64f446c8d452291548bba2c73a104bf068dc2d7e Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Thu, 27 Jun 2024 16:23:43 -0600
Subject: [PATCH 43/55] Use reader_options for filetype check and update
 failing kerchunk call.

---
 virtualizarr/tests/test_xarray.py | 18 +++++++++++++-----
 virtualizarr/xarray.py            |  4 +++-
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py
index e55583bf..282d4ad1 100644
--- a/virtualizarr/tests/test_xarray.py
+++ b/virtualizarr/tests/test_xarray.py
@@ -8,6 +8,7 @@
 from xarray.core.indexes import Index
 
 from virtualizarr import open_virtual_dataset
+from virtualizarr.kerchunk import FileType
 from virtualizarr.manifests import ChunkManifest, ManifestArray
 from virtualizarr.tests import network, requires_s3fs
 from virtualizarr.zarr import ZArray
@@ -325,18 +326,25 @@ def test_loadable_variables(self, netcdf4_file):
             if name in vars_to_load:
                 xrt.assert_identical(vds.variables[name], full_ds.variables[name])
 
-    @patch("virtualizarr.kerchunk.read_kerchunk_references_from_file")
+    @patch("virtualizarr.xarray._automatically_determine_filetype")
+    @patch("virtualizarr.xarray.virtual_vars_from_hdf")
     def test_open_virtual_dataset_passes_expected_args(
-        self, mock_read_kerchunk, netcdf4_file
+        self, mock_reader, mock_determine_filetype, netcdf4_file
     ):
         reader_options = {"option1": "value1", "option2": "value2"}
+        mock_determine_filetype.return_value = FileType.netcdf4
         open_virtual_dataset(netcdf4_file, indexes={}, reader_options=reader_options)
-        args = {
+        reader_args = {
+            "path": netcdf4_file,
+            "drop_variables": [],
+            "reader_options": reader_options,
+        }
+        mock_reader.assert_called_once_with(**reader_args)
+        filetype_args = {
             "filepath": netcdf4_file,
-            "filetype": None,
             "reader_options": reader_options,
         }
-        mock_read_kerchunk.assert_called_once_with(**args)
+        mock_determine_filetype.assert_called_once_with(**filetype_args)
 
 
 class TestRenamePaths:
diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
index a8a23693..86a59c8d 100644
--- a/virtualizarr/xarray.py
+++ b/virtualizarr/xarray.py
@@ -109,7 +109,9 @@ def open_virtual_dataset(
         )
     else:
         if filetype is None:
-            filetype = _automatically_determine_filetype(filepath=filepath)
+            filetype = _automatically_determine_filetype(
+                filepath=filepath, reader_options=reader_options
+            )
         filetype = FileType(filetype)
 
         if filetype.name.lower() == "netcdf4":

From 9797346463e443d6f48b567569156f4ca01490cf Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Sat, 29 Jun 2024 18:20:06 -0600
Subject: [PATCH 44/55] Fix chunkmanifest shaping for chunked datasets.

---
 virtualizarr/readers/hdf.py                 | 36 +++++++++------------
 virtualizarr/tests/test_readers/test_hdf.py | 10 ++++--
 2 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index cbbe824f..d683f693 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -1,3 +1,4 @@
+import math
 from typing import List, Mapping, Optional, Union
 
 import h5py
@@ -48,32 +49,27 @@ def _dataset_chunk_manifest(
         num_chunks = dsid.get_num_chunks()
         if num_chunks == 0:
             raise ValueError("The dataset is chunked but contains no chunks")
-        paths = np.full(num_chunks, path, dtype=np.dtypes.StringDType)  # type: ignore
-        offsets = np.empty((num_chunks), dtype=np.int32)
-        lengths = np.empty((num_chunks), dtype=np.int32)
 
-        def add_chunk_info(blob, chunk_index):
-            offsets[chunk_index] = blob.byte_offset
-            lengths[chunk_index] = blob.size
+        shape = tuple(math.ceil(a / b) for a, b in zip(dataset.shape, dataset.chunks))
+        paths = np.empty(shape, dtype=np.dtypes.StringDType)  # type: ignore
+        offsets = np.empty(shape, dtype=np.int32)
+        lengths = np.empty(shape, dtype=np.int32)
 
-        has_chunk_iter = callable(getattr(dsid, "chunk_iter", None))
-        if has_chunk_iter:
-
-            def create_callback(initial=0):
-                value = initial
+        def get_key(blob):
+            return tuple([a // b for a, b in zip(blob.chunk_offset, dataset.chunks)])
 
-                def callback(blob):
-                    nonlocal value
-                    add_chunk_info(blob, chunk_index=value)
-                    value += 1
+        def add_chunk_info(blob):
+            key = get_key(blob)
+            paths[key] = path
+            offsets[key] = blob.byte_offset
+            lengths[key] = blob.size
 
-                return callback
-
-            callback = create_callback()
-            dsid.chunk_iter(callback)
+        has_chunk_iter = callable(getattr(dsid, "chunk_iter", None))
+        if has_chunk_iter:
+            dsid.chunk_iter(add_chunk_info)
         else:
             for index in range(num_chunks):
-                add_chunk_info(dsid.get_chunk_info(index), index)
+                add_chunk_info(dsid.get_chunk_info(index))
 
         chunk_manifest = ChunkManifest.from_arrays(
             paths=paths, offsets=offsets, lengths=lengths
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
index c744cd68..25caab93 100644
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -28,13 +28,19 @@ def test_no_chunking(self, no_chunks_netcdf4_file):
         f = h5py.File(no_chunks_netcdf4_file)
         ds = f["data"]
         manifest = _dataset_chunk_manifest(path=no_chunks_netcdf4_file, dataset=ds)
-        assert len(manifest) == 1
+        assert manifest.shape_chunk_grid == (1, 1)
 
     def test_chunked(self, chunked_netcdf4_file):
         f = h5py.File(chunked_netcdf4_file)
         ds = f["data"]
         manifest = _dataset_chunk_manifest(path=chunked_netcdf4_file, dataset=ds)
-        assert len(manifest) == 4
+        assert manifest.shape_chunk_grid == (2, 2)
+
+    def test_chunked_roundtrip(self, chunked_roundtrip):
+        f = h5py.File(chunked_roundtrip)
+        ds = f["var2"]
+        manifest = _dataset_chunk_manifest(path=chunked_roundtrip, dataset=ds)
+        assert manifest.shape_chunk_grid == (2, 8)
 
 
 class TestDatasetDims:

From c833e191abb773e409aec6eeb47ab6438d0ee0a9 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Sat, 29 Jun 2024 18:22:05 -0600
Subject: [PATCH 45/55] Handle scale_factor attribute serialization for
 compressed files.

---
 virtualizarr/readers/hdf_filters.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index cc8e810e..1a3c2220 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -88,7 +88,11 @@ def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None:
     attributes = {attr: dataset.attrs[attr] for attr in dataset.attrs}
     mapping = {}
     if "scale_factor" in attributes:
-        mapping["scale_factor"] = 1 / attributes["scale_factor"][0]
+        try:
+            scale_factor = attributes["scale_factor"][0]
+        except IndexError:
+            scale_factor = attributes["scale_factor"]
+        mapping["scale_factor"] = 1 / scale_factor
     else:
         mapping["scale_factor"] = 1
     if "add_offset" in attributes:

From 701bcfad494326a71ec08c454465bceaa33803e9 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Sat, 29 Jun 2024 18:24:13 -0600
Subject: [PATCH 46/55] Include chunked roundtrip fixture.

---
 virtualizarr/tests/test_readers/conftest.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index e1a53c5e..5fbec00e 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -196,7 +196,8 @@ def filter_encoded_xarray_netcdf4_file(tmpdir, request, skip_test_for_libhdf5_ve
             "blosc_shuffle": 1,
             "fletcher32": False,
         }
-
+    #  Check on how handle scalar dim.
+    ds = ds.drop_dims("dim3")
     ds["var2"].encoding.update(encoding_config)
     filepath = f"{tmpdir}/{request.param}_xarray.nc"
     ds.to_netcdf(filepath, engine="netcdf4")
@@ -237,3 +238,14 @@ def scale_add_offset_netcdf4_file(tmpdir, np_uncompressed_int16, offset, scale_f
     f["data"].attrs.create(name="add_offset", data=offset)
     f["data"].attrs.create(name="scale_factor", data=np.array([scale_factor]))
     return filepath
+
+
+@pytest.fixture()
+def chunked_roundtrip(tmpdir):
+    ds = create_test_data(dim_sizes=(20, 80, 10))
+    ds = ds.drop_dims("dim3")
+    filepath = f"{tmpdir}/chunked_xarray.nc"
+    ds.to_netcdf(
+        filepath, engine="netcdf4", encoding={"var2": {"chunksizes": (10, 10)}}
+    )
+    return filepath

From 08c988e2c16a7366a4ea99f2fc073da407b326d5 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Sat, 29 Jun 2024 18:24:48 -0600
Subject: [PATCH 47/55] Standardize xarray integration tests for hdf filters.

---
 .../test_readers/test_hdf_integration.py      | 47 ++++++-------------
 1 file changed, 14 insertions(+), 33 deletions(-)

diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index dca34dbd..abc23df6 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -1,53 +1,34 @@
-import fsspec
-import numpy
 import pytest
 import xarray as xr
+import xarray.testing as xrt
 
 import virtualizarr
 from virtualizarr.kerchunk import FileType
 
 
 class TestIntegration:
+    @pytest.mark.xfail(reason="Investigate initial time value decoding issue")
     def test_filters_h5netcdf_roundtrip(
         self, tmpdir, filter_encoded_xarray_h5netcdf_file
     ):
-        virtual_ds = virtualizarr.open_virtual_dataset(
+        ds = xr.open_dataset(filter_encoded_xarray_h5netcdf_file, decode_times=False)
+        vds = virtualizarr.open_virtual_dataset(
             filter_encoded_xarray_h5netcdf_file, filetype=FileType("netcdf4")
         )
         kerchunk_file = f"{tmpdir}/kerchunk.json"
-        virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json")
-        fs = fsspec.filesystem("reference", fo=kerchunk_file)
-        m = fs.get_mapper("")
-
-        ds = xr.open_dataset(m, engine="kerchunk")
-        assert isinstance(ds.air.values[0][0][0], numpy.float64)
+        vds.virtualize.to_kerchunk(kerchunk_file, format="json")
+        roundtrip = xr.open_dataset(
+            kerchunk_file, engine="kerchunk", decode_times=False
+        )
+        xrt.assert_allclose(ds, roundtrip)
 
-    @pytest.mark.skip(
-        reason="Issue with xr 'dim1' serialization and blosc availability"
-    )
     def test_filters_netcdf4_roundtrip(
         self, tmpdir, filter_encoded_xarray_netcdf4_file
     ):
         filepath = filter_encoded_xarray_netcdf4_file["filepath"]
-        compressor = filter_encoded_xarray_netcdf4_file["compressor"]
-        virtual_ds = virtualizarr.open_virtual_dataset(
-            filepath, filetype=FileType("netcdf4")
-        )
+        ds = xr.open_dataset(filepath)
+        vds = virtualizarr.open_virtual_dataset(filepath, filetype=FileType("netcdf4"))
         kerchunk_file = f"{tmpdir}/kerchunk.json"
-        virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json")
-        fs = fsspec.filesystem("reference", fo=kerchunk_file)
-        m = fs.get_mapper("")
-        ds = xr.open_dataset(m, engine="kerchunk")
-
-        expected_encoding = ds["var2"].encoding.copy()
-        compression = expected_encoding.pop("compression")
-        blosc_shuffle = expected_encoding.pop("blosc_shuffle")
-        if compression is not None:
-            if "blosc" in compression and blosc_shuffle:
-                expected_encoding["blosc"] = {
-                    "compressor": compressor,
-                    "shuffle": blosc_shuffle,
-                }
-                expected_encoding["shuffle"] = False
-        actual_encoding = ds["var2"].encoding
-        assert expected_encoding.items() <= actual_encoding.items()
+        vds.virtualize.to_kerchunk(kerchunk_file, format="json")
+        roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
+        xrt.assert_equal(ds, roundtrip)

From 4cb4bac261a7825f44798e247c13a6faeb752a5a Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Sat, 29 Jun 2024 20:00:56 -0600
Subject: [PATCH 48/55] Update reader selection logic for new filetype
 determination.

---
 virtualizarr/xarray.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
index 1a795e56..9671264d 100644
--- a/virtualizarr/xarray.py
+++ b/virtualizarr/xarray.py
@@ -136,8 +136,7 @@ def open_virtual_dataset(
                 filepath=filepath, reader_options=reader_options
             )
         filetype = FileType(filetype)
-
-        if filetype.name.lower() == "netcdf4":
+        if filetype.name.lower() == "netcdf4" or filetype.name.lower() == "hdf5":
             virtual_vars = virtual_vars_from_hdf(
                 path=filepath,
                 drop_variables=drop_variables,

From d352104393d0eeacfc3b566a9f0cb79c7e688c8f Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Sun, 30 Jun 2024 13:07:17 -0600
Subject: [PATCH 49/55] Use decode_times for integration test.

---
 .../tests/test_readers/test_hdf_integration.py       | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index abc23df6..882dea31 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -1,4 +1,3 @@
-import pytest
 import xarray as xr
 import xarray.testing as xrt
 
@@ -7,19 +6,18 @@
 
 
 class TestIntegration:
-    @pytest.mark.xfail(reason="Investigate initial time value decoding issue")
     def test_filters_h5netcdf_roundtrip(
         self, tmpdir, filter_encoded_xarray_h5netcdf_file
     ):
-        ds = xr.open_dataset(filter_encoded_xarray_h5netcdf_file, decode_times=False)
+        ds = xr.open_dataset(filter_encoded_xarray_h5netcdf_file, decode_times=True)
         vds = virtualizarr.open_virtual_dataset(
-            filter_encoded_xarray_h5netcdf_file, filetype=FileType("netcdf4")
+            filter_encoded_xarray_h5netcdf_file,
+            loadable_variables=["time"],
+            cftime_variables=["time"],
         )
         kerchunk_file = f"{tmpdir}/kerchunk.json"
         vds.virtualize.to_kerchunk(kerchunk_file, format="json")
-        roundtrip = xr.open_dataset(
-            kerchunk_file, engine="kerchunk", decode_times=False
-        )
+        roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk", decode_times=True)
         xrt.assert_allclose(ds, roundtrip)
 
     def test_filters_netcdf4_roundtrip(

From 3d89ea426ccb0f8abdcb961e55773887d48d38d6 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Sun, 30 Jun 2024 13:38:46 -0600
Subject: [PATCH 50/55] Standardize fixture names for hdf5 vs netcdf4 file
 types.

---
 virtualizarr/tests/test_readers/conftest.py   | 36 +++++----
 virtualizarr/tests/test_readers/test_hdf.py   | 78 +++++++++----------
 .../tests/test_readers/test_hdf_filters.py    | 26 +++----
 .../test_readers/test_hdf_integration.py      | 10 +--
 4 files changed, 76 insertions(+), 74 deletions(-)

diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index 5fbec00e..539b2fbb 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -9,7 +9,7 @@
 
 
 @pytest.fixture
-def empty_chunks_netcdf4_file(tmpdir):
+def empty_chunks_hdf5_file(tmpdir):
     ds = xr.Dataset({"data": []})
     filepath = f"{tmpdir}/empty_chunks.nc"
     ds.to_netcdf(filepath, engine="h5netcdf")
@@ -17,7 +17,7 @@ def empty_chunks_netcdf4_file(tmpdir):
 
 
 @pytest.fixture
-def empty_dataset_netcdf4_file(tmpdir):
+def empty_dataset_hdf5_file(tmpdir):
     filepath = f"{tmpdir}/empty_dataset.nc"
     f = h5py.File(filepath, "w")
     f.create_dataset("data", shape=(0,), dtype="f")
@@ -25,7 +25,7 @@ def empty_dataset_netcdf4_file(tmpdir):
 
 
 @pytest.fixture
-def no_chunks_netcdf4_file(tmpdir):
+def no_chunks_hdf5_file(tmpdir):
     filepath = f"{tmpdir}/no_chunks.nc"
     f = h5py.File(filepath, "w")
     data = np.random.random((10, 10))
@@ -34,7 +34,7 @@ def no_chunks_netcdf4_file(tmpdir):
 
 
 @pytest.fixture
-def chunked_netcdf4_file(tmpdir):
+def chunked_hdf5_file(tmpdir):
     filepath = f"{tmpdir}/chunks.nc"
     f = h5py.File(filepath, "w")
     data = np.random.random((100, 100))
@@ -43,7 +43,7 @@ def chunked_netcdf4_file(tmpdir):
 
 
 @pytest.fixture
-def single_dimension_scale_netcdf4_file(tmpdir):
+def single_dimension_scale_hdf5_file(tmpdir):
     filepath = f"{tmpdir}/single_dimension_scale.nc"
     f = h5py.File(filepath, "w")
     data = [1, 2]
@@ -56,7 +56,7 @@ def single_dimension_scale_netcdf4_file(tmpdir):
 
 
 @pytest.fixture
-def is_scale_netcdf4_file(tmpdir):
+def is_scale_hdf5_file(tmpdir):
     filepath = f"{tmpdir}/is_scale.nc"
     f = h5py.File(filepath, "w")
     data = [1, 2]
@@ -66,7 +66,7 @@ def is_scale_netcdf4_file(tmpdir):
 
 
 @pytest.fixture
-def multiple_dimension_scales_netcdf4_file(tmpdir):
+def multiple_dimension_scales_hdf5_file(tmpdir):
     filepath = f"{tmpdir}/multiple_dimension_scales.nc"
     f = h5py.File(filepath, "w")
     data = [1, 2]
@@ -96,7 +96,7 @@ def chunked_dimensions_netcdf4_file(tmpdir):
 
 
 @pytest.fixture
-def string_attributes_netcdf4_file(tmpdir):
+def string_attributes_hdf5_file(tmpdir):
     filepath = f"{tmpdir}/attributes.nc"
     f = h5py.File(filepath, "w")
     data = np.random.random((10, 10))
@@ -107,7 +107,7 @@ def string_attributes_netcdf4_file(tmpdir):
 
 
 @pytest.fixture
-def root_attributes_netcdf4_file(tmpdir):
+def root_attributes_hdf5_file(tmpdir):
     filepath = f"{tmpdir}/root_attributes.nc"
     f = h5py.File(filepath, "w")
     f.attrs["attribute_name"] = "attribute_name"
@@ -115,7 +115,7 @@ def root_attributes_netcdf4_file(tmpdir):
 
 
 @pytest.fixture
-def group_netcdf4_file(tmpdir):
+def group_hdf5_file(tmpdir):
     filepath = f"{tmpdir}/group.nc"
     f = h5py.File(filepath, "w")
     f.create_group("group")
@@ -123,7 +123,7 @@ def group_netcdf4_file(tmpdir):
 
 
 @pytest.fixture
-def multiple_datasets_netcdf4_file(tmpdir):
+def multiple_datasets_hdf5_file(tmpdir):
     filepath = f"{tmpdir}/multiple_datasets.nc"
     f = h5py.File(filepath, "w")
     data = np.random.random((10, 10))
@@ -138,7 +138,7 @@ def np_uncompressed():
 
 
 @pytest.fixture(params=["gzip", "blosc_lz4", "lz4", "bzip2", "zstd"])
-def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request):
+def filter_encoded_hdf5_file(tmpdir, np_uncompressed, request):
     filepath = f"{tmpdir}/{request.param}.nc"
     f = h5py.File(filepath, "w")
     if request.param == "gzip":
@@ -162,7 +162,7 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request):
 
 
 @pytest.fixture(params=["gzip"])
-def filter_encoded_xarray_h5netcdf_file(tmpdir, request):
+def filter_encoded_roundtrip_hdf5_file(tmpdir, request):
     ds = xr.tutorial.open_dataset("air_temperature")
     encoding = {}
     if request.param == "gzip":
@@ -184,7 +184,9 @@ def skip_test_for_libhdf5_version():
 
 
 @pytest.fixture(params=["blosc_zlib"])
-def filter_encoded_xarray_netcdf4_file(tmpdir, request, skip_test_for_libhdf5_version):
+def filter_encoded_roundtrip_netcdf4_file(
+    tmpdir, request, skip_test_for_libhdf5_version
+):
     if skip_test_for_libhdf5_version:
         pytest.skip("Requires libhdf5 >= 1.14")
     ds = create_test_data(dim_sizes=(20, 80, 10))
@@ -215,7 +217,7 @@ def offset():
 
 
 @pytest.fixture
-def add_offset_netcdf4_file(tmpdir, np_uncompressed_int16, offset):
+def add_offset_hdf5_file(tmpdir, np_uncompressed_int16, offset):
     filepath = f"{tmpdir}/offset.nc"
     f = h5py.File(filepath, "w")
     data = np_uncompressed_int16 - offset
@@ -230,7 +232,7 @@ def scale_factor():
 
 
 @pytest.fixture
-def scale_add_offset_netcdf4_file(tmpdir, np_uncompressed_int16, offset, scale_factor):
+def scale_add_offset_hdf5_file(tmpdir, np_uncompressed_int16, offset, scale_factor):
     filepath = f"{tmpdir}/scale_offset.nc"
     f = h5py.File(filepath, "w")
     data = (np_uncompressed_int16 - offset) / scale_factor
@@ -241,7 +243,7 @@ def scale_add_offset_netcdf4_file(tmpdir, np_uncompressed_int16, offset, scale_f
 
 
 @pytest.fixture()
-def chunked_roundtrip(tmpdir):
+def chunked_roundtrip_hdf5_file(tmpdir):
     ds = create_test_data(dim_sizes=(20, 80, 10))
     ds = ds.drop_dims("dim3")
     filepath = f"{tmpdir}/chunked_xarray.nc"
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
index 25caab93..1fb0f6ee 100644
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -11,59 +11,59 @@
 
 
 class TestDatasetChunkManifest:
-    def test_empty_chunks(self, empty_chunks_netcdf4_file):
-        f = h5py.File(empty_chunks_netcdf4_file)
+    def test_empty_chunks(self, empty_chunks_hdf5_file):
+        f = h5py.File(empty_chunks_hdf5_file)
         ds = f["data"]
         with pytest.raises(ValueError, match="chunked but contains no chunks"):
-            _dataset_chunk_manifest(path=empty_chunks_netcdf4_file, dataset=ds)
+            _dataset_chunk_manifest(path=empty_chunks_hdf5_file, dataset=ds)
 
     @pytest.mark.skip("Need to differentiate non coordinate dimensions from empty")
-    def test_empty_dataset(self, empty_dataset_netcdf4_file):
-        f = h5py.File(empty_dataset_netcdf4_file)
+    def test_empty_dataset(self, empty_dataset_hdf5_file):
+        f = h5py.File(empty_dataset_hdf5_file)
         ds = f["data"]
         with pytest.raises(ValueError, match="no space allocated in the file"):
-            _dataset_chunk_manifest(path=empty_dataset_netcdf4_file, dataset=ds)
+            _dataset_chunk_manifest(path=empty_dataset_hdf5_file, dataset=ds)
 
-    def test_no_chunking(self, no_chunks_netcdf4_file):
-        f = h5py.File(no_chunks_netcdf4_file)
+    def test_no_chunking(self, no_chunks_hdf5_file):
+        f = h5py.File(no_chunks_hdf5_file)
         ds = f["data"]
-        manifest = _dataset_chunk_manifest(path=no_chunks_netcdf4_file, dataset=ds)
+        manifest = _dataset_chunk_manifest(path=no_chunks_hdf5_file, dataset=ds)
         assert manifest.shape_chunk_grid == (1, 1)
 
-    def test_chunked(self, chunked_netcdf4_file):
-        f = h5py.File(chunked_netcdf4_file)
+    def test_chunked(self, chunked_hdf5_file):
+        f = h5py.File(chunked_hdf5_file)
         ds = f["data"]
-        manifest = _dataset_chunk_manifest(path=chunked_netcdf4_file, dataset=ds)
+        manifest = _dataset_chunk_manifest(path=chunked_hdf5_file, dataset=ds)
         assert manifest.shape_chunk_grid == (2, 2)
 
-    def test_chunked_roundtrip(self, chunked_roundtrip):
-        f = h5py.File(chunked_roundtrip)
+    def test_chunked_roundtrip(self, chunked_roundtrip_hdf5_file):
+        f = h5py.File(chunked_roundtrip_hdf5_file)
         ds = f["var2"]
-        manifest = _dataset_chunk_manifest(path=chunked_roundtrip, dataset=ds)
+        manifest = _dataset_chunk_manifest(path=chunked_roundtrip_hdf5_file, dataset=ds)
         assert manifest.shape_chunk_grid == (2, 8)
 
 
 class TestDatasetDims:
-    def test_single_dimension_scale(self, single_dimension_scale_netcdf4_file):
-        f = h5py.File(single_dimension_scale_netcdf4_file)
+    def test_single_dimension_scale(self, single_dimension_scale_hdf5_file):
+        f = h5py.File(single_dimension_scale_hdf5_file)
         ds = f["data"]
         dims = _dataset_dims(ds)
         assert dims[0] == "x"
 
-    def test_is_dimension_scale(self, is_scale_netcdf4_file):
-        f = h5py.File(is_scale_netcdf4_file)
+    def test_is_dimension_scale(self, is_scale_hdf5_file):
+        f = h5py.File(is_scale_hdf5_file)
         ds = f["data"]
         dims = _dataset_dims(ds)
         assert dims[0] == "data"
 
-    def test_multiple_dimension_scales(self, multiple_dimension_scales_netcdf4_file):
-        f = h5py.File(multiple_dimension_scales_netcdf4_file)
+    def test_multiple_dimension_scales(self, multiple_dimension_scales_hdf5_file):
+        f = h5py.File(multiple_dimension_scales_hdf5_file)
         ds = f["data"]
         with pytest.raises(ValueError, match="dimension scales attached"):
             _dataset_dims(ds)
 
-    def test_no_dimension_scales(self, no_chunks_netcdf4_file):
-        f = h5py.File(no_chunks_netcdf4_file)
+    def test_no_dimension_scales(self, no_chunks_hdf5_file):
+        f = h5py.File(no_chunks_hdf5_file)
         ds = f["data"]
         dims = _dataset_dims(ds)
         assert dims == ["phony_dim_0", "phony_dim_1"]
@@ -76,33 +76,33 @@ def test_chunked_dataset(self, chunked_dimensions_netcdf4_file):
         var = _dataset_to_variable(chunked_dimensions_netcdf4_file, ds)
         assert var.chunks == (50, 50)
 
-    def test_not_chunked_dataset(self, single_dimension_scale_netcdf4_file):
-        f = h5py.File(single_dimension_scale_netcdf4_file)
+    def test_not_chunked_dataset(self, single_dimension_scale_hdf5_file):
+        f = h5py.File(single_dimension_scale_hdf5_file)
         ds = f["data"]
-        var = _dataset_to_variable(single_dimension_scale_netcdf4_file, ds)
+        var = _dataset_to_variable(single_dimension_scale_hdf5_file, ds)
         assert var.chunks == (2,)
 
-    def test_dataset_attributes(self, string_attributes_netcdf4_file):
-        f = h5py.File(string_attributes_netcdf4_file)
+    def test_dataset_attributes(self, string_attributes_hdf5_file):
+        f = h5py.File(string_attributes_hdf5_file)
         ds = f["data"]
-        var = _dataset_to_variable(string_attributes_netcdf4_file, ds)
+        var = _dataset_to_variable(string_attributes_hdf5_file, ds)
         assert var.attrs["attribute_name"] == "attribute_name"
 
 
 class TestExtractAttributes:
-    def test_string_attribute(self, string_attributes_netcdf4_file):
-        f = h5py.File(string_attributes_netcdf4_file)
+    def test_string_attribute(self, string_attributes_hdf5_file):
+        f = h5py.File(string_attributes_hdf5_file)
         ds = f["data"]
         attrs = _extract_attrs(ds)
         assert attrs["attribute_name"] == "attribute_name"
 
-    def test_root_attribute(self, root_attributes_netcdf4_file):
-        f = h5py.File(root_attributes_netcdf4_file)
+    def test_root_attribute(self, root_attributes_hdf5_file):
+        f = h5py.File(root_attributes_hdf5_file)
         attrs = _extract_attrs(f)
         assert attrs["attribute_name"] == "attribute_name"
 
-    def test_multiple_attributes(self, string_attributes_netcdf4_file):
-        f = h5py.File(string_attributes_netcdf4_file)
+    def test_multiple_attributes(self, string_attributes_hdf5_file):
+        f = h5py.File(string_attributes_hdf5_file)
         ds = f["data"]
         attrs = _extract_attrs(ds)
         assert len(attrs.keys()) == 2
@@ -113,10 +113,10 @@ def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file):
         variables = virtual_vars_from_hdf(chunked_dimensions_netcdf4_file)
         assert len(variables) == 3
 
-    def test_groups_not_implemented(self, group_netcdf4_file):
+    def test_groups_not_implemented(self, group_hdf5_file):
         with pytest.raises(NotImplementedError):
-            virtual_vars_from_hdf(group_netcdf4_file)
+            virtual_vars_from_hdf(group_hdf5_file)
 
-    def test_drop_variables(self, multiple_datasets_netcdf4_file):
-        variables = virtual_vars_from_hdf(multiple_datasets_netcdf4_file, ["data2"])
+    def test_drop_variables(self, multiple_datasets_hdf5_file):
+        variables = virtual_vars_from_hdf(multiple_datasets_hdf5_file, ["data2"])
         assert "data2" not in variables.keys()
diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py
index 960bcf2c..99b3af48 100644
--- a/virtualizarr/tests/test_readers/test_hdf_filters.py
+++ b/virtualizarr/tests/test_readers/test_hdf_filters.py
@@ -39,12 +39,12 @@ def test_zstd(self):
 
 
 class TestCodecsFromDataSet:
-    def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file):
-        f = h5py.File(filter_encoded_netcdf4_file)
+    def test_numcodec_decoding(self, np_uncompressed, filter_encoded_hdf5_file):
+        f = h5py.File(filter_encoded_hdf5_file)
         ds = f["data"]
         chunk_info = ds.id.get_chunk_info(0)
         codecs = codecs_from_dataset(ds)
-        with open(filter_encoded_netcdf4_file, "rb") as file:
+        with open(filter_encoded_hdf5_file, "rb") as file:
             file.seek(chunk_info.byte_offset)
             bytes_read = file.read(chunk_info.size)
             decoded = codecs[0].decode(bytes_read)
@@ -52,8 +52,8 @@ def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file):
 
 
 class TestCFCodecFromDataset:
-    def test_no_cf_convention(self, filter_encoded_netcdf4_file):
-        f = h5py.File(filter_encoded_netcdf4_file)
+    def test_no_cf_convention(self, filter_encoded_hdf5_file):
+        f = h5py.File(filter_encoded_hdf5_file)
         ds = f["data"]
         cf_codec = cfcodec_from_dataset(ds)
         assert cf_codec is None
@@ -68,8 +68,8 @@ def test_cf_scale_factor(self, netcdf4_file):
         assert cf_codec["codec"].dtype == "<f8"
         assert cf_codec["codec"].astype == "<i2"
 
-    def test_cf_add_offset(self, add_offset_netcdf4_file):
-        f = h5py.File(add_offset_netcdf4_file)
+    def test_cf_add_offset(self, add_offset_hdf5_file):
+        f = h5py.File(add_offset_hdf5_file)
         ds = f["data"]
         cf_codec = cfcodec_from_dataset(ds)
         assert cf_codec["target_dtype"] == np.dtype(np.float64)
@@ -78,13 +78,13 @@ def test_cf_add_offset(self, add_offset_netcdf4_file):
         assert cf_codec["codec"].dtype == "<f8"
 
     def test_cf_codec_decoding_offset(
-        self, add_offset_netcdf4_file, np_uncompressed_int16
+        self, add_offset_hdf5_file, np_uncompressed_int16
     ):
-        f = h5py.File(add_offset_netcdf4_file)
+        f = h5py.File(add_offset_hdf5_file)
         ds = f["data"]
         chunk_info = ds.id.get_chunk_info(0)
         cfcodec = cfcodec_from_dataset(ds)
-        with open(add_offset_netcdf4_file, "rb") as file:
+        with open(add_offset_hdf5_file, "rb") as file:
             file.seek(chunk_info.byte_offset)
             bytes_read = file.read(chunk_info.size)
             decoded = cfcodec["codec"].decode(bytes_read)
@@ -92,13 +92,13 @@ def test_cf_codec_decoding_offset(
             assert decoded.dtype == np.float64
 
     def test_cf_codec_decoding_scale_offset(
-        self, scale_add_offset_netcdf4_file, np_uncompressed_int16
+        self, scale_add_offset_hdf5_file, np_uncompressed_int16
     ):
-        f = h5py.File(scale_add_offset_netcdf4_file)
+        f = h5py.File(scale_add_offset_hdf5_file)
         ds = f["data"]
         chunk_info = ds.id.get_chunk_info(0)
         cfcodec = cfcodec_from_dataset(ds)
-        with open(scale_add_offset_netcdf4_file, "rb") as file:
+        with open(scale_add_offset_hdf5_file, "rb") as file:
             file.seek(chunk_info.byte_offset)
             bytes_read = file.read(chunk_info.size)
             decoded = cfcodec["codec"].decode(bytes_read)
diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index 882dea31..4fc7bd3e 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -7,11 +7,11 @@
 
 class TestIntegration:
     def test_filters_h5netcdf_roundtrip(
-        self, tmpdir, filter_encoded_xarray_h5netcdf_file
+        self, tmpdir, filter_encoded_roundtrip_hdf5_file
     ):
-        ds = xr.open_dataset(filter_encoded_xarray_h5netcdf_file, decode_times=True)
+        ds = xr.open_dataset(filter_encoded_roundtrip_hdf5_file, decode_times=True)
         vds = virtualizarr.open_virtual_dataset(
-            filter_encoded_xarray_h5netcdf_file,
+            filter_encoded_roundtrip_hdf5_file,
             loadable_variables=["time"],
             cftime_variables=["time"],
         )
@@ -21,9 +21,9 @@ def test_filters_h5netcdf_roundtrip(
         xrt.assert_allclose(ds, roundtrip)
 
     def test_filters_netcdf4_roundtrip(
-        self, tmpdir, filter_encoded_xarray_netcdf4_file
+        self, tmpdir, filter_encoded_roundtrip_netcdf4_file
     ):
-        filepath = filter_encoded_xarray_netcdf4_file["filepath"]
+        filepath = filter_encoded_roundtrip_netcdf4_file["filepath"]
         ds = xr.open_dataset(filepath)
         vds = virtualizarr.open_virtual_dataset(filepath, filetype=FileType("netcdf4"))
         kerchunk_file = f"{tmpdir}/kerchunk.json"

From c9dd0d9cbcc638a4f9d116e3b36a86de997140b6 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Sun, 30 Jun 2024 22:14:26 -0600
Subject: [PATCH 51/55] Handle array add_offset property for compressed data.

---
 virtualizarr/readers/hdf_filters.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index 1a3c2220..5b35d8ff 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -96,7 +96,11 @@ def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None:
     else:
         mapping["scale_factor"] = 1
     if "add_offset" in attributes:
-        mapping["add_offset"] = attributes["add_offset"]
+        try:
+            offset = attributes["add_offset"][0]
+        except IndexError:
+            offset = attributes["add_offset"]
+        mapping["add_offset"] = offset
     else:
         mapping["add_offset"] = 0
     if mapping["scale_factor"] != 1 or mapping["add_offset"] != 0:

From db5b4213b0c4b512c872ce4acdce04c66936a6a5 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 1 Jul 2024 16:57:11 -0600
Subject: [PATCH 52/55] Include h5py shuffle filter.

---
 virtualizarr/readers/hdf_filters.py            | 18 ++++++++++++++----
 .../tests/test_readers/test_hdf_filters.py     | 11 ++++++++++-
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index 5b35d8ff..a60dd56a 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -36,6 +36,14 @@ class ZstdProperties(BaseModel):
     level: int
 
 
+class ShuffleProperties(BaseModel):
+    elementsize: int
+
+
+class ZlibProperties(BaseModel):
+    level: int
+
+
 class CFCodec(TypedDict):
     target_dtype: np.dtype
     codec: Codec
@@ -56,9 +64,13 @@ def _filter_to_codec(
             id = _non_standard_filters[id_str]
         else:
             id = id_str
-        conf["id"] = id  # type: ignore[assignment]
         if id == "zlib":
-            conf["level"] = filter_properties  # type: ignore[assignment]
+            zlib_props = ZlibProperties(level=filter_properties)
+            conf = zlib_props.model_dump()  # type: ignore[assignment]
+        if id == "shuffle" and isinstance(filter_properties, tuple):
+            shuffle_props = ShuffleProperties(elementsize=filter_properties[0])
+            conf = shuffle_props.model_dump()  # type: ignore[assignment]
+        conf["id"] = id  # type: ignore[assignment]
     if id_int:
         filter = hdf5plugin.get_filters(id_int)[0]
         id = filter.filter_name
@@ -77,9 +89,7 @@ def _filter_to_codec(
         if id == "zstd" and isinstance(filter_properties, tuple):
             zstd_props = ZstdProperties(level=filter_properties[0])
             conf = zstd_props.model_dump()  # type: ignore[assignment]
-
         conf["id"] = id
-
     codec = registry.get_codec(conf)
     return codec
 
diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py
index 99b3af48..efaad781 100644
--- a/virtualizarr/tests/test_readers/test_hdf_filters.py
+++ b/virtualizarr/tests/test_readers/test_hdf_filters.py
@@ -37,6 +37,12 @@ def test_zstd(self):
         expected_config = {"id": "zstd", "level": 5}
         assert codec.get_config() == expected_config
 
+    def test_shuffle(self):
+        codec = _filter_to_codec("shuffle", (7,))
+        assert isinstance(codec, numcodecs.shuffle.Shuffle)
+        expected_config = {"id": "shuffle", "elementsize": 7}
+        assert codec.get_config() == expected_config
+
 
 class TestCodecsFromDataSet:
     def test_numcodec_decoding(self, np_uncompressed, filter_encoded_hdf5_file):
@@ -48,7 +54,10 @@ def test_numcodec_decoding(self, np_uncompressed, filter_encoded_hdf5_file):
             file.seek(chunk_info.byte_offset)
             bytes_read = file.read(chunk_info.size)
             decoded = codecs[0].decode(bytes_read)
-            assert decoded == np_uncompressed.tobytes()
+            if isinstance(decoded, np.ndarray):
+                assert decoded.tobytes() == np_uncompressed.tobytes()
+            else:
+                assert decoded == np_uncompressed.tobytes()
 
 
 class TestCFCodecFromDataset:

From 9a1da321e186f56d230cb5609dc787f7d9ec557b Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 1 Jul 2024 17:03:46 -0600
Subject: [PATCH 53/55] Make ScaleAndOffset codec last in filters list.

---
 virtualizarr/readers/hdf.py                   |  2 +-
 virtualizarr/tests/test_readers/conftest.py   | 36 ++++++++++++++++++-
 .../test_readers/test_hdf_integration.py      | 10 ++++++
 3 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index d683f693..f3337c04 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -176,7 +176,7 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> Optional[xr.Variab
         cfcodec = cfcodec_from_dataset(dataset)
         attrs = _extract_attrs(dataset)
         if cfcodec:
-            codecs.append(cfcodec["codec"])
+            codecs.insert(0, cfcodec["codec"])
             dtype = cfcodec["target_dtype"]
             attrs.pop("scale_factor", None)
             attrs.pop("add_offset", None)
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index 539b2fbb..afc0beea 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -137,7 +137,7 @@ def np_uncompressed():
     return np.arange(100)
 
 
-@pytest.fixture(params=["gzip", "blosc_lz4", "lz4", "bzip2", "zstd"])
+@pytest.fixture(params=["gzip", "blosc_lz4", "lz4", "bzip2", "zstd", "shuffle"])
 def filter_encoded_hdf5_file(tmpdir, np_uncompressed, request):
     filepath = f"{tmpdir}/{request.param}.nc"
     f = h5py.File(filepath, "w")
@@ -157,6 +157,8 @@ def filter_encoded_hdf5_file(tmpdir, np_uncompressed, request):
         f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.BZip2())
     if request.param == "zstd":
         f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.Zstd(clevel=2))
+    if request.param == "shuffle":
+        f.create_dataset(name="data", data=np_uncompressed, shuffle=True)
 
     return filepath
 
@@ -251,3 +253,35 @@ def chunked_roundtrip_hdf5_file(tmpdir):
         filepath, engine="netcdf4", encoding={"var2": {"chunksizes": (10, 10)}}
     )
     return filepath
+
+
+@pytest.fixture(params=["gzip", "zlib"])
+def filter_and_cf_roundtrip_hdf5_file(tmpdir, request):
+    x = np.arange(100)
+    y = np.arange(100)
+    temperature = 0.1 * x[:, None] + 0.1 * y[None, :]
+    ds = xr.Dataset(
+        {"temperature": (["x", "y"], temperature)},
+        coords={"x": np.arange(100), "y": np.arange(100)},
+    )
+    encoding = {
+        "temperature": {
+            "dtype": "int16",
+            "scale_factor": 0.1,
+            "add_offset": 273.15,
+        }
+    }
+    if request.param == "gzip":
+        encoding["temperature"]["compression"] = "gzip"
+        encoding["temperature"]["compression_opts"] = 7
+
+    if request.param == "zlib":
+        encoding["temperature"]["zlib"] = True
+        encoding["temperature"]["complevel"] = 9
+
+    from random import randint
+
+    filepath = f"{tmpdir}/{request.param}_{randint(0,100)}_cf_roundtrip.nc"
+    ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding)
+
+    return filepath
diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index 4fc7bd3e..dd8d6c3b 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -1,3 +1,4 @@
+import pytest
 import xarray as xr
 import xarray.testing as xrt
 
@@ -30,3 +31,12 @@ def test_filters_netcdf4_roundtrip(
         vds.virtualize.to_kerchunk(kerchunk_file, format="json")
         roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
         xrt.assert_equal(ds, roundtrip)
+
+    @pytest.mark.xfail(reason="Investigate kerchunk _FillValue logic")
+    def test_filter_and_cf_roundtrip(self, tmpdir, filter_and_cf_roundtrip_hdf5_file):
+        ds = xr.open_dataset(filter_and_cf_roundtrip_hdf5_file)
+        vds = virtualizarr.open_virtual_dataset(filter_and_cf_roundtrip_hdf5_file)
+        kerchunk_file = f"{tmpdir}/filter_cf_kerchunk.json"
+        vds.virtualize.to_kerchunk(kerchunk_file, format="json")
+        roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
+        xrt.assert_allclose(ds, roundtrip)

From 9b2b0f8a2b94073c2bf50fe78d8dd068e6d1332c Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Tue, 2 Jul 2024 13:23:23 -0600
Subject: [PATCH 54/55] Apply ScaleAndOffset codec to _FillValue since it's
 value is now downstream.

---
 virtualizarr/readers/hdf.py                             | 4 +++-
 virtualizarr/tests/test_readers/conftest.py             | 7 ++++++-
 virtualizarr/tests/test_readers/test_hdf_integration.py | 2 --
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index f3337c04..6197067f 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -180,14 +180,16 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> Optional[xr.Variab
             dtype = cfcodec["target_dtype"]
             attrs.pop("scale_factor", None)
             attrs.pop("add_offset", None)
+            fill_value = cfcodec["codec"].decode(dataset.fillvalue)
         else:
             dtype = dataset.dtype
+            fill_value = dataset.fillvalue
         filters = [codec.get_config() for codec in codecs]
         zarray = ZArray(
             chunks=chunks,
             compressor=None,
             dtype=dtype,
-            fill_value=dataset.fillvalue,
+            fill_value=fill_value,
             filters=filters,
             order="C",
             shape=dataset.shape,
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index afc0beea..ec4132ba 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -259,7 +259,9 @@ def chunked_roundtrip_hdf5_file(tmpdir):
 def filter_and_cf_roundtrip_hdf5_file(tmpdir, request):
     x = np.arange(100)
     y = np.arange(100)
+    fill_value = np.int16(-9999)
     temperature = 0.1 * x[:, None] + 0.1 * y[None, :]
+    temperature[0][0] = fill_value
     ds = xr.Dataset(
         {"temperature": (["x", "y"], temperature)},
         coords={"x": np.arange(100), "y": np.arange(100)},
@@ -269,7 +271,10 @@ def filter_and_cf_roundtrip_hdf5_file(tmpdir, request):
             "dtype": "int16",
             "scale_factor": 0.1,
             "add_offset": 273.15,
-        }
+            "_FillValue": fill_value,
+        },
+        "x": {"_FillValue": fill_value},
+        "y": {"_FillValue": fill_value},
     }
     if request.param == "gzip":
         encoding["temperature"]["compression"] = "gzip"
diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index dd8d6c3b..5cf3f79d 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -1,4 +1,3 @@
-import pytest
 import xarray as xr
 import xarray.testing as xrt
 
@@ -32,7 +31,6 @@ def test_filters_netcdf4_roundtrip(
         roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
         xrt.assert_equal(ds, roundtrip)
 
-    @pytest.mark.xfail(reason="Investigate kerchunk _FillValue logic")
     def test_filter_and_cf_roundtrip(self, tmpdir, filter_and_cf_roundtrip_hdf5_file):
         ds = xr.open_dataset(filter_and_cf_roundtrip_hdf5_file)
         vds = virtualizarr.open_virtual_dataset(filter_and_cf_roundtrip_hdf5_file)

From 9ef136275ff636535dcb7e6ecc5b35c1e7149065 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Tue, 2 Jul 2024 15:12:04 -0600
Subject: [PATCH 55/55] Coerce scale and add_offset values to native float for
 JSON serialization.

---
 virtualizarr/readers/hdf_filters.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index a60dd56a..ae232fec 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -102,7 +102,7 @@ def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None:
             scale_factor = attributes["scale_factor"][0]
         except IndexError:
             scale_factor = attributes["scale_factor"]
-        mapping["scale_factor"] = 1 / scale_factor
+        mapping["scale_factor"] = float(1 / scale_factor)
     else:
         mapping["scale_factor"] = 1
     if "add_offset" in attributes:
@@ -110,7 +110,7 @@ def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None:
             offset = attributes["add_offset"][0]
         except IndexError:
             offset = attributes["add_offset"]
-        mapping["add_offset"] = offset
+        mapping["add_offset"] = float(offset)
     else:
         mapping["add_offset"] = 0
     if mapping["scale_factor"] != 1 or mapping["add_offset"] != 0: