Merge remote-tracking branch 'upstream/hdf5_reader' into codecs

ghidalgo3 · Jul 18, 2024 · 7a65fbd · 7a65fbd
2 parents c051f04 + 9ef1362
commit 7a65fbd
Show file tree

Hide file tree

Showing 12 changed files with 1,023 additions and 28 deletions.
diff --git a/ci/environment.yml b/ci/environment.yml
@@ -14,6 +14,7 @@ dependencies:
  - ujson
  - packaging
  - universal_pathlib
+ - hdf5plugin
  # Testing
  - codecov
  - pre-commit
@@ -26,7 +27,10 @@ dependencies:
  - fsspec
  - s3fs
  - fastparquet
+ - imagecodecs>=2024.6.1
  # for opening tiff files
  - tifffile
  # for opening FITS files
  - astropy
+ - pip:
+ - imagecodecs-numcodecs
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,6 +29,7 @@ dependencies = [
  "ujson",
  "packaging",
  "universal-pathlib",
+ "hdf5plugin",
 ]
 
 [project.optional-dependencies]
@@ -45,6 +46,7 @@ test = [
  "fsspec",
  "s3fs",
  "fastparquet",
+ "imagecodecs-numcodecs",
 ]
 
 

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
@@ -0,0 +1,243 @@
+import math
+from typing import List, Mapping, Optional, Union
+
+import h5py
+import numpy as np
+import xarray as xr
+
+from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray
+from virtualizarr.readers.hdf_filters import cfcodec_from_dataset, codecs_from_dataset
+from virtualizarr.types import ChunkKey
+from virtualizarr.utils import _fsspec_openfile_from_filepath
+from virtualizarr.zarr import ZArray
+
+
+def _dataset_chunk_manifest(
+ path: str, dataset: h5py.Dataset
+) -> Optional[ChunkManifest]:
+ """
+ Generate ChunkManifest for HDF5 dataset.
+
+ Parameters
+ ----------
+ path: str
+ The path the HDF5 container file
+ dset : h5py.Dataset
+ HDF5 dataset for which to create a ChunkManifest
+
+ Returns
+ -------
+ ChunkManifest
+ A Virtualizarr ChunkManifest
+ """
+ dsid = dataset.id
+
+ if dataset.chunks is None:
+ if dsid.get_offset() is None:
+ return None
+ else:
+ key_list = [0] * (len(dataset.shape) or 1)
+ key = ".".join(map(str, key_list))
+ chunk_entry = ChunkEntry(
+ path=path, offset=dsid.get_offset(), length=dsid.get_storage_size()
+ )
+ chunk_key = ChunkKey(key)
+ chunk_entries = {chunk_key: chunk_entry.dict()}
+ chunk_manifest = ChunkManifest(entries=chunk_entries)
+ return chunk_manifest
+ else:
+ num_chunks = dsid.get_num_chunks()
+ if num_chunks == 0:
+ raise ValueError("The dataset is chunked but contains no chunks")
+
+ shape = tuple(math.ceil(a / b) for a, b in zip(dataset.shape, dataset.chunks))
+ paths = np.empty(shape, dtype=np.dtypes.StringDType) # type: ignore
+ offsets = np.empty(shape, dtype=np.int32)
+ lengths = np.empty(shape, dtype=np.int32)
+
+ def get_key(blob):
+ return tuple([a // b for a, b in zip(blob.chunk_offset, dataset.chunks)])
+
+ def add_chunk_info(blob):
+ key = get_key(blob)
+ paths[key] = path
+ offsets[key] = blob.byte_offset
+ lengths[key] = blob.size
+
+ has_chunk_iter = callable(getattr(dsid, "chunk_iter", None))
+ if has_chunk_iter:
+ dsid.chunk_iter(add_chunk_info)
+ else:
+ for index in range(num_chunks):
+ add_chunk_info(dsid.get_chunk_info(index))
+
+ chunk_manifest = ChunkManifest.from_arrays(
+ paths=paths, offsets=offsets, lengths=lengths
+ )
+ return chunk_manifest
+
+
+def _dataset_dims(dataset: h5py.Dataset) -> Union[List[str], List[None]]:
+ """
+ Get a list of dimension scale names attached to input HDF5 dataset.
+
+ This is required by the xarray package to work with Zarr arrays. Only
+ one dimension scale per dataset dimension is allowed. If dataset is
+ dimension scale, it will be considered as the dimension to itself.
+
+ Parameters
+ ----------
+ dataset : h5py.Dataset
+ HDF5 dataset.
+
+ Returns
+ -------
+ list
+ List with HDF5 path names of dimension scales attached to input
+ dataset.
+ """
+ dims = list()
+ rank = len(dataset.shape)
+ if rank:
+ for n in range(rank):
+ num_scales = len(dataset.dims[n])
+ if num_scales == 1:
+ dims.append(dataset.dims[n][0].name[1:])
+ elif h5py.h5ds.is_scale(dataset.id):
+ dims.append(dataset.name[1:])
+ elif num_scales > 1:
+ raise ValueError(
+ f"{dataset.name}: {len(dataset.dims[n])} "
+ f"dimension scales attached to dimension #{n}"
+ )
+ elif num_scales == 0:
+ # Some HDF5 files do not have dimension scales.
+ # If this is the case, `num_scales` will be 0.
+ # In this case, we mimic netCDF4 and assign phony dimension names.
+ # See https://github.com/fsspec/kerchunk/issues/41
+ dims.append(f"phony_dim_{n}")
+ return dims
+
+
+def _extract_attrs(h5obj: Union[h5py.Dataset, h5py.Group]):
+ """
+ Extract attributes from an HDF5 group or dataset.
+
+ Parameters
+ ----------
+ h5obj : h5py.Group or h5py.Dataset
+ An HDF5 group or dataset.
+ """
+ _HIDDEN_ATTRS = {
+ "REFERENCE_LIST",
+ "CLASS",
+ "DIMENSION_LIST",
+ "NAME",
+ "_Netcdf4Dimid",
+ "_Netcdf4Coordinates",
+ "_nc3_strict",
+ "_NCProperties",
+ }
+ attrs = {}
+ for n, v in h5obj.attrs.items():
+ if n in _HIDDEN_ATTRS:
+ continue
+ # Fix some attribute values to avoid JSON encoding exceptions...
+ if isinstance(v, bytes):
+ v = v.decode("utf-8") or " "
+ elif isinstance(v, (np.ndarray, np.number, np.bool_)):
+ if v.dtype.kind == "S":
+ v = v.astype(str)
+ if n == "_FillValue":
+ continue
+ elif v.size == 1:
+ v = v.flatten()[0]
+ if isinstance(v, (np.ndarray, np.number, np.bool_)):
+ v = v.tolist()
+ else:
+ v = v.tolist()
+ elif isinstance(v, h5py._hl.base.Empty):
+ v = ""
+ if v == "DIMENSION_SCALE":
+ continue
+
+ attrs[n] = v
+ return attrs
+
+
+def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> Optional[xr.Variable]:
+ # This chunk determination logic mirrors zarr-python's create
+ # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66
+
+ manifest = _dataset_chunk_manifest(path, dataset)
+ if manifest:
+ chunks = dataset.chunks if dataset.chunks else dataset.shape
+ codecs = codecs_from_dataset(dataset)
+ cfcodec = cfcodec_from_dataset(dataset)
+ attrs = _extract_attrs(dataset)
+ if cfcodec:
+ codecs.insert(0, cfcodec["codec"])
+ dtype = cfcodec["target_dtype"]
+ attrs.pop("scale_factor", None)
+ attrs.pop("add_offset", None)
+ fill_value = cfcodec["codec"].decode(dataset.fillvalue)
+ else:
+ dtype = dataset.dtype
+ fill_value = dataset.fillvalue
+ filters = [codec.get_config() for codec in codecs]
+ zarray = ZArray(
+ chunks=chunks,
+ compressor=None,
+ dtype=dtype,
+ fill_value=fill_value,
+ filters=filters,
+ order="C",
+ shape=dataset.shape,
+ zarr_format=2,
+ )
+ marray = ManifestArray(zarray=zarray, chunkmanifest=manifest)
+ dims = _dataset_dims(dataset)
+ variable = xr.Variable(data=marray, dims=dims, attrs=attrs)
+ else:
+ variable = None
+ return variable
+
+
+def virtual_vars_from_hdf(
+ path: str,
+ drop_variables: Optional[List[str]] = None,
+ reader_options: Optional[dict] = {
+ "storage_options": {"key": "", "secret": "", "anon": True}
+ },
+) -> Mapping[str, xr.Variable]:
+ if drop_variables is None:
+ drop_variables = []
+ open_file = _fsspec_openfile_from_filepath(
+ filepath=path, reader_options=reader_options
+ )
+ f = h5py.File(open_file, mode="r")
+ variables = {}
+ for key in f.keys():
+ if key not in drop_variables:
+ if isinstance(f[key], h5py.Dataset):
+ variable = _dataset_to_variable(path, f[key])
+ if variable is not None:
+ variables[key] = variable
+ else:
+ raise NotImplementedError("Nested groups are not yet supported")
+
+ return variables
+
+
+def attrs_from_root_group(
+ path: str,
+ reader_options: Optional[dict] = {
+ "storage_options": {"key": "", "secret": "", "anon": True}
+ },
+):
+ open_file = _fsspec_openfile_from_filepath(
+ filepath=path, reader_options=reader_options
+ )
+ f = h5py.File(open_file, mode="r")
+ attrs = _extract_attrs(f)
+ return attrs