-
Notifications
You must be signed in to change notification settings - Fork 22
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Draft] Non-kerchunk backend for HDF5/netcdf4 files. #87
base: main
Are you sure you want to change the base?
Changes from 64 commits
6b7abe2
bca0aab
384ff6b
4c5f9bd
1dd3370
d92c75c
0ed8362
f4485fa
3cc1254
0123df7
332bcaa
c51e615
0083f77
3c00071
207c4b5
c573800
ef0d7a8
588e06b
725333e
72df108
d1e85cb
1e2b343
7f1c189
908e332
0df332d
ca6b236
b7426c5
dac21dd
e968772
9a98e57
7590b87
e9fbc8a
14bd709
acdf0d7
4ba323a
e14e53b
b808ded
b052f8c
01a3980
c37d9e5
17b30d4
f6b596a
eb6e24d
c85bd16
ca435da
3017951
ccf0b73
32ba135
64f446c
1c590bb
9797346
c833e19
701bcfa
08c988e
e6076bd
d684a84
4cb4bac
d352104
3d89ea4
c9dd0d9
db5b421
9a1da32
9b2b0f8
9ef1362
30005bd
14f7a99
f4f9c8f
d257cb9
e795c2c
a9e59f2
2b33bc2
a57ae9e
e21fc69
df69a12
169337c
bdcbfbf
77f1689
42c653a
9c86e0d
001a4a7
79f9921
1589776
772c580
3ab90c6
150d06d
8ccba34
81874e0
f87abe2
70e7e29
43bc0e4
82a6321
b34f260
f9ead06
5608292
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,243 @@ | ||
import math | ||
from typing import List, Mapping, Optional, Union | ||
|
||
import h5py | ||
import numpy as np | ||
import xarray as xr | ||
|
||
from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray | ||
from virtualizarr.readers.hdf_filters import cfcodec_from_dataset, codecs_from_dataset | ||
from virtualizarr.types import ChunkKey | ||
from virtualizarr.utils import _fsspec_openfile_from_filepath | ||
from virtualizarr.zarr import ZArray | ||
|
||
|
||
def _dataset_chunk_manifest( | ||
path: str, dataset: h5py.Dataset | ||
) -> Optional[ChunkManifest]: | ||
""" | ||
Generate ChunkManifest for HDF5 dataset. | ||
|
||
Parameters | ||
---------- | ||
path: str | ||
The path the HDF5 container file | ||
dset : h5py.Dataset | ||
HDF5 dataset for which to create a ChunkManifest | ||
|
||
Returns | ||
------- | ||
ChunkManifest | ||
A Virtualizarr ChunkManifest | ||
""" | ||
dsid = dataset.id | ||
|
||
if dataset.chunks is None: | ||
if dsid.get_offset() is None: | ||
return None | ||
else: | ||
key_list = [0] * (len(dataset.shape) or 1) | ||
key = ".".join(map(str, key_list)) | ||
chunk_entry = ChunkEntry( | ||
path=path, offset=dsid.get_offset(), length=dsid.get_storage_size() | ||
) | ||
chunk_key = ChunkKey(key) | ||
chunk_entries = {chunk_key: chunk_entry.dict()} | ||
chunk_manifest = ChunkManifest(entries=chunk_entries) | ||
return chunk_manifest | ||
else: | ||
num_chunks = dsid.get_num_chunks() | ||
if num_chunks == 0: | ||
raise ValueError("The dataset is chunked but contains no chunks") | ||
|
||
shape = tuple(math.ceil(a / b) for a, b in zip(dataset.shape, dataset.chunks)) | ||
paths = np.empty(shape, dtype=np.dtypes.StringDType) # type: ignore | ||
offsets = np.empty(shape, dtype=np.int32) | ||
lengths = np.empty(shape, dtype=np.int32) | ||
|
||
def get_key(blob): | ||
return tuple([a // b for a, b in zip(blob.chunk_offset, dataset.chunks)]) | ||
|
||
def add_chunk_info(blob): | ||
key = get_key(blob) | ||
paths[key] = path | ||
offsets[key] = blob.byte_offset | ||
lengths[key] = blob.size | ||
|
||
has_chunk_iter = callable(getattr(dsid, "chunk_iter", None)) | ||
if has_chunk_iter: | ||
dsid.chunk_iter(add_chunk_info) | ||
else: | ||
for index in range(num_chunks): | ||
add_chunk_info(dsid.get_chunk_info(index)) | ||
|
||
chunk_manifest = ChunkManifest.from_arrays( | ||
paths=paths, offsets=offsets, lengths=lengths | ||
) | ||
return chunk_manifest | ||
|
||
|
||
def _dataset_dims(dataset: h5py.Dataset) -> Union[List[str], List[None]]: | ||
""" | ||
Get a list of dimension scale names attached to input HDF5 dataset. | ||
|
||
This is required by the xarray package to work with Zarr arrays. Only | ||
one dimension scale per dataset dimension is allowed. If dataset is | ||
dimension scale, it will be considered as the dimension to itself. | ||
|
||
Parameters | ||
---------- | ||
dataset : h5py.Dataset | ||
HDF5 dataset. | ||
|
||
Returns | ||
------- | ||
list | ||
List with HDF5 path names of dimension scales attached to input | ||
dataset. | ||
""" | ||
dims = list() | ||
rank = len(dataset.shape) | ||
if rank: | ||
for n in range(rank): | ||
num_scales = len(dataset.dims[n]) | ||
if num_scales == 1: | ||
dims.append(dataset.dims[n][0].name[1:]) | ||
elif h5py.h5ds.is_scale(dataset.id): | ||
dims.append(dataset.name[1:]) | ||
elif num_scales > 1: | ||
raise ValueError( | ||
f"{dataset.name}: {len(dataset.dims[n])} " | ||
f"dimension scales attached to dimension #{n}" | ||
) | ||
elif num_scales == 0: | ||
# Some HDF5 files do not have dimension scales. | ||
# If this is the case, `num_scales` will be 0. | ||
# In this case, we mimic netCDF4 and assign phony dimension names. | ||
# See https://github.com/fsspec/kerchunk/issues/41 | ||
dims.append(f"phony_dim_{n}") | ||
return dims | ||
|
||
|
||
def _extract_attrs(h5obj: Union[h5py.Dataset, h5py.Group]): | ||
""" | ||
Extract attributes from an HDF5 group or dataset. | ||
|
||
Parameters | ||
---------- | ||
h5obj : h5py.Group or h5py.Dataset | ||
An HDF5 group or dataset. | ||
""" | ||
_HIDDEN_ATTRS = { | ||
"REFERENCE_LIST", | ||
"CLASS", | ||
"DIMENSION_LIST", | ||
"NAME", | ||
"_Netcdf4Dimid", | ||
"_Netcdf4Coordinates", | ||
"_nc3_strict", | ||
"_NCProperties", | ||
} | ||
attrs = {} | ||
for n, v in h5obj.attrs.items(): | ||
if n in _HIDDEN_ATTRS: | ||
continue | ||
# Fix some attribute values to avoid JSON encoding exceptions... | ||
if isinstance(v, bytes): | ||
v = v.decode("utf-8") or " " | ||
elif isinstance(v, (np.ndarray, np.number, np.bool_)): | ||
if v.dtype.kind == "S": | ||
v = v.astype(str) | ||
if n == "_FillValue": | ||
continue | ||
elif v.size == 1: | ||
v = v.flatten()[0] | ||
if isinstance(v, (np.ndarray, np.number, np.bool_)): | ||
v = v.tolist() | ||
else: | ||
v = v.tolist() | ||
elif isinstance(v, h5py._hl.base.Empty): | ||
v = "" | ||
if v == "DIMENSION_SCALE": | ||
continue | ||
|
||
attrs[n] = v | ||
return attrs | ||
|
||
|
||
def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> Optional[xr.Variable]: | ||
# This chunk determination logic mirrors zarr-python's create | ||
# https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66 | ||
|
||
manifest = _dataset_chunk_manifest(path, dataset) | ||
if manifest: | ||
chunks = dataset.chunks if dataset.chunks else dataset.shape | ||
codecs = codecs_from_dataset(dataset) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Leaving There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ghidalgo3 My rationale for describing the full codec chain in the |
||
cfcodec = cfcodec_from_dataset(dataset) | ||
attrs = _extract_attrs(dataset) | ||
if cfcodec: | ||
codecs.insert(0, cfcodec["codec"]) | ||
dtype = cfcodec["target_dtype"] | ||
attrs.pop("scale_factor", None) | ||
attrs.pop("add_offset", None) | ||
fill_value = cfcodec["codec"].decode(dataset.fillvalue) | ||
else: | ||
dtype = dataset.dtype | ||
fill_value = dataset.fillvalue | ||
filters = [codec.get_config() for codec in codecs] | ||
zarray = ZArray( | ||
chunks=chunks, | ||
compressor=None, | ||
dtype=dtype, | ||
fill_value=fill_value, | ||
filters=filters, | ||
order="C", | ||
shape=dataset.shape, | ||
zarr_format=2, | ||
) | ||
marray = ManifestArray(zarray=zarray, chunkmanifest=manifest) | ||
dims = _dataset_dims(dataset) | ||
variable = xr.Variable(data=marray, dims=dims, attrs=attrs) | ||
else: | ||
variable = None | ||
return variable | ||
|
||
|
||
def virtual_vars_from_hdf( | ||
path: str, | ||
drop_variables: Optional[List[str]] = None, | ||
reader_options: Optional[dict] = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The default |
||
"storage_options": {"key": "", "secret": "", "anon": True} | ||
}, | ||
) -> Mapping[str, xr.Variable]: | ||
if drop_variables is None: | ||
drop_variables = [] | ||
open_file = _fsspec_openfile_from_filepath( | ||
filepath=path, reader_options=reader_options | ||
) | ||
f = h5py.File(open_file, mode="r") | ||
variables = {} | ||
for key in f.keys(): | ||
if key not in drop_variables: | ||
if isinstance(f[key], h5py.Dataset): | ||
variable = _dataset_to_variable(path, f[key]) | ||
if variable is not None: | ||
variables[key] = variable | ||
else: | ||
raise NotImplementedError("Nested groups are not yet supported") | ||
|
||
return variables | ||
|
||
|
||
def attrs_from_root_group( | ||
path: str, | ||
reader_options: Optional[dict] = { | ||
"storage_options": {"key": "", "secret": "", "anon": True} | ||
}, | ||
): | ||
open_file = _fsspec_openfile_from_filepath( | ||
filepath=path, reader_options=reader_options | ||
) | ||
f = h5py.File(open_file, mode="r") | ||
attrs = _extract_attrs(f) | ||
return attrs |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After #177 , these arrays will need to be
uint64
instead ofint32
.