Initial attempt at scale and offset via numcodecs.

zarr-developers · May 22, 2024 · 7f1c189 · 7f1c189
1 parent 1e2b343
commit 7f1c189
Show file tree

Hide file tree

Showing 2 changed files with 46 additions and 4 deletions.
diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
@@ -5,7 +5,7 @@
 import xarray as xr
 
 from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray
-from virtualizarr.readers.hdf_filters import codecs_from_dataset
+from virtualizarr.readers.hdf_filters import cfcodec_from_dataset, codecs_from_dataset
 from virtualizarr.types import ChunkKey
 from virtualizarr.utils import _fsspec_openfile_from_filepath
 from virtualizarr.zarr import ZArray
@@ -163,11 +163,20 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable:
  # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66
  chunks = dataset.chunks if dataset.chunks else dataset.shape
  codecs = codecs_from_dataset(dataset)
+ cfcodec = cfcodec_from_dataset(dataset)
+ attrs = _extract_attrs(dataset)
+ if cfcodec:
+ codecs.append(cfcodec["codec"])
+ dtype = cfcodec["target_dtype"]
+ attrs.pop("scale_factor", None)
+ attrs.pop("add_offset", None)
+ else:
+ dtype = dataset.dtype
  filters = [codec.get_config() for codec in codecs]
  zarray = ZArray(
  chunks=chunks,
  compressor=None,
- dtype=dataset.dtype,
+ dtype=dtype,
  fill_value=dataset.fillvalue,
  filters=filters,
  order="C",
@@ -177,7 +186,6 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable:
  manifest = _dataset_chunk_manifest(path, dataset)
  marray = ManifestArray(zarray=zarray, chunkmanifest=manifest)
  dims = _dataset_dims(dataset)
- attrs = _extract_attrs(dataset)
  variable = xr.Variable(data=marray, dims=dims, attrs=attrs)
  return variable
 

diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
@@ -1,10 +1,13 @@
-from typing import List, Tuple, Union
+from typing import List, Tuple, TypedDict, Union
 
 import h5py
 import hdf5plugin
 import numcodecs.registry as registry
+import numpy as np
 from numcodecs.abc import Codec
+from numcodecs.fixedscaleoffset import FixedScaleOffset
 from pydantic import BaseModel, validator
+from xarray.coding.variables import _choose_float_dtype
 
 _non_standard_filters = {"gzip": "zlib"}
 
@@ -24,6 +27,11 @@ def get_cname_from_code(cls, v):
  return blosc_compressor_codes[v]
 
 
+class CFCodec(TypedDict):
+ target_dtype: np.dtype
+ codec: Codec
+
+
 def _filter_to_codec(
  filter_id: str, filter_properties: Union[int, None, Tuple] = None
 ) -> Codec:
@@ -61,6 +69,32 @@ def _filter_to_codec(
  return codec
 
 
+def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None:
+ attributes = {attr: dataset.attrs[attr] for attr in dataset.attrs}
+ mapping = {}
+ if "scale_factor" in attributes:
+ mapping["scale_factor"] = 1 / attributes["scale_factor"][0]
+ else:
+ mapping["scale_factor"] = 1
+ if "add_offset" in attributes:
+ mapping["add_offset"] = attributes["add_offset"]
+ else:
+ mapping["add_offset"] = 0
+ if mapping["scale_factor"] != 1 or mapping["add_offset"] != 0:
+ float_dtype = _choose_float_dtype(dtype=dataset.dtype, mapping=mapping)
+ target_dtype = np.dtype(float_dtype)
+ codec = FixedScaleOffset(
+ offset=mapping["add_offset"],
+ scale=mapping["scale_factor"],
+ dtype=target_dtype,
+ astype=dataset.dtype,
+ )
+ cfcodec = CFCodec(target_dtype=target_dtype, codec=codec)
+ return cfcodec
+ else:
+ return None
+
+
 def codecs_from_dataset(dataset: h5py.Dataset) -> List[Codec]:
  codecs = []
  for filter_id, filter_properties in dataset._filters.items():