Skip to content

Commit

Permalink
Add support in the "zarr" backend for reading NCZarr data (#6420)
Browse files Browse the repository at this point in the history
* add support for NCZarr

* restore original format

* add test_nczarr

* better comment

* test reading with zarr

* decode zarray

* use public store and test nczarr only

* restore tests

* install netcdf-c fixing bug

* add env

* fix ci

* try build netcdf-c on windows

* fix typo

* install netcdf-c first

* install netcdf-c dep with conda

* fix ci

* try win env again

* fix Nan in tests

* edit zarray

* loop over all variables

* edit Nan in zattrs and zarray

* check path exists

* must use netcdf-c>=4.8.1

* skip 4.8.1 and Windows

* revisions

* better testing

* revisions

* add what's new

* update docs

* [skip ci] Mention netCDF and GDAL in user-guide

* [skip ci] reword
  • Loading branch information
malmans2 authored Apr 14, 2022
1 parent b4c943e commit b112aa2
Show file tree
Hide file tree
Showing 5 changed files with 110 additions and 29 deletions.
6 changes: 4 additions & 2 deletions doc/internals/zarr-encoding-spec.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,11 @@ the variable dimension names and then removed from the attributes dictionary
returned to the user.

Because of these choices, Xarray cannot read arbitrary array data, but only
Zarr data with valid ``_ARRAY_DIMENSIONS`` attributes on each array.
Zarr data with valid ``_ARRAY_DIMENSIONS`` or
`NCZarr <https://docs.unidata.ucar.edu/nug/current/nczarr_head.html>`_ attributes
on each array (NCZarr dimension names are defined in the ``.zarray`` file).

After decoding the ``_ARRAY_DIMENSIONS`` attribute and assigning the variable
After decoding the ``_ARRAY_DIMENSIONS`` or NCZarr attribute and assigning the variable
dimensions, Xarray proceeds to [optionally] decode each variable using its
standard CF decoding machinery used for NetCDF data (see :py:func:`decode_cf`).

Expand Down
12 changes: 10 additions & 2 deletions doc/user-guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -518,8 +518,11 @@ the ability to store and analyze datasets far too large fit onto disk

Xarray can't open just any zarr dataset, because xarray requires special
metadata (attributes) describing the dataset dimensions and coordinates.
At this time, xarray can only open zarr datasets that have been written by
xarray. For implementation details, see :ref:`zarr_encoding`.
At this time, xarray can only open zarr datasets with these special attributes,
such as zarr datasets written by xarray,
`netCDF <https://docs.unidata.ucar.edu/nug/current/nczarr_head.html>`_,
or `GDAL <https://gdal.org/drivers/raster/zarr.html>`_.
For implementation details, see :ref:`zarr_encoding`.

To write a dataset with zarr, we use the :py:meth:`Dataset.to_zarr` method.

Expand Down Expand Up @@ -548,6 +551,11 @@ store is already present at that path, an error will be raised, preventing it
from being overwritten. To override this behavior and overwrite an existing
store, add ``mode='w'`` when invoking :py:meth:`~Dataset.to_zarr`.

.. note::

xarray does not write NCZarr attributes. Therefore, NCZarr data must be
opened in read-only mode.

To store variable length strings, convert them to object arrays first with
``dtype=object``.

Expand Down
2 changes: 2 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ v2022.03.1 (unreleased)
New Features
~~~~~~~~~~~~

- The `zarr` backend is now able to read NCZarr.
By `Mattia Almansi <https://github.com/malmans2>`_.
- Add a weighted ``quantile`` method to :py:class:`~core.weighted.DatasetWeighted` and
:py:class:`~core.weighted.DataArrayWeighted` (:pull:`6059`). By
`Christian Jauvin <https://github.com/cjauvin>`_ and `David Huard <https://github.com/huard>`_.
Expand Down
70 changes: 45 additions & 25 deletions xarray/backends/zarr.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import os
import warnings

Expand Down Expand Up @@ -178,19 +179,37 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks):
raise AssertionError("We should never get here. Function logic must be wrong.")


def _get_zarr_dims_and_attrs(zarr_obj, dimension_key):
def _get_zarr_dims_and_attrs(zarr_obj, dimension_key, try_nczarr):
# Zarr arrays do not have dimensions. To get around this problem, we add
# an attribute that specifies the dimension. We have to hide this attribute
# when we send the attributes to the user.
# zarr_obj can be either a zarr group or zarr array
try:
# Xarray-Zarr
dimensions = zarr_obj.attrs[dimension_key]
except KeyError:
raise KeyError(
f"Zarr object is missing the attribute `{dimension_key}`, which is "
"required for xarray to determine variable dimensions."
)
attributes = HiddenKeyDict(zarr_obj.attrs, [dimension_key])
except KeyError as e:
if not try_nczarr:
raise KeyError(
f"Zarr object is missing the attribute `{dimension_key}`, which is "
"required for xarray to determine variable dimensions."
) from e

# NCZarr defines dimensions through metadata in .zarray
zarray_path = os.path.join(zarr_obj.path, ".zarray")
zarray = json.loads(zarr_obj.store[zarray_path])
try:
# NCZarr uses Fully Qualified Names
dimensions = [
os.path.basename(dim) for dim in zarray["_NCZARR_ARRAY"]["dimrefs"]
]
except KeyError as e:
raise KeyError(
f"Zarr object is missing the attribute `{dimension_key}` and the NCZarr metadata, "
"which are required for xarray to determine variable dimensions."
) from e

nc_attrs = [attr for attr in zarr_obj.attrs if attr.startswith("_NC")]
attributes = HiddenKeyDict(zarr_obj.attrs, [dimension_key] + nc_attrs)
return dimensions, attributes


Expand Down Expand Up @@ -409,7 +428,10 @@ def ds(self):

def open_store_variable(self, name, zarr_array):
data = indexing.LazilyIndexedArray(ZarrArrayWrapper(name, self))
dimensions, attributes = _get_zarr_dims_and_attrs(zarr_array, DIMENSION_KEY)
try_nczarr = self._mode == "r"
dimensions, attributes = _get_zarr_dims_and_attrs(
zarr_array, DIMENSION_KEY, try_nczarr
)
attributes = dict(attributes)
encoding = {
"chunks": zarr_array.chunks,
Expand All @@ -430,26 +452,24 @@ def get_variables(self):
)

def get_attrs(self):
return dict(self.zarr_group.attrs.asdict())
return {
k: v
for k, v in self.zarr_group.attrs.asdict().items()
if not k.startswith("_NC")
}

def get_dimensions(self):
try_nczarr = self._mode == "r"
dimensions = {}
for k, v in self.zarr_group.arrays():
try:
for d, s in zip(v.attrs[DIMENSION_KEY], v.shape):
if d in dimensions and dimensions[d] != s:
raise ValueError(
f"found conflicting lengths for dimension {d} "
f"({s} != {dimensions[d]})"
)
dimensions[d] = s

except KeyError:
raise KeyError(
f"Zarr object is missing the attribute `{DIMENSION_KEY}`, "
"which is required for xarray to determine "
"variable dimensions."
)
dim_names, _ = _get_zarr_dims_and_attrs(v, DIMENSION_KEY, try_nczarr)
for d, s in zip(dim_names, v.shape):
if d in dimensions and dimensions[d] != s:
raise ValueError(
f"found conflicting lengths for dimension {d} "
f"({s} != {dimensions[d]})"
)
dimensions[d] = s
return dimensions

def set_dimensions(self, variables, unlimited_dims=None):
Expand Down Expand Up @@ -645,7 +665,7 @@ def open_zarr(
The `store` object should be a valid store for a Zarr group. `store`
variables must contain dimension metadata encoded in the
`_ARRAY_DIMENSIONS` attribute.
`_ARRAY_DIMENSIONS` attribute or must have NCZarr format.
Parameters
----------
Expand Down
49 changes: 49 additions & 0 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import math
import os.path
import pickle
import platform
import re
import shutil
import sys
Expand Down Expand Up @@ -5434,3 +5435,51 @@ def test_write_file_from_np_str(str_type, tmpdir) -> None:
txr = tdf.to_xarray()

txr.to_netcdf(tmpdir.join("test.nc"))


@requires_zarr
@requires_netCDF4
class TestNCZarr:
@staticmethod
def _create_nczarr(filename):
netcdfc_version = Version(nc4.getlibversion().split()[0])
if netcdfc_version < Version("4.8.1"):
pytest.skip("requires netcdf-c>=4.8.1")
if (platform.system() == "Windows") and (netcdfc_version == Version("4.8.1")):
# Bug in netcdf-c==4.8.1 (typo: Nan instead of NaN)
# https://github.com/Unidata/netcdf-c/issues/2265
pytest.skip("netcdf-c==4.8.1 has issues on Windows")

ds = create_test_data()
# Drop dim3: netcdf-c does not support dtype='<U1'
# https://github.com/Unidata/netcdf-c/issues/2259
ds = ds.drop_vars("dim3")

# netcdf-c>4.8.1 will add _ARRAY_DIMENSIONS by default
mode = "nczarr" if netcdfc_version == Version("4.8.1") else "nczarr,noxarray"
ds.to_netcdf(f"file://{filename}#mode={mode}")
return ds

def test_open_nczarr(self):
with create_tmp_file(suffix=".zarr") as tmp:
expected = self._create_nczarr(tmp)
actual = xr.open_zarr(tmp, consolidated=False)
assert_identical(expected, actual)

def test_overwriting_nczarr(self):
with create_tmp_file(suffix=".zarr") as tmp:
ds = self._create_nczarr(tmp)
expected = ds[["var1"]]
expected.to_zarr(tmp, mode="w")
actual = xr.open_zarr(tmp, consolidated=False)
assert_identical(expected, actual)

@pytest.mark.parametrize("mode", ["a", "r+"])
@pytest.mark.filterwarnings("ignore:.*non-consolidated metadata.*")
def test_raise_writing_to_nczarr(self, mode):
with create_tmp_file(suffix=".zarr") as tmp:
ds = self._create_nczarr(tmp)
with pytest.raises(
KeyError, match="missing the attribute `_ARRAY_DIMENSIONS`,"
):
ds.to_zarr(tmp, mode=mode)

0 comments on commit b112aa2

Please sign in to comment.