From b112aa258d38c2f053e2eab13e74f9d7877b2686 Mon Sep 17 00:00:00 2001 From: Mattia Almansi Date: Thu, 14 Apr 2022 17:36:04 +0200 Subject: [PATCH] Add support in the "zarr" backend for reading NCZarr data (#6420) * add support for NCZarr * restore original format * add test_nczarr * better comment * test reading with zarr * decode zarray * use public store and test nczarr only * restore tests * install netcdf-c fixing bug * add env * fix ci * try build netcdf-c on windows * fix typo * install netcdf-c first * install netcdf-c dep with conda * fix ci * try win env again * fix Nan in tests * edit zarray * loop over all variables * edit Nan in zattrs and zarray * check path exists * must use netcdf-c>=4.8.1 * skip 4.8.1 and Windows * revisions * better testing * revisions * add what's new * update docs * [skip ci] Mention netCDF and GDAL in user-guide * [skip ci] reword --- doc/internals/zarr-encoding-spec.rst | 6 ++- doc/user-guide/io.rst | 12 ++++- doc/whats-new.rst | 2 + xarray/backends/zarr.py | 70 ++++++++++++++++++---------- xarray/tests/test_backends.py | 49 +++++++++++++++++++ 5 files changed, 110 insertions(+), 29 deletions(-) diff --git a/doc/internals/zarr-encoding-spec.rst b/doc/internals/zarr-encoding-spec.rst index f8bffa6e82f..7f468b8b0db 100644 --- a/doc/internals/zarr-encoding-spec.rst +++ b/doc/internals/zarr-encoding-spec.rst @@ -32,9 +32,11 @@ the variable dimension names and then removed from the attributes dictionary returned to the user. Because of these choices, Xarray cannot read arbitrary array data, but only -Zarr data with valid ``_ARRAY_DIMENSIONS`` attributes on each array. +Zarr data with valid ``_ARRAY_DIMENSIONS`` or +`NCZarr `_ attributes +on each array (NCZarr dimension names are defined in the ``.zarray`` file). -After decoding the ``_ARRAY_DIMENSIONS`` attribute and assigning the variable +After decoding the ``_ARRAY_DIMENSIONS`` or NCZarr attribute and assigning the variable dimensions, Xarray proceeds to [optionally] decode each variable using its standard CF decoding machinery used for NetCDF data (see :py:func:`decode_cf`). diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst index ddde0bf5888..81fa29bdf5f 100644 --- a/doc/user-guide/io.rst +++ b/doc/user-guide/io.rst @@ -518,8 +518,11 @@ the ability to store and analyze datasets far too large fit onto disk Xarray can't open just any zarr dataset, because xarray requires special metadata (attributes) describing the dataset dimensions and coordinates. -At this time, xarray can only open zarr datasets that have been written by -xarray. For implementation details, see :ref:`zarr_encoding`. +At this time, xarray can only open zarr datasets with these special attributes, +such as zarr datasets written by xarray, +`netCDF `_, +or `GDAL `_. +For implementation details, see :ref:`zarr_encoding`. To write a dataset with zarr, we use the :py:meth:`Dataset.to_zarr` method. @@ -548,6 +551,11 @@ store is already present at that path, an error will be raised, preventing it from being overwritten. To override this behavior and overwrite an existing store, add ``mode='w'`` when invoking :py:meth:`~Dataset.to_zarr`. +.. note:: + + xarray does not write NCZarr attributes. Therefore, NCZarr data must be + opened in read-only mode. + To store variable length strings, convert them to object arrays first with ``dtype=object``. diff --git a/doc/whats-new.rst b/doc/whats-new.rst index c1f3fe70ebf..85096a45a39 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -22,6 +22,8 @@ v2022.03.1 (unreleased) New Features ~~~~~~~~~~~~ +- The `zarr` backend is now able to read NCZarr. + By `Mattia Almansi `_. - Add a weighted ``quantile`` method to :py:class:`~core.weighted.DatasetWeighted` and :py:class:`~core.weighted.DataArrayWeighted` (:pull:`6059`). By `Christian Jauvin `_ and `David Huard `_. diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index aca0b8064f5..104f8aca58f 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1,3 +1,4 @@ +import json import os import warnings @@ -178,19 +179,37 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks): raise AssertionError("We should never get here. Function logic must be wrong.") -def _get_zarr_dims_and_attrs(zarr_obj, dimension_key): +def _get_zarr_dims_and_attrs(zarr_obj, dimension_key, try_nczarr): # Zarr arrays do not have dimensions. To get around this problem, we add # an attribute that specifies the dimension. We have to hide this attribute # when we send the attributes to the user. # zarr_obj can be either a zarr group or zarr array try: + # Xarray-Zarr dimensions = zarr_obj.attrs[dimension_key] - except KeyError: - raise KeyError( - f"Zarr object is missing the attribute `{dimension_key}`, which is " - "required for xarray to determine variable dimensions." - ) - attributes = HiddenKeyDict(zarr_obj.attrs, [dimension_key]) + except KeyError as e: + if not try_nczarr: + raise KeyError( + f"Zarr object is missing the attribute `{dimension_key}`, which is " + "required for xarray to determine variable dimensions." + ) from e + + # NCZarr defines dimensions through metadata in .zarray + zarray_path = os.path.join(zarr_obj.path, ".zarray") + zarray = json.loads(zarr_obj.store[zarray_path]) + try: + # NCZarr uses Fully Qualified Names + dimensions = [ + os.path.basename(dim) for dim in zarray["_NCZARR_ARRAY"]["dimrefs"] + ] + except KeyError as e: + raise KeyError( + f"Zarr object is missing the attribute `{dimension_key}` and the NCZarr metadata, " + "which are required for xarray to determine variable dimensions." + ) from e + + nc_attrs = [attr for attr in zarr_obj.attrs if attr.startswith("_NC")] + attributes = HiddenKeyDict(zarr_obj.attrs, [dimension_key] + nc_attrs) return dimensions, attributes @@ -409,7 +428,10 @@ def ds(self): def open_store_variable(self, name, zarr_array): data = indexing.LazilyIndexedArray(ZarrArrayWrapper(name, self)) - dimensions, attributes = _get_zarr_dims_and_attrs(zarr_array, DIMENSION_KEY) + try_nczarr = self._mode == "r" + dimensions, attributes = _get_zarr_dims_and_attrs( + zarr_array, DIMENSION_KEY, try_nczarr + ) attributes = dict(attributes) encoding = { "chunks": zarr_array.chunks, @@ -430,26 +452,24 @@ def get_variables(self): ) def get_attrs(self): - return dict(self.zarr_group.attrs.asdict()) + return { + k: v + for k, v in self.zarr_group.attrs.asdict().items() + if not k.startswith("_NC") + } def get_dimensions(self): + try_nczarr = self._mode == "r" dimensions = {} for k, v in self.zarr_group.arrays(): - try: - for d, s in zip(v.attrs[DIMENSION_KEY], v.shape): - if d in dimensions and dimensions[d] != s: - raise ValueError( - f"found conflicting lengths for dimension {d} " - f"({s} != {dimensions[d]})" - ) - dimensions[d] = s - - except KeyError: - raise KeyError( - f"Zarr object is missing the attribute `{DIMENSION_KEY}`, " - "which is required for xarray to determine " - "variable dimensions." - ) + dim_names, _ = _get_zarr_dims_and_attrs(v, DIMENSION_KEY, try_nczarr) + for d, s in zip(dim_names, v.shape): + if d in dimensions and dimensions[d] != s: + raise ValueError( + f"found conflicting lengths for dimension {d} " + f"({s} != {dimensions[d]})" + ) + dimensions[d] = s return dimensions def set_dimensions(self, variables, unlimited_dims=None): @@ -645,7 +665,7 @@ def open_zarr( The `store` object should be a valid store for a Zarr group. `store` variables must contain dimension metadata encoded in the - `_ARRAY_DIMENSIONS` attribute. + `_ARRAY_DIMENSIONS` attribute or must have NCZarr format. Parameters ---------- diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 6f6bb0410e2..81bfeb11a1e 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -4,6 +4,7 @@ import math import os.path import pickle +import platform import re import shutil import sys @@ -5434,3 +5435,51 @@ def test_write_file_from_np_str(str_type, tmpdir) -> None: txr = tdf.to_xarray() txr.to_netcdf(tmpdir.join("test.nc")) + + +@requires_zarr +@requires_netCDF4 +class TestNCZarr: + @staticmethod + def _create_nczarr(filename): + netcdfc_version = Version(nc4.getlibversion().split()[0]) + if netcdfc_version < Version("4.8.1"): + pytest.skip("requires netcdf-c>=4.8.1") + if (platform.system() == "Windows") and (netcdfc_version == Version("4.8.1")): + # Bug in netcdf-c==4.8.1 (typo: Nan instead of NaN) + # https://github.com/Unidata/netcdf-c/issues/2265 + pytest.skip("netcdf-c==4.8.1 has issues on Windows") + + ds = create_test_data() + # Drop dim3: netcdf-c does not support dtype='4.8.1 will add _ARRAY_DIMENSIONS by default + mode = "nczarr" if netcdfc_version == Version("4.8.1") else "nczarr,noxarray" + ds.to_netcdf(f"file://{filename}#mode={mode}") + return ds + + def test_open_nczarr(self): + with create_tmp_file(suffix=".zarr") as tmp: + expected = self._create_nczarr(tmp) + actual = xr.open_zarr(tmp, consolidated=False) + assert_identical(expected, actual) + + def test_overwriting_nczarr(self): + with create_tmp_file(suffix=".zarr") as tmp: + ds = self._create_nczarr(tmp) + expected = ds[["var1"]] + expected.to_zarr(tmp, mode="w") + actual = xr.open_zarr(tmp, consolidated=False) + assert_identical(expected, actual) + + @pytest.mark.parametrize("mode", ["a", "r+"]) + @pytest.mark.filterwarnings("ignore:.*non-consolidated metadata.*") + def test_raise_writing_to_nczarr(self, mode): + with create_tmp_file(suffix=".zarr") as tmp: + ds = self._create_nczarr(tmp) + with pytest.raises( + KeyError, match="missing the attribute `_ARRAY_DIMENSIONS`," + ): + ds.to_zarr(tmp, mode=mode)