Add support in the "zarr" backend for reading NCZarr data (#6420)

* add support for NCZarr * restore original format * add test_nczarr * better comment * test reading with zarr * decode zarray * use public store and test nczarr only * restore tests * install netcdf-c fixing bug * add env * fix ci * try build netcdf-c on windows * fix typo * install netcdf-c first * install netcdf-c dep with conda * fix ci * try win env again * fix Nan in tests * edit zarray * loop over all variables * edit Nan in zattrs and zarray * check path exists * must use netcdf-c>=4.8.1 * skip 4.8.1 and Windows * revisions * better testing * revisions * add what's new * update docs * [skip ci] Mention netCDF and GDAL in user-guide * [skip ci] reword
pydata · Apr 14, 2022 · b112aa2 · b112aa2
1 parent b4c943e
commit b112aa2
Show file tree

Hide file tree

Showing 5 changed files with 110 additions and 29 deletions.
diff --git a/doc/internals/zarr-encoding-spec.rst b/doc/internals/zarr-encoding-spec.rst
@@ -32,9 +32,11 @@ the variable dimension names and then removed from the attributes dictionary
 returned to the user.
 
 Because of these choices, Xarray cannot read arbitrary array data, but only
-Zarr data with valid ``_ARRAY_DIMENSIONS`` attributes on each array.
+Zarr data with valid ``_ARRAY_DIMENSIONS`` or
+`NCZarr <https://docs.unidata.ucar.edu/nug/current/nczarr_head.html>`_ attributes
+on each array (NCZarr dimension names are defined in the ``.zarray`` file).
 
-After decoding the ``_ARRAY_DIMENSIONS`` attribute and assigning the variable
+After decoding the ``_ARRAY_DIMENSIONS`` or NCZarr attribute and assigning the variable
 dimensions, Xarray proceeds to [optionally] decode each variable using its
 standard CF decoding machinery used for NetCDF data (see :py:func:`decode_cf`).
 

diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst
@@ -518,8 +518,11 @@ the ability to store and analyze datasets far too large fit onto disk
 
 Xarray can't open just any zarr dataset, because xarray requires special
 metadata (attributes) describing the dataset dimensions and coordinates.
-At this time, xarray can only open zarr datasets that have been written by
-xarray. For implementation details, see :ref:`zarr_encoding`.
+At this time, xarray can only open zarr datasets with these special attributes,
+such as zarr datasets written by xarray,
+`netCDF <https://docs.unidata.ucar.edu/nug/current/nczarr_head.html>`_,
+or `GDAL <https://gdal.org/drivers/raster/zarr.html>`_.
+For implementation details, see :ref:`zarr_encoding`.
 
 To write a dataset with zarr, we use the :py:meth:`Dataset.to_zarr` method.
 
@@ -548,6 +551,11 @@ store is already present at that path, an error will be raised, preventing it
 from being overwritten. To override this behavior and overwrite an existing
 store, add ``mode='w'`` when invoking :py:meth:`~Dataset.to_zarr`.
 
+.. note::
+
+    xarray does not write NCZarr attributes. Therefore, NCZarr data must be
+    opened in read-only mode.
+
 To store variable length strings, convert them to object arrays first with
 ``dtype=object``.
 

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -22,6 +22,8 @@ v2022.03.1 (unreleased)
 New Features
 ~~~~~~~~~~~~
 
+- The `zarr` backend is now able to read NCZarr.
+  By `Mattia Almansi <https://github.com/malmans2>`_.
 - Add a weighted ``quantile`` method to :py:class:`~core.weighted.DatasetWeighted` and
   :py:class:`~core.weighted.DataArrayWeighted` (:pull:`6059`). By
   `Christian Jauvin <https://github.com/cjauvin>`_ and `David Huard <https://github.com/huard>`_.

diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py
@@ -1,3 +1,4 @@
+import json
 import os
 import warnings
 
@@ -178,19 +179,37 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks):
     raise AssertionError("We should never get here. Function logic must be wrong.")
 
 
-def _get_zarr_dims_and_attrs(zarr_obj, dimension_key):
+def _get_zarr_dims_and_attrs(zarr_obj, dimension_key, try_nczarr):
     # Zarr arrays do not have dimensions. To get around this problem, we add
     # an attribute that specifies the dimension. We have to hide this attribute
     # when we send the attributes to the user.
     # zarr_obj can be either a zarr group or zarr array
     try:
+        # Xarray-Zarr
         dimensions = zarr_obj.attrs[dimension_key]
-    except KeyError:
-        raise KeyError(
-            f"Zarr object is missing the attribute `{dimension_key}`, which is "
-            "required for xarray to determine variable dimensions."
-        )
-    attributes = HiddenKeyDict(zarr_obj.attrs, [dimension_key])
+    except KeyError as e:
+        if not try_nczarr:
+            raise KeyError(
+                f"Zarr object is missing the attribute `{dimension_key}`, which is "
+                "required for xarray to determine variable dimensions."
+            ) from e
+
+        # NCZarr defines dimensions through metadata in .zarray
+        zarray_path = os.path.join(zarr_obj.path, ".zarray")
+        zarray = json.loads(zarr_obj.store[zarray_path])
+        try:
+            # NCZarr uses Fully Qualified Names
+            dimensions = [
+                os.path.basename(dim) for dim in zarray["_NCZARR_ARRAY"]["dimrefs"]
+            ]
+        except KeyError as e:
+            raise KeyError(
+                f"Zarr object is missing the attribute `{dimension_key}` and the NCZarr metadata, "
+                "which are required for xarray to determine variable dimensions."
+            ) from e
+
+    nc_attrs = [attr for attr in zarr_obj.attrs if attr.startswith("_NC")]
+    attributes = HiddenKeyDict(zarr_obj.attrs, [dimension_key] + nc_attrs)
     return dimensions, attributes
 
 
@@ -409,7 +428,10 @@ def ds(self):
 
     def open_store_variable(self, name, zarr_array):
         data = indexing.LazilyIndexedArray(ZarrArrayWrapper(name, self))
-        dimensions, attributes = _get_zarr_dims_and_attrs(zarr_array, DIMENSION_KEY)
+        try_nczarr = self._mode == "r"
+        dimensions, attributes = _get_zarr_dims_and_attrs(
+            zarr_array, DIMENSION_KEY, try_nczarr
+        )
         attributes = dict(attributes)
         encoding = {
             "chunks": zarr_array.chunks,
@@ -430,26 +452,24 @@ def get_variables(self):
         )
 
     def get_attrs(self):
-        return dict(self.zarr_group.attrs.asdict())
+        return {
+            k: v
+            for k, v in self.zarr_group.attrs.asdict().items()
+            if not k.startswith("_NC")
+        }
 
     def get_dimensions(self):
+        try_nczarr = self._mode == "r"
         dimensions = {}
         for k, v in self.zarr_group.arrays():
-            try:
-                for d, s in zip(v.attrs[DIMENSION_KEY], v.shape):
-                    if d in dimensions and dimensions[d] != s:
-                        raise ValueError(
-                            f"found conflicting lengths for dimension {d} "
-                            f"({s} != {dimensions[d]})"
-                        )
-                    dimensions[d] = s
-
-            except KeyError:
-                raise KeyError(
-                    f"Zarr object is missing the attribute `{DIMENSION_KEY}`, "
-                    "which is required for xarray to determine "
-                    "variable dimensions."
-                )
+            dim_names, _ = _get_zarr_dims_and_attrs(v, DIMENSION_KEY, try_nczarr)
+            for d, s in zip(dim_names, v.shape):
+                if d in dimensions and dimensions[d] != s:
+                    raise ValueError(
+                        f"found conflicting lengths for dimension {d} "
+                        f"({s} != {dimensions[d]})"
+                    )
+                dimensions[d] = s
         return dimensions
 
     def set_dimensions(self, variables, unlimited_dims=None):
@@ -645,7 +665,7 @@ def open_zarr(
 
     The `store` object should be a valid store for a Zarr group. `store`
     variables must contain dimension metadata encoded in the
-    `_ARRAY_DIMENSIONS` attribute.
+    `_ARRAY_DIMENSIONS` attribute or must have NCZarr format.
 
     Parameters
     ----------

diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
@@ -4,6 +4,7 @@
 import math
 import os.path
 import pickle
+import platform
 import re
 import shutil
 import sys
@@ -5434,3 +5435,51 @@ def test_write_file_from_np_str(str_type, tmpdir) -> None:
     txr = tdf.to_xarray()
 
     txr.to_netcdf(tmpdir.join("test.nc"))
+
+
+@requires_zarr
+@requires_netCDF4
+class TestNCZarr:
+    @staticmethod
+    def _create_nczarr(filename):
+        netcdfc_version = Version(nc4.getlibversion().split()[0])
+        if netcdfc_version < Version("4.8.1"):
+            pytest.skip("requires netcdf-c>=4.8.1")
+        if (platform.system() == "Windows") and (netcdfc_version == Version("4.8.1")):
+            # Bug in netcdf-c==4.8.1 (typo: Nan instead of NaN)
+            # https://github.com/Unidata/netcdf-c/issues/2265
+            pytest.skip("netcdf-c==4.8.1 has issues on Windows")
+
+        ds = create_test_data()
+        # Drop dim3: netcdf-c does not support dtype='<U1'
+        # https://github.com/Unidata/netcdf-c/issues/2259
+        ds = ds.drop_vars("dim3")
+
+        # netcdf-c>4.8.1 will add _ARRAY_DIMENSIONS by default
+        mode = "nczarr" if netcdfc_version == Version("4.8.1") else "nczarr,noxarray"
+        ds.to_netcdf(f"file://{filename}#mode={mode}")
+        return ds
+
+    def test_open_nczarr(self):
+        with create_tmp_file(suffix=".zarr") as tmp:
+            expected = self._create_nczarr(tmp)
+            actual = xr.open_zarr(tmp, consolidated=False)
+            assert_identical(expected, actual)
+
+    def test_overwriting_nczarr(self):
+        with create_tmp_file(suffix=".zarr") as tmp:
+            ds = self._create_nczarr(tmp)
+            expected = ds[["var1"]]
+            expected.to_zarr(tmp, mode="w")
+            actual = xr.open_zarr(tmp, consolidated=False)
+            assert_identical(expected, actual)
+
+    @pytest.mark.parametrize("mode", ["a", "r+"])
+    @pytest.mark.filterwarnings("ignore:.*non-consolidated metadata.*")
+    def test_raise_writing_to_nczarr(self, mode):
+        with create_tmp_file(suffix=".zarr") as tmp:
+            ds = self._create_nczarr(tmp)
+            with pytest.raises(
+                KeyError, match="missing the attribute `_ARRAY_DIMENSIONS`,"
+            ):
+                ds.to_zarr(tmp, mode=mode)