Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support in the "zarr" backend for reading NCZarr data #6420

Merged
merged 34 commits into from
Apr 14, 2022
Merged
Show file tree
Hide file tree
Changes from 30 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
dbf76a9
add support for NCZarr
malmans2 Mar 25, 2022
07a334f
restore original format
malmans2 Mar 25, 2022
d1e9120
add test_nczarr
malmans2 Mar 25, 2022
ee69a5c
better comment
malmans2 Mar 28, 2022
fd60da8
test reading with zarr
malmans2 Mar 28, 2022
e35d793
decode zarray
malmans2 Mar 28, 2022
eac9b3b
use public store and test nczarr only
malmans2 Mar 28, 2022
8af176c
restore tests
malmans2 Mar 28, 2022
ac32ac8
install netcdf-c fixing bug
malmans2 Mar 30, 2022
44ef220
add env
malmans2 Mar 30, 2022
b72e1d4
fix ci
malmans2 Mar 30, 2022
fd84283
try build netcdf-c on windows
malmans2 Mar 30, 2022
3355eb7
fix typo
malmans2 Mar 30, 2022
9af4401
install netcdf-c first
malmans2 Mar 30, 2022
12ef991
install netcdf-c dep with conda
malmans2 Mar 30, 2022
71eca46
fix ci
malmans2 Mar 30, 2022
d3e9182
try win env again
malmans2 Mar 30, 2022
316153b
fix Nan in tests
malmans2 Mar 30, 2022
978f753
edit zarray
malmans2 Mar 30, 2022
5be903b
loop over all variables
malmans2 Mar 30, 2022
f520e7f
edit Nan in zattrs and zarray
malmans2 Mar 30, 2022
b5609a1
check path exists
malmans2 Mar 30, 2022
bded882
Merge branch 'main' into nczarr
malmans2 Mar 30, 2022
3a22ac8
must use netcdf-c>=4.8.1
malmans2 Mar 30, 2022
7f19413
skip 4.8.1 and Windows
malmans2 Mar 30, 2022
b5704bd
revisions
malmans2 Apr 7, 2022
8ee5d19
Merge branch 'main' into nczarr
malmans2 Apr 7, 2022
2c12935
better testing
malmans2 Apr 7, 2022
286d72c
revisions
malmans2 Apr 9, 2022
c5bde72
Merge branch 'main' into nczarr
malmans2 Apr 9, 2022
b823675
add what's new
malmans2 Apr 9, 2022
b6cfad3
update docs
malmans2 Apr 10, 2022
eb92cde
[skip ci] Mention netCDF and GDAL in user-guide
malmans2 Apr 11, 2022
470210a
[skip ci] reword
malmans2 Apr 11, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 44 additions & 24 deletions xarray/backends/zarr.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import os
import warnings

Expand Down Expand Up @@ -178,19 +179,37 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks):
raise AssertionError("We should never get here. Function logic must be wrong.")


def _get_zarr_dims_and_attrs(zarr_obj, dimension_key):
def _get_zarr_dims_and_attrs(zarr_obj, dimension_key, try_nczarr):
# Zarr arrays do not have dimensions. To get around this problem, we add
# an attribute that specifies the dimension. We have to hide this attribute
# when we send the attributes to the user.
# zarr_obj can be either a zarr group or zarr array
try:
# Xarray-Zarr
dimensions = zarr_obj.attrs[dimension_key]
except KeyError:
raise KeyError(
f"Zarr object is missing the attribute `{dimension_key}`, which is "
"required for xarray to determine variable dimensions."
)
attributes = HiddenKeyDict(zarr_obj.attrs, [dimension_key])
except KeyError as e:
if not try_nczarr:
raise KeyError(
f"Zarr object is missing the attribute `{dimension_key}`, which is "
"required for xarray to determine variable dimensions."
) from e

# NCZarr defines dimensions through metadata in .zarray
zarray_path = os.path.join(zarr_obj.path, ".zarray")
zarray = json.loads(zarr_obj.store[zarray_path])
try:
# NCZarr uses Fully Qualified Names
dimensions = [
os.path.basename(dim) for dim in zarray["_NCZARR_ARRAY"]["dimrefs"]
]
except KeyError as e:
raise KeyError(
f"Zarr object is missing the attribute `{dimension_key}` and the NCZarr metadata, "
"which are required for xarray to determine variable dimensions."
) from e

nc_attrs = [attr for attr in zarr_obj.attrs if attr.startswith("_NC")]
attributes = HiddenKeyDict(zarr_obj.attrs, [dimension_key] + nc_attrs)
return dimensions, attributes


Expand Down Expand Up @@ -409,7 +428,10 @@ def ds(self):

def open_store_variable(self, name, zarr_array):
data = indexing.LazilyIndexedArray(ZarrArrayWrapper(name, self))
dimensions, attributes = _get_zarr_dims_and_attrs(zarr_array, DIMENSION_KEY)
try_nczarr = self._mode == "r"
dimensions, attributes = _get_zarr_dims_and_attrs(
zarr_array, DIMENSION_KEY, try_nczarr
)
attributes = dict(attributes)
encoding = {
"chunks": zarr_array.chunks,
Expand All @@ -430,26 +452,24 @@ def get_variables(self):
)

def get_attrs(self):
return dict(self.zarr_group.attrs.asdict())
return {
k: v
for k, v in self.zarr_group.attrs.asdict().items()
if not k.startswith("_NC")
}

malmans2 marked this conversation as resolved.
Show resolved Hide resolved
def get_dimensions(self):
try_nczarr = self._mode == "r"
dimensions = {}
for k, v in self.zarr_group.arrays():
try:
for d, s in zip(v.attrs[DIMENSION_KEY], v.shape):
if d in dimensions and dimensions[d] != s:
raise ValueError(
f"found conflicting lengths for dimension {d} "
f"({s} != {dimensions[d]})"
)
dimensions[d] = s

except KeyError:
raise KeyError(
f"Zarr object is missing the attribute `{DIMENSION_KEY}`, "
"which is required for xarray to determine "
"variable dimensions."
)
dim_names, _ = _get_zarr_dims_and_attrs(v, DIMENSION_KEY, try_nczarr)
for d, s in zip(dim_names, v.shape):
if d in dimensions and dimensions[d] != s:
raise ValueError(
f"found conflicting lengths for dimension {d} "
f"({s} != {dimensions[d]})"
)
dimensions[d] = s
return dimensions

def set_dimensions(self, variables, unlimited_dims=None):
Expand Down
49 changes: 49 additions & 0 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import math
import os.path
import pickle
import platform
import re
import shutil
import sys
Expand Down Expand Up @@ -5427,3 +5428,51 @@ def test_write_file_from_np_str(str_type, tmpdir) -> None:
txr = tdf.to_xarray()

txr.to_netcdf(tmpdir.join("test.nc"))


@requires_zarr
@requires_netCDF4
class TestNCZarr:
@staticmethod
def _create_nczarr(filename):
netcdfc_version = Version(nc4.getlibversion().split()[0])
if netcdfc_version < Version("4.8.1"):
pytest.skip("requires netcdf-c>=4.8.1")
if (platform.system() == "Windows") and (netcdfc_version == Version("4.8.1")):
# Bug in netcdf-c==4.8.1 (typo: Nan instead of NaN)
# https://github.com/Unidata/netcdf-c/issues/2265
pytest.skip("netcdf-c==4.8.1 has issues on Windows")

ds = create_test_data()
# Drop dim3: netcdf-c does not support dtype='<U1'
# https://github.com/Unidata/netcdf-c/issues/2259
ds = ds.drop_vars("dim3")

# netcdf-c>4.8.1 will add _ARRAY_DIMENSIONS by default
mode = "nczarr" if netcdfc_version == Version("4.8.1") else "nczarr,noxarray"
ds.to_netcdf(f"file://{filename}#mode={mode}")
return ds

def test_open_nczarr(self):
with create_tmp_file(suffix=".zarr") as tmp:
expected = self._create_nczarr(tmp)
actual = xr.open_zarr(tmp, consolidated=False)
assert_identical(expected, actual)

def test_overwriting_nczarr(self):
with create_tmp_file(suffix=".zarr") as tmp:
ds = self._create_nczarr(tmp)
expected = ds[["var1"]]
expected.to_zarr(tmp, mode="w")
actual = xr.open_zarr(tmp, consolidated=False)
assert_identical(expected, actual)

@pytest.mark.parametrize("mode", ["a", "r+"])
@pytest.mark.filterwarnings("ignore:.*non-consolidated metadata.*")
def test_raise_writing_to_nczarr(self, mode):
with create_tmp_file(suffix=".zarr") as tmp:
ds = self._create_nczarr(tmp)
with pytest.raises(
KeyError, match="missing the attribute `_ARRAY_DIMENSIONS`,"
):
ds.to_zarr(tmp, mode=mode)