diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 157282803cc..72665dfd5f3 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -78,6 +78,13 @@ v2023.10.1 (19 Oct, 2023) This release updates our minimum numpy version in ``pyproject.toml`` to 1.22, consistent with our documentation below. +Bug fixes +~~~~~~~~~ + +- Fix bug where :py:meth:`Dataset.to_zarr` would modify chunks of datetime-like variables (:issue:`8230`, :pull:`8253`). + By `Mattia Almansi `_. + + .. _whats-new.2023.10.0: v2023.10.0 (19 Oct, 2023) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 2b41fa5224e..0ea87856404 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -106,6 +106,34 @@ def __getitem__(self, key): # could possibly have a work-around for 0d data here +def _squeeze_var_chunks( + var_chunks: tuple[tuple[int, ...], ...], name=None +) -> tuple[int, ...]: + """ + Normalize chunks to tuple of integers. + + zarr chunks needs to be uniform for each array + http://zarr.readthedocs.io/en/latest/spec/v1.html#chunks + while dask chunks can be variable sized + http://dask.pydata.org/en/latest/array-design.html#chunks + """ + if any(len(set(chunks[:-1])) > 1 for chunks in var_chunks): + raise ValueError( + "Zarr requires uniform chunk sizes except for final chunk. " + f"Variable named {name!r} has incompatible dask chunks: {var_chunks!r}. " + "Consider rechunking using `chunk()`." + ) + if any((chunks[0] < chunks[-1]) for chunks in var_chunks): + raise ValueError( + "Final chunk of Zarr array must be the same size or smaller " + f"than the first. Variable named {name!r} has incompatible Dask chunks {var_chunks!r}." + "Consider either rechunking using `chunk()` or instead deleting " + "or modifying `encoding['chunks']`." + ) + # return the first chunk for each dimension + return tuple(chunk[0] for chunk in var_chunks) + + def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks): """ Given encoding chunks (possibly None or []) and variable chunks @@ -123,26 +151,8 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks): # if there are no chunks in encoding but there are dask chunks, we try to # use the same chunks in zarr - # However, zarr chunks needs to be uniform for each array - # http://zarr.readthedocs.io/en/latest/spec/v1.html#chunks - # while dask chunks can be variable sized - # http://dask.pydata.org/en/latest/array-design.html#chunks if var_chunks and not enc_chunks: - if any(len(set(chunks[:-1])) > 1 for chunks in var_chunks): - raise ValueError( - "Zarr requires uniform chunk sizes except for final chunk. " - f"Variable named {name!r} has incompatible dask chunks: {var_chunks!r}. " - "Consider rechunking using `chunk()`." - ) - if any((chunks[0] < chunks[-1]) for chunks in var_chunks): - raise ValueError( - "Final chunk of Zarr array must be the same size or smaller " - f"than the first. Variable named {name!r} has incompatible Dask chunks {var_chunks!r}." - "Consider either rechunking using `chunk()` or instead deleting " - "or modifying `encoding['chunks']`." - ) - # return the first chunk for each dimension - return tuple(chunk[0] for chunk in var_chunks) + return _squeeze_var_chunks(var_chunks, name=name) # from here on, we are dealing with user-specified chunks in encoding # zarr allows chunks to be an integer, in which case it uses the same chunk @@ -286,7 +296,8 @@ def extract_zarr_variable_encoding( # Function below is copied from conventions.encode_cf_variable. -# The only change is to raise an error for object dtypes. +# The only change is to raise an error for object dtypes, and +# add chunks to the encoding when dask arrays are converted to np. def encode_zarr_variable(var, needs_copy=True, name=None): """ Converts an Variable into an Variable which follows some @@ -307,6 +318,7 @@ def encode_zarr_variable(var, needs_copy=True, name=None): out : Variable A variable which has been encoded as described above. """ + original_chunks = var.chunks var = conventions.encode_cf_variable(var, name=name) @@ -317,6 +329,8 @@ def encode_zarr_variable(var, needs_copy=True, name=None): var = coder.encode(var, name=name) var = coding.strings.ensure_fixed_length_bytes(var) + if original_chunks and not var.chunks and "chunks" not in var.encoding: + var.encoding["chunks"] = _squeeze_var_chunks(original_chunks, name=name) return var diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 73352c3f7e1..e2546af35ab 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2709,6 +2709,14 @@ def test_attributes(self, obj) -> None: with pytest.raises(TypeError, match=r"Invalid attribute in Dataset.attrs."): ds.to_zarr(store_target, **self.version_kwargs) + @requires_dask + def test_chunked_datetime64(self) -> None: + original = create_test_data().astype("datetime64[ns]").chunk(1) + with self.roundtrip(original, open_kwargs={"chunks": {}}) as actual: + for name, actual_var in actual.variables.items(): + assert original[name].chunks == actual_var.chunks + assert original.chunks == actual.chunks + def test_vectorized_indexing_negative_step(self) -> None: if not has_dask: pytest.xfail(