From aa6d33331abe30ccedb28715c292a4a9ec1e79f5 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Mon, 30 Nov 2020 16:17:17 +0100 Subject: [PATCH 1/6] move get_chunks from zarr to dateset and removed maybe_chunks in zarr --- xarray/core/dataset.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 04974c58113..3de7c2b30be 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -359,6 +359,36 @@ def _assert_empty(args: tuple, msg: str = "%s") -> None: raise ValueError(msg % args) +def _get_chunk(name, var, chunks): + chunk_spec = dict(zip(var.dims, var.encoding.get("chunks"))) + + # Coordinate labels aren't chunked + if var.ndim == 1 and var.dims[0] == name: + return chunk_spec + + if chunks == "auto": + return chunk_spec + + for dim in var.dims: + if dim in chunks: + spec = chunks[dim] + if isinstance(spec, int): + spec = (spec,) + if isinstance(spec, (tuple, list)) and chunk_spec[dim]: + if any(s % chunk_spec[dim] for s in spec): + warnings.warn( + "Specified Dask chunks %r would " + "separate Zarr chunk shape %r for " + "dimension %r. This significantly " + "degrades performance. Consider " + "rechunking after loading instead." + % (chunks[dim], chunk_spec[dim], dim), + stacklevel=2, + ) + chunk_spec[dim] = chunks[dim] + return chunk_spec + + def _maybe_chunk( name, var, From 94365c5a6a8454d0d617ed7a2b60373396159757 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Mon, 30 Nov 2020 16:17:17 +0100 Subject: [PATCH 2/6] move get_chunks from zarr to dateset and removed maybe_chunks in zarr --- xarray/backends/api.py | 4 ++-- xarray/backends/apiv2.py | 13 ++++++----- xarray/backends/zarr.py | 47 ---------------------------------------- xarray/core/dataset.py | 30 +++++++++++++++++++++++++ 4 files changed, 40 insertions(+), 54 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 0b9b5046cb9..1ccdb5def3d 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -26,7 +26,7 @@ combine_by_coords, ) from ..core.dataarray import DataArray -from ..core.dataset import Dataset, _maybe_chunk +from ..core.dataset import Dataset, _maybe_chunk, _get_chunk from ..core.utils import close_on_error, is_grib_path, is_remote_uri from .common import AbstractDataStore, ArrayWriter from .locks import _get_scheduler @@ -536,7 +536,7 @@ def maybe_decode_store(store, chunks): k: _maybe_chunk( k, v, - store.get_chunk(k, v, chunks), + _get_chunk(k, v, chunks), overwrite_encoded_chunks=overwrite_encoded_chunks, ) for k, v in ds.variables.items() diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index 7e4605c42ce..f3a4812cf32 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -1,7 +1,8 @@ import os +from ..core.dataset import _get_chunk, _maybe_chunk from ..core.utils import is_remote_uri -from . import plugins, zarr +from . import plugins from .api import ( _autodetect_engine, _get_backend_cls, @@ -54,10 +55,12 @@ def dataset_from_backend_dataset( if isinstance(chunks, int): chunks = dict.fromkeys(ds.dims, chunks) - variables = { - k: zarr.ZarrStore.maybe_chunk(k, v, chunks, overwrite_encoded_chunks) - for k, v in ds.variables.items() - } + variables = {} + for k, v in ds.variables.items(): + var_chunks = _get_chunk(k, v, chunks) + variables[k] = _maybe_chunk( + k, v, var_chunks, overwrite_encoded_chunks=overwrite_encoded_chunks, + ) ds2 = ds._replace(variables) else: diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 9827c345239..22cf2b1fa7e 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -368,53 +368,6 @@ def encode_variable(self, variable): def encode_attribute(self, a): return encode_zarr_attr_value(a) - @staticmethod - def get_chunk(name, var, chunks): - chunk_spec = dict(zip(var.dims, var.encoding.get("chunks"))) - - # Coordinate labels aren't chunked - if var.ndim == 1 and var.dims[0] == name: - return chunk_spec - - if chunks == "auto": - return chunk_spec - - for dim in var.dims: - if dim in chunks: - spec = chunks[dim] - if isinstance(spec, int): - spec = (spec,) - if isinstance(spec, (tuple, list)) and chunk_spec[dim]: - if any(s % chunk_spec[dim] for s in spec): - warnings.warn( - "Specified Dask chunks %r would " - "separate Zarr chunk shape %r for " - "dimension %r. This significantly " - "degrades performance. Consider " - "rechunking after loading instead." - % (chunks[dim], chunk_spec[dim], dim), - stacklevel=2, - ) - chunk_spec[dim] = chunks[dim] - return chunk_spec - - @classmethod - def maybe_chunk(cls, name, var, chunks, overwrite_encoded_chunks): - chunk_spec = cls.get_chunk(name, var, chunks) - - if (var.ndim > 0) and (chunk_spec is not None): - from dask.base import tokenize - - # does this cause any data to be read? - token2 = tokenize(name, var._data, chunks) - name2 = f"xarray-{name}-{token2}" - var = var.chunk(chunk_spec, name=name2, lock=None) - if overwrite_encoded_chunks and var.chunks is not None: - var.encoding["chunks"] = tuple(x[0] for x in var.chunks) - return var - else: - return var - def store( self, variables, diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 04974c58113..3de7c2b30be 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -359,6 +359,36 @@ def _assert_empty(args: tuple, msg: str = "%s") -> None: raise ValueError(msg % args) +def _get_chunk(name, var, chunks): + chunk_spec = dict(zip(var.dims, var.encoding.get("chunks"))) + + # Coordinate labels aren't chunked + if var.ndim == 1 and var.dims[0] == name: + return chunk_spec + + if chunks == "auto": + return chunk_spec + + for dim in var.dims: + if dim in chunks: + spec = chunks[dim] + if isinstance(spec, int): + spec = (spec,) + if isinstance(spec, (tuple, list)) and chunk_spec[dim]: + if any(s % chunk_spec[dim] for s in spec): + warnings.warn( + "Specified Dask chunks %r would " + "separate Zarr chunk shape %r for " + "dimension %r. This significantly " + "degrades performance. Consider " + "rechunking after loading instead." + % (chunks[dim], chunk_spec[dim], dim), + stacklevel=2, + ) + chunk_spec[dim] = chunks[dim] + return chunk_spec + + def _maybe_chunk( name, var, From fb30b3e884fe94bba0a12fefcaf0441da802bd1b Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Mon, 30 Nov 2020 16:21:25 +0100 Subject: [PATCH 3/6] black --- xarray/backends/api.py | 2 +- xarray/backends/apiv2.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 1ccdb5def3d..f1d58813958 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -26,7 +26,7 @@ combine_by_coords, ) from ..core.dataarray import DataArray -from ..core.dataset import Dataset, _maybe_chunk, _get_chunk +from ..core.dataset import Dataset, _get_chunk, _maybe_chunk from ..core.utils import close_on_error, is_grib_path, is_remote_uri from .common import AbstractDataStore, ArrayWriter from .locks import _get_scheduler diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index f3a4812cf32..e71437da8ab 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -59,7 +59,10 @@ def dataset_from_backend_dataset( for k, v in ds.variables.items(): var_chunks = _get_chunk(k, v, chunks) variables[k] = _maybe_chunk( - k, v, var_chunks, overwrite_encoded_chunks=overwrite_encoded_chunks, + k, + v, + var_chunks, + overwrite_encoded_chunks=overwrite_encoded_chunks, ) ds2 = ds._replace(variables) From ac714a7a7ee8b747b7c76983a4a4dece89aeb47c Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Mon, 30 Nov 2020 18:09:10 +0100 Subject: [PATCH 4/6] removed not used import --- xarray/backends/zarr.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 22cf2b1fa7e..f3c92d52303 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1,5 +1,3 @@ -import warnings - import numpy as np from .. import coding, conventions From d4e286870386a2c18321d3caf24dc3ec1707cd71 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Tue, 1 Dec 2020 20:53:35 +0100 Subject: [PATCH 5/6] update warning message in get_chunks --- xarray/core/dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 3de7c2b30be..02c75918b83 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -378,8 +378,8 @@ def _get_chunk(name, var, chunks): if any(s % chunk_spec[dim] for s in spec): warnings.warn( "Specified Dask chunks %r would " - "separate Zarr chunk shape %r for " - "dimension %r. This significantly " + "separate on disks chunk shape %r for " + "dimension %r. This could " "degrades performance. Consider " "rechunking after loading instead." % (chunks[dim], chunk_spec[dim], dim), From ec31a67390f01d4f86cf99ee4c7d86c5fa549d96 Mon Sep 17 00:00:00 2001 From: Alessandro Amici Date: Wed, 2 Dec 2020 09:03:14 +0100 Subject: [PATCH 6/6] Reformat warning text to use f-strings --- xarray/core/dataset.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 02c75918b83..24792a96dfc 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -377,12 +377,10 @@ def _get_chunk(name, var, chunks): if isinstance(spec, (tuple, list)) and chunk_spec[dim]: if any(s % chunk_spec[dim] for s in spec): warnings.warn( - "Specified Dask chunks %r would " - "separate on disks chunk shape %r for " - "dimension %r. This could " - "degrades performance. Consider " - "rechunking after loading instead." - % (chunks[dim], chunk_spec[dim], dim), + f"Specified Dask chunks {chunks[dim]} would separate " + f"on disks chunk shape {chunk_spec[dim]} for dimension {dim}. " + "This could degrade performance. " + "Consider rechunking after loading instead.", stacklevel=2, ) chunk_spec[dim] = chunks[dim]