Skip to content

Commit

Permalink
Add .chunksizes property (#5900)
Browse files Browse the repository at this point in the history
* added chunksizes property

* fix typing via Hashable->Any

* add chunksizes to API doc

* whatsnew

* grammar

* Update doc/whats-new.rst

Co-authored-by: Deepak Cherian <dcherian@users.noreply.github.com>

* Update doc/whats-new.rst

Co-authored-by: Deepak Cherian <dcherian@users.noreply.github.com>

* removed the word consistent

* test .chunksizes

Co-authored-by: Deepak Cherian <dcherian@users.noreply.github.com>
  • Loading branch information
TomNicholas and dcherian authored Oct 29, 2021
1 parent bcb96ce commit 1d94b1e
Show file tree
Hide file tree
Showing 7 changed files with 159 additions and 21 deletions.
2 changes: 2 additions & 0 deletions doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ Attributes
Dataset.indexes
Dataset.get_index
Dataset.chunks
Dataset.chunksizes
Dataset.nbytes

Dictionary interface
Expand Down Expand Up @@ -271,6 +272,7 @@ Attributes
DataArray.encoding
DataArray.indexes
DataArray.get_index
DataArray.chunksizes

**ndarray attributes**:
:py:attr:`~DataArray.ndim`
Expand Down
4 changes: 4 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ New Features
`Nathan Lis <https://github.com/wxman22>`_.
- Histogram plots are set with a title displaying the scalar coords if any, similarly to the other plots (:issue:`5791`, :pull:`5792`).
By `Maxime Liquet <https://github.com/maximlt>`_.
- Added a new :py:attr:`Dataset.chunksizes`, :py:attr:`DataArray.chunksizes`, and :py:attr:`Variable.chunksizes`
property, which will always return a mapping from dimension names to chunking pattern along that dimension,
regardless of whether the object is a Dataset, DataArray, or Variable. (:issue:`5846`, :pull:`5900`)
By `Tom Nicholas <https://github.com/TomNicholas>`_.

Breaking changes
~~~~~~~~~~~~~~~~
Expand Down
17 changes: 17 additions & 0 deletions xarray/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1813,6 +1813,23 @@ def ones_like(other, dtype: DTypeLike = None):
return full_like(other, 1, dtype)


def get_chunksizes(
variables: Iterable[Variable],
) -> Mapping[Any, Tuple[int, ...]]:

chunks: Dict[Any, Tuple[int, ...]] = {}
for v in variables:
if hasattr(v.data, "chunks"):
for dim, c in v.chunksizes.items():
if dim in chunks and c != chunks[dim]:
raise ValueError(
f"Object has inconsistent chunks along dimension {dim}. "
"This can be fixed by calling unify_chunks()."
)
chunks[dim] = c
return Frozen(chunks)


def is_np_datetime_like(dtype: DTypeLike) -> bool:
"""Check if a dtype is a subclass of the numpy datetime types"""
return np.issubdtype(dtype, np.datetime64) or np.issubdtype(dtype, np.timedelta64)
Expand Down
32 changes: 29 additions & 3 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
reindex_like_indexers,
)
from .arithmetic import DataArrayArithmetic
from .common import AbstractArray, DataWithCoords
from .common import AbstractArray, DataWithCoords, get_chunksizes
from .computation import unify_chunks
from .coordinates import (
DataArrayCoordinates,
Expand Down Expand Up @@ -1058,11 +1058,37 @@ def __deepcopy__(self, memo=None) -> "DataArray":

@property
def chunks(self) -> Optional[Tuple[Tuple[int, ...], ...]]:
"""Block dimensions for this array's data or None if it's not a dask
array.
"""
Tuple of block lengths for this dataarray's data, in order of dimensions, or None if
the underlying data is not a dask array.
See Also
--------
DataArray.chunk
DataArray.chunksizes
xarray.unify_chunks
"""
return self.variable.chunks

@property
def chunksizes(self) -> Mapping[Any, Tuple[int, ...]]:
"""
Mapping from dimension names to block lengths for this dataarray's data, or None if
the underlying data is not a dask array.
Cannot be modified directly, but can be modified by calling .chunk().
Differs from DataArray.chunks because it returns a mapping of dimensions to chunk shapes
instead of a tuple of chunk shapes.
See Also
--------
DataArray.chunk
DataArray.chunks
xarray.unify_chunks
"""
all_variables = [self.variable] + [c.variable for c in self.coords.values()]
return get_chunksizes(all_variables)

def chunk(
self,
chunks: Union[
Expand Down
51 changes: 37 additions & 14 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
)
from .alignment import _broadcast_helper, _get_broadcast_dims_map_common_coords, align
from .arithmetic import DatasetArithmetic
from .common import DataWithCoords, _contains_datetime_like_objects
from .common import DataWithCoords, _contains_datetime_like_objects, get_chunksizes
from .computation import unify_chunks
from .coordinates import (
DatasetCoordinates,
Expand Down Expand Up @@ -2095,20 +2095,37 @@ def info(self, buf=None) -> None:

@property
def chunks(self) -> Mapping[Hashable, Tuple[int, ...]]:
"""Block dimensions for this dataset's data or None if it's not a dask
array.
"""
chunks: Dict[Hashable, Tuple[int, ...]] = {}
for v in self.variables.values():
if v.chunks is not None:
for dim, c in zip(v.dims, v.chunks):
if dim in chunks and c != chunks[dim]:
raise ValueError(
f"Object has inconsistent chunks along dimension {dim}. "
"This can be fixed by calling unify_chunks()."
)
chunks[dim] = c
return Frozen(chunks)
Mapping from dimension names to block lengths for this dataset's data, or None if
the underlying data is not a dask array.
Cannot be modified directly, but can be modified by calling .chunk().
Same as Dataset.chunksizes, but maintained for backwards compatibility.
See Also
--------
Dataset.chunk
Dataset.chunksizes
xarray.unify_chunks
"""
return get_chunksizes(self.variables.values())

@property
def chunksizes(self) -> Mapping[Any, Tuple[int, ...]]:
"""
Mapping from dimension names to block lengths for this dataset's data, or None if
the underlying data is not a dask array.
Cannot be modified directly, but can be modified by calling .chunk().
Same as Dataset.chunks.
See Also
--------
Dataset.chunk
Dataset.chunks
xarray.unify_chunks
"""
return get_chunksizes(self.variables.values())

def chunk(
self,
Expand Down Expand Up @@ -2147,6 +2164,12 @@ def chunk(
Returns
-------
chunked : xarray.Dataset
See Also
--------
Dataset.chunks
Dataset.chunksizes
xarray.unify_chunks
"""
if chunks is None:
warnings.warn(
Expand Down
37 changes: 33 additions & 4 deletions xarray/core/variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
sparse_array_type,
)
from .utils import (
Frozen,
NdimSizeLenMixin,
OrderedSet,
_default,
Expand Down Expand Up @@ -996,16 +997,44 @@ def __deepcopy__(self, memo=None):
__hash__ = None # type: ignore[assignment]

@property
def chunks(self):
"""Block dimensions for this array's data or None if it's not a dask
array.
def chunks(self) -> Optional[Tuple[Tuple[int, ...], ...]]:
"""
Tuple of block lengths for this dataarray's data, in order of dimensions, or None if
the underlying data is not a dask array.
See Also
--------
Variable.chunk
Variable.chunksizes
xarray.unify_chunks
"""
return getattr(self._data, "chunks", None)

@property
def chunksizes(self) -> Mapping[Any, Tuple[int, ...]]:
"""
Mapping from dimension names to block lengths for this variable's data, or None if
the underlying data is not a dask array.
Cannot be modified directly, but can be modified by calling .chunk().
Differs from variable.chunks because it returns a mapping of dimensions to chunk shapes
instead of a tuple of chunk shapes.
See Also
--------
Variable.chunk
Variable.chunks
xarray.unify_chunks
"""
if hasattr(self._data, "chunks"):
return Frozen({dim: c for dim, c in zip(self.dims, self.data.chunks)})
else:
return {}

_array_counter = itertools.count()

def chunk(self, chunks={}, name=None, lock=False):
"""Coerce this array's data into a dask arrays with the given chunks.
"""Coerce this array's data into a dask array with the given chunks.
If this variable is a non-dask array, it will be converted to dask
array. If it's a dask array, it will be rechunked to the given chunk
Expand Down
37 changes: 37 additions & 0 deletions xarray/tests/test_dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,11 @@ def test_chunk(self):
assert rechunked.chunks == expected
self.assertLazyAndIdentical(self.eager_var, rechunked)

expected_chunksizes = {
dim: chunks for dim, chunks in zip(self.lazy_var.dims, expected)
}
assert rechunked.chunksizes == expected_chunksizes

def test_indexing(self):
u = self.eager_var
v = self.lazy_var
Expand Down Expand Up @@ -330,6 +335,38 @@ def setUp(self):
self.data, coords={"x": range(4)}, dims=("x", "y"), name="foo"
)

def test_chunk(self):
for chunks, expected in [
({}, ((2, 2), (2, 2, 2))),
(3, ((3, 1), (3, 3))),
({"x": 3, "y": 3}, ((3, 1), (3, 3))),
({"x": 3}, ((3, 1), (2, 2, 2))),
({"x": (3, 1)}, ((3, 1), (2, 2, 2))),
]:
# Test DataArray
rechunked = self.lazy_array.chunk(chunks)
assert rechunked.chunks == expected
self.assertLazyAndIdentical(self.eager_array, rechunked)

expected_chunksizes = {
dim: chunks for dim, chunks in zip(self.lazy_array.dims, expected)
}
assert rechunked.chunksizes == expected_chunksizes

# Test Dataset
lazy_dataset = self.lazy_array.to_dataset()
eager_dataset = self.eager_array.to_dataset()
expected_chunksizes = {
dim: chunks for dim, chunks in zip(lazy_dataset.dims, expected)
}
rechunked = lazy_dataset.chunk(chunks)

# Dataset.chunks has a different return type to DataArray.chunks - see issue #5843
assert rechunked.chunks == expected_chunksizes
self.assertLazyAndIdentical(eager_dataset, rechunked)

assert rechunked.chunksizes == expected_chunksizes

def test_rechunk(self):
chunked = self.eager_array.chunk({"x": 2}).chunk({"y": 2})
assert chunked.chunks == ((2,) * 2, (2,) * 3)
Expand Down

0 comments on commit 1d94b1e

Please sign in to comment.