Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move get_chunks from zarr.py to dataset.py #4632

Merged
merged 7 commits into from
Dec 2, 2020
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions xarray/backends/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
combine_by_coords,
)
from ..core.dataarray import DataArray
from ..core.dataset import Dataset, _maybe_chunk
from ..core.dataset import Dataset, _get_chunk, _maybe_chunk
from ..core.utils import close_on_error, is_grib_path, is_remote_uri
from .common import AbstractDataStore, ArrayWriter
from .locks import _get_scheduler
Expand Down Expand Up @@ -536,7 +536,7 @@ def maybe_decode_store(store, chunks):
k: _maybe_chunk(
k,
v,
store.get_chunk(k, v, chunks),
_get_chunk(k, v, chunks),
overwrite_encoded_chunks=overwrite_encoded_chunks,
)
for k, v in ds.variables.items()
Expand Down
16 changes: 11 additions & 5 deletions xarray/backends/apiv2.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import os

from ..core.dataset import _get_chunk, _maybe_chunk
from ..core.utils import is_remote_uri
from . import plugins, zarr
from . import plugins
from .api import (
_autodetect_engine,
_get_backend_cls,
Expand Down Expand Up @@ -54,10 +55,15 @@ def dataset_from_backend_dataset(
if isinstance(chunks, int):
chunks = dict.fromkeys(ds.dims, chunks)

variables = {
k: zarr.ZarrStore.maybe_chunk(k, v, chunks, overwrite_encoded_chunks)
for k, v in ds.variables.items()
}
variables = {}
for k, v in ds.variables.items():
var_chunks = _get_chunk(k, v, chunks)
variables[k] = _maybe_chunk(
k,
v,
var_chunks,
overwrite_encoded_chunks=overwrite_encoded_chunks,
)
ds2 = ds._replace(variables)

else:
Expand Down
49 changes: 0 additions & 49 deletions xarray/backends/zarr.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import warnings

import numpy as np

from .. import coding, conventions
Expand Down Expand Up @@ -368,53 +366,6 @@ def encode_variable(self, variable):
def encode_attribute(self, a):
return encode_zarr_attr_value(a)

@staticmethod
def get_chunk(name, var, chunks):
chunk_spec = dict(zip(var.dims, var.encoding.get("chunks")))

# Coordinate labels aren't chunked
if var.ndim == 1 and var.dims[0] == name:
return chunk_spec

if chunks == "auto":
return chunk_spec

for dim in var.dims:
if dim in chunks:
spec = chunks[dim]
if isinstance(spec, int):
spec = (spec,)
if isinstance(spec, (tuple, list)) and chunk_spec[dim]:
if any(s % chunk_spec[dim] for s in spec):
warnings.warn(
"Specified Dask chunks %r would "
"separate Zarr chunk shape %r for "
"dimension %r. This significantly "
"degrades performance. Consider "
"rechunking after loading instead."
% (chunks[dim], chunk_spec[dim], dim),
stacklevel=2,
)
chunk_spec[dim] = chunks[dim]
return chunk_spec

@classmethod
def maybe_chunk(cls, name, var, chunks, overwrite_encoded_chunks):
chunk_spec = cls.get_chunk(name, var, chunks)

if (var.ndim > 0) and (chunk_spec is not None):
from dask.base import tokenize

# does this cause any data to be read?
token2 = tokenize(name, var._data, chunks)
name2 = f"xarray-{name}-{token2}"
var = var.chunk(chunk_spec, name=name2, lock=None)
if overwrite_encoded_chunks and var.chunks is not None:
var.encoding["chunks"] = tuple(x[0] for x in var.chunks)
return var
else:
return var

def store(
self,
variables,
Expand Down
30 changes: 30 additions & 0 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,36 @@ def _assert_empty(args: tuple, msg: str = "%s") -> None:
raise ValueError(msg % args)


def _get_chunk(name, var, chunks):
chunk_spec = dict(zip(var.dims, var.encoding.get("chunks")))

# Coordinate labels aren't chunked
if var.ndim == 1 and var.dims[0] == name:
return chunk_spec

if chunks == "auto":
return chunk_spec

for dim in var.dims:
if dim in chunks:
spec = chunks[dim]
if isinstance(spec, int):
spec = (spec,)
if isinstance(spec, (tuple, list)) and chunk_spec[dim]:
if any(s % chunk_spec[dim] for s in spec):
warnings.warn(
"Specified Dask chunks %r would "
"separate Zarr chunk shape %r for "
"dimension %r. This significantly "
"degrades performance. Consider "
"rechunking after loading instead."
% (chunks[dim], chunk_spec[dim], dim),
alexamici marked this conversation as resolved.
Show resolved Hide resolved
stacklevel=2,
)
chunk_spec[dim] = chunks[dim]
return chunk_spec


def _maybe_chunk(
name,
var,
Expand Down