-
-
Notifications
You must be signed in to change notification settings - Fork 1.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix zarr datetime64 chunks #8253
Changes from 16 commits
58ed18b
d3b5891
d5cf4b3
1831839
5e7ec10
61aca65
967c9f9
3b3cf7b
8a0ccd9
83ece89
f393375
8358fb5
b3bba6d
c670e95
a99c54e
2e043c4
818ba67
1dc9020
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -106,6 +106,24 @@ def __getitem__(self, key): | |||||
# could possibly have a work-around for 0d data here | ||||||
|
||||||
|
||||||
def _squeeze_var_chunks(var_chunks, name=None): | ||||||
if any(len(set(chunks[:-1])) > 1 for chunks in var_chunks): | ||||||
raise ValueError( | ||||||
"Zarr requires uniform chunk sizes except for final chunk. " | ||||||
f"Variable named {name!r} has incompatible dask chunks: {var_chunks!r}. " | ||||||
"Consider rechunking using `chunk()`." | ||||||
) | ||||||
if any((chunks[0] < chunks[-1]) for chunks in var_chunks): | ||||||
raise ValueError( | ||||||
"Final chunk of Zarr array must be the same size or smaller " | ||||||
f"than the first. Variable named {name!r} has incompatible Dask chunks {var_chunks!r}." | ||||||
"Consider either rechunking using `chunk()` or instead deleting " | ||||||
"or modifying `encoding['chunks']`." | ||||||
) | ||||||
# return the first chunk for each dimension | ||||||
return tuple(chunk[0] for chunk in var_chunks) | ||||||
|
||||||
|
||||||
def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks): | ||||||
""" | ||||||
Given encoding chunks (possibly None or []) and variable chunks | ||||||
|
@@ -128,21 +146,7 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks): | |||||
# while dask chunks can be variable sized | ||||||
# http://dask.pydata.org/en/latest/array-design.html#chunks | ||||||
if var_chunks and not enc_chunks: | ||||||
if any(len(set(chunks[:-1])) > 1 for chunks in var_chunks): | ||||||
raise ValueError( | ||||||
"Zarr requires uniform chunk sizes except for final chunk. " | ||||||
f"Variable named {name!r} has incompatible dask chunks: {var_chunks!r}. " | ||||||
"Consider rechunking using `chunk()`." | ||||||
) | ||||||
if any((chunks[0] < chunks[-1]) for chunks in var_chunks): | ||||||
raise ValueError( | ||||||
"Final chunk of Zarr array must be the same size or smaller " | ||||||
f"than the first. Variable named {name!r} has incompatible Dask chunks {var_chunks!r}." | ||||||
"Consider either rechunking using `chunk()` or instead deleting " | ||||||
"or modifying `encoding['chunks']`." | ||||||
) | ||||||
# return the first chunk for each dimension | ||||||
return tuple(chunk[0] for chunk in var_chunks) | ||||||
return _squeeze_var_chunks(var_chunks, name=name) | ||||||
|
||||||
# from here on, we are dealing with user-specified chunks in encoding | ||||||
# zarr allows chunks to be an integer, in which case it uses the same chunk | ||||||
|
@@ -286,7 +290,8 @@ def extract_zarr_variable_encoding( | |||||
|
||||||
|
||||||
# Function below is copied from conventions.encode_cf_variable. | ||||||
# The only change is to raise an error for object dtypes. | ||||||
# The only change is to raise an error for object dtypes, and | ||||||
# add chunks to the encoding when dask arrays are converted to np. | ||||||
def encode_zarr_variable(var, needs_copy=True, name=None): | ||||||
""" | ||||||
Converts an Variable into an Variable which follows some | ||||||
|
@@ -307,6 +312,7 @@ def encode_zarr_variable(var, needs_copy=True, name=None): | |||||
out : Variable | ||||||
A variable which has been encoded as described above. | ||||||
""" | ||||||
original_chunks = var.chunks | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is kind of the crux. I cannot actually understand how / where the
xarray/xarray/core/variable.py Line 315 in 15328b6
The word I'm tempted to loop in @TomNicholas into this conversation, who recently refactored everything about how we handle chunked arrays, to help us sort through this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
xarray/xarray/namedarray/core.py Line 660 in e5d163a
Yes, the problem is that the encoder does not make any difference between dask/numpy arrays and always returns numpy arrays. I originally thought that was a mistake, but I wasn't so sure after I tried to change that. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. DateTimes are always cast to numpy using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So are you saying that all datatime arrays are eagerly computed by the coding pipelines, even if they are Dask arrays? |
||||||
|
||||||
var = conventions.encode_cf_variable(var, name=name) | ||||||
|
||||||
|
@@ -317,6 +323,8 @@ def encode_zarr_variable(var, needs_copy=True, name=None): | |||||
var = coder.encode(var, name=name) | ||||||
var = coding.strings.ensure_fixed_length_bytes(var) | ||||||
|
||||||
if original_chunks and not var.chunks and "chunks" not in var.encoding: | ||||||
var.encoding["chunks"] = _squeeze_var_chunks(original_chunks, name=name) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I worry that fixing the issue this way reveals that our internal interfaces are leaky. It seems like a bandaid for a deeper problem. Why does |
||||||
return var | ||||||
|
||||||
|
||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we get a quick comment to explain what this function does?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
818ba67