-
-
Notifications
You must be signed in to change notification settings - Fork 1.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Appending to zarr store #2706
Appending to zarr store #2706
Changes from 5 commits
f231393
f14f3b7
928440d
442e938
389ba43
6097da2
390a792
da9a962
6756b8f
95d5782
a750a92
295084b
cc353e1
e56a210
519b398
c85aa98
3adfd49
608813b
2078838
7a90ce8
b8af5bd
5bee0dc
7ed77ad
b4ff1c7
5b3f8ea
7564329
93be790
58c4b78
5316593
ad08c73
4d2122b
105ed39
af4a5a5
62d4f52
9558811
9d70e02
a6ff494
34b700f
3e54cb9
97ed25b
beb12e5
41a6ca3
321aec1
2b130ff
58de86d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,7 +7,8 @@ | |
from ..core import indexing | ||
from ..core.pycompat import integer_types | ||
from ..core.utils import FrozenOrderedDict, HiddenKeyDict | ||
from .common import AbstractWritableDataStore, BackendArray | ||
from .common import AbstractWritableDataStore, BackendArray, \ | ||
_encode_variable_name | ||
|
||
# need some special secret attributes to tell us the dimensions | ||
_DIMENSION_KEY = '_ARRAY_DIMENSIONS' | ||
|
@@ -312,40 +313,99 @@ def encode_variable(self, variable): | |
def encode_attribute(self, a): | ||
return _encode_zarr_attr_value(a) | ||
|
||
def prepare_variable(self, name, variable, check_encoding=False, | ||
unlimited_dims=None): | ||
|
||
attrs = variable.attrs.copy() | ||
dims = variable.dims | ||
dtype = variable.dtype | ||
shape = variable.shape | ||
|
||
fill_value = attrs.pop('_FillValue', None) | ||
if variable.encoding == {'_FillValue': None} and fill_value is None: | ||
variable.encoding = {} | ||
|
||
encoding = _extract_zarr_variable_encoding( | ||
variable, raise_on_invalid=check_encoding) | ||
|
||
encoded_attrs = OrderedDict() | ||
# the magic for storing the hidden dimension data | ||
encoded_attrs[_DIMENSION_KEY] = dims | ||
for k, v in attrs.items(): | ||
encoded_attrs[k] = self.encode_attribute(v) | ||
|
||
zarr_array = self.ds.create(name, shape=shape, dtype=dtype, | ||
fill_value=fill_value, **encoding) | ||
zarr_array.attrs.put(encoded_attrs) | ||
|
||
return zarr_array, variable.data | ||
|
||
def store(self, variables, attributes, *args, **kwargs): | ||
AbstractWritableDataStore.store(self, variables, attributes, | ||
*args, **kwargs) | ||
def store(self, variables, attributes, check_encoding_set=frozenset(), | ||
writer=None, unlimited_dims=None): | ||
""" | ||
Top level method for putting data on this store, this method: | ||
- encodes variables/attributes | ||
- sets dimensions | ||
- sets variables | ||
|
||
Parameters | ||
---------- | ||
variables : dict-like | ||
Dictionary of key/value (variable name / xr.Variable) pairs | ||
attributes : dict-like | ||
Dictionary of key/value (attribute name / attribute) pairs | ||
check_encoding_set : list-like | ||
List of variables that should be checked for invalid encoding | ||
values | ||
writer : ArrayWriter | ||
unlimited_dims : list-like | ||
List of dimension names that should be treated as unlimited | ||
dimensions. | ||
dimension on which the zarray will be appended | ||
only needed in append mode | ||
""" | ||
|
||
variables, attributes = self.encode(variables, attributes) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is where the encoding from If we wanted to make sure that the encoding of the new variables is compatible with the target store, we would have to peek at the target store encodings and explicitly put them in the new variable encoding. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will try doing that :) Will probably take a while, but I might be able to do that on Monday or Tuesday 👍 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would even consider opening up the zarr store (into an xarray.Dataset) before doing any appending. Then it’s easy to decode all the metadata and ensure consistency of the appended data. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would try to avoid opening the whole zarr store for performance reasons and instead just try pulling the encodings from the array attributes. I think the only way to really solve this is adding the possibility to all CF encoders to use a specific encoding if one is passed. |
||
|
||
self.set_attributes(attributes) | ||
self.set_dimensions(variables, unlimited_dims=unlimited_dims) | ||
self.set_variables(variables, check_encoding_set, writer, | ||
unlimited_dims=unlimited_dims) | ||
|
||
def sync(self): | ||
pass | ||
|
||
def set_variables(self, variables, check_encoding_set, writer, | ||
unlimited_dims=None, append_dim=None): | ||
""" | ||
This provides a centralized method to set the variables on the data | ||
store. | ||
|
||
Parameters | ||
---------- | ||
variables : dict-like | ||
Dictionary of key/value (variable name / xr.Variable) pairs | ||
check_encoding_set : list-like | ||
List of variables that should be checked for invalid encoding | ||
values | ||
writer : | ||
unlimited_dims : list-like | ||
List of dimension names that should be treated as unlimited | ||
dimensions. | ||
append_dim: str | ||
dimension on which the zarray will be appended | ||
only needed in append mode | ||
""" | ||
for vn, v in variables.items(): | ||
name = _encode_variable_name(vn) | ||
check = vn in check_encoding_set | ||
attrs = v.attrs.copy() | ||
dims = v.dims | ||
dtype = v.dtype | ||
shape = v.shape | ||
|
||
fill_value = attrs.pop('_FillValue', None) | ||
if v.encoding == {'_FillValue': None} and fill_value is None: | ||
v.encoding = {} | ||
append = False | ||
try: | ||
zarr_array = self.ds[name] | ||
append = True | ||
except KeyError: | ||
encoding = _extract_zarr_variable_encoding( | ||
v, raise_on_invalid=check) | ||
encoded_attrs = OrderedDict() | ||
# the magic for storing the hidden dimension data | ||
encoded_attrs[_DIMENSION_KEY] = dims | ||
for k2, v2 in attrs.items(): | ||
encoded_attrs[k2] = self.encode_attribute(v2) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What if we pulled this attribute encoding out before the |
||
|
||
zarr_array = self.ds.create(name, shape=shape, dtype=dtype, | ||
fill_value=fill_value, **encoding) | ||
zarr_array.attrs.put(encoded_attrs) | ||
zarr_array[...] = v.data | ||
if append: | ||
if self.append_dim is None: | ||
raise ValueError('The dimension on which the data is \ | ||
appended has to be named.') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What if we just want to add a new variable to an existing zarr store? This PR could hypothetically support that case as well, but in that case, there is no append_dim to specify. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As mentioned in the other comment, it should already work, but I will add another test for it 👍 |
||
if self.append_dim not in dims: | ||
continue | ||
axis = dims.index(self.append_dim) | ||
zarr_array.append(v.data, axis=axis) | ||
|
||
def close(self): | ||
if self._consolidate_on_close: | ||
import zarr | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1243,7 +1243,8 @@ def to_netcdf(self, path=None, mode='w', format=None, group=None, | |
compute=compute) | ||
|
||
def to_zarr(self, store=None, mode='w-', synchronizer=None, group=None, | ||
encoding=None, compute=True, consolidated=False): | ||
encoding=None, compute=True, consolidated=False, | ||
append_dim=None): | ||
"""Write dataset contents to a zarr group. | ||
|
||
.. note:: Experimental | ||
|
@@ -1254,9 +1255,10 @@ def to_zarr(self, store=None, mode='w-', synchronizer=None, group=None, | |
---------- | ||
store : MutableMapping or str, optional | ||
Store or path to directory in file system. | ||
mode : {'w', 'w-'} | ||
mode : {'w', 'w-', 'a'} | ||
Persistence mode: 'w' means create (overwrite if exists); | ||
'w-' means create (fail if exists). | ||
'w-' means create (fail if exists); | ||
'a' means append (create if does not exist). | ||
synchronizer : object, optional | ||
Array synchronizer | ||
group : str, obtional | ||
|
@@ -1271,21 +1273,23 @@ def to_zarr(self, store=None, mode='w-', synchronizer=None, group=None, | |
consolidated: bool, optional | ||
If True, apply zarr's `consolidate_metadata` function to the store | ||
after writing. | ||
append_dim: str | ||
If mode='a' hand the dimension on which the data will be appended | ||
|
||
References | ||
---------- | ||
https://zarr.readthedocs.io/ | ||
""" | ||
if encoding is None: | ||
encoding = {} | ||
if mode not in ['w', 'w-']: | ||
# TODO: figure out how to handle 'r+' and 'a' | ||
if mode not in ['w', 'w-', 'a']: | ||
# TODO: figure out how to handle 'r+' | ||
raise ValueError("The only supported options for mode are 'w' " | ||
"and 'w-'.") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. and |
||
from ..backends.api import to_zarr | ||
return to_zarr(self, store=store, mode=mode, synchronizer=synchronizer, | ||
group=group, encoding=encoding, compute=compute, | ||
consolidated=consolidated) | ||
consolidated=consolidated, append_dim=append_dim) | ||
|
||
def __repr__(self): | ||
return formatting.dataset_repr(self) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,7 +35,7 @@ | |
requires_pathlib, requires_pseudonetcdf, requires_pydap, requires_pynio, | ||
requires_rasterio, requires_scipy, requires_scipy_or_netCDF4, | ||
requires_zarr) | ||
from .test_dataset import create_test_data | ||
from .test_dataset import create_test_data, create_append_test_data | ||
|
||
try: | ||
import netCDF4 as nc4 | ||
|
@@ -1482,11 +1482,17 @@ def test_write_persistence_modes(self): | |
with pytest.raises(ValueError): | ||
self.save(original, store, mode='w-') | ||
|
||
# check that we can't use other persistence modes | ||
# TODO: reconsider whether other persistence modes should be supported | ||
with pytest.raises(ValueError): | ||
with self.roundtrip(original, save_kwargs={'mode': 'a'}) as actual: | ||
pass | ||
# check append mode for normal write | ||
with self.roundtrip(original, save_kwargs={'mode': 'a'}) as actual: | ||
assert_identical(original, actual) | ||
|
||
# check append mode for append write | ||
obj, obj2 = create_append_test_data() | ||
with self.create_zarr_target() as store_target: | ||
obj.to_zarr(store_target, mode='w') | ||
obj2.to_zarr(store_target, mode='a', append_dim='dim1') | ||
original = xr.concat([obj, obj2], dim='dim1') | ||
assert_identical(original, xr.open_zarr(store_target)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 love this test 😄 |
||
|
||
def test_compressor_encoding(self): | ||
original = create_test_data() | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Need to credit all the contributors to this PR.