pydata · shoyer · Jun 29, 2019 · Jan 24, 2019 · Jan 24, 2019 · Jan 24, 2019
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -43,6 +43,7 @@ Enhancements
   report showing what exactly differs between the two objects (dimensions /
   coordinates / variables / attributes)  (:issue:`1507`).
   By `Benoit Bovy <https://github.com/benbovy>`_.
+- Added append capability to the zarr store.
 
 Bug fixes
 ~~~~~~~~~

diff --git a/xarray/backends/api.py b/xarray/backends/api.py
@@ -888,7 +888,7 @@ def save_mfdataset(datasets, paths, mode='w', format=None, groups=None,
 
 
 def to_zarr(dataset, store=None, mode='w-', synchronizer=None, group=None,
-            encoding=None, compute=True, consolidated=False):
+            encoding=None, compute=True, consolidated=False, append_dim=None):
     """This function creates an appropriate datastore for writing a dataset to
     a zarr ztore
 
@@ -907,7 +907,7 @@ def to_zarr(dataset, store=None, mode='w-', synchronizer=None, group=None,
                                            synchronizer=synchronizer,
                                            group=group,
                                            consolidate_on_close=consolidated)
-
+    zstore.append_dim = append_dim
     writer = ArrayWriter()
     # TODO: figure out how to properly handle unlimited_dims
     dump_to_store(dataset, zstore, writer, encoding=encoding)

diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py
@@ -7,7 +7,8 @@
 from ..core import indexing
 from ..core.pycompat import integer_types
 from ..core.utils import FrozenOrderedDict, HiddenKeyDict
-from .common import AbstractWritableDataStore, BackendArray
+from .common import AbstractWritableDataStore, BackendArray, \
+    _encode_variable_name
 
 # need some special secret attributes to tell us the dimensions
 _DIMENSION_KEY = '_ARRAY_DIMENSIONS'
@@ -312,40 +313,99 @@ def encode_variable(self, variable):
     def encode_attribute(self, a):
         return _encode_zarr_attr_value(a)
 
-    def prepare_variable(self, name, variable, check_encoding=False,
-                         unlimited_dims=None):
-
-        attrs = variable.attrs.copy()
-        dims = variable.dims
-        dtype = variable.dtype
-        shape = variable.shape
-
-        fill_value = attrs.pop('_FillValue', None)
-        if variable.encoding == {'_FillValue': None} and fill_value is None:
-            variable.encoding = {}
-
-        encoding = _extract_zarr_variable_encoding(
-            variable, raise_on_invalid=check_encoding)
-
-        encoded_attrs = OrderedDict()
-        # the magic for storing the hidden dimension data
-        encoded_attrs[_DIMENSION_KEY] = dims
-        for k, v in attrs.items():
-            encoded_attrs[k] = self.encode_attribute(v)
-
-        zarr_array = self.ds.create(name, shape=shape, dtype=dtype,
-                                    fill_value=fill_value, **encoding)
-        zarr_array.attrs.put(encoded_attrs)
-
-        return zarr_array, variable.data
-
-    def store(self, variables, attributes, *args, **kwargs):
-        AbstractWritableDataStore.store(self, variables, attributes,
-                                        *args, **kwargs)
+    def store(self, variables, attributes, check_encoding_set=frozenset(),
+              writer=None, unlimited_dims=None):
+        """
+        Top level method for putting data on this store, this method:
+          - encodes variables/attributes
+          - sets dimensions
+          - sets variables
+
+        Parameters
+        ----------
+        variables : dict-like
+            Dictionary of key/value (variable name / xr.Variable) pairs
+        attributes : dict-like
+            Dictionary of key/value (attribute name / attribute) pairs
+        check_encoding_set : list-like
+            List of variables that should be checked for invalid encoding
+            values
+        writer : ArrayWriter
+        unlimited_dims : list-like
+            List of dimension names that should be treated as unlimited
+            dimensions.
+            dimension on which the zarray will be appended
+            only needed in append mode
+        """
+
+        variables, attributes = self.encode(variables, attributes)
+
+        self.set_attributes(attributes)
+        self.set_dimensions(variables, unlimited_dims=unlimited_dims)
+        self.set_variables(variables, check_encoding_set, writer,
+                           unlimited_dims=unlimited_dims)
 
     def sync(self):
         pass
 
+    def set_variables(self, variables, check_encoding_set, writer,
+                      unlimited_dims=None, append_dim=None):
+        """
+        This provides a centralized method to set the variables on the data
+        store.
+
+        Parameters
+        ----------
+        variables : dict-like
+            Dictionary of key/value (variable name / xr.Variable) pairs
+        check_encoding_set : list-like
+            List of variables that should be checked for invalid encoding
+            values
+        writer :
+        unlimited_dims : list-like
+            List of dimension names that should be treated as unlimited
+            dimensions.
+        append_dim: str
+            dimension on which the zarray will be appended
+            only needed in append mode
+        """
+        for vn, v in variables.items():
+            name = _encode_variable_name(vn)
+            check = vn in check_encoding_set
+            attrs = v.attrs.copy()
+            dims = v.dims
+            dtype = v.dtype
+            shape = v.shape
+
+            fill_value = attrs.pop('_FillValue', None)
+            if v.encoding == {'_FillValue': None} and fill_value is None:
+                v.encoding = {}
+            append = False
+            try:
+                zarr_array = self.ds[name]
+                append = True
+            except KeyError:
+                encoding = _extract_zarr_variable_encoding(
+                    v, raise_on_invalid=check)
+                encoded_attrs = OrderedDict()
+                # the magic for storing the hidden dimension data
+                encoded_attrs[_DIMENSION_KEY] = dims
+                for k2, v2 in attrs.items():
+                    encoded_attrs[k2] = self.encode_attribute(v2)
+
+                zarr_array = self.ds.create(name, shape=shape, dtype=dtype,
+                                            fill_value=fill_value, **encoding)
+                zarr_array.attrs.put(encoded_attrs)
+                zarr_array[...] = v.data
+            if append:
+                if self.append_dim is None:
+                    raise ValueError('The dimension on which the data is \
+                     appended has to be named.')
+                if self.append_dim not in dims:
+                    continue
+                axis = dims.index(self.append_dim)
+                zarr_array.append(v.data, axis=axis)
+
     def close(self):
         if self._consolidate_on_close:
             import zarr

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -1243,7 +1243,8 @@ def to_netcdf(self, path=None, mode='w', format=None, group=None,
                          compute=compute)
 
     def to_zarr(self, store=None, mode='w-', synchronizer=None, group=None,
-                encoding=None, compute=True, consolidated=False):
+                encoding=None, compute=True, consolidated=False,
+                append_dim=None):
         """Write dataset contents to a zarr group.
 
         .. note:: Experimental
@@ -1254,9 +1255,10 @@ def to_zarr(self, store=None, mode='w-', synchronizer=None, group=None,
         ----------
         store : MutableMapping or str, optional
             Store or path to directory in file system.
-        mode : {'w', 'w-'}
+        mode : {'w', 'w-', 'a'}
             Persistence mode: 'w' means create (overwrite if exists);
-            'w-' means create (fail if exists).
+            'w-' means create (fail if exists);
+            'a' means append (create if does not exist).
         synchronizer : object, optional
             Array synchronizer
         group : str, obtional
@@ -1271,21 +1273,23 @@ def to_zarr(self, store=None, mode='w-', synchronizer=None, group=None,
         consolidated: bool, optional
             If True, apply zarr's `consolidate_metadata` function to the store
             after writing.
+        append_dim: str
+            If mode='a' hand the dimension on which the data will be appended
 
         References
         ----------
         https://zarr.readthedocs.io/
         """
         if encoding is None:
             encoding = {}
-        if mode not in ['w', 'w-']:
-            # TODO: figure out how to handle 'r+' and 'a'
+        if mode not in ['w', 'w-', 'a']:
+            # TODO: figure out how to handle 'r+'
             raise ValueError("The only supported options for mode are 'w' "
                              "and 'w-'.")
         from ..backends.api import to_zarr
         return to_zarr(self, store=store, mode=mode, synchronizer=synchronizer,
                        group=group, encoding=encoding, compute=compute,
-                       consolidated=consolidated)
+                       consolidated=consolidated, append_dim=append_dim)
 
     def __repr__(self):
         return formatting.dataset_repr(self)

diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
@@ -35,7 +35,7 @@
     requires_pathlib, requires_pseudonetcdf, requires_pydap, requires_pynio,
     requires_rasterio, requires_scipy, requires_scipy_or_netCDF4,
     requires_zarr)
-from .test_dataset import create_test_data
+from .test_dataset import create_test_data, create_append_test_data
 
 try:
     import netCDF4 as nc4
@@ -1482,11 +1482,17 @@ def test_write_persistence_modes(self):
                 with pytest.raises(ValueError):
                     self.save(original, store, mode='w-')
 
-        # check that we can't use other persistence modes
-        # TODO: reconsider whether other persistence modes should be supported
-        with pytest.raises(ValueError):
-            with self.roundtrip(original, save_kwargs={'mode': 'a'}) as actual:
-                pass
+        # check append mode for normal write
+        with self.roundtrip(original, save_kwargs={'mode': 'a'}) as actual:
+            assert_identical(original, actual)
+
+        # check append mode for append write
+        obj, obj2 = create_append_test_data()
+        with self.create_zarr_target() as store_target:
+            obj.to_zarr(store_target, mode='w')
+            obj2.to_zarr(store_target, mode='a', append_dim='dim1')
+            original = xr.concat([obj, obj2], dim='dim1')
+            assert_identical(original, xr.open_zarr(store_target))
 
     def test_compressor_encoding(self):
         original = create_test_data()

diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py
@@ -52,6 +52,43 @@ def create_test_data(seed=None):
     return obj
 
 
+def create_append_test_data(seed=None):
+    rs = np.random.RandomState(seed)
+    _vars = {'var1': ['dim1', 'dim2'],
+             'var2': ['dim1', 'dim2'],
+             'var3': ['dim3', 'dim1']}
+    _dims = {'dim1': 8, 'dim2': 9, 'dim3': 10}
+
+    obj = Dataset()
+    obj['time'] = ('time', pd.date_range('2000-01-01', periods=20))
+    obj['dim2'] = ('dim2', 0.5 * np.arange(_dims['dim2']))
+    obj['dim3'] = ('dim3', list('abcdefghij'))
+    for v, dims in sorted(_vars.items()):
+        data = rs.normal(size=tuple(_dims[d] for d in dims))
+        obj[v] = (dims, data, {'foo': 'variable'})
+    obj.coords['numbers'] = ('dim3', np.array([0, 1, 2, 0, 0, 1, 1, 2, 2, 3],
+                                              dtype='int64'))
+    obj.encoding = {'foo': 'bar'}
+    assert all(objp.data.flags.writeable for objp in obj.variables.values())
+    _vars = {'var1': ['dim1', 'dim2'],
+             'var2': ['dim1', 'dim2'],
+             'var3': ['dim3', 'dim1']}
+    _dims = {'dim1': 8, 'dim2': 9, 'dim3': 10}
+
+    obj2 = Dataset()
+    obj2['time'] = ('time', pd.date_range('2000-01-01', periods=20))
+    obj2['dim2'] = ('dim2', 0.5 * np.arange(_dims['dim2']))
+    obj2['dim3'] = ('dim3', list('abcdefghij'))
+    for v, dims in sorted(_vars.items()):
+        data = rs.normal(size=tuple(_dims[d] for d in dims))
+        obj2[v] = (dims, data, {'foo': 'variable'})
+    obj2.coords['numbers'] = ('dim3', np.array([0, 1, 2, 0, 0, 1, 1, 2, 2, 3],
+                                                dtype='int64'))
+    obj2.encoding = {'foo': 'bar'}
+    assert all(objp.data.flags.writeable for objp in obj2.variables.values())
+    return obj, obj2
+
+
 def create_test_multiindex():
     mindex = pd.MultiIndex.from_product([['a', 'b'], [1, 2]],
                                         names=('level_1', 'level_2'))