diff --git a/doc/Makefile b/doc/Makefile index c028a6116fa..4f9678caeba 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -4,6 +4,7 @@ # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build +SPHINXAUTOBUILD = sphinx-autobuild PAPER = BUILDDIR = _build @@ -24,6 +25,7 @@ I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . help: @echo "Please use \`make ' where is one of" @echo " html to make standalone HTML files" + @echo " livehtml to make and auto-rebuild standalone HTML files, requires sphinx-autorebuild" @echo " dirhtml to make HTML files named index.html in directories" @echo " singlehtml to make a single large HTML file" @echo " pickle to make pickle files" @@ -55,6 +57,11 @@ html: @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." +livehtml: + $(SPHINXAUTOBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + dirhtml: $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml @echo diff --git a/doc/api.rst b/doc/api.rst index c1e2d447b0a..0c041972fe4 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -17,6 +17,7 @@ Top-level functions align broadcast concat + empty_like set_options Dataset @@ -245,6 +246,7 @@ Computation DataArray.reduce DataArray.groupby + DataArray.rolling DataArray.resample DataArray.get_axis_num DataArray.diff diff --git a/doc/computation.rst b/doc/computation.rst index da0d9363883..cc048301407 100644 --- a/doc/computation.rst +++ b/doc/computation.rst @@ -99,6 +99,49 @@ These operations automatically skip missing values, like in pandas: If desired, you can disable this behavior by invoking the aggregation method with ``skipna=False``. +Rolling window operations +========================= + +``DataArray`` objects include a :py:meth:`~xarray.DataArray.rolling` method. This +method supports rolling window aggregation: + +.. ipython:: python + + arr = xr.DataArray(np.arange(0, 7.5, 0.5).reshape(3, 5), + dims=('x', 'y')) + arr + +:py:meth:`~xarray.DataArray.rolling` is applied along one dimension using the +name of the dimension as a key (e.g. ``y``) and the window size as the value +(e.g. ``3``). We get back a ``Rolling`` object: + +.. ipython:: python + + arr.rolling(y=3) + +The label position and minimum number of periods in the rolling window are +controlled by the ``center`` and ``min_periods`` arguments: + +.. ipython:: python + + arr.rolling(y=3, min_periods=2, center=True) + +Aggregation and summary methods can be applied directly to the ``Rolling`` object: + +.. ipython:: python + + r = arr.rolling(y=3) + r.mean() + r.reduce(np.std) + +Finally, we can manually iterate through ``Rolling`` objects: + +.. ipython:: python + + @verbatim + for label, arr_window in r: + # arr_window is a view of x + Broadcasting by dimension name ============================== diff --git a/doc/whats-new.rst b/doc/whats-new.rst index ac44112cd87..a762f1b54a2 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -19,6 +19,37 @@ v0.7.2 (unreleased) Enhancements ~~~~~~~~~~~~ +- Rolling window operations on DataArray objects are now supported via a new + :py:meth:`xarray.DataArray.rolling` method. + + .. ipython:: + :verbatim: + + In [1]: import xarray as xr; import numpy as np + + In [2]: arr = xr.DataArray(np.arange(0, 7.5, 0.5).reshape(3, 5), + dims=('x', 'y')) + + In [3]: arr + Out[3]: + + array([[ 0. , 0.5, 1. , 1.5, 2. ], + [ 2.5, 3. , 3.5, 4. , 4.5], + [ 5. , 5.5, 6. , 6.5, 7. ]]) + Coordinates: + * x (x) int64 0 1 2 + * y (y) int64 0 1 2 3 4 + + In [4]: arr.rolling(y=3, min_periods=2).mean() + Out[4]: + + array([[ nan, 0.25, 0.5 , 1. , 1.5 ], + [ nan, 2.75, 3. , 3.5 , 4. ], + [ nan, 5.25, 5.5 , 6. , 6.5 ]]) + Coordinates: + * x (x) int64 0 1 2 + * y (y) int64 0 1 2 3 4 + Bug fixes ~~~~~~~~~ diff --git a/xarray/core/combine.py b/xarray/core/combine.py index e228932c022..1fb22c7b4db 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -110,7 +110,7 @@ def concat(objs, dim=None, data_vars='all', coords='different', f = _dataset_concat else: raise TypeError('can only concatenate xarray Dataset and DataArray ' - 'objects') + 'objects, got %s' % type(first_obj)) return f(objs, dim, data_vars, coords, compat, positions) diff --git a/xarray/core/common.py b/xarray/core/common.py index 2ebc28a3c82..ecf2946a636 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -1,7 +1,7 @@ import numpy as np import pandas as pd -from .pycompat import basestring, iteritems, suppress +from .pycompat import basestring, iteritems, suppress, dask_array_type from . import formatting from .utils import SortedKeysDict @@ -52,6 +52,63 @@ def wrapped_func(self, dim=None, keep_attrs=False, **kwargs): applied over all dimensions.""" +class ImplementsRollingArrayReduce(object): + @classmethod + def _reduce_method(cls, func): + def wrapped_func(self, **kwargs): + return self.reduce(func, **kwargs) + return wrapped_func + + @classmethod + def _bottleneck_reduce(cls, func): + def wrapped_func(self, **kwargs): + from .dataarray import DataArray + + if isinstance(self.obj.data, dask_array_type): + raise NotImplementedError( + 'Rolling window operation does not work with dask arrays') + + # bottleneck doesn't allow min_count to be 0, although it should + # work the same as if min_count = 1 + if self.min_periods is not None and self.min_periods == 0: + min_count = self.min_periods + 1 + else: + min_count = self.min_periods + + values = func(self.obj.data, window=self.window, + min_count=min_count, axis=self._axis_num) + + result = DataArray(values, self.obj.coords) + + if self.center: + result = self._center_result(result) + + return result + return wrapped_func + + @classmethod + def _bottleneck_reduce_without_min_count(cls, func): + def wrapped_func(self, **kwargs): + from .dataarray import DataArray + + if self.min_periods is not None: + raise ValueError('Rolling.median does not accept min_periods') + + if isinstance(self.obj.data, dask_array_type): + raise NotImplementedError( + 'Rolling window operation does not work with dask arrays') + + values = func(self.obj.data, window=self.window, axis=self._axis_num) + + result = DataArray(values, self.obj.coords) + + if self.center: + result = self._center_result(result) + + return result + return wrapped_func + + class AbstractArray(ImplementsArrayReduce): def __bool__(self): return bool(self.values) @@ -286,6 +343,31 @@ def groupby(self, group, squeeze=True): group = self[group] return self.groupby_cls(self, group, squeeze=squeeze) + def rolling(self, min_periods=None, center=False, **kwarg): + """Returns a Rolling object for performing moving window operations. + + Parameters + ---------- + min_periods : int, default None + Minimum number of observations in window required to have a value + (otherwise result is NA). + center : boolean, default False + Set the labels at the center of the window. + kwarg : dim=window + dim : str + Name of the dimension to create the rolling iterator + along (e.g., `time`). + window : int + Size of the moving window. + + Returns + ------- + rolling : type of input argument + """ + + return self.rolling_cls(self, min_periods=min_periods, + center=center, **kwarg) + def resample(self, freq, dim, how='mean', skipna=None, closed=None, label=None, base=0): """Resample this object to a new temporal resolution. @@ -435,3 +517,64 @@ def _possibly_convert_objects(values): datetime64 and timedelta64, according to the pandas convention. """ return np.asarray(pd.Series(values.ravel())).reshape(values.shape) + + +def _get_fill_value(dtype): + """Return a fill value that appropriately promotes types when used with + np.concatenate + """ + _, fill_value = _maybe_promote(dtype) + return fill_value + + +def _full_like_dataarray(arr, keep_attrs=False, fill_value=None): + """empty DataArray""" + from .dataarray import DataArray + + attrs = arr.attrs if keep_attrs else {} + + if fill_value is None: + values = np.empty_like(arr) + elif fill_value is True: + dtype, fill_value = _maybe_promote(arr.dtype) + values = np.full_like(arr, fill_value=fill_value, dtype=dtype) + else: + dtype, _ = _maybe_promote(np.array(fill_value).dtype) + values = np.full_like(arr, fill_value=fill_value, dtype=dtype) + + return DataArray(values, dims=arr.dims, coords=arr.coords, attrs=attrs) + + +def _full_like(xray_obj, keep_attrs=False, fill_value=None): + """Return a new object with the same shape and type as a given object. + + Parameters + ---------- + xray_obj : DataArray or Dataset + Return a full object with the same shape/dims/coords/attrs. + `func` is calculated over all dimension for each group item. + keep_attrs : bool, optional + If True, the datasets's attributes (`attrs`) will be copied from + the original object to the new one. If False (default), the new + object will be returned without attributes. + fill_value : scalar, optional + Value to fill DataArray(s) with before returning. + + Returns + ------- + out : same as xray_obj + New object with the same shape and type as a given object. + """ + from .dataarray import DataArray + from .dataset import Dataset + + if isinstance(xray_obj, Dataset): + attrs = xray_obj.attrs if keep_attrs else {} + + return Dataset(dict((k, _full_like_dataarray(v, keep_attrs=keep_attrs, + fill_value=fill_value)) + for k, v in iteritems(xray_obj.data_vars)), + name=xray_obj.name, attrs=attrs) + elif isinstance(xray_obj, DataArray): + return _full_like_dataarray(xray_obj, keep_attrs=keep_attrs, + fill_value=fill_value) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 1e3e88a3268..e1a47d7688d 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -9,6 +9,7 @@ from . import indexing from . import groupby +from . import rolling from . import ops from . import utils from .alignment import align @@ -152,6 +153,7 @@ class DataArray(AbstractArray, BaseDataObject): Dictionary for holding arbitrary metadata. """ groupby_cls = groupby.DataArrayGroupBy + rolling_cls = rolling.DataArrayRolling def __init__(self, data, coords=None, dims=None, name=None, attrs=None, encoding=None, fastpath=False): diff --git a/xarray/core/ops.py b/xarray/core/ops.py index e28df7b0d6c..1ea3a34e7fe 100644 --- a/xarray/core/ops.py +++ b/xarray/core/ops.py @@ -17,9 +17,11 @@ try: import bottleneck as bn + has_bottleneck = True except ImportError: # use numpy methods instead bn = np + has_bottleneck = False try: import dask.array as da @@ -46,6 +48,9 @@ REDUCE_METHODS = ['all', 'any'] NAN_REDUCE_METHODS = ['argmax', 'argmin', 'max', 'min', 'mean', 'prod', 'sum', 'std', 'var', 'median'] +BOTTLENECK_ROLLING_METHODS = {'move_sum': 'sum', 'move_mean': 'mean', + 'move_std': 'std', 'move_min': 'min', + 'move_max': 'max'} # TODO: wrap cumprod/cumsum, take, dot, sort @@ -465,3 +470,29 @@ def inject_all_ops_and_reduce_methods(cls, priority=50, array_only=True): setattr(cls, name, _values_method_wrapper(name)) inject_reduce_methods(cls) + + +def inject_bottleneck_rolling_methods(cls): + # standard numpy reduce methods + for name in NAN_REDUCE_METHODS: + f = getattr(np, name) + func = cls._reduce_method(f) + func.__name__ = name + func.__doc__ = 'todo' + setattr(cls, name, func) + + # bottleneck rolling methods + if has_bottleneck: + for bn_name, method_name in BOTTLENECK_ROLLING_METHODS.items(): + f = getattr(bn, bn_name) + func = cls._bottleneck_reduce(f) + func.__name__ = method_name + func.__doc__ = 'todo' + setattr(cls, method_name, func) + + # bottleneck rolling methods without min_count + f = getattr(bn, 'move_median') + func = cls._bottleneck_reduce_without_min_count(f) + func.__name__ = 'median' + func.__doc__ = 'todo' + setattr(cls, 'median', func) diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py new file mode 100644 index 00000000000..793ec23e370 --- /dev/null +++ b/xarray/core/rolling.py @@ -0,0 +1,179 @@ +import numpy as np + +from .pycompat import OrderedDict, zip +from .common import ImplementsRollingArrayReduce, _full_like +from .combine import concat +from .ops import inject_bottleneck_rolling_methods + + +class Rolling(object): + """A object that implements the moving window pattern. + + See Also + -------- + Dataset.groupby + DataArray.groupby + Dataset.rolling + DataArray.rolling + """ + + _attributes = ['window', 'min_periods', 'center', 'dim'] + + def __init__(self, obj, min_periods=None, center=False, **windows): + """ + Moving window object. + + Parameters + ---------- + obj : Dataset or DataArray + Object to window. + min_periods : int, default None + Minimum number of observations in window required to have a value + (otherwise result is NA). The default, None, is equivalent to + setting min_periods equal to the size of the window. + center : boolean, default False + Set the labels at the center of the window. + **windows : dim=window + dim : str + Name of the dimension to create the rolling iterator + along (e.g., `time`). + window : int + Size of the moving window. + + Returns + ------- + rolling : type of input argument + """ + + if len(windows) != 1: + raise ValueError('exactly one dim/window should be provided') + + dim, window = next(iter(windows.items())) + + if window <= 0: + raise ValueError('window must be > 0') + + self.obj = obj + + # attributes + self.window = window + self.min_periods = min_periods + self.center = center + self.dim = dim + + self._axis_num = self.obj.get_axis_num(self.dim) + + self._windows = None + self._valid_windows = None + self.window_indices = None + self.window_labels = None + + self._setup_windows(min_periods=min_periods, center=center) + + def __repr__(self): + """provide a nice str repr of our rolling object""" + + attrs = ["{k}->{v}".format(k=k, v=getattr(self, k)) + for k in self._attributes if getattr(self, k, None) is not None] + return "{klass} [{attrs}]".format(klass=self.__class__.__name__, + attrs=','.join(attrs)) + + @property + def windows(self): + if self._windows is None: + self._windows = OrderedDict(zip(self.window_labels, + self.window_indices)) + return self._windows + + def __len__(self): + return len(self.obj[self.dim]) + + def __iter__(self): + for (label, indices, valid) in zip(self.window_labels, + self.window_indices, + self._valid_windows): + + window = self.obj.isel(**{self.dim: indices}) + + if not valid: + window = _full_like(window, fill_value=True) + + yield (label, window) + + def _setup_windows(self, min_periods=None, center=False): + """ + Find the indicies and labels for each window + """ + from .dataarray import DataArray + + self.window_labels = self.obj[self.dim] + + window = int(self.window) + if min_periods is None: + min_periods = window + + dim_size = self.obj[self.dim].size + + stops = np.arange(dim_size) + 1 + starts = np.maximum(stops - window, 0) + + if min_periods > 1: + valid_windows = (stops - starts) >= min_periods + else: + # No invalid windows + valid_windows = np.ones(dim_size, dtype=bool) + self._valid_windows = DataArray(valid_windows, dims=(self.dim, ), + coords=self.obj[self.dim].coords) + + self.window_indices = [slice(start, stop) + for start, stop in zip(starts, stops)] + + def _center_result(self, result): + """center result""" + shift = (-self.window // 2) + 1 + return result.shift(**{self.dim: shift}) + + def reduce(self, func, **kwargs): + """Reduce the items in this group by applying `func` along some + dimension(s). + + Parameters + ---------- + func : function + Function which can be called in the form + `func(x, **kwargs)` to return the result of collapsing an + np.ndarray over an the rolling dimension. + **kwargs : dict + Additional keyword arguments passed on to `func`. + + Returns + ------- + reduced : DataArray + Array with summarized data. + """ + from .dataarray import DataArray + + windows = [func(window, axis=self._axis_num, **kwargs) + for _, window in self] + + if not isinstance(windows[0], type(self.obj)): + # some functions don't return DataArrays (e.g. np.median) + dims = list(self.obj.dims) + dims.remove(dims[self._axis_num]) + windows = [DataArray(window, dims=dims) for window in windows] + + result = concat(windows, dim=self.window_labels) + + result = result.where(self._valid_windows) + + if self.center: + result = self._center_result(result) + + return result + + +class DataArrayRolling(Rolling, ImplementsRollingArrayReduce): + """Rolling object for DataArrays""" + + +inject_bottleneck_rolling_methods(DataArrayRolling) diff --git a/xarray/test/__init__.py b/xarray/test/__init__.py index 14d28733010..c43c7208504 100644 --- a/xarray/test/__init__.py +++ b/xarray/test/__init__.py @@ -63,6 +63,13 @@ has_matplotlib = False +try: + import bottleneck + has_bottleneck = True +except ImportError: + has_bottleneck = False + + def requires_scipy(test): return test if has_scipy else unittest.skip('requires scipy')(test) @@ -96,6 +103,10 @@ def requires_matplotlib(test): return test if has_matplotlib else unittest.skip('requires matplotlib')(test) +def requires_bottleneck(test): + return test if has_bottleneck else unittest.skip('requires bottleneck')(test) + + def incompatible_2_6(test): """ Test won't work in Python 2.6 diff --git a/xarray/test/test_dataarray.py b/xarray/test/test_dataarray.py index 7236f71839c..6cb83873213 100644 --- a/xarray/test/test_dataarray.py +++ b/xarray/test/test_dataarray.py @@ -7,7 +7,9 @@ from xarray import (align, broadcast, Dataset, DataArray, Coordinate, Variable) from xarray.core.pycompat import iteritems, OrderedDict -from . import TestCase, ReturnItem, source_ndarray, unittest, requires_dask +from xarray.core.common import _full_like +from . import (TestCase, ReturnItem, source_ndarray, unittest, requires_dask, + requires_bottleneck) class TestDataArray(TestCase): @@ -1242,6 +1244,118 @@ def test_groupby_first_and_last(self): expected = array # should be a no-op self.assertDataArrayIdentical(expected, actual) + def make_rolling_example_array(self): + times = pd.date_range('2000-01-01', freq='1D', periods=21) + values = np.random.random((21, 4)) + da = DataArray(values, dims=('time', 'x')) + da['time'] = times + + return da + + def test_rolling_iter(self): + da = self.make_rolling_example_array() + + rolling_obj = da.rolling(time=7) + + self.assertEqual(len(rolling_obj.window_labels), len(da['time'])) + self.assertDataArrayIdentical(rolling_obj.window_labels, da['time']) + + for i, (label, window_da) in enumerate(rolling_obj): + self.assertEqual(label, da['time'].isel(time=i)) + + def test_rolling_properties(self): + da = self.make_rolling_example_array() + rolling_obj = da.rolling(time=4) + + self.assertEqual(rolling_obj._axis_num, 0) + + # catching invalid args + with self.assertRaisesRegexp(ValueError, 'exactly one dim/window should'): + da.rolling(time=7, x=2) + with self.assertRaisesRegexp(ValueError, 'window must be > 0'): + da.rolling(time=-2) + + @requires_bottleneck + def test_rolling_wrapped_bottleneck(self): + import bottleneck as bn + + da = self.make_rolling_example_array() + + # Test all bottleneck functions + rolling_obj = da.rolling(time=7) + for name in ('sum', 'mean', 'std', 'min', 'max', 'median'): + func_name = 'move_{0}'.format(name) + actual = getattr(rolling_obj, name)() + expected = getattr(bn, func_name)(da.values, window=7, axis=0) + self.assertArrayEqual(actual.values, expected) + + # Using min_periods + rolling_obj = da.rolling(time=7, min_periods=1) + for name in ('sum', 'mean', 'std', 'min', 'max'): + func_name = 'move_{0}'.format(name) + actual = getattr(rolling_obj, name)() + expected = getattr(bn, func_name)(da.values, window=7, axis=0, + min_count=1) + self.assertArrayEqual(actual.values, expected) + + # Using center=False + rolling_obj = da.rolling(time=7, center=False) + for name in ('sum', 'mean', 'std', 'min', 'max', 'median'): + actual = getattr(rolling_obj, name)()['time'] + self.assertDataArrayEqual(actual, da['time']) + + # Using center=True + rolling_obj = da.rolling(time=7, center=True) + for name in ('sum', 'mean', 'std', 'min', 'max', 'median'): + actual = getattr(rolling_obj, name)()['time'] + self.assertDataArrayEqual(actual, da['time']) + + # catching invalid args + with self.assertRaisesRegexp(ValueError, 'Rolling.median does not'): + da.rolling(time=7, min_periods=1).median() + + def test_rolling_pandas_compat(self): + s = pd.Series(range(10)) + da = DataArray.from_series(s) + + for center in (False, True): + for window in [1, 2, 3, 4]: + for min_periods in [None, 0, 1, 2, 3]: + if min_periods is not None and window < min_periods: + min_periods = window + s_rolling = pd.rolling_mean(s, window, center=center, + min_periods=min_periods) + da_rolling = da.rolling(index=window, center=center, + min_periods=min_periods).mean() + # pandas does some fancy stuff in the last position, + # we're not going to do that yet! + np.testing.assert_allclose(s_rolling.values[:-1], + da_rolling.values[:-1]) + np.testing.assert_allclose(s_rolling.index, + da_rolling['index']) + + def test_rolling_reduce(self): + da = self.make_rolling_example_array() + + for center in (False, True): + for window in [1, 2, 3, 4]: + for min_periods in [None, 0, 1, 2, 3]: + if min_periods is not None and window < min_periods: + min_periods = window + # we can use this rolling object for all methods below + rolling_obj = da.rolling(time=window, center=center, + min_periods=min_periods) + + # loop over a bunch of methods, skip std since bottleneck uses + # a slightly different formulation and it results in roundoff + # differences when compared to numpy + for name in ('sum', 'mean', 'min', 'max', 'median'): + # skip median when min_periods isset + if not (name == 'median' and min_periods is not None): + actual = rolling_obj.reduce(getattr(np, name)) + expected = getattr(rolling_obj, name)() + self.assertDataArrayAllClose(actual, expected) + def test_resample(self): times = pd.date_range('2000-01-01', freq='6H', periods=10) array = DataArray(np.arange(10), [('time', times)]) @@ -1643,3 +1757,37 @@ def test_setattr_raises(self): array.foo = 2 with self.assertRaisesRegexp(AttributeError, 'cannot set attr'): array.other = 2 + + def test_full_like(self): + da = DataArray(np.random.random(size=(4, 4)), dims=('x', 'y'), + attrs={'attr1': 'value1'}) + actual = _full_like(da) + + self.assertEqual(actual.dtype, da.dtype) + self.assertEqual(actual.shape, da.shape) + self.assertEqual(actual.dims, da.dims) + self.assertEqual(actual.attrs, {}) + + for name in da.coords: + self.assertArrayEqual(da[name], actual[name]) + self.assertEqual(da[name].dtype, actual[name].dtype) + + # keep attrs + actual = _full_like(da, keep_attrs=True) + self.assertEqual(actual.attrs, da.attrs) + + # Fill value + actual = _full_like(da, fill_value=True) + self.assertEqual(actual.dtype, da.dtype) + self.assertEqual(actual.shape, da.shape) + self.assertEqual(actual.dims, da.dims) + np.testing.assert_equal(actual.values, np.nan) + + actual = _full_like(da, fill_value=10) + self.assertEqual(actual.dtype, da.dtype) + np.testing.assert_equal(actual.values, 10) + + # make sure filling with nans promotes integer type + actual = _full_like(DataArray([1, 2, 3]), fill_value=np.nan) + self.assertEqual(actual.dtype, np.float) + np.testing.assert_equal(actual.values, np.nan)