Add quantile method to GroupBy (pydata#2828)

* implement groupby.quantile + tests * added quantile method in whats-new * mark additional test as xfail. * lint fix * simpler version of groupby.quantile * added quantile methods to api.rst * included DEFAULT_DIMS handling in quantile method * clarified groupby tests * added test with more typical use case * pep8 * removed failing test
dcherian · Jun 24, 2019 · b054c31 · b054c31
1 parent cfd8210
commit b054c31
Show file tree

Hide file tree

Showing 4 changed files with 123 additions and 3 deletions.
diff --git a/doc/api.rst b/doc/api.rst
@@ -190,6 +190,7 @@ Computation
 :py:attr:`~core.groupby.DatasetGroupBy.last`
 :py:attr:`~core.groupby.DatasetGroupBy.fillna`
 :py:attr:`~core.groupby.DatasetGroupBy.where`
+:py:attr:`~core.groupby.DatasetGroupBy.quantile`
 
 Reshaping and reorganizing
 --------------------------
@@ -362,7 +363,7 @@ Computation
 :py:attr:`~core.groupby.DataArrayGroupBy.last`
 :py:attr:`~core.groupby.DataArrayGroupBy.fillna`
 :py:attr:`~core.groupby.DataArrayGroupBy.where`
-
+:py:attr:`~core.groupby.DataArrayGroupBy.quantile`
 
 Reshaping and reorganizing
 --------------------------

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -21,7 +21,8 @@ v0.12.2 (unreleased)
 Enhancements
 ~~~~~~~~~~~~
 
-
+- New :py:meth:`~xarray.GroupBy.quantile` method. (:issue:`3018`)
+  By `David Huard <https://github.com/huard>`_.
 - Add ``keepdims`` argument for reduce operations (:issue:`2170`)
   By `Scott Wales <https://github.com/ScottWales>`_.
 - netCDF chunksizes are now only dropped when original_shape is different,
@@ -90,7 +91,7 @@ Bug fixes
   By `Maximilian Roos <https://github.com/max-sixty>`_.
 - Fixed performance issues with cftime installed (:issue:`3000`)
   By `0x0L <https://github.com/0x0L>`_.
-- Replace incorrect usages of `message` in pytest assertions 
+- Replace incorrect usages of `message` in pytest assertions
   with `match` (:issue:`3011`)
   By `Maximilian Roos <https://github.com/max-sixty>`_.
 - Add explicit pytest markers, now required by pytest

diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py
@@ -595,6 +595,64 @@ def _combine(self, applied, restore_coord_dims=False, shortcut=False):
         combined = self._maybe_unstack(combined)
         return combined
 
+    def quantile(self, q, dim=None, interpolation='linear', keep_attrs=None):
+        """Compute the qth quantile over each array in the groups and
+        concatenate them together into a new array.
+
+        Parameters
+        ----------
+        q : float in range of [0,1] (or sequence of floats)
+            Quantile to compute, which must be between 0 and 1
+            inclusive.
+        dim : str or sequence of str, optional
+            Dimension(s) over which to apply quantile.
+            Defaults to the grouped dimension.
+        interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
+            This optional parameter specifies the interpolation method to
+            use when the desired quantile lies between two data points
+            ``i < j``:
+                * linear: ``i + (j - i) * fraction``, where ``fraction`` is
+                  the fractional part of the index surrounded by ``i`` and
+                  ``j``.
+                * lower: ``i``.
+                * higher: ``j``.
+                * nearest: ``i`` or ``j``, whichever is nearest.
+                * midpoint: ``(i + j) / 2``.
+
+        Returns
+        -------
+        quantiles : Variable
+            If `q` is a single quantile, then the result
+            is a scalar. If multiple percentiles are given, first axis of
+            the result corresponds to the quantile and a quantile dimension
+            is added to the return array. The other dimensions are the
+            dimensions that remain after the reduction of the array.
+
+        See Also
+        --------
+        numpy.nanpercentile, pandas.Series.quantile, Dataset.quantile,
+        DataArray.quantile
+        """
+        if dim == DEFAULT_DIMS:
+            dim = ALL_DIMS
+            # TODO change this to dim = self._group_dim after
+            # the deprecation process
+            if self._obj.ndim > 1:
+                warnings.warn(
+                    "Default reduction dimension will be changed to the "
+                    "grouped dimension in a future version of xarray. To "
+                    "silence this warning, pass dim=xarray.ALL_DIMS "
+                    "explicitly.",
+                    FutureWarning, stacklevel=2)
+
+        out = self.apply(self._obj.__class__.quantile, shortcut=False,
+                         q=q, dim=dim, interpolation=interpolation,
+                         keep_attrs=keep_attrs)
+
+        if np.asarray(q, dtype=np.float64).ndim == 0:
+            out = out.drop('quantile')
+        return out
+
     def reduce(self, func, dim=None, axis=None, keep_attrs=None,
                shortcut=True, **kwargs):
         """Reduce the items in this group by applying `func` along some

diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py
@@ -105,4 +105,64 @@ def func(arg1, arg2, arg3=0):
     assert_identical(expected, actual)
 
 
+def test_da_groupby_quantile():
+
+    array = xr.DataArray([1, 2, 3, 4, 5, 6],
+                         [('x', [1, 1, 1, 2, 2, 2])])
+
+    # Scalar quantile
+    expected = xr.DataArray([2, 5], [('x', [1, 2])])
+    actual = array.groupby('x').quantile(.5)
+    assert_identical(expected, actual)
+
+    # Vector quantile
+    expected = xr.DataArray([[1, 3], [4, 6]],
+                            [('x', [1, 2]), ('quantile', [0, 1])])
+    actual = array.groupby('x').quantile([0, 1])
+    assert_identical(expected, actual)
+
+    # Multiple dimensions
+    array = xr.DataArray([[1, 11, 26], [2, 12, 22], [3, 13, 23],
+                          [4, 16, 24], [5, 15, 25]],
+                         [('x', [1, 1, 1, 2, 2],),
+                          ('y', [0, 0, 1])])
+
+    actual_x = array.groupby('x').quantile(0)
+    expected_x = xr.DataArray([1, 4],
+                              [('x', [1, 2]), ])
+    assert_identical(expected_x, actual_x)
+
+    actual_y = array.groupby('y').quantile(0)
+    expected_y = xr.DataArray([1, 22],
+                              [('y', [0, 1]), ])
+    assert_identical(expected_y, actual_y)
+
+    actual_xx = array.groupby('x').quantile(0, dim='x')
+    expected_xx = xr.DataArray([[1, 11, 22], [4, 15, 24]],
+                               [('x', [1, 2]), ('y', [0, 0, 1])])
+    assert_identical(expected_xx, actual_xx)
+
+    actual_yy = array.groupby('y').quantile(0, dim='y')
+    expected_yy = xr.DataArray([[1, 26], [2, 22], [3, 23], [4, 24], [5, 25]],
+                               [('x', [1, 1, 1, 2, 2]), ('y', [0, 1])])
+    assert_identical(expected_yy, actual_yy)
+
+    times = pd.date_range('2000-01-01', periods=365)
+    x = [0, 1]
+    foo = xr.DataArray(np.reshape(np.arange(365 * 2), (365, 2)),
+                       coords=dict(time=times, x=x), dims=('time', 'x'))
+    g = foo.groupby(foo.time.dt.month)
+
+    actual = g.quantile(0)
+    expected = xr.DataArray([0., 62., 120., 182., 242., 304.,
+                             364., 426., 488., 548., 610., 670.],
+                            [('month', np.arange(1, 13))])
+    assert_identical(expected, actual)
+
+    actual = g.quantile(0, dim='time')[:2]
+    expected = xr.DataArray([[0., 1], [62., 63]],
+                            [('month', [1, 2]), ('x', [0, 1])])
+    assert_identical(expected, actual)
+
+
 # TODO: move other groupby tests from test_dataset and test_dataarray over here