From 56a01204d13bc785f8dc8f62c8b58fda447d303e Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 4 Feb 2021 20:47:09 -0800 Subject: [PATCH 1/6] BUG: quantile for ExtensionArray --- pandas/core/array_algos/quantile.py | 76 +++++++++++++++++ pandas/core/arrays/datetimelike.py | 3 +- pandas/core/internals/blocks.py | 93 ++++++++++----------- pandas/tests/frame/methods/test_quantile.py | 79 +++++++++++++++++ 4 files changed, 199 insertions(+), 52 deletions(-) create mode 100644 pandas/core/array_algos/quantile.py diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py new file mode 100644 index 0000000000000..6106a275440b1 --- /dev/null +++ b/pandas/core/array_algos/quantile.py @@ -0,0 +1,76 @@ +from typing import Sequence, Union + +import numpy as np + +from pandas._libs import lib + +from pandas.core.dtypes.common import is_list_like + +from pandas.core.nanops import nanpercentile + + +def quantile_with_mask( + values: np.ndarray, + mask: np.ndarray, + fill_value, + qs: Union[float, Sequence[float]], + interpolation: str, + axis: int, +) -> np.ndarray: + """ + Compute the quantiles of the given values for each quantile in `qs`. + + + Parameters + ---------- + values : np.ndarray + For ExtensionArray, this is _values_for_factorize()[0] + mask : np.ndarray[bool] + mask = isna(values) + For ExtensionArray, this is computed before calling _value_for_factorize + fill_value : Scalar + The value to interpret fill NA entries with + For ExtensionArray, this is _values_for_factorize()[1] + qs : a scalar or list of the quantiles to be computed + interpolation : str + Type of interpolation + axis : int + Axis along which to compute quantiles. + + Notes + ----- + Assumes values is already 2D. For ExtensionArray this means np.atleast_2d + has been called on _values_for_factorize()[0] + """ + is_empty = values.shape[axis] == 0 + orig_scalar = not is_list_like(qs) + if orig_scalar: + # make list-like, unpack later + qs = [qs] + + if is_empty: + # create the array of na_values + # 2d len(values) * len(qs) + flat = np.array([fill_value] * len(qs)) + result = np.repeat(flat, len(values)).reshape(len(values), len(qs)) + else: + # asarray needed for Sparse, see GH#24600 + result = nanpercentile( + values, + np.array(qs) * 100, + axis=axis, + na_value=fill_value, + mask=mask, + ndim=values.ndim, + interpolation=interpolation, + ) + + result = np.array(result, copy=False) + result = result.T + + if orig_scalar: + assert result.shape[-1] == 1, result.shape + result = result[..., 0] + result = lib.item_from_zerodim(result) + + return result diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 1032559766ada..78ad661b34493 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -425,7 +425,8 @@ def copy(self: DatetimeLikeArrayT) -> DatetimeLikeArrayT: return new_obj def _values_for_factorize(self): - return self._ndarray, iNaT + # int64 instead of int ensures we have a "view" method + return self._ndarray, np.int64(iNaT) @classmethod def _from_factorized( diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 9314666acdaad..5597c4fa561c3 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -56,6 +56,7 @@ putmask_smart, putmask_without_repeat, ) +from pandas.core.array_algos.quantile import quantile_with_mask from pandas.core.array_algos.replace import ( compare_or_regex_search, replace_regex, @@ -79,7 +80,6 @@ is_scalar_indexer, ) import pandas.core.missing as missing -from pandas.core.nanops import nanpercentile if TYPE_CHECKING: from pandas import Index @@ -1390,8 +1390,10 @@ def quantile(self, qs, interpolation="linear", axis: int = 0) -> Block: Parameters ---------- qs: a scalar or list of the quantiles to be computed - interpolation: type of interpolation, default 'linear' - axis: axis to compute, default 0 + interpolation : str, default "linear" + Type of interpolation + axis : int, default 0 + Axis along which to compute quantiles. Returns ------- @@ -1400,44 +1402,16 @@ def quantile(self, qs, interpolation="linear", axis: int = 0) -> Block: # We should always have ndim == 2 because Series dispatches to DataFrame assert self.ndim == 2 - values = self.get_values() - - is_empty = values.shape[axis] == 0 - orig_scalar = not is_list_like(qs) - if orig_scalar: - # make list-like, unpack later - qs = [qs] - - if is_empty: - # create the array of na_values - # 2d len(values) * len(qs) - result = np.repeat( - np.array([self.fill_value] * len(qs)), len(values) - ).reshape(len(values), len(qs)) - else: - # asarray needed for Sparse, see GH#24600 - mask = np.asarray(isna(values)) - result = nanpercentile( - values, - np.array(qs) * 100, - axis=axis, - na_value=self.fill_value, - mask=mask, - ndim=values.ndim, - interpolation=interpolation, - ) + fill_value = self.fill_value + values = self.values + mask = np.asarray(isna(values)) - result = np.array(result, copy=False) - result = result.T + result = quantile_with_mask(values, mask, fill_value, qs, interpolation, axis) + ndim = np.ndim(result) - if orig_scalar and not lib.is_scalar(result): - # result could be scalar in case with is_empty and self.ndim == 1 - assert result.shape[-1] == 1, result.shape - result = result[..., 0] - result = lib.item_from_zerodim(result) + placement = np.arange(len(result)) - ndim = np.ndim(result) - return make_block(result, placement=np.arange(len(result)), ndim=ndim) + return make_block(result, placement=placement, ndim=ndim) def _replace_coerce( self, @@ -1866,6 +1840,36 @@ def _unstack(self, unstacker, fill_value, new_placement): ] return blocks, mask + def quantile(self, qs, interpolation="linear", axis: int = 0) -> Block: + # asarray needed for Sparse, see GH#24600 + mask = np.asarray(isna(self.values)) + mask = np.atleast_2d(mask) + + values, fill_value = self.values._values_for_factorize() + + values = np.atleast_2d(values) + + result = quantile_with_mask(values, mask, fill_value, qs, interpolation, axis) + ndim = np.ndim(result) + + if not is_sparse(self.dtype): + # shape[0] should be 1 as long as EAs are 1D + + if result.ndim == 1: + # i.e. qs was originally a scalar + assert result.shape == (1,), result.shape + result = type(self.values)._from_factorized(result, self.values) + placement = np.arange(len(result)) + + else: + assert result.shape == (1, len(qs)), result.shape + result = type(self.values)._from_factorized(result[0], self.values) + placement = [0] + else: + placement = np.arange(len(result)) + + return make_block(result, placement=placement, ndim=ndim) + class HybridMixin: """ @@ -2184,19 +2188,6 @@ def fillna( value, limit=limit, inplace=inplace, downcast=downcast ) - def quantile(self, qs, interpolation="linear", axis: int = 0) -> Block: - naive = self.values.view("M8[ns]") - - # TODO(EA2D): kludge for 2D block with 1D values - naive = naive.reshape(self.shape) - - blk = self.make_block(naive) - res_blk = blk.quantile(qs, interpolation=interpolation, axis=axis) - - # TODO(EA2D): ravel is kludge for 2D block with 1D values, assumes column-like - aware = self._holder(res_blk.values.ravel(), dtype=self.dtype) - return self.make_block_same_class(aware, ndim=res_blk.ndim) - def _check_ndim(self, values, ndim): """ ndim inference and validation. diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 3f7f2e51add96..cee34648d5e2c 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -78,6 +78,85 @@ def test_quantile(self, datetime_frame): expected = Series([3.0, 4.0], index=[0, 1], name=0.5) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("as_dt64tz", [True, False]) + def test_quantile_period(self, frame_or_series, as_dt64tz): + pi = pd.period_range("2016-01-01", periods=9, freq="D", name="A") + if as_dt64tz: + pi = pi.to_timestamp("S").tz_localize("US/Central") + + obj = frame_or_series(pi) + + qs = [0.5, 0, 1] + if frame_or_series is Series: + result = obj.quantile(qs) + else: + result = obj.quantile(qs, numeric_only=False) + + expected = Series([pi[4], pi[0], pi[-1]], index=qs, name="A") + expected = frame_or_series(expected) + + tm.assert_equal(result, expected) + + # TODO: tests for axis=1? + # TODO: empty case? might as well do dt64 and td64 here too + @pytest.mark.parametrize("as_dt64tz", [True, False]) + def test_quantile_period_with_nat(self, frame_or_series, as_dt64tz): + pi = pd.period_range("2016-01-01", periods=9, freq="D", name="A") + if as_dt64tz: + pi = pi.to_timestamp("S").tz_localize("US/Central") + + obj = frame_or_series(pi) + + obj.iloc[0] = pd.NaT + obj.iloc[-1] = pd.NaT + + qs = [0.5, 0, 1] + if frame_or_series is Series: + result = obj.quantile(qs) + else: + result = obj.quantile(qs, numeric_only=False) + + expected = Series([pi[4], pi[1], pi[-2]], index=qs, name="A") + expected = frame_or_series(expected) + tm.assert_equal(result, expected) + + @pytest.mark.parametrize("as_dt64tz", [True, False]) + def test_quantile_period_all_nat(self, frame_or_series, as_dt64tz): + pi = pd.period_range("2016-01-01", periods=9, freq="D", name="A") + if as_dt64tz: + pi = pi.to_timestamp("S").tz_localize("US/Central") + + obj = frame_or_series(pi) + obj.iloc[:] = pd.NaT + + qs = [0.5, 0, 1] + if frame_or_series is Series: + result = obj.quantile(qs) + else: + result = obj.quantile(qs, numeric_only=False) + + expected = Series([pd.NaT, pd.NaT, pd.NaT], dtype=pi.dtype, index=qs, name="A") + expected = frame_or_series(expected) + tm.assert_equal(result, expected) + + def test_quantile_period_scalar(self, frame_or_series): + # scalar qs + pi = pd.period_range("2016-01-01", periods=9, freq="D", name="A") + obj = frame_or_series(pi) + + qs = 0.5 + if frame_or_series is Series: + result = obj.quantile(qs) + else: + result = obj.quantile(qs, numeric_only=False) + + expected = Series({"A": pi[4]}, name=0.5) + if frame_or_series is Series: + expected = expected["A"] + assert result == expected + else: + tm.assert_series_equal(result, expected) + def test_quantile_date_range(self): # GH 2460 From a8dce2ed30da2448b41737827ea8a7df6a368c6b Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 4 Feb 2021 20:49:34 -0800 Subject: [PATCH 2/6] docstring fixup --- pandas/core/array_algos/quantile.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py index 6106a275440b1..6580350e6b037 100644 --- a/pandas/core/array_algos/quantile.py +++ b/pandas/core/array_algos/quantile.py @@ -20,7 +20,6 @@ def quantile_with_mask( """ Compute the quantiles of the given values for each quantile in `qs`. - Parameters ---------- values : np.ndarray @@ -37,6 +36,10 @@ def quantile_with_mask( axis : int Axis along which to compute quantiles. + Returns + ------- + np.ndarray + Notes ----- Assumes values is already 2D. For ExtensionArray this means np.atleast_2d From 4d2a16d2c0678d73d69c2ab1c1cd6aea87498d81 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 5 Feb 2021 09:45:27 -0800 Subject: [PATCH 3/6] mypy fixup --- pandas/core/array_algos/quantile.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py index 6580350e6b037..8d4dd7be28839 100644 --- a/pandas/core/array_algos/quantile.py +++ b/pandas/core/array_algos/quantile.py @@ -1,5 +1,3 @@ -from typing import Sequence, Union - import numpy as np from pandas._libs import lib @@ -13,7 +11,7 @@ def quantile_with_mask( values: np.ndarray, mask: np.ndarray, fill_value, - qs: Union[float, Sequence[float]], + qs, interpolation: str, axis: int, ) -> np.ndarray: From cebaf0148ad41b55d385c16c8cced1b0cf4ac240 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 5 Feb 2021 10:15:26 -0800 Subject: [PATCH 4/6] TST: fixturize ea quantile tests --- pandas/tests/frame/methods/test_quantile.py | 176 +++++++++++--------- 1 file changed, 97 insertions(+), 79 deletions(-) diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index cee34648d5e2c..58476a30cf960 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -78,85 +78,6 @@ def test_quantile(self, datetime_frame): expected = Series([3.0, 4.0], index=[0, 1], name=0.5) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("as_dt64tz", [True, False]) - def test_quantile_period(self, frame_or_series, as_dt64tz): - pi = pd.period_range("2016-01-01", periods=9, freq="D", name="A") - if as_dt64tz: - pi = pi.to_timestamp("S").tz_localize("US/Central") - - obj = frame_or_series(pi) - - qs = [0.5, 0, 1] - if frame_or_series is Series: - result = obj.quantile(qs) - else: - result = obj.quantile(qs, numeric_only=False) - - expected = Series([pi[4], pi[0], pi[-1]], index=qs, name="A") - expected = frame_or_series(expected) - - tm.assert_equal(result, expected) - - # TODO: tests for axis=1? - # TODO: empty case? might as well do dt64 and td64 here too - @pytest.mark.parametrize("as_dt64tz", [True, False]) - def test_quantile_period_with_nat(self, frame_or_series, as_dt64tz): - pi = pd.period_range("2016-01-01", periods=9, freq="D", name="A") - if as_dt64tz: - pi = pi.to_timestamp("S").tz_localize("US/Central") - - obj = frame_or_series(pi) - - obj.iloc[0] = pd.NaT - obj.iloc[-1] = pd.NaT - - qs = [0.5, 0, 1] - if frame_or_series is Series: - result = obj.quantile(qs) - else: - result = obj.quantile(qs, numeric_only=False) - - expected = Series([pi[4], pi[1], pi[-2]], index=qs, name="A") - expected = frame_or_series(expected) - tm.assert_equal(result, expected) - - @pytest.mark.parametrize("as_dt64tz", [True, False]) - def test_quantile_period_all_nat(self, frame_or_series, as_dt64tz): - pi = pd.period_range("2016-01-01", periods=9, freq="D", name="A") - if as_dt64tz: - pi = pi.to_timestamp("S").tz_localize("US/Central") - - obj = frame_or_series(pi) - obj.iloc[:] = pd.NaT - - qs = [0.5, 0, 1] - if frame_or_series is Series: - result = obj.quantile(qs) - else: - result = obj.quantile(qs, numeric_only=False) - - expected = Series([pd.NaT, pd.NaT, pd.NaT], dtype=pi.dtype, index=qs, name="A") - expected = frame_or_series(expected) - tm.assert_equal(result, expected) - - def test_quantile_period_scalar(self, frame_or_series): - # scalar qs - pi = pd.period_range("2016-01-01", periods=9, freq="D", name="A") - obj = frame_or_series(pi) - - qs = 0.5 - if frame_or_series is Series: - result = obj.quantile(qs) - else: - result = obj.quantile(qs, numeric_only=False) - - expected = Series({"A": pi[4]}, name=0.5) - if frame_or_series is Series: - expected = expected["A"] - assert result == expected - else: - tm.assert_series_equal(result, expected) - def test_quantile_date_range(self): # GH 2460 @@ -612,3 +533,100 @@ def test_quantile_item_cache(self): ser.values[0] = 99 assert df.iloc[0, 0] == df["A"][0] + + +class TestQuantileExtensionDtype: + # TODO: tests for axis=1? + # TODO: empty case? might as well do dt64 and td64 here too + + @pytest.fixture( + params=[ + pytest.param( + pd.IntervalIndex.from_breaks(range(10)), + marks=pytest.mark.xfail(reason="raises when trying to add Intervals"), + ), + pd.period_range("2016-01-01", periods=9, freq="D"), + pd.date_range("2016-01-01", periods=9, tz="US/Pacific"), + ], + ids=lambda x: str(x.dtype), + ) + def index(self, request): + idx = request.param + idx.name = "A" + return idx + + def compute_quantile(self, obj, qs): + if isinstance(obj, Series): + result = obj.quantile(qs) + else: + result = obj.quantile(qs, numeric_only=False) + return result + + def test_quantile_ea(self, index, frame_or_series): + obj = frame_or_series(index).copy() + + # shuffle our values + indexer = np.arange(len(index), dtype=np.intp) + np.random.shuffle(indexer) + obj = obj.iloc[indexer] + + qs = [0.5, 0, 1] + result = self.compute_quantile(obj, qs) + + # expected here assumes len(index) == 9 + expected = Series([index[4], index[0], index[-1]], index=qs, name="A") + expected = frame_or_series(expected) + + tm.assert_equal(result, expected) + + def test_quantile_ea_with_na(self, index, frame_or_series): + obj = frame_or_series(index).copy() + + obj.iloc[0] = index._na_value + obj.iloc[-1] = index._na_value + + # shuffle our values + indexer = np.arange(len(index), dtype=np.intp) + np.random.shuffle(indexer) + obj = obj.iloc[indexer] + + qs = [0.5, 0, 1] + result = self.compute_quantile(obj, qs) + + # expected here assumes len(index) == 9 + expected = Series([index[4], index[1], index[-2]], index=qs, name="A") + expected = frame_or_series(expected) + tm.assert_equal(result, expected) + + def test_quantile_ea_all_na(self, index, frame_or_series): + + obj = frame_or_series(index).copy() + + obj.iloc[:] = index._na_value + + # shuffle our values + indexer = np.arange(len(index), dtype=np.intp) + np.random.shuffle(indexer) + obj = obj.iloc[indexer] + + qs = [0.5, 0, 1] + result = self.compute_quantile(obj, qs) + + expected = index.take([-1, -1, -1], allow_fill=True, fill_value=index._na_value) + expected = Series(expected, index=qs) + expected = frame_or_series(expected) + tm.assert_equal(result, expected) + + def test_quantile_ea_scalar(self, index, frame_or_series): + # scalar qs + obj = frame_or_series(index).copy() + + qs = 0.5 + result = self.compute_quantile(obj, qs) + + expected = Series({"A": index[4]}, name=0.5) + if frame_or_series is Series: + expected = expected["A"] + assert result == expected + else: + tm.assert_series_equal(result, expected) From 3b31e5a26a40d38568d9f38975f2cbf4b6601faf Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 5 Feb 2021 10:16:44 -0800 Subject: [PATCH 5/6] shuffle --- pandas/tests/frame/methods/test_quantile.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 58476a30cf960..c9ceb0baa8ecb 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -565,7 +565,7 @@ def compute_quantile(self, obj, qs): def test_quantile_ea(self, index, frame_or_series): obj = frame_or_series(index).copy() - # shuffle our values + # result should be invariant to shuffling indexer = np.arange(len(index), dtype=np.intp) np.random.shuffle(indexer) obj = obj.iloc[indexer] @@ -585,7 +585,7 @@ def test_quantile_ea_with_na(self, index, frame_or_series): obj.iloc[0] = index._na_value obj.iloc[-1] = index._na_value - # shuffle our values + # result should be invariant to shuffling indexer = np.arange(len(index), dtype=np.intp) np.random.shuffle(indexer) obj = obj.iloc[indexer] @@ -604,7 +604,7 @@ def test_quantile_ea_all_na(self, index, frame_or_series): obj.iloc[:] = index._na_value - # shuffle our values + # result should be invariant to shuffling indexer = np.arange(len(index), dtype=np.intp) np.random.shuffle(indexer) obj = obj.iloc[indexer] @@ -621,6 +621,11 @@ def test_quantile_ea_scalar(self, index, frame_or_series): # scalar qs obj = frame_or_series(index).copy() + # result should be invariant to shuffling + indexer = np.arange(len(index), dtype=np.intp) + np.random.shuffle(indexer) + obj = obj.iloc[indexer] + qs = 0.5 result = self.compute_quantile(obj, qs) From d09fc5e4e24e58214041b7fa81cd2d939bb1afd8 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 11 Feb 2021 14:59:26 -0800 Subject: [PATCH 6/6] tests for FloatingArray, IntegerArray --- pandas/core/internals/blocks.py | 18 +++--------------- pandas/tests/frame/methods/test_quantile.py | 8 ++++++++ 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 4d471a006bad2..b38f3f3d7a87e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1850,25 +1850,13 @@ def quantile(self, qs, interpolation="linear", axis: int = 0) -> Block: values = np.atleast_2d(values) result = quantile_with_mask(values, mask, fill_value, qs, interpolation, axis) - ndim = np.ndim(result) if not is_sparse(self.dtype): # shape[0] should be 1 as long as EAs are 1D + assert result.shape == (1, len(qs)), result.shape + result = type(self.values)._from_factorized(result[0], self.values) - if result.ndim == 1: - # i.e. qs was originally a scalar - assert result.shape == (1,), result.shape - result = type(self.values)._from_factorized(result, self.values) - placement = np.arange(len(result)) - - else: - assert result.shape == (1, len(qs)), result.shape - result = type(self.values)._from_factorized(result[0], self.values) - placement = [0] - else: - placement = np.arange(len(result)) - - return make_block(result, placement=placement, ndim=ndim) + return make_block(result, placement=self.mgr_locs, ndim=2) class HybridMixin: diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index c9ceb0baa8ecb..6d6016df52238 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -547,6 +547,14 @@ class TestQuantileExtensionDtype: ), pd.period_range("2016-01-01", periods=9, freq="D"), pd.date_range("2016-01-01", periods=9, tz="US/Pacific"), + pytest.param( + pd.array(np.arange(9), dtype="Int64"), + marks=pytest.mark.xfail(reason="doesnt implement from_factorized"), + ), + pytest.param( + pd.array(np.arange(9), dtype="Float64"), + marks=pytest.mark.xfail(reason="doesnt implement from_factorized"), + ), ], ids=lambda x: str(x.dtype), )