-
-
Notifications
You must be signed in to change notification settings - Fork 18.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
BUG: quantile for ExtensionArray #39606
Changes from 7 commits
56a0120
a8dce2e
5d80780
4d2a16d
cebaf01
3b31e5a
5cc4df5
a618368
d09fc5e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
import numpy as np | ||
|
||
from pandas._libs import lib | ||
|
||
from pandas.core.dtypes.common import is_list_like | ||
|
||
from pandas.core.nanops import nanpercentile | ||
|
||
|
||
def quantile_with_mask( | ||
values: np.ndarray, | ||
mask: np.ndarray, | ||
fill_value, | ||
qs, | ||
interpolation: str, | ||
axis: int, | ||
) -> np.ndarray: | ||
""" | ||
Compute the quantiles of the given values for each quantile in `qs`. | ||
|
||
Parameters | ||
---------- | ||
values : np.ndarray | ||
For ExtensionArray, this is _values_for_factorize()[0] | ||
mask : np.ndarray[bool] | ||
mask = isna(values) | ||
For ExtensionArray, this is computed before calling _value_for_factorize | ||
fill_value : Scalar | ||
The value to interpret fill NA entries with | ||
For ExtensionArray, this is _values_for_factorize()[1] | ||
qs : a scalar or list of the quantiles to be computed | ||
interpolation : str | ||
Type of interpolation | ||
axis : int | ||
Axis along which to compute quantiles. | ||
|
||
Returns | ||
------- | ||
np.ndarray | ||
|
||
Notes | ||
----- | ||
Assumes values is already 2D. For ExtensionArray this means np.atleast_2d | ||
has been called on _values_for_factorize()[0] | ||
""" | ||
is_empty = values.shape[axis] == 0 | ||
orig_scalar = not is_list_like(qs) | ||
if orig_scalar: | ||
# make list-like, unpack later | ||
qs = [qs] | ||
|
||
if is_empty: | ||
# create the array of na_values | ||
# 2d len(values) * len(qs) | ||
flat = np.array([fill_value] * len(qs)) | ||
result = np.repeat(flat, len(values)).reshape(len(values), len(qs)) | ||
else: | ||
# asarray needed for Sparse, see GH#24600 | ||
result = nanpercentile( | ||
values, | ||
np.array(qs) * 100, | ||
axis=axis, | ||
na_value=fill_value, | ||
mask=mask, | ||
ndim=values.ndim, | ||
interpolation=interpolation, | ||
) | ||
|
||
result = np.array(result, copy=False) | ||
result = result.T | ||
|
||
if orig_scalar: | ||
assert result.shape[-1] == 1, result.shape | ||
result = result[..., 0] | ||
result = lib.item_from_zerodim(result) | ||
|
||
return result |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -56,6 +56,7 @@ | |
putmask_smart, | ||
putmask_without_repeat, | ||
) | ||
from pandas.core.array_algos.quantile import quantile_with_mask | ||
from pandas.core.array_algos.replace import ( | ||
compare_or_regex_search, | ||
replace_regex, | ||
|
@@ -79,7 +80,6 @@ | |
is_scalar_indexer, | ||
) | ||
import pandas.core.missing as missing | ||
from pandas.core.nanops import nanpercentile | ||
|
||
if TYPE_CHECKING: | ||
from pandas import Float64Index, Index | ||
|
@@ -1413,31 +1413,11 @@ def quantile( | |
assert axis == 1 # only ever called this way | ||
assert is_list_like(qs) # caller is responsible for this | ||
|
||
values = self.get_values() | ||
|
||
is_empty = values.shape[axis] == 0 | ||
|
||
if is_empty: | ||
# create the array of na_values | ||
# 2d len(values) * len(qs) | ||
result = np.repeat( | ||
np.array([self.fill_value] * len(qs)), len(values) | ||
).reshape(len(values), len(qs)) | ||
else: | ||
# asarray needed for Sparse, see GH#24600 | ||
mask = np.asarray(isna(values)) | ||
result = nanpercentile( | ||
values, | ||
np.array(qs) * 100, | ||
axis=axis, | ||
na_value=self.fill_value, | ||
mask=mask, | ||
ndim=values.ndim, | ||
interpolation=interpolation, | ||
) | ||
fill_value = self.fill_value | ||
values = self.values | ||
mask = np.asarray(isna(values)) | ||
|
||
result = np.array(result, copy=False) | ||
result = result.T | ||
result = quantile_with_mask(values, mask, fill_value, qs, interpolation, axis) | ||
|
||
return make_block(result, placement=self.mgr_locs, ndim=2) | ||
|
||
|
@@ -1868,6 +1848,36 @@ def _unstack(self, unstacker, fill_value, new_placement): | |
] | ||
return blocks, mask | ||
|
||
def quantile(self, qs, interpolation="linear", axis: int = 0) -> Block: | ||
# asarray needed for Sparse, see GH#24600 | ||
mask = np.asarray(isna(self.values)) | ||
mask = np.atleast_2d(mask) | ||
|
||
values, fill_value = self.values._values_for_factorize() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In general, (eg in geopands, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think quantile makes sense for any ordered types, and IIUC values_for_factorize is supposed to preserve ordering There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
In theory I think that's not required for factorize (since we don't sort in factorization). In practice not sure if that matters though (can't think of a situation where it wouldn't be orderable). |
||
|
||
values = np.atleast_2d(values) | ||
|
||
result = quantile_with_mask(values, mask, fill_value, qs, interpolation, axis) | ||
ndim = np.ndim(result) | ||
|
||
if not is_sparse(self.dtype): | ||
# shape[0] should be 1 as long as EAs are 1D | ||
|
||
if result.ndim == 1: | ||
# i.e. qs was originally a scalar | ||
assert result.shape == (1,), result.shape | ||
result = type(self.values)._from_factorized(result, self.values) | ||
placement = np.arange(len(result)) | ||
|
||
else: | ||
assert result.shape == (1, len(qs)), result.shape | ||
result = type(self.values)._from_factorized(result[0], self.values) | ||
placement = [0] | ||
else: | ||
placement = np.arange(len(result)) | ||
|
||
return make_block(result, placement=placement, ndim=ndim) | ||
|
||
|
||
class HybridMixin: | ||
""" | ||
|
@@ -2186,22 +2196,6 @@ def fillna( | |
value, limit=limit, inplace=inplace, downcast=downcast | ||
) | ||
|
||
def quantile( | ||
self, qs: Float64Index, interpolation="linear", axis: int = 0 | ||
) -> Block: | ||
assert axis == 1 # only ever called this way | ||
naive = self.values.view("M8[ns]") | ||
|
||
# TODO(EA2D): kludge for 2D block with 1D values | ||
naive = naive.reshape(self.shape) | ||
|
||
blk = self.make_block(naive) | ||
res_blk = blk.quantile(qs, interpolation=interpolation, axis=axis) | ||
|
||
# TODO(EA2D): ravel is kludge for 2D block with 1D values, assumes column-like | ||
aware = self._holder(res_blk.values.ravel(), dtype=self.dtype) | ||
return self.make_block_same_class(aware, ndim=res_blk.ndim) | ||
|
||
def _check_ndim(self, values, ndim): | ||
""" | ||
ndim inference and validation. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -533,3 +533,105 @@ def test_quantile_item_cache(self): | |
ser.values[0] = 99 | ||
|
||
assert df.iloc[0, 0] == df["A"][0] | ||
|
||
|
||
class TestQuantileExtensionDtype: | ||
# TODO: tests for axis=1? | ||
# TODO: empty case? might as well do dt64 and td64 here too | ||
|
||
@pytest.fixture( | ||
params=[ | ||
pytest.param( | ||
pd.IntervalIndex.from_breaks(range(10)), | ||
marks=pytest.mark.xfail(reason="raises when trying to add Intervals"), | ||
), | ||
pd.period_range("2016-01-01", periods=9, freq="D"), | ||
pd.date_range("2016-01-01", periods=9, tz="US/Pacific"), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add eg |
||
], | ||
ids=lambda x: str(x.dtype), | ||
) | ||
def index(self, request): | ||
idx = request.param | ||
idx.name = "A" | ||
return idx | ||
|
||
def compute_quantile(self, obj, qs): | ||
if isinstance(obj, Series): | ||
result = obj.quantile(qs) | ||
else: | ||
result = obj.quantile(qs, numeric_only=False) | ||
return result | ||
|
||
def test_quantile_ea(self, index, frame_or_series): | ||
obj = frame_or_series(index).copy() | ||
|
||
# result should be invariant to shuffling | ||
indexer = np.arange(len(index), dtype=np.intp) | ||
np.random.shuffle(indexer) | ||
obj = obj.iloc[indexer] | ||
|
||
qs = [0.5, 0, 1] | ||
result = self.compute_quantile(obj, qs) | ||
|
||
# expected here assumes len(index) == 9 | ||
expected = Series([index[4], index[0], index[-1]], index=qs, name="A") | ||
expected = frame_or_series(expected) | ||
|
||
tm.assert_equal(result, expected) | ||
|
||
def test_quantile_ea_with_na(self, index, frame_or_series): | ||
obj = frame_or_series(index).copy() | ||
|
||
obj.iloc[0] = index._na_value | ||
obj.iloc[-1] = index._na_value | ||
|
||
# result should be invariant to shuffling | ||
indexer = np.arange(len(index), dtype=np.intp) | ||
np.random.shuffle(indexer) | ||
obj = obj.iloc[indexer] | ||
|
||
qs = [0.5, 0, 1] | ||
result = self.compute_quantile(obj, qs) | ||
|
||
# expected here assumes len(index) == 9 | ||
expected = Series([index[4], index[1], index[-2]], index=qs, name="A") | ||
expected = frame_or_series(expected) | ||
tm.assert_equal(result, expected) | ||
|
||
def test_quantile_ea_all_na(self, index, frame_or_series): | ||
|
||
obj = frame_or_series(index).copy() | ||
|
||
obj.iloc[:] = index._na_value | ||
|
||
# result should be invariant to shuffling | ||
indexer = np.arange(len(index), dtype=np.intp) | ||
np.random.shuffle(indexer) | ||
obj = obj.iloc[indexer] | ||
|
||
qs = [0.5, 0, 1] | ||
result = self.compute_quantile(obj, qs) | ||
|
||
expected = index.take([-1, -1, -1], allow_fill=True, fill_value=index._na_value) | ||
expected = Series(expected, index=qs) | ||
expected = frame_or_series(expected) | ||
tm.assert_equal(result, expected) | ||
|
||
def test_quantile_ea_scalar(self, index, frame_or_series): | ||
# scalar qs | ||
obj = frame_or_series(index).copy() | ||
|
||
# result should be invariant to shuffling | ||
indexer = np.arange(len(index), dtype=np.intp) | ||
np.random.shuffle(indexer) | ||
obj = obj.iloc[indexer] | ||
|
||
qs = 0.5 | ||
result = self.compute_quantile(obj, qs) | ||
|
||
expected = Series({"A": index[4]}, name=0.5) | ||
if frame_or_series is Series: | ||
expected = expected["A"] | ||
assert result == expected | ||
else: | ||
tm.assert_series_equal(result, expected) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could also call this
na_value
? (like nanpercentile does)(I know we use both fill_value and na_value in many places, and somewhat interchangeably, but here I personally find na_value clearer)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I thought about this and decided on fill_value on the theory that "na_value" means "the value that, when we see it, indicates we have an NA" and "fill_value" means "the value that we use when we need to fill in an NA value".
e.g. we get here with fill_value=iNaT, which would be weird to have as an na_value