-
-
Notifications
You must be signed in to change notification settings - Fork 18.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
BUG: quantile for ExtensionArray #39606
Changes from 1 commit
56a0120
a8dce2e
5d80780
4d2a16d
cebaf01
3b31e5a
5cc4df5
a618368
d09fc5e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
from typing import Sequence, Union | ||
|
||
import numpy as np | ||
|
||
from pandas._libs import lib | ||
|
||
from pandas.core.dtypes.common import is_list_like | ||
|
||
from pandas.core.nanops import nanpercentile | ||
|
||
|
||
def quantile_with_mask( | ||
values: np.ndarray, | ||
mask: np.ndarray, | ||
fill_value, | ||
qs: Union[float, Sequence[float]], | ||
interpolation: str, | ||
axis: int, | ||
) -> np.ndarray: | ||
""" | ||
Compute the quantiles of the given values for each quantile in `qs`. | ||
|
||
|
||
Parameters | ||
---------- | ||
values : np.ndarray | ||
For ExtensionArray, this is _values_for_factorize()[0] | ||
mask : np.ndarray[bool] | ||
mask = isna(values) | ||
For ExtensionArray, this is computed before calling _value_for_factorize | ||
fill_value : Scalar | ||
The value to interpret fill NA entries with | ||
For ExtensionArray, this is _values_for_factorize()[1] | ||
qs : a scalar or list of the quantiles to be computed | ||
interpolation : str | ||
Type of interpolation | ||
axis : int | ||
Axis along which to compute quantiles. | ||
|
||
Notes | ||
----- | ||
Assumes values is already 2D. For ExtensionArray this means np.atleast_2d | ||
has been called on _values_for_factorize()[0] | ||
""" | ||
is_empty = values.shape[axis] == 0 | ||
orig_scalar = not is_list_like(qs) | ||
if orig_scalar: | ||
# make list-like, unpack later | ||
qs = [qs] | ||
|
||
if is_empty: | ||
# create the array of na_values | ||
# 2d len(values) * len(qs) | ||
flat = np.array([fill_value] * len(qs)) | ||
result = np.repeat(flat, len(values)).reshape(len(values), len(qs)) | ||
else: | ||
# asarray needed for Sparse, see GH#24600 | ||
result = nanpercentile( | ||
values, | ||
np.array(qs) * 100, | ||
axis=axis, | ||
na_value=fill_value, | ||
mask=mask, | ||
ndim=values.ndim, | ||
interpolation=interpolation, | ||
) | ||
|
||
result = np.array(result, copy=False) | ||
result = result.T | ||
|
||
if orig_scalar: | ||
assert result.shape[-1] == 1, result.shape | ||
result = result[..., 0] | ||
result = lib.item_from_zerodim(result) | ||
|
||
return result |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -56,6 +56,7 @@ | |
putmask_smart, | ||
putmask_without_repeat, | ||
) | ||
from pandas.core.array_algos.quantile import quantile_with_mask | ||
from pandas.core.array_algos.replace import ( | ||
compare_or_regex_search, | ||
replace_regex, | ||
|
@@ -79,7 +80,6 @@ | |
is_scalar_indexer, | ||
) | ||
import pandas.core.missing as missing | ||
from pandas.core.nanops import nanpercentile | ||
|
||
if TYPE_CHECKING: | ||
from pandas import Index | ||
|
@@ -1390,8 +1390,10 @@ def quantile(self, qs, interpolation="linear", axis: int = 0) -> Block: | |
Parameters | ||
---------- | ||
qs: a scalar or list of the quantiles to be computed | ||
interpolation: type of interpolation, default 'linear' | ||
axis: axis to compute, default 0 | ||
interpolation : str, default "linear" | ||
Type of interpolation | ||
axis : int, default 0 | ||
Axis along which to compute quantiles. | ||
|
||
Returns | ||
------- | ||
|
@@ -1400,44 +1402,16 @@ def quantile(self, qs, interpolation="linear", axis: int = 0) -> Block: | |
# We should always have ndim == 2 because Series dispatches to DataFrame | ||
assert self.ndim == 2 | ||
|
||
values = self.get_values() | ||
|
||
is_empty = values.shape[axis] == 0 | ||
orig_scalar = not is_list_like(qs) | ||
if orig_scalar: | ||
# make list-like, unpack later | ||
qs = [qs] | ||
|
||
if is_empty: | ||
# create the array of na_values | ||
# 2d len(values) * len(qs) | ||
result = np.repeat( | ||
np.array([self.fill_value] * len(qs)), len(values) | ||
).reshape(len(values), len(qs)) | ||
else: | ||
# asarray needed for Sparse, see GH#24600 | ||
mask = np.asarray(isna(values)) | ||
result = nanpercentile( | ||
values, | ||
np.array(qs) * 100, | ||
axis=axis, | ||
na_value=self.fill_value, | ||
mask=mask, | ||
ndim=values.ndim, | ||
interpolation=interpolation, | ||
) | ||
fill_value = self.fill_value | ||
values = self.values | ||
mask = np.asarray(isna(values)) | ||
|
||
result = np.array(result, copy=False) | ||
result = result.T | ||
result = quantile_with_mask(values, mask, fill_value, qs, interpolation, axis) | ||
ndim = np.ndim(result) | ||
|
||
if orig_scalar and not lib.is_scalar(result): | ||
# result could be scalar in case with is_empty and self.ndim == 1 | ||
assert result.shape[-1] == 1, result.shape | ||
result = result[..., 0] | ||
result = lib.item_from_zerodim(result) | ||
placement = np.arange(len(result)) | ||
|
||
ndim = np.ndim(result) | ||
return make_block(result, placement=np.arange(len(result)), ndim=ndim) | ||
return make_block(result, placement=placement, ndim=ndim) | ||
|
||
def _replace_coerce( | ||
self, | ||
|
@@ -1866,6 +1840,36 @@ def _unstack(self, unstacker, fill_value, new_placement): | |
] | ||
return blocks, mask | ||
|
||
def quantile(self, qs, interpolation="linear", axis: int = 0) -> Block: | ||
# asarray needed for Sparse, see GH#24600 | ||
mask = np.asarray(isna(self.values)) | ||
mask = np.atleast_2d(mask) | ||
|
||
values, fill_value = self.values._values_for_factorize() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In general, (eg in geopands, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think quantile makes sense for any ordered types, and IIUC values_for_factorize is supposed to preserve ordering There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
In theory I think that's not required for factorize (since we don't sort in factorization). In practice not sure if that matters though (can't think of a situation where it wouldn't be orderable). |
||
|
||
values = np.atleast_2d(values) | ||
|
||
result = quantile_with_mask(values, mask, fill_value, qs, interpolation, axis) | ||
ndim = np.ndim(result) | ||
|
||
if not is_sparse(self.dtype): | ||
# shape[0] should be 1 as long as EAs are 1D | ||
|
||
if result.ndim == 1: | ||
# i.e. qs was originally a scalar | ||
assert result.shape == (1,), result.shape | ||
result = type(self.values)._from_factorized(result, self.values) | ||
placement = np.arange(len(result)) | ||
|
||
else: | ||
assert result.shape == (1, len(qs)), result.shape | ||
result = type(self.values)._from_factorized(result[0], self.values) | ||
placement = [0] | ||
else: | ||
placement = np.arange(len(result)) | ||
|
||
return make_block(result, placement=placement, ndim=ndim) | ||
|
||
|
||
class HybridMixin: | ||
""" | ||
|
@@ -2184,19 +2188,6 @@ def fillna( | |
value, limit=limit, inplace=inplace, downcast=downcast | ||
) | ||
|
||
def quantile(self, qs, interpolation="linear", axis: int = 0) -> Block: | ||
naive = self.values.view("M8[ns]") | ||
|
||
# TODO(EA2D): kludge for 2D block with 1D values | ||
naive = naive.reshape(self.shape) | ||
|
||
blk = self.make_block(naive) | ||
res_blk = blk.quantile(qs, interpolation=interpolation, axis=axis) | ||
|
||
# TODO(EA2D): ravel is kludge for 2D block with 1D values, assumes column-like | ||
aware = self._holder(res_blk.values.ravel(), dtype=self.dtype) | ||
return self.make_block_same_class(aware, ndim=res_blk.ndim) | ||
|
||
def _check_ndim(self, values, ndim): | ||
""" | ||
ndim inference and validation. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could also call this
na_value
? (like nanpercentile does)(I know we use both fill_value and na_value in many places, and somewhat interchangeably, but here I personally find na_value clearer)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I thought about this and decided on fill_value on the theory that "na_value" means "the value that, when we see it, indicates we have an NA" and "fill_value" means "the value that we use when we need to fill in an NA value".
e.g. we get here with fill_value=iNaT, which would be weird to have as an na_value