Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: PandasArray._quantile when empty #46110

Merged
merged 4 commits into from
Feb 26, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions pandas/core/array_algos/quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import numpy as np

from pandas._libs import lib
from pandas._typing import (
ArrayLike,
Scalar,
Expand Down Expand Up @@ -128,7 +127,10 @@ def _nanpercentile_1d(
values = values[~mask]

if len(values) == 0:
return np.array([na_value] * len(qs), dtype=values.dtype)
# Can't pass dtype=values.dtype here bc we might have na_value=np.nan
# with values.dtype=int64 see test_quantile_empty
# equiv: 'np.array([na_value] * len(qs))' but much faster
return np.full(len(qs), na_value)

return np.percentile(values, qs, **{np_percentile_argname: interpolation})

Expand Down Expand Up @@ -173,7 +175,7 @@ def _nanpercentile(
# have float result at this point, not i8
return result.astype(values.dtype)

if not lib.is_scalar(mask) and mask.any():
if mask.any():
# Caller is responsible for ensuring mask shape match
assert mask.shape == values.shape
result = [
Expand Down
21 changes: 17 additions & 4 deletions pandas/core/arrays/_mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,12 @@ class NDArrayBackedExtensionArray(NDArrayBacked, ExtensionArray):

_ndarray: np.ndarray

# scalar used to denote NA value inside our self._ndarray, e.g. -1
# for Categorical, iNaT for Period. Outside of object dtype,
# self.isna() should be exactly locations in self._ndarray with
# _internal_fill_value.
_internal_fill_value: Any

def _box_func(self, x):
"""
Wrap numpy type in our dtype.type if necessary.
Expand Down Expand Up @@ -462,18 +468,25 @@ def _quantile(
mask = np.atleast_2d(mask)

arr = np.atleast_2d(self._ndarray)
# TODO: something NDArrayBacked-specific instead of _values_for_factorize[1]?
fill_value = self._values_for_factorize()[1]
fill_value = self._internal_fill_value

res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation)

result = type(self)._from_factorized(res_values, self)
res_values = self._cast_quantile_result(res_values)
result = self._from_backing_data(res_values)
if self.ndim == 1:
assert result.shape == (1, len(qs)), result.shape
result = result[0]

return result

# TODO: see if we can share this with other dispatch-wrapping methods
def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
"""
Cast the result of quantile_with_mask to an appropriate dtype
to pass to _from_backing_data in _quantile.
"""
return res_values

# ------------------------------------------------------------------------
# numpy-like methods

Expand Down
6 changes: 6 additions & 0 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,7 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi
# For comparisons, so that numpy uses our implementation if the compare
# ops, which raise
__array_priority__ = 1000
_internal_fill_value = -1
# tolist is not actually deprecated, just suppressed in the __dir__
_hidden_attrs = PandasObject._hidden_attrs | frozenset(["tolist"])
_typ = "categorical"
Expand Down Expand Up @@ -2316,6 +2317,11 @@ def _from_factorized(cls, uniques, original):
original.categories.take(uniques), dtype=original.dtype
)

def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
# make sure we have correct itemsize for resulting codes
res_values = coerce_indexer_dtype(res_values, self.dtype.categories)
return res_values

def equals(self, other: object) -> bool:
"""
Returns True if categorical arrays are equal.
Expand Down
1 change: 1 addition & 0 deletions pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps):

_typ = "datetimearray"
_scalar_type = Timestamp
_internal_fill_value = np.datetime64("NaT", "ns")
_recognized_scalars = (datetime, np.datetime64)
_is_recognized_dtype = is_datetime64_any_dtype
_infer_matches = ("datetime", "datetime64", "date")
Expand Down
1 change: 1 addition & 0 deletions pandas/core/arrays/numpy_.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ class PandasArray(
__array_priority__ = 1000
_ndarray: np.ndarray
_dtype: PandasDtype
_internal_fill_value = np.nan

# ------------------------------------------------------------------------
# Constructors
Expand Down
7 changes: 7 additions & 0 deletions pandas/core/arrays/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ class PeriodArray(dtl.DatelikeOps):
__array_priority__ = 1000
_typ = "periodarray" # ABCPeriodArray
_scalar_type = Period
_internal_fill_value = np.int64(iNaT)
_recognized_scalars = (Period,)
_is_recognized_dtype = is_period_dtype
_infer_matches = ("period",)
Expand Down Expand Up @@ -697,6 +698,12 @@ def fillna(self, value=None, method=None, limit=None) -> PeriodArray:
return result.view(self.dtype) # type: ignore[return-value]
return super().fillna(value=value, method=method, limit=limit)

# TODO: alternately could override _quantile like searchsorted
def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
# quantile_with_mask may return float64 instead of int64, in which
# case we need to cast back
return res_values.astype(np.int64, copy=False)

# ------------------------------------------------------------------
# Arithmetic Methods

Expand Down
1 change: 1 addition & 0 deletions pandas/core/arrays/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ class TimedeltaArray(dtl.TimelikeOps):

_typ = "timedeltaarray"
_scalar_type = Timedelta
_internal_fill_value = np.timedelta64("NaT", "ns")
_recognized_scalars = (timedelta, np.timedelta64, Tick)
_is_recognized_dtype = is_timedelta64_dtype
_infer_matches = ("timedelta", "timedelta64")
Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/arrays/categorical/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,3 +363,13 @@ def test_validate_inplace_raises(self, value):

with pytest.raises(ValueError, match=msg):
cat.sort_values(inplace=value)

def test_quantile_empty(self):
# make sure we have correct itemsize on resulting codes
cat = Categorical(["A", "B"])
idx = Index([0.0, 0.5])
result = cat[:0]._quantile(idx, interpolation="linear")
assert result._codes.dtype == np.int8

expected = cat.take([-1, -1], allow_fill=True)
tm.assert_extension_array_equal(result, expected)
11 changes: 11 additions & 0 deletions pandas/tests/arrays/numpy_/test_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,3 +298,14 @@ def test_setitem_preserves_views():
arr[-1] = 2.5
view1[-1] = 5
assert arr[-1] == 5


@pytest.mark.parametrize("dtype", [np.int64, np.uint64])
def test_quantile_empty(dtype):
# we should get back np.nans, not -1s
arr = PandasArray(np.array([], dtype=dtype))
idx = pd.Index([0.0, 0.5])

result = arr._quantile(idx, interpolation="linear")
expected = PandasArray(np.array([np.nan, np.nan]))
tm.assert_extension_array_equal(result, expected)