BUG: PandasArray._quantile when empty (pandas-dev#46110)

yehoshuadimarsky · Jul 13, 2022 · b74545c · b74545c
1 parent 9dd1210
commit b74545c
Show file tree

Hide file tree

Showing 9 changed files with 59 additions and 7 deletions.
diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py
@@ -2,7 +2,6 @@
 
 import numpy as np
 
-from pandas._libs import lib
 from pandas._typing import (
     ArrayLike,
     Scalar,
@@ -128,7 +127,10 @@ def _nanpercentile_1d(
     values = values[~mask]
 
     if len(values) == 0:
-        return np.array([na_value] * len(qs), dtype=values.dtype)
+        # Can't pass dtype=values.dtype here bc we might have na_value=np.nan
+        #  with values.dtype=int64 see test_quantile_empty
+        # equiv: 'np.array([na_value] * len(qs))' but much faster
+        return np.full(len(qs), na_value)
 
     return np.percentile(values, qs, **{np_percentile_argname: interpolation})
 
@@ -173,7 +175,7 @@ def _nanpercentile(
         #  have float result at this point, not i8
         return result.astype(values.dtype)
 
-    if not lib.is_scalar(mask) and mask.any():
+    if mask.any():
         # Caller is responsible for ensuring mask shape match
         assert mask.shape == values.shape
         result = [

diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py
@@ -99,6 +99,12 @@ class NDArrayBackedExtensionArray(NDArrayBacked, ExtensionArray):
 
     _ndarray: np.ndarray
 
+    # scalar used to denote NA value inside our self._ndarray, e.g. -1
+    #  for Categorical, iNaT for Period. Outside of object dtype,
+    #  self.isna() should be exactly locations in self._ndarray with
+    #  _internal_fill_value.
+    _internal_fill_value: Any
+
     def _box_func(self, x):
         """
         Wrap numpy type in our dtype.type if necessary.
@@ -463,18 +469,25 @@ def _quantile(
         mask = np.atleast_2d(mask)
 
         arr = np.atleast_2d(self._ndarray)
-        # TODO: something NDArrayBacked-specific instead of _values_for_factorize[1]?
-        fill_value = self._values_for_factorize()[1]
+        fill_value = self._internal_fill_value
 
         res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation)
-
-        result = type(self)._from_factorized(res_values, self)
+        res_values = self._cast_quantile_result(res_values)
+        result = self._from_backing_data(res_values)
         if self.ndim == 1:
             assert result.shape == (1, len(qs)), result.shape
             result = result[0]
 
         return result
 
+    # TODO: see if we can share this with other dispatch-wrapping methods
+    def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
+        """
+        Cast the result of quantile_with_mask to an appropriate dtype
+        to pass to _from_backing_data in _quantile.
+        """
+        return res_values
+
     # ------------------------------------------------------------------------
     # numpy-like methods
 

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -356,6 +356,7 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi
     # For comparisons, so that numpy uses our implementation if the compare
     # ops, which raise
     __array_priority__ = 1000
+    _internal_fill_value = -1
     # tolist is not actually deprecated, just suppressed in the __dir__
     _hidden_attrs = PandasObject._hidden_attrs | frozenset(["tolist"])
     _typ = "categorical"
@@ -2316,6 +2317,11 @@ def _from_factorized(cls, uniques, original):
             original.categories.take(uniques), dtype=original.dtype
         )
 
+    def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
+        # make sure we have correct itemsize for resulting codes
+        res_values = coerce_indexer_dtype(res_values, self.dtype.categories)
+        return res_values
+
     def equals(self, other: object) -> bool:
         """
         Returns True if categorical arrays are equal.

diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
@@ -191,6 +191,7 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps):
 
     _typ = "datetimearray"
     _scalar_type = Timestamp
+    _internal_fill_value = np.datetime64("NaT", "ns")
     _recognized_scalars = (datetime, np.datetime64)
     _is_recognized_dtype = is_datetime64_any_dtype
     _infer_matches = ("datetime", "datetime64", "date")

diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py
@@ -61,6 +61,7 @@ class PandasArray(
     __array_priority__ = 1000
     _ndarray: np.ndarray
     _dtype: PandasDtype
+    _internal_fill_value = np.nan
 
     # ------------------------------------------------------------------------
     # Constructors

diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py
@@ -163,6 +163,7 @@ class PeriodArray(dtl.DatelikeOps):
     __array_priority__ = 1000
     _typ = "periodarray"  # ABCPeriodArray
     _scalar_type = Period
+    _internal_fill_value = np.int64(iNaT)
     _recognized_scalars = (Period,)
     _is_recognized_dtype = is_period_dtype
     _infer_matches = ("period",)
@@ -697,6 +698,12 @@ def fillna(self, value=None, method=None, limit=None) -> PeriodArray:
             return result.view(self.dtype)  # type: ignore[return-value]
         return super().fillna(value=value, method=method, limit=limit)
 
+    # TODO: alternately could override _quantile like searchsorted
+    def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
+        # quantile_with_mask may return float64 instead of int64, in which
+        #  case we need to cast back
+        return res_values.astype(np.int64, copy=False)
+
     # ------------------------------------------------------------------
     # Arithmetic Methods
 

diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py
@@ -127,6 +127,7 @@ class TimedeltaArray(dtl.TimelikeOps):
 
     _typ = "timedeltaarray"
     _scalar_type = Timedelta
+    _internal_fill_value = np.timedelta64("NaT", "ns")
     _recognized_scalars = (timedelta, np.timedelta64, Tick)
     _is_recognized_dtype = is_timedelta64_dtype
     _infer_matches = ("timedelta", "timedelta64")

diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py
@@ -363,3 +363,13 @@ def test_validate_inplace_raises(self, value):
 
         with pytest.raises(ValueError, match=msg):
             cat.sort_values(inplace=value)
+
+    def test_quantile_empty(self):
+        # make sure we have correct itemsize on resulting codes
+        cat = Categorical(["A", "B"])
+        idx = Index([0.0, 0.5])
+        result = cat[:0]._quantile(idx, interpolation="linear")
+        assert result._codes.dtype == np.int8
+
+        expected = cat.take([-1, -1], allow_fill=True)
+        tm.assert_extension_array_equal(result, expected)
diff --git a/pandas/tests/arrays/numpy_/test_numpy.py b/pandas/tests/arrays/numpy_/test_numpy.py
@@ -298,3 +298,14 @@ def test_setitem_preserves_views():
     arr[-1] = 2.5
     view1[-1] = 5
     assert arr[-1] == 5
+
+
+@pytest.mark.parametrize("dtype", [np.int64, np.uint64])
+def test_quantile_empty(dtype):
+    # we should get back np.nans, not -1s
+    arr = PandasArray(np.array([], dtype=dtype))
+    idx = pd.Index([0.0, 0.5])
+
+    result = arr._quantile(idx, interpolation="linear")
+    expected = PandasArray(np.array([np.nan, np.nan]))
+    tm.assert_extension_array_equal(result, expected)