BUG: Use correct ExtensionArray reductions in DataFrame reductions (p…

…andas-dev#35254)
fangchenli · Jul 16, 2020 · 3de5714 · 3de5714
1 parent 5c88f92
commit 3de5714
Show file tree

Hide file tree

Showing 4 changed files with 52 additions and 8 deletions.
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -916,6 +916,7 @@ Numeric
 - Bug in :meth:`DataFrame.diff` with ``axis=1`` returning incorrect results with mixed dtypes (:issue:`32995`)
 - Bug in :meth:`DataFrame.corr` and :meth:`DataFrame.cov` raising when handling nullable integer columns with ``pandas.NA`` (:issue:`33803`)
 - Bug in :class:`DataFrame` and :class:`Series` addition and subtraction between object-dtype objects and ``datetime64`` dtype objects (:issue:`33824`)
+- Bug in :class:`DataFrame` reductions (e.g. ``df.min()``, ``df.max()``) with ``ExtensionArray`` dtypes (:issue:`34520`, :issue:`32651`)
 
 Conversion
 ^^^^^^^^^^

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -117,6 +117,7 @@
 from pandas.core.arrays import Categorical, ExtensionArray
 from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray
 from pandas.core.arrays.sparse import SparseFrameAccessor
+from pandas.core.construction import extract_array
 from pandas.core.generic import NDFrame, _shared_docs
 from pandas.core.indexes import base as ibase
 from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences
@@ -8509,7 +8510,14 @@ def _count_level(self, level, axis=0, numeric_only=False):
         return result
 
     def _reduce(
-        self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds
+        self,
+        op,
+        name: str,
+        axis=0,
+        skipna=True,
+        numeric_only=None,
+        filter_type=None,
+        **kwds,
     ):
 
         assert filter_type is None or filter_type == "bool", filter_type
@@ -8541,8 +8549,11 @@ def _reduce(
             labels = self._get_agg_axis(axis)
             constructor = self._constructor
 
-        def f(x):
-            return op(x, axis=axis, skipna=skipna, **kwds)
+        def func(values):
+            if is_extension_array_dtype(values.dtype):
+                return extract_array(values)._reduce(name, skipna=skipna, **kwds)
+            else:
+                return op(values, axis=axis, skipna=skipna, **kwds)
 
         def _get_data(axis_matters):
             if filter_type is None:
@@ -8589,7 +8600,7 @@ def blk_func(values):
                 out[:] = coerce_to_dtypes(out.values, df.dtypes)
             return out
 
-        if not self._is_homogeneous_type:
+        if not self._is_homogeneous_type or self._mgr.any_extension_types:
             # try to avoid self.values call
 
             if filter_type is None and axis == 0 and len(self) > 0:
@@ -8609,7 +8620,7 @@ def blk_func(values):
                 from pandas.core.apply import frame_apply
 
                 opa = frame_apply(
-                    self, func=f, result_type="expand", ignore_failures=True
+                    self, func=func, result_type="expand", ignore_failures=True
                 )
                 result = opa.get_result()
                 if result.ndim == self.ndim:
@@ -8621,7 +8632,7 @@ def blk_func(values):
             values = data.values
 
             try:
-                result = f(values)
+                result = func(values)
 
             except TypeError:
                 # e.g. in nanops trying to convert strs to float
@@ -8632,7 +8643,7 @@ def blk_func(values):
 
                 values = data.values
                 with np.errstate(all="ignore"):
-                    result = f(values)
+                    result = func(values)
 
         else:
             if numeric_only:
@@ -8643,7 +8654,7 @@ def blk_func(values):
             else:
                 data = self
                 values = data.values
-            result = f(values)
+            result = func(values)
 
         if filter_type == "bool" and is_object_dtype(values) and axis is None:
             # work around https://github.com/numpy/numpy/issues/10489

diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py
@@ -133,6 +133,15 @@ def test_integer_array_numpy_sum(values, expected):
     assert result == expected
 
 
+@pytest.mark.parametrize("op", ["sum", "prod", "min", "max"])
+def test_dataframe_reductions(op):
+    # https://github.com/pandas-dev/pandas/pull/32867
+    # ensure the integers are not cast to float during reductions
+    df = pd.DataFrame({"a": pd.array([1, 2], dtype="Int64")})
+    result = df.max()
+    assert isinstance(result["a"], np.int64)
+
+
 # TODO(jreback) - these need testing / are broken
 
 # shift

diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
@@ -1303,3 +1303,26 @@ def test_preserve_timezone(self, initial: str, method):
         df = DataFrame([expected])
         result = getattr(df, method)(axis=1)
         tm.assert_series_equal(result, expected)
+
+
+def test_mixed_frame_with_integer_sum():
+    # https://github.com/pandas-dev/pandas/issues/34520
+    df = pd.DataFrame([["a", 1]], columns=list("ab"))
+    df = df.astype({"b": "Int64"})
+    result = df.sum()
+    expected = pd.Series(["a", 1], index=["a", "b"])
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("numeric_only", [True, False, None])
+@pytest.mark.parametrize("method", ["min", "max"])
+def test_minmax_extensionarray(method, numeric_only):
+    # https://github.com/pandas-dev/pandas/issues/32651
+    int64_info = np.iinfo("int64")
+    ser = Series([int64_info.max, None, int64_info.min], dtype=pd.Int64Dtype())
+    df = DataFrame({"Int64": ser})
+    result = getattr(df, method)(numeric_only=numeric_only)
+    expected = Series(
+        [getattr(int64_info, method)], index=pd.Index(["Int64"], dtype="object")
+    )
+    tm.assert_series_equal(result, expected)