Skip to content

Commit

Permalink
BUG: Use correct ExtensionArray reductions in DataFrame reductions (p…
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored and fangchenli committed Jul 16, 2020
1 parent 5c88f92 commit 3de5714
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 8 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -916,6 +916,7 @@ Numeric
- Bug in :meth:`DataFrame.diff` with ``axis=1`` returning incorrect results with mixed dtypes (:issue:`32995`)
- Bug in :meth:`DataFrame.corr` and :meth:`DataFrame.cov` raising when handling nullable integer columns with ``pandas.NA`` (:issue:`33803`)
- Bug in :class:`DataFrame` and :class:`Series` addition and subtraction between object-dtype objects and ``datetime64`` dtype objects (:issue:`33824`)
- Bug in :class:`DataFrame` reductions (e.g. ``df.min()``, ``df.max()``) with ``ExtensionArray`` dtypes (:issue:`34520`, :issue:`32651`)

Conversion
^^^^^^^^^^
Expand Down
27 changes: 19 additions & 8 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@
from pandas.core.arrays import Categorical, ExtensionArray
from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray
from pandas.core.arrays.sparse import SparseFrameAccessor
from pandas.core.construction import extract_array
from pandas.core.generic import NDFrame, _shared_docs
from pandas.core.indexes import base as ibase
from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences
Expand Down Expand Up @@ -8509,7 +8510,14 @@ def _count_level(self, level, axis=0, numeric_only=False):
return result

def _reduce(
self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds
self,
op,
name: str,
axis=0,
skipna=True,
numeric_only=None,
filter_type=None,
**kwds,
):

assert filter_type is None or filter_type == "bool", filter_type
Expand Down Expand Up @@ -8541,8 +8549,11 @@ def _reduce(
labels = self._get_agg_axis(axis)
constructor = self._constructor

def f(x):
return op(x, axis=axis, skipna=skipna, **kwds)
def func(values):
if is_extension_array_dtype(values.dtype):
return extract_array(values)._reduce(name, skipna=skipna, **kwds)
else:
return op(values, axis=axis, skipna=skipna, **kwds)

def _get_data(axis_matters):
if filter_type is None:
Expand Down Expand Up @@ -8589,7 +8600,7 @@ def blk_func(values):
out[:] = coerce_to_dtypes(out.values, df.dtypes)
return out

if not self._is_homogeneous_type:
if not self._is_homogeneous_type or self._mgr.any_extension_types:
# try to avoid self.values call

if filter_type is None and axis == 0 and len(self) > 0:
Expand All @@ -8609,7 +8620,7 @@ def blk_func(values):
from pandas.core.apply import frame_apply

opa = frame_apply(
self, func=f, result_type="expand", ignore_failures=True
self, func=func, result_type="expand", ignore_failures=True
)
result = opa.get_result()
if result.ndim == self.ndim:
Expand All @@ -8621,7 +8632,7 @@ def blk_func(values):
values = data.values

try:
result = f(values)
result = func(values)

except TypeError:
# e.g. in nanops trying to convert strs to float
Expand All @@ -8632,7 +8643,7 @@ def blk_func(values):

values = data.values
with np.errstate(all="ignore"):
result = f(values)
result = func(values)

else:
if numeric_only:
Expand All @@ -8643,7 +8654,7 @@ def blk_func(values):
else:
data = self
values = data.values
result = f(values)
result = func(values)

if filter_type == "bool" and is_object_dtype(values) and axis is None:
# work around https://github.com/numpy/numpy/issues/10489
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/arrays/integer/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,15 @@ def test_integer_array_numpy_sum(values, expected):
assert result == expected


@pytest.mark.parametrize("op", ["sum", "prod", "min", "max"])
def test_dataframe_reductions(op):
# https://github.com/pandas-dev/pandas/pull/32867
# ensure the integers are not cast to float during reductions
df = pd.DataFrame({"a": pd.array([1, 2], dtype="Int64")})
result = df.max()
assert isinstance(result["a"], np.int64)


# TODO(jreback) - these need testing / are broken

# shift
Expand Down
23 changes: 23 additions & 0 deletions pandas/tests/frame/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -1303,3 +1303,26 @@ def test_preserve_timezone(self, initial: str, method):
df = DataFrame([expected])
result = getattr(df, method)(axis=1)
tm.assert_series_equal(result, expected)


def test_mixed_frame_with_integer_sum():
# https://github.com/pandas-dev/pandas/issues/34520
df = pd.DataFrame([["a", 1]], columns=list("ab"))
df = df.astype({"b": "Int64"})
result = df.sum()
expected = pd.Series(["a", 1], index=["a", "b"])
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("numeric_only", [True, False, None])
@pytest.mark.parametrize("method", ["min", "max"])
def test_minmax_extensionarray(method, numeric_only):
# https://github.com/pandas-dev/pandas/issues/32651
int64_info = np.iinfo("int64")
ser = Series([int64_info.max, None, int64_info.min], dtype=pd.Int64Dtype())
df = DataFrame({"Int64": ser})
result = getattr(df, method)(numeric_only=numeric_only)
expected = Series(
[getattr(int64_info, method)], index=pd.Index(["Int64"], dtype="object")
)
tm.assert_series_equal(result, expected)

0 comments on commit 3de5714

Please sign in to comment.