Skip to content

Commit

Permalink
Backport PR #48782 on branch 1.5.x (REGR: describe raising when resul…
Browse files Browse the repository at this point in the history
…t contains NA) (#48793)

Backport PR #48782: REGR: describe raising when result contains NA

Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
  • Loading branch information
meeseeksmachine and phofl authored Sep 26, 2022
1 parent 486fe15 commit 2dfbe0c
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 3 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.5.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ Fixed regressions
- Fixed Regression in :meth:`Series.__setitem__` casting ``None`` to ``NaN`` for object dtype (:issue:`48665`)
- Fixed Regression in :meth:`DataFrame.loc` when setting values as a :class:`DataFrame` with all ``True`` indexer (:issue:`48701`)
- Regression in :func:`.read_csv` causing an ``EmptyDataError`` when using an UTF-8 file handle that was already read from (:issue:`48646`)
- Fixed regression in :meth:`DataFrame.describe` raising ``TypeError`` when result contains ``NA`` (:issue:`48778`)
- Fixed regression in :meth:`DataFrame.plot` ignoring invalid ``colormap`` for ``kind="scatter"`` (:issue:`48726`)
- Fixed performance regression in :func:`factorize` when ``na_sentinel`` is not ``None`` and ``sort=False`` (:issue:`48620`)
-
Expand Down
11 changes: 10 additions & 1 deletion pandas/core/describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

from pandas._libs.tslibs import Timestamp
from pandas._typing import (
DtypeObj,
NDFrameT,
npt,
)
Expand All @@ -34,10 +35,12 @@
is_bool_dtype,
is_complex_dtype,
is_datetime64_any_dtype,
is_extension_array_dtype,
is_numeric_dtype,
is_timedelta64_dtype,
)

import pandas as pd
from pandas.core.reshape.concat import concat

from pandas.io.formats.format import format_percentiles
Expand Down Expand Up @@ -242,7 +245,13 @@ def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series:
+ [series.max()]
)
# GH#48340 - always return float on non-complex numeric data
dtype = float if is_numeric_dtype(series) and not is_complex_dtype(series) else None
dtype: DtypeObj | None
if is_extension_array_dtype(series):
dtype = pd.Float64Dtype()
elif is_numeric_dtype(series) and not is_complex_dtype(series):
dtype = np.dtype("float")
else:
dtype = None
return Series(d, index=stat_index, name=series.name, dtype=dtype)


Expand Down
12 changes: 12 additions & 0 deletions pandas/tests/frame/methods/test_describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,3 +397,15 @@ def test_describe_with_duplicate_columns(self):
ser = df.iloc[:, 0].describe()
expected = pd.concat([ser, ser, ser], keys=df.columns, axis=1)
tm.assert_frame_equal(result, expected)

def test_ea_with_na(self, any_numeric_ea_dtype):
# GH#48778

df = DataFrame({"a": [1, pd.NA, pd.NA], "b": pd.NA}, dtype=any_numeric_ea_dtype)
result = df.describe()
expected = DataFrame(
{"a": [1.0, 1.0, pd.NA] + [1.0] * 5, "b": [0.0] + [pd.NA] * 7},
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
dtype="Float64",
)
tm.assert_frame_equal(result, expected)
12 changes: 10 additions & 2 deletions pandas/tests/series/methods/test_describe.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import numpy as np

from pandas.core.dtypes.common import is_complex_dtype
from pandas.core.dtypes.common import (
is_complex_dtype,
is_extension_array_dtype,
)

from pandas import (
Period,
Expand Down Expand Up @@ -154,6 +157,11 @@ def test_datetime_is_numeric_includes_datetime(self):

def test_numeric_result_dtype(self, any_numeric_dtype):
# GH#48340 - describe should always return float on non-complex numeric input
if is_extension_array_dtype(any_numeric_dtype):
dtype = "Float64"
else:
dtype = "complex128" if is_complex_dtype(any_numeric_dtype) else None

ser = Series([0, 1], dtype=any_numeric_dtype)
result = ser.describe()
expected = Series(
Expand All @@ -168,6 +176,6 @@ def test_numeric_result_dtype(self, any_numeric_dtype):
1.0,
],
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
dtype="complex128" if is_complex_dtype(ser) else None,
dtype=dtype,
)
tm.assert_series_equal(result, expected)

0 comments on commit 2dfbe0c

Please sign in to comment.