Skip to content

Commit

Permalink
BUG: regression when applying groupby aggregation on categorical colu…
Browse files Browse the repository at this point in the history
  • Loading branch information
charlesdong1991 authored and TomAugspurger committed Jan 29, 2020
1 parent 05a0b63 commit 3c719f2
Show file tree
Hide file tree
Showing 7 changed files with 138 additions and 5 deletions.
48 changes: 48 additions & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -626,6 +626,54 @@ consistent with the behaviour of :class:`DataFrame` and :class:`Index`.
DeprecationWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.
Series([], dtype: float64)
Result dtype inference changes for resample operations
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

The rules for the result dtype in :meth:`DataFrame.resample` aggregations have changed for extension types (:issue:`31359`).
Previously, pandas would attempt to convert the result back to the original dtype, falling back to the usual
inference rules if that was not possible. Now, pandas will only return a result of the original dtype if the
scalar values in the result are instances of the extension dtype's scalar type.

.. ipython:: python
df = pd.DataFrame({"A": ['a', 'b']}, dtype='category',
index=pd.date_range('2000', periods=2))
df
*pandas 0.25.x*

.. code-block:: python
>>> df.resample("2D").agg(lambda x: 'a').A.dtype
CategoricalDtype(categories=['a', 'b'], ordered=False)
*pandas 1.0.0*

.. ipython:: python
df.resample("2D").agg(lambda x: 'a').A.dtype
This fixes an inconsistency between ``resample`` and ``groupby``.
This also fixes a potential bug, where the **values** of the result might change
depending on how the results are cast back to the original dtype.

*pandas 0.25.x*

.. code-block:: python
>>> df.resample("2D").agg(lambda x: 'c')
A
0 NaN
*pandas 1.0.0*

.. ipython:: python
df.resample("2D").agg(lambda x: 'c')
.. _whatsnew_100.api_breaking.python:

Increased minimum version for Python
Expand Down
7 changes: 4 additions & 3 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -813,9 +813,10 @@ def _try_cast(self, result, obj, numeric_only: bool = False):
# datetime64tz is handled correctly in agg_series,
# so is excluded here.

# return the same type (Series) as our caller
cls = dtype.construct_array_type()
result = try_cast_to_ea(cls, result, dtype=dtype)
if len(result) and isinstance(result[0], dtype.type):
cls = dtype.construct_array_type()
result = try_cast_to_ea(cls, result, dtype=dtype)

elif numeric_only and is_numeric_dtype(dtype) or not numeric_only:
result = maybe_downcast_to_dtype(result, dtype)

Expand Down
11 changes: 11 additions & 0 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -543,6 +543,17 @@ def _cython_operation(
if mask.any():
result = result.astype("float64")
result[mask] = np.nan
elif (
how == "add"
and is_integer_dtype(orig_values.dtype)
and is_extension_array_dtype(orig_values.dtype)
):
# We need this to ensure that Series[Int64Dtype].resample().sum()
# remains int64 dtype.
# Two options for avoiding this special case
# 1. mask-aware ops and avoid casting to float with NaN above
# 2. specify the result dtype when calling this method
result = result.astype("int64")

if kind == "aggregate" and self._filter_empty_groups and not counts.all():
assert result.ndim != 2
Expand Down
37 changes: 37 additions & 0 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -648,6 +648,43 @@ def test_lambda_named_agg(func):
tm.assert_frame_equal(result, expected)


def test_aggregate_mixed_types():
# GH 16916
df = pd.DataFrame(
data=np.array([0] * 9).reshape(3, 3), columns=list("XYZ"), index=list("abc")
)
df["grouping"] = ["group 1", "group 1", 2]
result = df.groupby("grouping").aggregate(lambda x: x.tolist())
expected_data = [[[0], [0], [0]], [[0, 0], [0, 0], [0, 0]]]
expected = pd.DataFrame(
expected_data,
index=Index([2, "group 1"], dtype="object", name="grouping"),
columns=Index(["X", "Y", "Z"], dtype="object"),
)
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(reason="Not implemented.")
def test_aggregate_udf_na_extension_type():
# https://github.com/pandas-dev/pandas/pull/31359
# This is currently failing to cast back to Int64Dtype.
# The presence of the NA causes two problems
# 1. NA is not an instance of Int64Dtype.type (numpy.int64)
# 2. The presence of an NA forces object type, so the non-NA values is
# a Python int rather than a NumPy int64. Python ints aren't
# instances of numpy.int64.
def aggfunc(x):
if all(x > 2):
return 1
else:
return pd.NA

df = pd.DataFrame({"A": pd.array([1, 2, 3])})
result = df.groupby([1, 1, 2]).agg(aggfunc)
expected = pd.DataFrame({"A": pd.array([1, pd.NA], dtype="Int64")}, index=[1, 2])
tm.assert_frame_equal(result, expected)


class TestLambdaMangling:
def test_maybe_mangle_lambdas_passthrough(self):
assert _maybe_mangle_lambdas("mean") == "mean"
Expand Down
34 changes: 34 additions & 0 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1342,3 +1342,37 @@ def test_series_groupby_categorical_aggregation_getitem():
result = groups["foo"].agg("mean")
expected = groups.agg("mean")["foo"]
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
"func, expected_values",
[(pd.Series.nunique, [1, 1, 2]), (pd.Series.count, [1, 2, 2])],
)
def test_groupby_agg_categorical_columns(func, expected_values):
# 31256
df = pd.DataFrame(
{
"id": [0, 1, 2, 3, 4],
"groups": [0, 1, 1, 2, 2],
"value": pd.Categorical([0, 0, 0, 0, 1]),
}
).set_index("id")
result = df.groupby("groups").agg(func)

expected = pd.DataFrame(
{"value": expected_values}, index=pd.Index([0, 1, 2], name="groups"),
)
tm.assert_frame_equal(result, expected)


def test_groupby_agg_non_numeric():
df = pd.DataFrame(
{"A": pd.Categorical(["a", "a", "b"], categories=["a", "b", "c"])}
)
expected = pd.DataFrame({"A": [2, 1]}, index=[1, 2])

result = df.groupby([1, 2, 1]).agg(pd.Series.nunique)
tm.assert_frame_equal(result, expected)

result = df.groupby([1, 2, 1]).nunique()
tm.assert_frame_equal(result, expected)
4 changes: 3 additions & 1 deletion pandas/tests/resample/test_datetime_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,9 @@ def test_resample_integerarray():

result = ts.resample("3T").mean()
expected = Series(
[1, 4, 7], index=pd.date_range("1/1/2000", periods=3, freq="3T"), dtype="Int64"
[1, 4, 7],
index=pd.date_range("1/1/2000", periods=3, freq="3T"),
dtype="float64",
)
tm.assert_series_equal(result, expected)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/resample/test_timedelta.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def test_resample_categorical_data_with_timedeltaindex():
index=pd.to_timedelta([0, 10], unit="s"),
)
expected = expected.reindex(["Group_obj", "Group"], axis=1)
expected["Group"] = expected["Group_obj"].astype("category")
expected["Group"] = expected["Group_obj"]
tm.assert_frame_equal(result, expected)


Expand Down

0 comments on commit 3c719f2

Please sign in to comment.