Skip to content

BUG: Grouper specified by key not regarded as in-axis #50414

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Dec 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -932,6 +932,8 @@ Groupby/resample/rolling
- Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` with ``dropna=False`` would drop NA values when the grouper was categorical (:issue:`36327`)
- Bug in :meth:`.SeriesGroupBy.nunique` would incorrectly raise when the grouper was an empty categorical and ``observed=True`` (:issue:`21334`)
- Bug in :meth:`.SeriesGroupBy.nth` would raise when grouper contained NA values after subsetting from a :class:`DataFrameGroupBy` (:issue:`26454`)
- Bug in :meth:`DataFrame.groupby` would not include a :class:`.Grouper` specified by ``key`` in the result when ``as_index=False`` (:issue:`50413`)
-

Reshaping
^^^^^^^^^
Expand Down
8 changes: 4 additions & 4 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1014,10 +1014,10 @@ def _set_group_selection(self) -> None:
"""
# This is a no-op for SeriesGroupBy
grp = self.grouper
if not (
grp.groupings is not None
and self.obj.ndim > 1
and self._group_selection is None
if (
grp.groupings is None
or self.obj.ndim == 1
or self._group_selection is not None
):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This leaked into here, but I've always found the old version quite hard to read.

return

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -906,7 +906,7 @@ def is_in_obj(gpr) -> bool:
elif isinstance(gpr, Grouper) and gpr.key is not None:
# Add key to exclusions
exclusions.add(gpr.key)
in_axis = False
in_axis = True
else:
in_axis = False

Expand Down
65 changes: 38 additions & 27 deletions pandas/tests/groupby/test_grouping.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from pandas import (
CategoricalIndex,
DataFrame,
Grouper,
Index,
MultiIndex,
Series,
Expand Down Expand Up @@ -168,7 +169,7 @@ def test_grouper_index_types(self, index):
def test_grouper_multilevel_freq(self):

# GH 7885
# with level and freq specified in a pd.Grouper
# with level and freq specified in a Grouper
from datetime import (
date,
timedelta,
Expand All @@ -182,20 +183,20 @@ def test_grouper_multilevel_freq(self):
# Check string level
expected = (
df.reset_index()
.groupby([pd.Grouper(key="foo", freq="W"), pd.Grouper(key="bar", freq="W")])
.groupby([Grouper(key="foo", freq="W"), Grouper(key="bar", freq="W")])
.sum()
)
# reset index changes columns dtype to object
expected.columns = Index([0], dtype="int64")

result = df.groupby(
[pd.Grouper(level="foo", freq="W"), pd.Grouper(level="bar", freq="W")]
[Grouper(level="foo", freq="W"), Grouper(level="bar", freq="W")]
).sum()
tm.assert_frame_equal(result, expected)

# Check integer level
result = df.groupby(
[pd.Grouper(level=0, freq="W"), pd.Grouper(level=1, freq="W")]
[Grouper(level=0, freq="W"), Grouper(level=1, freq="W")]
).sum()
tm.assert_frame_equal(result, expected)

Expand All @@ -206,11 +207,11 @@ def test_grouper_creation_bug(self):
g = df.groupby("A")
expected = g.sum()

g = df.groupby(pd.Grouper(key="A"))
g = df.groupby(Grouper(key="A"))
result = g.sum()
tm.assert_frame_equal(result, expected)

g = df.groupby(pd.Grouper(key="A", axis=0))
g = df.groupby(Grouper(key="A", axis=0))
result = g.sum()
tm.assert_frame_equal(result, expected)

Expand All @@ -220,13 +221,13 @@ def test_grouper_creation_bug(self):
tm.assert_frame_equal(result, expected)

# GH14334
# pd.Grouper(key=...) may be passed in a list
# Grouper(key=...) may be passed in a list
df = DataFrame(
{"A": [0, 0, 0, 1, 1, 1], "B": [1, 1, 2, 2, 3, 3], "C": [1, 2, 3, 4, 5, 6]}
)
# Group by single column
expected = df.groupby("A").sum()
g = df.groupby([pd.Grouper(key="A")])
g = df.groupby([Grouper(key="A")])
result = g.sum()
tm.assert_frame_equal(result, expected)

Expand All @@ -235,17 +236,17 @@ def test_grouper_creation_bug(self):
expected = df.groupby(["A", "B"]).sum()

# Group with two Grouper objects
g = df.groupby([pd.Grouper(key="A"), pd.Grouper(key="B")])
g = df.groupby([Grouper(key="A"), Grouper(key="B")])
result = g.sum()
tm.assert_frame_equal(result, expected)

# Group with a string and a Grouper object
g = df.groupby(["A", pd.Grouper(key="B")])
g = df.groupby(["A", Grouper(key="B")])
result = g.sum()
tm.assert_frame_equal(result, expected)

# Group with a Grouper object and a string
g = df.groupby([pd.Grouper(key="A"), "B"])
g = df.groupby([Grouper(key="A"), "B"])
result = g.sum()
tm.assert_frame_equal(result, expected)

Expand All @@ -257,15 +258,15 @@ def test_grouper_creation_bug(self):
names=["one", "two", "three"],
),
)
result = s.groupby(pd.Grouper(level="three", freq="M")).sum()
result = s.groupby(Grouper(level="three", freq="M")).sum()
expected = Series(
[28],
index=pd.DatetimeIndex([Timestamp("2013-01-31")], freq="M", name="three"),
)
tm.assert_series_equal(result, expected)

# just specifying a level breaks
result = s.groupby(pd.Grouper(level="one")).sum()
result = s.groupby(Grouper(level="one")).sum()
expected = s.groupby(level="one").sum()
tm.assert_series_equal(result, expected)

Expand All @@ -282,18 +283,14 @@ def test_grouper_column_and_index(self):
{"A": np.arange(6), "B": ["one", "one", "two", "two", "one", "one"]},
index=idx,
)
result = df_multi.groupby(["B", pd.Grouper(level="inner")]).mean(
numeric_only=True
)
result = df_multi.groupby(["B", Grouper(level="inner")]).mean(numeric_only=True)
expected = (
df_multi.reset_index().groupby(["B", "inner"]).mean(numeric_only=True)
)
tm.assert_frame_equal(result, expected)

# Test the reverse grouping order
result = df_multi.groupby([pd.Grouper(level="inner"), "B"]).mean(
numeric_only=True
)
result = df_multi.groupby([Grouper(level="inner"), "B"]).mean(numeric_only=True)
expected = (
df_multi.reset_index().groupby(["inner", "B"]).mean(numeric_only=True)
)
Expand All @@ -302,7 +299,7 @@ def test_grouper_column_and_index(self):
# Grouping a single-index frame by a column and the index should
# be equivalent to resetting the index and grouping by two columns
df_single = df_multi.reset_index("outer")
result = df_single.groupby(["B", pd.Grouper(level="inner")]).mean(
result = df_single.groupby(["B", Grouper(level="inner")]).mean(
numeric_only=True
)
expected = (
Expand All @@ -311,7 +308,7 @@ def test_grouper_column_and_index(self):
tm.assert_frame_equal(result, expected)

# Test the reverse grouping order
result = df_single.groupby([pd.Grouper(level="inner"), "B"]).mean(
result = df_single.groupby([Grouper(level="inner"), "B"]).mean(
numeric_only=True
)
expected = (
Expand Down Expand Up @@ -368,7 +365,7 @@ def test_grouper_getting_correct_binner(self):
),
)
result = df.groupby(
[pd.Grouper(level="one"), pd.Grouper(level="two", freq="M")]
[Grouper(level="one"), Grouper(level="two", freq="M")]
).sum()
expected = DataFrame(
{"A": [31, 28, 21, 31, 28, 21]},
Expand Down Expand Up @@ -646,7 +643,7 @@ def test_list_grouper_with_nat(self):
# GH 14715
df = DataFrame({"date": date_range("1/1/2011", periods=365, freq="D")})
df.iloc[-1] = pd.NaT
grouper = pd.Grouper(key="date", freq="AS")
grouper = Grouper(key="date", freq="AS")

# Grouper in a list grouping
result = df.groupby([grouper])
Expand Down Expand Up @@ -847,7 +844,7 @@ def test_groupby_with_empty(self):
index = pd.DatetimeIndex(())
data = ()
series = Series(data, index, dtype=object)
grouper = pd.Grouper(freq="D")
grouper = Grouper(freq="D")
grouped = series.groupby(grouper)
assert next(iter(grouped), None) is None

Expand Down Expand Up @@ -982,7 +979,7 @@ def test_groupby_with_small_elem(self):
{"event": ["start", "start"], "change": [1234, 5678]},
index=pd.DatetimeIndex(["2014-09-10", "2013-10-10"]),
)
grouped = df.groupby([pd.Grouper(freq="M"), "event"])
grouped = df.groupby([Grouper(freq="M"), "event"])
assert len(grouped.groups) == 2
assert grouped.ngroups == 2
assert (Timestamp("2014-09-30"), "start") in grouped.groups
Expand All @@ -997,7 +994,7 @@ def test_groupby_with_small_elem(self):
{"event": ["start", "start", "start"], "change": [1234, 5678, 9123]},
index=pd.DatetimeIndex(["2014-09-10", "2013-10-10", "2014-09-15"]),
)
grouped = df.groupby([pd.Grouper(freq="M"), "event"])
grouped = df.groupby([Grouper(freq="M"), "event"])
assert len(grouped.groups) == 2
assert grouped.ngroups == 2
assert (Timestamp("2014-09-30"), "start") in grouped.groups
Expand All @@ -1013,7 +1010,7 @@ def test_groupby_with_small_elem(self):
{"event": ["start", "start", "start"], "change": [1234, 5678, 9123]},
index=pd.DatetimeIndex(["2014-09-10", "2013-10-10", "2014-08-05"]),
)
grouped = df.groupby([pd.Grouper(freq="M"), "event"])
grouped = df.groupby([Grouper(freq="M"), "event"])
assert len(grouped.groups) == 3
assert grouped.ngroups == 3
assert (Timestamp("2014-09-30"), "start") in grouped.groups
Expand All @@ -1036,3 +1033,17 @@ def test_grouping_string_repr(self):
result = gr.grouper.groupings[0].__repr__()
expected = "Grouping(('A', 'a'))"
assert result == expected


def test_grouping_by_key_is_in_axis():
# GH#50413 - Groupers specified by key are in-axis
df = DataFrame({"a": [1, 1, 2], "b": [1, 1, 2], "c": [3, 4, 5]}).set_index("a")
gb = df.groupby([Grouper(level="a"), Grouper(key="b")], as_index=False)
assert not gb.grouper.groupings[0].in_axis
assert gb.grouper.groupings[1].in_axis

# Currently only in-axis groupings are including in the result when as_index=False;
# This is likely to change in the future.
result = gb.sum()
expected = DataFrame({"b": [1, 2], "c": [7, 5]})
tm.assert_frame_equal(result, expected)