Skip to content

Commit 4fd773c

Browse files
committed
BUG: DataFrame.groupby with as_index=False shouldn't modify grouping columns
1 parent 44e3c40 commit 4fd773c

File tree

5 files changed

+58
-18
lines changed

5 files changed

+58
-18
lines changed

doc/source/whatsnew/v1.1.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -803,6 +803,7 @@ Groupby/resample/rolling
803803
- Bug in :meth:`DataFrame.groupby` where a ``ValueError`` would be raised when grouping by a categorical column with read-only categories and ``sort=False`` (:issue:`33410`)
804804
- Bug in :meth:`GroupBy.first` and :meth:`GroupBy.last` where None is not preserved in object dtype (:issue:`32800`)
805805
- Bug in :meth:`Rolling.min` and :meth:`Rolling.max`: Growing memory usage after multiple calls when using a fixed window (:issue:`30726`)
806+
- Bug in :meth:`DataFrame.groupby` when using ``as_index=False`` would modify the grouping column when used with ``idxmax``, ``idxmin``, ``mad``, ``nunique``, and ``skew`` (:issue:`21090`)
806807

807808
Reshaping
808809
^^^^^^^^^

pandas/core/groupby/generic.py

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1257,7 +1257,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
12571257

12581258
v = values[0]
12591259

1260-
if isinstance(v, (np.ndarray, Index, Series)):
1260+
if isinstance(v, (np.ndarray, Index, Series)) or not self.as_index:
12611261
if isinstance(v, Series):
12621262
applied_index = self._selected_obj._get_axis(self.axis)
12631263
all_indexed_same = all_indexes_same([x.index for x in values])
@@ -1333,6 +1333,11 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
13331333
result = DataFrame(
13341334
stacked_values.T, index=v.index, columns=key_index
13351335
)
1336+
elif not self.as_index:
1337+
# We add grouping column below, so create a frame here
1338+
result = DataFrame(
1339+
values, index=key_index, columns=[self._selection]
1340+
)
13361341
else:
13371342
# GH#1738: values is list of arrays of unequal lengths
13381343
# fall through to the outer else clause
@@ -1348,6 +1353,9 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
13481353
else:
13491354
result = result._convert(datetime=True)
13501355

1356+
if not self.as_index:
1357+
self._insert_inaxis_grouper_inplace(result)
1358+
13511359
return self._reindex_output(result)
13521360

13531361
# values are not series or array-like but scalars
@@ -1684,9 +1692,11 @@ def _insert_inaxis_grouper_inplace(self, result):
16841692
),
16851693
)
16861694
)
1687-
1695+
columns = result.columns
16881696
for name, lev, in_axis in izip:
1689-
if in_axis:
1697+
# GH #28549
1698+
# When using .apply(-), name will be in columns already
1699+
if in_axis and name not in columns:
16901700
result.insert(0, name, lev)
16911701

16921702
def _wrap_aggregated_output(
@@ -1836,11 +1846,11 @@ def nunique(self, dropna: bool = True):
18361846
5 ham 5 y
18371847
18381848
>>> df.groupby('id').nunique()
1839-
id value1 value2
1849+
value1 value2
18401850
id
1841-
egg 1 1 1
1842-
ham 1 1 2
1843-
spam 1 2 1
1851+
egg 1 1
1852+
ham 1 2
1853+
spam 2 1
18441854
18451855
Check for rows with the same id but conflicting values:
18461856
@@ -1851,7 +1861,7 @@ def nunique(self, dropna: bool = True):
18511861
4 ham 5 x
18521862
5 ham 5 y
18531863
"""
1854-
obj = self._selected_obj
1864+
obj = self._obj_with_exclusions
18551865

18561866
def groupby_series(obj, col=None):
18571867
return SeriesGroupBy(obj, selection=col, grouper=self.grouper).nunique(
@@ -1882,6 +1892,9 @@ def groupby_series(obj, col=None):
18821892

18831893
if not self.as_index:
18841894
results.index = ibase.default_index(len(results))
1895+
if results.ndim == 1:
1896+
results = results.to_frame()
1897+
self._insert_inaxis_grouper_inplace(results)
18851898
return results
18861899

18871900
boxplot = boxplot_frame_groupby

pandas/core/groupby/groupby.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -718,11 +718,11 @@ def _make_wrapper(self, name):
718718

719719
# need to setup the selection
720720
# as are not passed directly but in the grouper
721-
f = getattr(self._selected_obj, name)
721+
f = getattr(self._obj_with_exclusions, name)
722722
if not isinstance(f, types.MethodType):
723723
return self.apply(lambda self: getattr(self, name))
724724

725-
f = getattr(type(self._selected_obj), name)
725+
f = getattr(type(self._obj_with_exclusions), name)
726726
sig = inspect.signature(f)
727727

728728
def wrapper(*args, **kwargs):
@@ -745,7 +745,7 @@ def curried(x):
745745
return self.apply(curried)
746746

747747
try:
748-
return self.apply(curried)
748+
return self._python_apply_general(curried, self._obj_with_exclusions)
749749
except TypeError as err:
750750
if not re.search(
751751
"reduction operation '.*' not allowed for this dtype", str(err)
@@ -836,7 +836,7 @@ def f(g):
836836
# ignore SettingWithCopy here in case the user mutates
837837
with option_context("mode.chained_assignment", None):
838838
try:
839-
result = self._python_apply_general(f)
839+
result = self._python_apply_general(f, self._selected_obj)
840840
except TypeError:
841841
# gh-20949
842842
# try again, with .apply acting as a filtering
@@ -847,12 +847,12 @@ def f(g):
847847
# on a string grouper column
848848

849849
with _group_selection_context(self):
850-
return self._python_apply_general(f)
850+
return self._python_apply_general(f, self._selected_obj)
851851

852852
return result
853853

854-
def _python_apply_general(self, f):
855-
keys, values, mutated = self.grouper.apply(f, self._selected_obj, self.axis)
854+
def _python_apply_general(self, f, obj):
855+
keys, values, mutated = self.grouper.apply(f, obj, self.axis)
856856

857857
return self._wrap_applied_output(
858858
keys, values, not_indexed_same=mutated or self.mutated
@@ -1019,7 +1019,7 @@ def _python_agg_general(
10191019
output[key] = maybe_cast_result(result, obj, numeric_only=True)
10201020

10211021
if len(output) == 0:
1022-
return self._python_apply_general(f)
1022+
return self._python_apply_general(f, self._selected_obj)
10231023

10241024
if self.grouper._filter_empty_groups:
10251025

pandas/tests/groupby/test_function.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,7 @@ def test_non_cython_api():
280280
result = g.mad()
281281
tm.assert_frame_equal(result, expected)
282282

283-
expected = DataFrame([[0.0, 0.0], [0, np.nan]], columns=["A", "B"], index=[0, 1])
283+
expected = DataFrame([[1, 0.0], [3, np.nan]], columns=["A", "B"], index=[0, 1])
284284
result = gni.mad()
285285
tm.assert_frame_equal(result, expected)
286286

@@ -573,6 +573,32 @@ def test_ops_general(op, targop):
573573
tm.assert_frame_equal(result, expected)
574574

575575

576+
def test_ops_not_as_index(reduction_func):
577+
# GH 21090
578+
# Using as_index=False should not modify grouped column
579+
580+
if reduction_func in ("corrwith",):
581+
pytest.skip("Test not applicable")
582+
583+
if reduction_func in ("nth", "ngroup", "size",):
584+
pytest.skip("Skip until behavior is determined (GH #5755)")
585+
586+
if reduction_func in ("sem", "std"):
587+
pytest.skip("Function incorrectly modifies keys (GH #10355)")
588+
589+
df = DataFrame(np.random.randint(0, 5, size=(100, 2)), columns=["a", "b"])
590+
expected = getattr(df.groupby("a"), reduction_func)().reset_index()
591+
592+
result = getattr(df.groupby("a", as_index=False), reduction_func)()
593+
tm.assert_frame_equal(result, expected)
594+
595+
result = df.groupby("a", as_index=False).agg(reduction_func)
596+
tm.assert_frame_equal(result, expected)
597+
598+
result = getattr(df.groupby("a", as_index=False)["b"], reduction_func)()
599+
tm.assert_frame_equal(result, expected)
600+
601+
576602
def test_max_nan_bug():
577603
raw = """,Date,app,File
578604
-04-23,2013-04-23 00:00:00,,log080001.log

pandas/tests/groupby/test_whitelist.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -406,7 +406,7 @@ def test_all_methods_categorized(mframe):
406406
if new_names:
407407
msg = f"""
408408
There are uncatgeorized methods defined on the Grouper class:
409-
{names}.
409+
{new_names}.
410410
411411
Was a new method recently added?
412412

0 commit comments

Comments
 (0)