Skip to content

BUG: Add squeeze keyword to groupby to allow reduction in returned type #3599

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 15, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions RELEASE.rst
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ pandas 0.11.1
``timedelta64[ns]`` to ``object/int`` (GH3425_)
- Do not allow datetimelike/timedeltalike creation except with valid types
(e.g. cannot pass ``datetime64[ms]``) (GH3423_)
- Add ``squeeze`` keyword to ``groupby`` to allow reduction from
DataFrame -> Series if groups are unique. Regression from 0.10.1,
partial revert on (GH2893_) with (GH3596_)

**Bug Fixes**

Expand Down Expand Up @@ -161,6 +164,7 @@ pandas 0.11.1
.. _GH3594: https://github.com/pydata/pandas/issues/3594
.. _GH3590: https://github.com/pydata/pandas/issues/3590
.. _GH3610: https://github.com/pydata/pandas/issues/3610
.. _GH3596: https://github.com/pydata/pandas/issues/3596
.. _GH3435: https://github.com/pydata/pandas/issues/3435


Expand Down
22 changes: 22 additions & 0 deletions doc/source/v0.11.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,26 @@ API changes
p / p
p / 0

- Add ``squeeze`` keyword to ``groupby`` to allow reduction from
DataFrame -> Series if groups are unique. This is a Regression from 0.10.1.
We are reverting back to the prior behavior. This means groupby will return the
same shaped objects whether the groups are unique or not. revert on (GH2893_)
with (GH3596_).

.. ipython:: python

df2 = DataFrame([{"val1": 1, "val2" : 20}, {"val1":1, "val2": 19},
{"val1":1, "val2": 27}, {"val1":1, "val2": 12}])
def func(dataf):
return dataf["val2"] - dataf["val2"].mean()

# squeezing the result frame to a series (because we have unique groups)
df2.groupby("val1", squeeze=True).apply(func)

# no squeezing (the default, and behavior in 0.10.1)
df2.groupby("val1").apply(func)


Enhancements
~~~~~~~~~~~~
- ``pd.read_html()`` can now parse HTML string, files or urls and return dataframes
Expand All @@ -44,5 +64,7 @@ on GitHub for a complete list.
.. _GH3477: https://github.com/pydata/pandas/issues/3477
.. _GH3492: https://github.com/pydata/pandas/issues/3492
.. _GH3499: https://github.com/pydata/pandas/issues/3499
.. _GH2893: https://github.com/pydata/pandas/issues/2893
.. _GH3596: https://github.com/pydata/pandas/issues/3596
.. _GH3590: https://github.com/pydata/pandas/issues/3590
.. _GH3435: https://github.com/pydata/pandas/issues/3435
8 changes: 6 additions & 2 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def get(self, key, default=None):
return default

def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
group_keys=True):
group_keys=True, squeeze=False):
"""
Group series using mapper (dict or key function, apply given function
to group, return result as series) or by a series of columns
Expand All @@ -131,6 +131,9 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
Sort group keys. Get better performance by turning this off
group_keys : boolean, default True
When calling apply, add group keys to index to identify pieces
squeeze : boolean, default False
reduce the dimensionaility of the return type if possible, otherwise
return a consistent type

Examples
--------
Expand All @@ -150,7 +153,8 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
from pandas.core.groupby import groupby
axis = self._get_axis_number(axis)
return groupby(self, by, axis=axis, level=level, as_index=as_index,
sort=sort, group_keys=group_keys)
sort=sort, group_keys=group_keys,
squeeze=squeeze)

def asfreq(self, freq, method=None, how=None, normalize=False):
"""
Expand Down
26 changes: 17 additions & 9 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ class GroupBy(object):

def __init__(self, obj, keys=None, axis=0, level=None,
grouper=None, exclusions=None, selection=None, as_index=True,
sort=True, group_keys=True):
sort=True, group_keys=True, squeeze=False):
self._selection = selection

if isinstance(obj, NDFrame):
Expand All @@ -189,6 +189,7 @@ def __init__(self, obj, keys=None, axis=0, level=None,
self.keys = keys
self.sort = sort
self.group_keys = group_keys
self.squeeze = squeeze

if grouper is None:
grouper, exclusions = _get_grouper(obj, keys, axis=axis,
Expand Down Expand Up @@ -1841,15 +1842,22 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
all_indexed_same = _all_indexes_same([x.index for x in values])
singular_series = len(values) == 1 and applied_index.nlevels == 1

# assign the name to this series
if singular_series:
values[0].name = keys[0]
# GH3596
# provide a reduction (Frame -> Series) if groups are unique
if self.squeeze:

# GH2893
# we have series in the values array, we want to produce a series:
# if any of the sub-series are not indexed the same
# OR we don't have a multi-index and we have only a single values
if singular_series or not all_indexed_same:
# assign the name to this series
if singular_series:
values[0].name = keys[0]

# GH2893
# we have series in the values array, we want to produce a series:
# if any of the sub-series are not indexed the same
# OR we don't have a multi-index and we have only a single values
return self._concat_objects(keys, values,
not_indexed_same=not_indexed_same)

if not all_indexed_same:
return self._concat_objects(keys, values,
not_indexed_same=not_indexed_same)

Expand Down
11 changes: 8 additions & 3 deletions pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,24 +263,29 @@ def test_groupby_nonobject_dtype(self):

def test_groupby_return_type(self):

# GH2893
# GH2893, return a reduced type
df1 = DataFrame([{"val1": 1, "val2" : 20}, {"val1":1, "val2": 19},
{"val1":2, "val2": 27}, {"val1":2, "val2": 12}])

def func(dataf):
return dataf["val2"] - dataf["val2"].mean()

result = df1.groupby("val1").apply(func)
result = df1.groupby("val1", squeeze=True).apply(func)
self.assert_(isinstance(result,Series))

df2 = DataFrame([{"val1": 1, "val2" : 20}, {"val1":1, "val2": 19},
{"val1":1, "val2": 27}, {"val1":1, "val2": 12}])
def func(dataf):
return dataf["val2"] - dataf["val2"].mean()

result = df2.groupby("val1").apply(func)
result = df2.groupby("val1", squeeze=True).apply(func)
self.assert_(isinstance(result,Series))

# GH3596, return a consistent type (regression in 0.11 from 0.10.1)
df = DataFrame([[1,1],[1,1]],columns=['X','Y'])
result = df.groupby('X',squeeze=False).count()
self.assert_(isinstance(result,DataFrame))

def test_agg_regression1(self):
grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month])
result = grouped.agg(np.mean)
Expand Down