Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BUG] groupby failed when using categorical columns with as_index=False #3036

Open
hekaisheng opened this issue May 13, 2022 · 1 comment
Open
Labels
mod: dataframe type: bug Something isn't working

Comments

@hekaisheng
Copy link
Contributor

Describe the bug
groupby failed when using categorical columns with as_index=False.

To Reproduce

In [14]: a = pd.DataFrame({'a':['a','b', 'c'] * 5, 'b': ['d', 'e', 'f'] * 5, 'c': range(15)}).astype({'a': "category", "b": 'category'})

In [15]: df = md.DataFrame(a)

In [16]: df.groupby(['a', 'b'], as_index=False).agg({'c': 'sum'}).execute()
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Input In [16], in <cell line: 1>()
----> 1 df.groupby(['a', 'b'], as_index=False).agg({'c': 'sum'}).execute()

File ~/Documents/mars/mars/dataframe/groupby/aggregation.py:1308, in agg(groupby, func, method, combine_size, *args, **kwargs)
   1298 use_inf_as_na = kwargs.pop("_use_inf_as_na", options.dataframe.mode.use_inf_as_na)
   1299 agg_op = DataFrameGroupByAgg(
   1300     raw_func=func,
   1301     raw_func_kw=kwargs,
   (...)
   1306     use_inf_as_na=use_inf_as_na,
   1307 )
-> 1308 return agg_op(groupby)

File ~/Documents/mars/mars/core/mode.py:77, in _EnterModeFuncWrapper.__call__.<locals>._inner(*args, **kwargs)
     74 @functools.wraps(func)
     75 def _inner(*args, **kwargs):
     76     with enter_mode(**mode_name_to_value):
---> 77         return func(*args, **kwargs)

File ~/Documents/mars/mars/dataframe/groupby/aggregation.py:297, in DataFrameGroupByAgg.__call__(self, groupby)
    290     self.output_types = (
    291         [OutputType.dataframe]
    292         if groupby.op.output_types[0] == OutputType.dataframe_groupby
    293         else [OutputType.series]
    294     )
    296 if self.output_types[0] == OutputType.dataframe:
--> 297     return self._call_dataframe(groupby, df)
    298 else:
    299     return self._call_series(groupby, df)

File ~/Documents/mars/mars/dataframe/groupby/aggregation.py:224, in DataFrameGroupByAgg._call_dataframe(self, groupby, input_df)
    223 def _call_dataframe(self, groupby, input_df):
--> 224     agg_df = build_mock_agg_result(
    225         groupby, self.groupby_params, self.raw_func, **self.raw_func_kw
    226     )
    228     shape = (np.nan, agg_df.shape[1])
    229     index_value = parse_index(agg_df.index, groupby.key, groupby.index_value.key)

File ~/Documents/mars/mars/dataframe/groupby/aggregation.py:141, in build_mock_agg_result(groupby, groupby_params, raw_func, **raw_func_kw)
    134 def build_mock_agg_result(
    135     groupby: GROUPBY_TYPE,
    136     groupby_params: Dict,
    137     raw_func: Callable,
    138     **raw_func_kw,
    139 ):
    140     try:
--> 141         agg_result = groupby.op.build_mock_groupby().aggregate(raw_func, **raw_func_kw)
    142     except ValueError:
    143         if (
    144             groupby_params.get("as_index") or _support_get_group_without_as_index
    145         ):  # pragma: no cover

File ~/miniconda3/lib/python3.8/site-packages/pandas/core/groupby/generic.py:924, in DataFrameGroupBy.aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
    921                 result.columns = result.columns.droplevel(-1)
    923 if not self.as_index:
--> 924     self._insert_inaxis_grouper_inplace(result)
    925     result.index = Index(range(len(result)))
    927 return result

File ~/miniconda3/lib/python3.8/site-packages/pandas/core/groupby/generic.py:1407, in DataFrameGroupBy._insert_inaxis_grouper_inplace(self, result)
   1399 for name, lev, in_axis in zip(
   1400     reversed(self.grouper.names),
   1401     reversed(self.grouper.get_group_levels()),
   (...)
   1404     # GH #28549
   1405     # When using .apply(-), name will be in columns already
   1406     if in_axis and name not in columns:
-> 1407         result.insert(0, name, lev)

File ~/miniconda3/lib/python3.8/site-packages/pandas/core/frame.py:4444, in DataFrame.insert(self, loc, column, value, allow_duplicates)
   4441 if not isinstance(loc, int):
   4442     raise TypeError("loc must be int")
-> 4444 value = self._sanitize_column(value)
   4445 self._mgr.insert(loc, column, value)

File ~/miniconda3/lib/python3.8/site-packages/pandas/core/frame.py:4535, in DataFrame._sanitize_column(self, value)
   4532     return _reindex_for_setitem(value, self.index)
   4534 if is_list_like(value):
-> 4535     com.require_length_match(value, self.index)
   4536 return sanitize_array(value, self.index, copy=True, allow_2d=True)

File ~/miniconda3/lib/python3.8/site-packages/pandas/core/common.py:557, in require_length_match(data, index)
    553 """
    554 Check the length of data matches the length of the index.
    555 """
    556 if len(data) != len(index):
--> 557     raise ValueError(
    558         "Length of values "
    559         f"({len(data)}) "
    560         "does not match length of index "
    561         f"({len(index)})"
    562     )

ValueError: Length of values (1) does not match length of index (9)

Expected behavior
A clear and concise description of what you expected to happen.

Additional context
Add any other context about the problem here.

@hekaisheng hekaisheng added type: bug Something isn't working mod: dataframe labels May 13, 2022
@hekaisheng hekaisheng added this to the v0.10.0a1 milestone May 13, 2022
@hekaisheng hekaisheng changed the title [BUG] groupby using categorical columns failed [BUG] groupby failed when using categorical columns with as_index=False May 13, 2022
@hekaisheng
Copy link
Contributor Author

After digging into, found that this is a Pandas issue:

In [20]: a = pd.DataFrame({'a':['a','b', 'c'] * 5, 'b': ['d', 'e', 'f'] * 5, 'c': range(15)}).astype({'a': "category", "b": 'category'})

In [21]: a.groupby(['a', 'b'], as_index=False).agg({'c': 'sum'})
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Input In [21], in <cell line: 1>()
----> 1 a.groupby(['a', 'b'], as_index=False).agg({'c': 'sum'})

File ~/miniconda3/lib/python3.8/site-packages/pandas/core/groupby/generic.py:924, in DataFrameGroupBy.aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
    921                 result.columns = result.columns.droplevel(-1)
    923 if not self.as_index:
--> 924     self._insert_inaxis_grouper_inplace(result)
    925     result.index = Index(range(len(result)))
    927 return result

File ~/miniconda3/lib/python3.8/site-packages/pandas/core/groupby/generic.py:1407, in DataFrameGroupBy._insert_inaxis_grouper_inplace(self, result)
   1399 for name, lev, in_axis in zip(
   1400     reversed(self.grouper.names),
   1401     reversed(self.grouper.get_group_levels()),
   (...)
   1404     # GH #28549
   1405     # When using .apply(-), name will be in columns already
   1406     if in_axis and name not in columns:
-> 1407         result.insert(0, name, lev)

File ~/miniconda3/lib/python3.8/site-packages/pandas/core/frame.py:4444, in DataFrame.insert(self, loc, column, value, allow_duplicates)
   4441 if not isinstance(loc, int):
   4442     raise TypeError("loc must be int")
-> 4444 value = self._sanitize_column(value)
   4445 self._mgr.insert(loc, column, value)

File ~/miniconda3/lib/python3.8/site-packages/pandas/core/frame.py:4535, in DataFrame._sanitize_column(self, value)
   4532     return _reindex_for_setitem(value, self.index)
   4534 if is_list_like(value):
-> 4535     com.require_length_match(value, self.index)
   4536 return sanitize_array(value, self.index, copy=True, allow_2d=True)

File ~/miniconda3/lib/python3.8/site-packages/pandas/core/common.py:557, in require_length_match(data, index)
    553 """
    554 Check the length of data matches the length of the index.
    555 """
    556 if len(data) != len(index):
--> 557     raise ValueError(
    558         "Length of values "
    559         f"({len(data)}) "
    560         "does not match length of index "
    561         f"({len(index)})"
    562     )

ValueError: Length of values (3) does not match length of index (9)

And pandas-dev/pandas#46492 gives a workaround.

@qinxuye qinxuye removed this from the v0.10.0a1 milestone May 28, 2022
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
mod: dataframe type: bug Something isn't working
Projects
None yet
Development

No branches or pull requests

2 participants