Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make groupby transform-like op order match original data order #8720

Merged
merged 3 commits into from
Aug 4, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 10 additions & 8 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,10 @@ def agg(self, func):
else:
raise

if libgroupby._is_all_scan_aggregate(normalized_aggs):
# Scan aggregations return rows in original index order
return self._mimic_pandas_order(result)

# set index names to be group key names
if len(result):
result.index.names = self.grouping.names
Expand Down Expand Up @@ -981,22 +985,20 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
if not axis == 0:
raise NotImplementedError("Only axis=0 is supported.")

value_column_names = [
x for x in self.obj._column_names if x not in self.grouping.names
]
num_columns_to_shift = len(value_column_names)
value_columns = self.grouping.values
if is_list_like(fill_value):
if not len(fill_value) == num_columns_to_shift:
if not len(fill_value) == len(value_columns._data):
raise ValueError(
"Mismatched number of columns and values to fill."
)
else:
fill_value = [fill_value] * num_columns_to_shift
fill_value = [fill_value] * len(value_columns._data)

value_columns = self.obj._data.select_by_label(value_column_names)
return self.obj.__class__._from_data(
result = self.obj.__class__._from_data(
*self._groupby.shift(Table(value_columns), periods, fill_value)
)
result = self._mimic_pandas_order(result)
Copy link
Contributor

@shwina shwina Jul 15, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This does a multi-column sort, which we can avoid by appending a column 0...N to the dataframe before the groupby and then sorting by that single column later.

Copy link
Contributor Author

@isVoid isVoid Jul 15, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Catched up offline, this is certainly a good optimization to our current approach. To achieve this we would require libcudf to perform a "no-op" on the sequence column. However a "no-op" wouldn't fit in our current libcudf aggregation framework because they are required to be binary (reduction) ops.

We discussed alternatives but settled upon it's best to just merge what we have so far and raise an issue to track the optimization thoughts with more people joining the dicussion.

return result._copy_type_metadata(value_columns)

def _mimic_pandas_order(
self, result: DataFrameOrSeries
Expand Down
20 changes: 0 additions & 20 deletions python/cudf/cudf/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1702,7 +1702,6 @@ def test_groupby_2keys_scan(nelem, func):
# pd.groupby.cumcount returns a series.
if isinstance(expect_df, pd.Series):
expect_df = expect_df.to_frame("val")
expect_df = expect_df.set_index([pdf["x"], pdf["y"]]).sort_index()

check_dtype = False if func in _index_type_aggs else True
assert_groupby_results_equal(got_df, expect_df, check_dtype=check_dtype)
Expand Down Expand Up @@ -1734,10 +1733,6 @@ def test_groupby_shift_row(nelem, shift_perc, direction, fill_value):
)
got = gdf.groupby(["x", "y"]).shift(periods=n_shift, fill_value=fill_value)

# Pandas returns shifted column in original row order. We set its index
# to be the key columns, so that `assert_groupby_results_equal` can sort
# rows by key columns to make sure cudf and pandas results matches.
expected.index = pd.MultiIndex.from_frame(gdf[["x", "y"]].to_pandas())
assert_groupby_results_equal(
expected[["val", "val2"]], got[["val", "val2"]]
)
Expand Down Expand Up @@ -1776,10 +1771,6 @@ def test_groupby_shift_row_mixed_numerics(
expected = pdf.groupby(["0"]).shift(periods=n_shift, fill_value=fill_value)
got = gdf.groupby(["0"]).shift(periods=n_shift, fill_value=fill_value)

# Pandas returns shifted column in original row order. We set its index
# to be the key columns, so that `assert_groupby_results_equal` can sort
# rows by key columns to make sure cudf and pandas results matches.
expected.index = gdf["0"].to_pandas()
assert_groupby_results_equal(
expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]]
)
Expand Down Expand Up @@ -1817,10 +1808,6 @@ def test_groupby_shift_row_mixed(nelem, shift_perc, direction):
expected = pdf.groupby(["0"]).shift(periods=n_shift)
got = gdf.groupby(["0"]).shift(periods=n_shift)

# Pandas returns shifted column in original row order. We set its index
# to be the key columns, so that `assert_groupby_results_equal` can sort
# rows by key columns to make sure cudf and pandas results matches.
expected.index = gdf["0"].to_pandas()
assert_groupby_results_equal(
expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]]
)
Expand Down Expand Up @@ -1880,10 +1867,6 @@ def test_groupby_shift_row_mixed_fill(

got = gdf.groupby(["0"]).shift(periods=n_shift, fill_value=fill_value)

# Pandas returns shifted column in original row order. We set its index
# to be the key columns, so that `assert_groupby_results_equal` can sort
# rows by key columns to make sure cudf and pandas results matches.
expected.index = gdf["0"].to_pandas()
assert_groupby_results_equal(
expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]]
)
Expand Down Expand Up @@ -1916,9 +1899,6 @@ def test_groupby_shift_row_zero_shift(nelem, fill_value):
expected = gdf
got = gdf.groupby(["0"]).shift(periods=0, fill_value=fill_value)

# Here, the result should be the same as input due to 0-shift, only the
# key orders are different.
expected = expected.set_index("0")
assert_groupby_results_equal(
expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]]
)
Expand Down