-
-
Notifications
You must be signed in to change notification settings - Fork 18.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
BUG: correctly instantiate subclassed DataFrame/Series in groupby apply #45363
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -753,6 +753,7 @@ def apply( | |
zipped = zip(group_keys, splitter) | ||
|
||
for key, group in zipped: | ||
group = group.__finalize__(data, method="groupby") | ||
object.__setattr__(group, "name", key) | ||
|
||
# group might be modified | ||
|
@@ -1000,6 +1001,7 @@ def _aggregate_series_pure_python( | |
splitter = get_splitter(obj, ids, ngroups, axis=0) | ||
|
||
for i, group in enumerate(splitter): | ||
group = group.__finalize__(obj, method="groupby") | ||
res = func(group) | ||
res = libreduction.extract_result(res) | ||
|
||
|
@@ -1243,13 +1245,7 @@ def _chop(self, sdata: Series, slice_obj: slice) -> Series: | |
# fastpath equivalent to `sdata.iloc[slice_obj]` | ||
mgr = sdata._mgr.get_slice(slice_obj) | ||
# __finalize__ not called here, must be applied by caller if applicable | ||
|
||
# fastpath equivalent to: | ||
# `return sdata._constructor(mgr, name=sdata.name, fastpath=True)` | ||
obj = type(sdata)._from_mgr(mgr) | ||
object.__setattr__(obj, "_flags", sdata._flags) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. are these still getting pinned? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
object.__setattr__(obj, "_name", sdata._name) | ||
return obj | ||
return sdata._constructor(mgr, name=sdata.name, fastpath=True) | ||
|
||
|
||
class FrameSplitter(DataSplitter): | ||
|
@@ -1261,11 +1257,7 @@ def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: | |
# return sdata.iloc[:, slice_obj] | ||
mgr = sdata._mgr.get_slice(slice_obj, axis=1 - self.axis) | ||
# __finalize__ not called here, must be applied by caller if applicable | ||
|
||
# fastpath equivalent to `return sdata._constructor(mgr)` | ||
obj = type(sdata)._from_mgr(mgr) | ||
object.__setattr__(obj, "_flags", sdata._flags) | ||
return obj | ||
return sdata._constructor(mgr) | ||
|
||
|
||
def get_splitter( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
are we now doing
__finalize__
in all of the places this is called? if so, better to do it here? (and on L1260)?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, there is a comment there about this:
pandas/pandas/core/groupby/ops.py
Line 1263 in 83ea173
(from #37461)
But I don't know the reason that it was originally decided that finalize must be called by the caller. In any case I don't see a reason to not move it to
_chop
now.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#37461 OP says it is for perf to not call it on each iteration. This PR removes that motivation, so go for it!