diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 8161b5c5c2b11..4c7664833d967 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -51,7 +51,7 @@ cdef class _BaseGrouper: # See the comment in indexes/base.py about _index_data. # We need this for EA-backed indexes that have a reference # to a 1-d ndarray like datetime / timedelta / period. - object.__setattr__(cached_ityp, '_index_data', islider.buf) + object.__setattr__(cached_ityp, '_data', islider.buf) cached_ityp._engine.clear_mapping() cached_ityp._cache.clear() # e.g. inferred_freq must go object.__setattr__(cached_typ._mgr._block, 'values', vslider.buf) @@ -296,7 +296,7 @@ cdef class Slider: Only handles contiguous data for now """ cdef: - ndarray values, buf + ndarray values, buf, orig_buf Py_ssize_t stride, orig_len, orig_stride char *orig_data @@ -308,6 +308,7 @@ cdef class Slider: values = values.copy() self.values = values + self.orig_buf = buf self.buf = buf self.stride = values.strides[0] @@ -315,21 +316,14 @@ cdef class Slider: self.orig_len = self.buf.shape[0] self.orig_stride = self.buf.strides[0] - self.buf.data = self.values.data - self.buf.strides[0] = self.stride - cdef move(self, int start, int end): """ For slicing """ - self.buf.data = self.values.data + self.stride * start - self.buf.shape[0] = end - start + self.buf = self.values[start:end] cdef reset(self): - - self.buf.shape[0] = self.orig_len - self.buf.data = self.orig_data - self.buf.strides[0] = self.orig_stride + self.buf = self.orig_buf class InvalidApply(Exception): @@ -359,7 +353,7 @@ def apply_frame_axis0(object frame, object f, object names, slider.move(starts[i], ends[i]) item_cache.clear() # ugh - chunk = slider.dummy + chunk = slider.frame[starts[i]:ends[i]] object.__setattr__(chunk, 'name', names[i]) try: @@ -406,7 +400,8 @@ cdef class BlockSlider: object frame, dummy, index int nblocks Slider idx_slider - list blocks + list blocks, blk_values + ndarray orig_blklocs, orig_blknos cdef: char **base_ptrs @@ -420,20 +415,27 @@ cdef class BlockSlider: self.dummy = frame[:0] self.index = self.dummy.index - self.blocks = [b.values for b in self.dummy._mgr.blocks] + # GH#35417 attributes we need to restore at each step in case + # the function modified them. + mgr = self.dummy._mgr + self.orig_blklocs = mgr.blklocs + self.orig_blknos = mgr.blknos + self.blocks = [x for x in self.dummy._mgr.blocks] - for x in self.blocks: + self.blk_values = [b.values for b in self.dummy._mgr.blocks] + + for x in self.blk_values: util.set_array_not_contiguous(x) - self.nblocks = len(self.blocks) + self.nblocks = len(self.blk_values) # See the comment in indexes/base.py about _index_data. # We need this for EA-backed indexes that have a reference to a 1-d # ndarray like datetime / timedelta / period. self.idx_slider = Slider( self.frame.index._index_data, self.dummy.index._index_data) - self.base_ptrs = malloc(sizeof(char*) * len(self.blocks)) - for i, block in enumerate(self.blocks): + self.base_ptrs = malloc(sizeof(char*) * len(self.blk_values)) + for i, block in enumerate(self.blk_values): self.base_ptrs[i] = (block).data def __dealloc__(self): @@ -444,9 +446,11 @@ cdef class BlockSlider: ndarray arr Py_ssize_t i + self._restore_blocks() + # move blocks for i in range(self.nblocks): - arr = self.blocks[i] + arr = self.blk_values[i] # axis=1 is the frame's axis=0 arr.data = self.base_ptrs[i] + arr.strides[1] * start @@ -459,14 +463,25 @@ cdef class BlockSlider: self.index._engine.clear_mapping() self.index._cache.clear() # e.g. inferred_freq must go + cdef _restore_blocks(self): + """ + Ensure that we have the original blocks, blknos, and blklocs. + """ + mgr = self.dummy._mgr + mgr.blocks = self.blocks + mgr._blklocs = self.orig_blklocs + mgr._blknos = self.orig_blknos + cdef reset(self): cdef: ndarray arr Py_ssize_t i + self._restore_blocks() + # reset blocks for i in range(self.nblocks): - arr = self.blocks[i] + arr = self.blk_values[i] # axis=1 is the frame's axis=0 arr.data = self.base_ptrs[i] diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0b9021b094cd7..67efd8b5dee16 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -404,6 +404,18 @@ def _data(self): # e.g. fastparquet return self._mgr + @property + def _can_use_libreduction(self) -> bool: + # groupby ops can only use libreduction fast-path if we are all-numpy + if self.index._has_complex_internals: + return False + + is_invalid = lambda x: is_extension_array_dtype(x) or x.kind in ["m", "M"] + if self.ndim == 1: + return not is_invalid(self.dtype) + else: + return not self.dtypes.apply(is_invalid).any() + # ---------------------------------------------------------------------- # Axis _stat_axis_number = 0 diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index bbccd22f2ae85..9e98d66ca2881 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -74,7 +74,14 @@ get_groupby, group_selection_context, ) -from pandas.core.indexes.api import Index, MultiIndex, all_indexes_same +from pandas.core.indexes.api import ( + DatetimeIndex, + Index, + MultiIndex, + PeriodIndex, + TimedeltaIndex, + all_indexes_same, +) import pandas.core.indexes.base as ibase from pandas.core.internals import BlockManager from pandas.core.series import Series @@ -256,16 +263,25 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) if self.grouper.nkeys > 1: return self._python_agg_general(func, *args, **kwargs) - try: - return self._python_agg_general(func, *args, **kwargs) - except (ValueError, KeyError): - # TODO: KeyError is raised in _python_agg_general, - # see see test_groupby.test_basic - result = self._aggregate_named(func, *args, **kwargs) + if isinstance( + self._selected_obj.index, (DatetimeIndex, TimedeltaIndex, PeriodIndex) + ): + # using _python_agg_general would end up incorrectly patching + # _index_data in reduction.pyx + result = self._aggregate_maybe_named(func, *args, **kwargs) + else: + try: + return self._python_agg_general(func, *args, **kwargs) + except (ValueError, KeyError): + # TODO: KeyError is raised in _python_agg_general, + # see see test_groupby.test_basic + result = self._aggregate_maybe_named(func, *args, **kwargs) - index = Index(sorted(result), name=self.grouper.names[0]) + # name setting -> test_metadata_propagation_indiv + index = self.grouper.result_index + obj = self._selected_obj ret = create_series_with_explicit_dtype( - result, index=index, dtype_if_empty=object + result, index=index, dtype_if_empty=object, name=obj.name ) if not self.as_index: # pragma: no cover @@ -469,14 +485,34 @@ def _get_index() -> Index: ) return self._reindex_output(result) - def _aggregate_named(self, func, *args, **kwargs): + def _aggregate_maybe_named(self, func, *args, **kwargs): + """ + Try the named-aggregator first, then unnamed, which better matches + what libreduction does. + """ + try: + return self._aggregate_named(func, *args, named=True, **kwargs) + except KeyError: + return self._aggregate_named(func, *args, named=False, **kwargs) + + def _aggregate_named(self, func, *args, named: bool = True, **kwargs): result = {} - for name, group in self: - group.name = name + for name, group in self: # TODO: could we have duplicate names? + if named: + group.name = name + output = func(group, *args, **kwargs) if isinstance(output, (Series, Index, np.ndarray)): - raise ValueError("Must produce aggregated value") + if ( + isinstance(output, Series) + and len(output) == 1 + and name in output.index + ): + # FIXME: kludge for test_resampler_grouper.test_apply + output = output.iloc[0] + else: + raise ValueError("Must produce aggregated value") result[name] = output return result diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index e9525f03368fa..af1f02adf5331 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -156,18 +156,13 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0): result_values = None sdata: FrameOrSeries = splitter._get_sorted_data() - if sdata.ndim == 2 and np.any(sdata.dtypes.apply(is_extension_array_dtype)): - # calling splitter.fast_apply will raise TypeError via apply_frame_axis0 - # if we pass EA instead of ndarray - # TODO: can we have a workaround for EAs backed by ndarray? - pass - - elif ( + if ( com.get_callable_name(f) not in base.plotting_methods and isinstance(splitter, FrameSplitter) and axis == 0 # fast_apply/libreduction doesn't allow non-numpy backed indexes - and not sdata.index._has_complex_internals + # or columns + and sdata._can_use_libreduction ): try: result_values, mutated = splitter.fast_apply(f, sdata, group_keys) @@ -609,15 +604,10 @@ def agg_series(self, obj: Series, func: F): # SeriesGrouper would raise if we were to call _aggregate_series_fast return self._aggregate_series_pure_python(obj, func) - elif is_extension_array_dtype(obj.dtype): + elif not obj._can_use_libreduction: # _aggregate_series_fast would raise TypeError when # calling libreduction.Slider # In the datetime64tz case it would incorrectly cast to tz-naive - # TODO: can we get a performant workaround for EAs backed by ndarray? - return self._aggregate_series_pure_python(obj, func) - - elif obj.index._has_complex_internals: - # Preempt TypeError in _aggregate_series_fast return self._aggregate_series_pure_python(obj, func) try: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 9cd28974f9385..62ee5cdbf7586 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4056,7 +4056,9 @@ def _has_complex_internals(self) -> bool: Indicates if an index is not directly backed by a numpy array """ # used to avoid libreduction code paths, which raise or require conversion - return False + return isinstance(self, (ABCMultiIndex, ABCRangeIndex)) or not isinstance( + self._data, np.ndarray + ) def _is_memory_usage_qualified(self) -> bool: """ diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index c798ae0bd4e4d..77a55bced2187 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -343,11 +343,6 @@ def values(self): """ return the underlying data, which is a Categorical """ return self._data - @property - def _has_complex_internals(self) -> bool: - # used to avoid libreduction code paths, which raise or require conversion - return True - @doc(Index.__contains__) def __contains__(self, key: Any) -> bool: # if key is a NaN, check if any NaN is in self. diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 2f43787919faa..e5055b8262d29 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -394,11 +394,6 @@ def values(self) -> IntervalArray: """ return self._data - @property - def _has_complex_internals(self) -> bool: - # used to avoid libreduction code paths, which raise or require conversion - return True - def __array_wrap__(self, result, context=None): # we don't want the superclass implementation return result diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index a21a54e4a9be3..561402a79fa27 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1508,11 +1508,6 @@ def _get_level_number(self, level) -> int: ) from err return level - @property - def _has_complex_internals(self) -> bool: - # used to avoid libreduction code paths, which raise or require conversion - return True - @cache_readonly def is_monotonic_increasing(self) -> bool: """ diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 900d3f9f1866b..cb3efbc26da89 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -247,11 +247,6 @@ def _simple_new(cls, values: PeriodArray, name: Label = None): def values(self): return np.asarray(self) - @property - def _has_complex_internals(self): - # used to avoid libreduction code paths, which raise or require conversion - return True - def _shallow_copy(self, values=None, name: Label = no_default): name = name if name is not no_default else self.name cache = self._cache.copy() if values is None else {} diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index db5c4af9c6f53..93761a186b804 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1002,7 +1002,6 @@ def test_apply_function_with_indexing_return_column(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(reason="GH-34998") def test_apply_with_timezones_aware(): # GH: 27212