diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6bb9753fcea65..b63811e08e182 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -157,7 +157,7 @@ jobs: pytest pandas/tests/reductions/ --array-manager pytest pandas/tests/generic/test_generic.py --array-manager pytest pandas/tests/arithmetic/ --array-manager - pytest pandas/tests/groupby/aggregate/ --array-manager + pytest pandas/tests/groupby/ --array-manager pytest pandas/tests/reshape/merge --array-manager # indexing subset (temporary since other tests don't pass yet) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index f4c69ea9d89db..aaf67fb1be532 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1815,6 +1815,8 @@ def count(self) -> DataFrame: ids, _, ngroups = self.grouper.group_info mask = ids != -1 + using_array_manager = isinstance(data, ArrayManager) + def hfunc(bvalues: ArrayLike) -> ArrayLike: # TODO(2DEA): reshape would not be necessary with 2D EAs if bvalues.ndim == 1: @@ -1824,6 +1826,10 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike: masked = mask & ~isna(bvalues) counted = lib.count_level_2d(masked, labels=ids, max_bin=ngroups, axis=1) + if using_array_manager: + # count_level_2d return (1, N) array for single column + # -> extract 1D array + counted = counted[0, :] return counted new_mgr = data.grouped_reduce(hfunc) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 5004d1fe08a5b..d5b9a9806d8d5 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -84,6 +84,7 @@ MultiIndex, ensure_index, ) +from pandas.core.internals import ArrayManager from pandas.core.series import Series from pandas.core.sorting import ( compress_group_index, @@ -214,6 +215,10 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0): # TODO: can we have a workaround for EAs backed by ndarray? pass + elif isinstance(sdata._mgr, ArrayManager): + # TODO(ArrayManager) don't use fast_apply / libreduction.apply_frame_axis0 + # for now -> relies on BlockManager internals + pass elif ( com.get_callable_name(f) not in base.plotting_methods and isinstance(splitter, FrameSplitter) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index e0447378c4542..44d929c707c87 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -270,15 +270,30 @@ def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T: ------- ArrayManager """ - # TODO ignore_failures - result_arrays = [func(arr) for arr in self.arrays] + result_arrays: List[np.ndarray] = [] + result_indices: List[int] = [] + + for i, arr in enumerate(self.arrays): + try: + res = func(arr) + except (TypeError, NotImplementedError): + if not ignore_failures: + raise + continue + result_arrays.append(res) + result_indices.append(i) if len(result_arrays) == 0: index = Index([None]) # placeholder else: index = Index(range(result_arrays[0].shape[0])) - return type(self)(result_arrays, [index, self.items]) + if ignore_failures: + columns = self.items[np.array(result_indices, dtype="int64")] + else: + columns = self.items + + return type(self)(result_arrays, [index, columns]) def operate_blockwise(self, other: ArrayManager, array_op) -> ArrayManager: """ diff --git a/pandas/tests/groupby/test_allowlist.py b/pandas/tests/groupby/test_allowlist.py index de8335738791d..cc036bb484ff9 100644 --- a/pandas/tests/groupby/test_allowlist.py +++ b/pandas/tests/groupby/test_allowlist.py @@ -8,6 +8,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( DataFrame, Index, @@ -355,7 +357,8 @@ def test_groupby_function_rename(mframe): "cummax", "cummin", "cumprod", - "describe", + # TODO(ArrayManager) quantile + pytest.param("describe", marks=td.skip_array_manager_not_yet_implemented), "rank", "quantile", "diff", diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 639fe308529dc..daf5c71af7488 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -7,6 +7,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( DataFrame, @@ -84,6 +86,7 @@ def test_apply_trivial_fail(): tm.assert_frame_equal(result, expected) +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) fast_apply not used def test_fast_apply(): # make sure that fast apply is correctly called # rather than raising any kind of error @@ -213,6 +216,7 @@ def test_group_apply_once_per_group2(capsys): assert result == expected +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) fast_apply not used @pytest.mark.xfail(reason="GH-34998") def test_apply_fast_slow_identical(): # GH 31613 @@ -233,6 +237,7 @@ def fast(group): tm.assert_frame_equal(fast_df, slow_df) +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) fast_apply not used @pytest.mark.parametrize( "func", [ @@ -313,6 +318,7 @@ def test_groupby_as_index_apply(df): tm.assert_index_equal(res, ind) +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile def test_apply_concat_preserve_names(three_group): grouped = three_group.groupby(["A", "B"]) @@ -1003,9 +1009,10 @@ def test_apply_function_with_indexing_return_column(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(reason="GH-34998") -def test_apply_with_timezones_aware(): +def test_apply_with_timezones_aware(using_array_manager, request): # GH: 27212 + if not using_array_manager: + request.node.add_marker(pytest.mark.xfail(reason="GH-34998")) dates = ["2001-01-01"] * 2 + ["2001-01-02"] * 2 + ["2001-01-03"] * 2 index_no_tz = pd.DatetimeIndex(dates) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index f0356ad90a3ff..a7247c2c04761 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( Categorical, @@ -81,6 +83,7 @@ def get_stats(group): assert result.index.names[0] == "C" +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile def test_basic(): cats = Categorical( @@ -276,7 +279,9 @@ def test_apply(ordered): tm.assert_series_equal(result, expected) -def test_observed(observed): +# TODO(ArrayManager) incorrect dtype for mean() +@td.skip_array_manager_not_yet_implemented +def test_observed(observed, using_array_manager): # multiple groupers, don't re-expand the output space # of the grouper # gh-14942 (implement) @@ -535,6 +540,7 @@ def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort): assert False, msg +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile def test_datetime(): # GH9049: ensure backward compatibility levels = pd.date_range("2014-01-01", periods=4) @@ -600,6 +606,7 @@ def test_categorical_index(): tm.assert_frame_equal(result, expected) +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile def test_describe_categorical_columns(): # GH 11558 cats = CategoricalIndex( @@ -614,6 +621,7 @@ def test_describe_categorical_columns(): tm.assert_categorical_equal(result.stack().columns.values, cats.values) +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile def test_unstack_categorical(): # GH11558 (example is taken from the original issue) df = DataFrame( diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index cab5417e81445..598465a951e0f 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -367,6 +367,7 @@ def test_mad(self, gb, gni): result = gni.mad() tm.assert_frame_equal(result, expected) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile def test_describe(self, df, gb, gni): # describe expected_index = Index([1, 3], name="A") @@ -923,11 +924,13 @@ def test_is_monotonic_decreasing(in_vals, out_vals): # -------------------------------- +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile def test_apply_describe_bug(mframe): grouped = mframe.groupby(level="first") grouped.describe() # it works! +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile def test_series_describe_multikey(): ts = tm.makeTimeSeries() grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) @@ -937,6 +940,7 @@ def test_series_describe_multikey(): tm.assert_series_equal(result["min"], grouped.min(), check_names=False) +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile def test_series_describe_single(): ts = tm.makeTimeSeries() grouped = ts.groupby(lambda x: x.month) @@ -951,6 +955,7 @@ def test_series_index_name(df): assert result.index.name == "A" +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile def test_frame_describe_multikey(tsframe): grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) result = grouped.describe() @@ -973,6 +978,7 @@ def test_frame_describe_multikey(tsframe): tm.assert_frame_equal(result, expected) +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile def test_frame_describe_tupleindex(): # GH 14848 - regression from 0.19.0 to 0.19.1 @@ -992,6 +998,7 @@ def test_frame_describe_tupleindex(): df2.groupby("key").describe() +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile def test_frame_describe_unstacked_format(): # GH 4792 prices = { @@ -1018,6 +1025,7 @@ def test_frame_describe_unstacked_format(): tm.assert_frame_equal(result, expected) +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile @pytest.mark.filterwarnings( "ignore:" "indexing past lexsort depth may impact performance:" diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index afde1daca74c1..8cbb9d2443cb2 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -7,6 +7,7 @@ from pandas.compat import IS64 from pandas.errors import PerformanceWarning +import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -210,6 +211,7 @@ def f(grp): tm.assert_series_equal(result, e) +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile def test_pass_args_kwargs(ts, tsframe): def f(x, q=None, axis=0): return np.percentile(x, q, axis=axis) @@ -364,6 +366,7 @@ def f3(x): df2.groupby("a").apply(f3) +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile def test_attr_wrapper(ts): grouped = ts.groupby(lambda x: x.weekday()) diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index 9c9d1aa881890..2924348e98b56 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( DataFrame, @@ -8,6 +10,9 @@ ) import pandas._testing as tm +# TODO(ArrayManager) quantile +pytestmark = td.skip_array_manager_not_yet_implemented + @pytest.mark.parametrize( "interpolation", ["linear", "lower", "higher", "nearest", "midpoint"] diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 4956454ef2d4f..c4621d5fc0f8c 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -4,6 +4,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas.core.dtypes.common import ( ensure_platform_int, is_timedelta64_dtype, @@ -161,8 +163,13 @@ def test_transform_broadcast(tsframe, ts): assert_fp_equal(res.xs(idx), agged[idx]) -def test_transform_axis_1(request, transformation_func): +def test_transform_axis_1(request, transformation_func, using_array_manager): # GH 36308 + if using_array_manager and transformation_func == "pct_change": + # TODO(ArrayManager) column-wise shift + request.node.add_marker( + pytest.mark.xfail(reason="ArrayManager: shift axis=1 not yet implemented") + ) warn = None if transformation_func == "tshift": warn = FutureWarning @@ -183,6 +190,8 @@ def test_transform_axis_1(request, transformation_func): tm.assert_equal(result, expected) +# TODO(ArrayManager) groupby().transform returns DataFrame backed by BlockManager +@td.skip_array_manager_not_yet_implemented def test_transform_axis_ts(tsframe): # make sure that we are setting the axes