From 048424471278e719684dd2e2ee609cf0ca732185 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sat, 15 Aug 2020 22:24:26 +0000 Subject: [PATCH 01/42] handle dropna=False in _selected_obj, _set_result_index_ordered --- pandas/core/groupby/generic.py | 7 ++++++- pandas/core/groupby/groupby.py | 15 +++++++++++---- pandas/tests/groupby/test_groupby_dropna.py | 9 ++------- 3 files changed, 19 insertions(+), 12 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b806d9856d20f..248f18567e282 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1677,12 +1677,17 @@ def _gotitem(self, key, ndim: int, subset=None): exclusions=self.exclusions, as_index=self.as_index, observed=self.observed, + dropna=self.dropna, ) elif ndim == 1: if subset is None: subset = self.obj[key] return SeriesGroupBy( - subset, selection=key, grouper=self.grouper, observed=self.observed + subset, + selection=key, + grouper=self.grouper, + observed=self.observed, + dropna=self.dropna, ) raise AssertionError("invalid ndim for _gotitem") diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 0047877ef78ee..f34d06b45dbd7 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -637,10 +637,12 @@ def _selected_obj(self): if self._selection is None or isinstance(self.obj, Series): if self._group_selection is not None: - return self.obj[self._group_selection] - return self.obj + result = self.obj[self._group_selection] + result = self.obj else: - return self.obj[self._selection] + result = self.obj[self._selection] + + return result.dropna() if self.dropna else result def _reset_group_selection(self): """ @@ -690,7 +692,12 @@ def _set_result_index_ordered(self, result): result.set_axis(index, axis=self.axis, inplace=True) result = result.sort_index(axis=self.axis) - result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True) + if hasattr(self, "_selected_obj"): + labels = self._selected_obj._get_axis(self.axis) + else: + labels = self.obj._get_axis(self.axis) + + result.set_axis(labels, axis=self.axis, inplace=True) return result def _dir_additions(self): diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index adf62c4723526..824ac91c86cb4 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -165,12 +165,7 @@ def test_groupby_dropna_series_by(dropna, expected): @pytest.mark.parametrize( "dropna,df_expected,s_expected", [ - pytest.param( - True, - pd.DataFrame({"B": [2, 2, 1]}), - pd.Series(data=[2, 2, 1], name="B"), - marks=pytest.mark.xfail(raises=ValueError), - ), + (True, pd.DataFrame({"B": [2, 2, 1]}), pd.Series(data=[2, 2, 1], name="B"),), ( False, pd.DataFrame({"B": [2, 2, 1, 1]}), @@ -179,7 +174,7 @@ def test_groupby_dropna_series_by(dropna, expected): ], ) def test_slice_groupby_then_transform(dropna, df_expected, s_expected): - # GH35014 + # GH35014 & GH35612 df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}) gb = df.groupby("A", dropna=dropna) From a33574476afc37842b7c0ddde0b315d6565e0fe7 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sun, 16 Aug 2020 04:54:42 +0000 Subject: [PATCH 02/42] add whatsnew --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 42f95d88d74ac..7cf671888c2a1 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -221,7 +221,7 @@ Indexing Missing ^^^^^^^ -- Bug in :meth:`SeriesGroupBy.transform` now correctly handles missing values for `dropna=False` (:issue:`35014`) +- Bug in :meth:`SeriesGroupBy.transform` and :meth:`DataFrameGroupBy.transform` now correctly handle missing values (:issue:`35014` and :issue:`35612`) - MultiIndex From 640ec380bfd80414add5467e842c7d2222d88f73 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sun, 16 Aug 2020 06:37:06 +0000 Subject: [PATCH 03/42] rewrote _set_result_index_ordered --- pandas/core/groupby/generic.py | 5 +++-- pandas/core/groupby/groupby.py | 16 +++++----------- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 248f18567e282..226213425a6e1 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -556,8 +556,9 @@ def _transform_general( if common_dtype is result.dtype: result = maybe_downcast_numeric(result, self._selected_obj.dtype) - result.name = self._selected_obj.name - result.index = self._selected_obj.index + obj = self._selected_obj.dropna() if self.dropna else self._selected_obj + result.name = obj.name + result.index = obj.index return result def _transform_fast(self, result) -> Series: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f34d06b45dbd7..d86ff0d49bef2 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -637,12 +637,10 @@ def _selected_obj(self): if self._selection is None or isinstance(self.obj, Series): if self._group_selection is not None: - result = self.obj[self._group_selection] - result = self.obj + return self.obj[self._group_selection] + return self.obj else: - result = self.obj[self._selection] - - return result.dropna() if self.dropna else result + return self.obj[self._selection] def _reset_group_selection(self): """ @@ -692,12 +690,8 @@ def _set_result_index_ordered(self, result): result.set_axis(index, axis=self.axis, inplace=True) result = result.sort_index(axis=self.axis) - if hasattr(self, "_selected_obj"): - labels = self._selected_obj._get_axis(self.axis) - else: - labels = self.obj._get_axis(self.axis) - - result.set_axis(labels, axis=self.axis, inplace=True) + obj = self.obj.dropna() if self.dropna else self.obj + result.set_axis(obj._get_axis(self.axis), axis=self.axis, inplace=True) return result def _dir_additions(self): From 099e30caeb092162c283a91cacb1486a69566429 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sun, 16 Aug 2020 06:38:48 +0000 Subject: [PATCH 04/42] added dropna=False to tests reliant on that --- .../tests/groupby/transform/test_transform.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index c09f35526a6bf..e946f0d61b72d 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -309,11 +309,11 @@ def test_transform_multiple(ts): def test_dispatch_transform(tsframe): df = tsframe[::5].reindex(tsframe.index) - grouped = df.groupby(lambda x: x.month) + grouped = df.groupby(lambda x: x.month, dropna=False) filled = grouped.fillna(method="pad") fillit = lambda x: x.fillna(method="pad") - expected = df.groupby(lambda x: x.month).transform(fillit) + expected = df.groupby(lambda x: x.month, dropna=False).transform(fillit) tm.assert_frame_equal(filled, expected) @@ -412,10 +412,10 @@ def nsum(x): return np.nansum(x) results = [ - df.groupby("col1").transform(sum)["col2"], - df.groupby("col1")["col2"].transform(sum), - df.groupby("col1").transform(nsum)["col2"], - df.groupby("col1")["col2"].transform(nsum), + df.groupby("col1", dropna=False).transform(sum)["col2"], + df.groupby("col1", dropna=False)["col2"].transform(sum), + df.groupby("col1", dropna=False).transform(nsum)["col2"], + df.groupby("col1", dropna=False)["col2"].transform(nsum), ] for result in results: tm.assert_series_equal(result, expected, check_names=False) @@ -448,7 +448,9 @@ def test_groupby_transform_with_int(): ) ) with np.errstate(all="ignore"): - result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) + result = df.groupby("A", dropna=False).transform( + lambda x: (x - x.mean()) / x.std() + ) expected = DataFrame( dict(B=np.nan, C=Series([-1, 0, 1, -1, 0, 1], dtype="float64")) ) @@ -612,8 +614,7 @@ def test_cython_transform_series(op, args, targop): # series for data in [s, s_missing]: - # print(data.head()) - expected = data.groupby(labels).transform(targop) + expected = data.groupby(labels, dropna=False).transform(targop) tm.assert_series_equal(expected, data.groupby(labels).transform(op, *args)) tm.assert_series_equal(expected, getattr(data.groupby(labels), op)(*args)) From 8a13d068891e0f66de671c1b57ec0ddd1088ae7e Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sun, 16 Aug 2020 07:09:52 +0000 Subject: [PATCH 05/42] delete second reindexing in _transform_general --- pandas/core/groupby/generic.py | 4 +--- pandas/tests/groupby/test_grouping.py | 27 --------------------------- 2 files changed, 1 insertion(+), 30 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 226213425a6e1..4ae7b87d10dfa 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -556,9 +556,7 @@ def _transform_general( if common_dtype is result.dtype: result = maybe_downcast_numeric(result, self._selected_obj.dtype) - obj = self._selected_obj.dropna() if self.dropna else self._selected_obj - result.name = obj.name - result.index = obj.index + result.name = self._selected_obj.name return result def _transform_fast(self, result) -> Series: diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 40b4ce46e550b..e033f2129a73f 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -610,33 +610,6 @@ def test_list_grouper_with_nat(self): expected = {pd.Timestamp("2011-01-01"): 365} tm.assert_dict_equal(result.groups, expected) - @pytest.mark.parametrize( - "func,expected", - [ - ( - "transform", - pd.Series(name=2, dtype=np.float64, index=pd.RangeIndex(0, 0, 1)), - ), - ( - "agg", - pd.Series(name=2, dtype=np.float64, index=pd.Float64Index([], name=1)), - ), - ( - "apply", - pd.Series(name=2, dtype=np.float64, index=pd.Float64Index([], name=1)), - ), - ], - ) - def test_evaluate_with_empty_groups(self, func, expected): - # 26208 - # test transform'ing empty groups - # (not testing other agg fns, because they return - # different index objects. - df = pd.DataFrame({1: [], 2: []}) - g = df.groupby(1) - result = getattr(g[2], func)(lambda x: x) - tm.assert_series_equal(result, expected) - def test_groupby_empty(self): # https://github.com/pandas-dev/pandas/issues/27190 s = pd.Series([], name="name", dtype="float64") From 394feb6a6c75d6c755ea8e808d30f8e0ee92fea8 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sun, 16 Aug 2020 07:36:01 +0000 Subject: [PATCH 06/42] restore + change test_evaluate_with_empty_groups --- pandas/tests/groupby/test_grouping.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index e033f2129a73f..6fd928cdd8988 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -610,6 +610,33 @@ def test_list_grouper_with_nat(self): expected = {pd.Timestamp("2011-01-01"): 365} tm.assert_dict_equal(result.groups, expected) + @pytest.mark.parametrize( + "func,expected", + [ + ( + "transform", + pd.Series(name=2, dtype=np.float64, index=pd.Index([], dtype="object")), + ), + ( + "agg", + pd.Series(name=2, dtype=np.float64, index=pd.Float64Index([], name=1)), + ), + ( + "apply", + pd.Series(name=2, dtype=np.float64, index=pd.Float64Index([], name=1)), + ), + ], + ) + def test_evaluate_with_empty_groups(self, func, expected): + # 26208 + # test transform'ing empty groups + # (not testing other agg fns, because they return + # different index objects. + df = pd.DataFrame({1: [], 2: []}) + g = df.groupby(1) + result = getattr(g[2], func)(lambda x: x) + tm.assert_series_equal(result, expected) + def test_groupby_empty(self): # https://github.com/pandas-dev/pandas/issues/27190 s = pd.Series([], name="name", dtype="float64") From e1cafd46451e0939f00ed6979ba05a09e0984ad7 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Mon, 17 Aug 2020 06:45:40 +0000 Subject: [PATCH 07/42] remove calls to obj.dropna() --- pandas/core/groupby/generic.py | 9 +++++++-- pandas/core/groupby/groupby.py | 3 +-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 4ae7b87d10dfa..ae7d2bf00176b 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -538,14 +538,19 @@ def _transform_general( if isinstance(res, (ABCDataFrame, ABCSeries)): res = res._values - results.append(klass(res, index=group.index)) + indexer = self._get_index(name) if self.dropna else group.index + results.append(klass(res, index=indexer)) # check for empty "results" to avoid concat ValueError if results: from pandas.core.reshape.concat import concat concatenated = concat(results) - result = self._set_result_index_ordered(concatenated) + + if not self.dropna: + result = self._set_result_index_ordered(concatenated) + else: + result = concatenated.sort_index() else: result = self.obj._constructor(dtype=np.float64) # we will only try to coerce the result type if diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d86ff0d49bef2..0047877ef78ee 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -690,8 +690,7 @@ def _set_result_index_ordered(self, result): result.set_axis(index, axis=self.axis, inplace=True) result = result.sort_index(axis=self.axis) - obj = self.obj.dropna() if self.dropna else self.obj - result.set_axis(obj._get_axis(self.axis), axis=self.axis, inplace=True) + result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True) return result def _dir_additions(self): From 0df329ced381de6084203fcd315f6b116995ea40 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Mon, 17 Aug 2020 07:29:32 +0000 Subject: [PATCH 08/42] separate tests for dataframe/series slices --- pandas/tests/groupby/test_groupby_dropna.py | 40 ++++++++++++--------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 824ac91c86cb4..5b53ad837b768 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -163,32 +163,40 @@ def test_groupby_dropna_series_by(dropna, expected): @pytest.mark.parametrize( - "dropna,df_expected,s_expected", + "dropna,expected", [ - (True, pd.DataFrame({"B": [2, 2, 1]}), pd.Series(data=[2, 2, 1], name="B"),), - ( - False, - pd.DataFrame({"B": [2, 2, 1, 1]}), - pd.Series(data=[2, 2, 1, 1], name="B"), - ), + (True, pd.DataFrame({"B": [2, 2, 1]})), + (False, pd.DataFrame({"B": [2, 2, 1, 1]})), ], ) -def test_slice_groupby_then_transform(dropna, df_expected, s_expected): +def test_groupby_dataframe_slice_then_transform(dropna, expected): # GH35014 & GH35612 df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}) gb = df.groupby("A", dropna=dropna) - res = gb.transform(len) - tm.assert_frame_equal(res, df_expected) + result = gb.transform(len) + tm.assert_frame_equal(result, expected) + + result = gb[["B"]].transform(len) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "dropna,expected", + [ + (True, pd.Series(data=[2, 2, 1], name="B")), + (False, pd.Series(data=[2, 2, 1, 1], name="B")), + ], +) +def test_groupby_series_slice_then_transform_(dropna, expected): + # GH35014 & GH35612 - gb_slice = gb[["B"]] - res = gb_slice.transform(len) - tm.assert_frame_equal(res, df_expected) + df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}) + gb = df.groupby("A", dropna=dropna) - gb_slice = gb["B"] - res = gb["B"].transform(len) - tm.assert_series_equal(res, s_expected) + result = gb["B"].transform(len) + tm.assert_series_equal(result, expected) @pytest.mark.parametrize( From 77e7fc7fe65612364f4fe5eef23f4f27a0531766 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 21 Aug 2020 05:58:51 +0000 Subject: [PATCH 09/42] fix series indexing --- pandas/core/groupby/generic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ae7d2bf00176b..94bba958f8f52 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -551,6 +551,7 @@ def _transform_general( result = self._set_result_index_ordered(concatenated) else: result = concatenated.sort_index() + result.index = self._selected_obj.index[result.index.asi8] else: result = self.obj._constructor(dtype=np.float64) # we will only try to coerce the result type if From 16544ea5d7ec1c0ad4555106b0304ca2025432b6 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 21 Aug 2020 07:06:21 +0000 Subject: [PATCH 10/42] fix DataFrameGroupBy._transform_general --- pandas/core/groupby/generic.py | 14 ++++++++++---- pandas/tests/groupby/test_apply.py | 2 +- .../tests/groupby/transform/test_transform.py | 19 +++++++++---------- 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 94bba958f8f52..01cfaa429daa3 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1409,7 +1409,9 @@ def _transform_general( else: fast_path, slow_path = self._define_paths(func, *args, **kwargs) + has_nan = False for name, group in gen: + has_nan = has_nan or isna(name) object.__setattr__(group, "name", name) if maybe_use_numba(engine): @@ -1418,9 +1420,8 @@ def _transform_general( if cache_key not in NUMBA_FUNC_CACHE: NUMBA_FUNC_CACHE[cache_key] = numba_func # Return the result as a DataFrame for concatenation later - res = self.obj._constructor( - res, index=group.index, columns=group.columns - ) + indexer = self._get_index(name) if self.dropna else group.index + res = self.obj._constructor(res, index=indexer, columns=group.columns) else: # Try slow path and fast path. try: @@ -1459,7 +1460,12 @@ def _transform_general( other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1 concatenated = concat(applied, axis=self.axis, verify_integrity=False) concatenated = concatenated.reindex(concat_index, axis=other_axis, copy=False) - return self._set_result_index_ordered(concatenated) + if not self.dropna or not has_nan: + return self._set_result_index_ordered(concatenated) + else: + concatenated.sort_index(inplace=True) + concatenated.index = obj.index[concatenated.index.asi8] + return concatenated @Substitution(klass="DataFrame") @Appender(_transform_template) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index ee38722ffb8ce..388a0ad7e7041 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -404,7 +404,7 @@ def trans2(group): def test_apply_transform(ts): - grouped = ts.groupby(lambda x: x.month) + grouped = ts.groupby(lambda x: x.month, dropna=False) result = grouped.apply(lambda x: x * 2) expected = grouped.transform(lambda x: x * 2) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index e946f0d61b72d..c09f35526a6bf 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -309,11 +309,11 @@ def test_transform_multiple(ts): def test_dispatch_transform(tsframe): df = tsframe[::5].reindex(tsframe.index) - grouped = df.groupby(lambda x: x.month, dropna=False) + grouped = df.groupby(lambda x: x.month) filled = grouped.fillna(method="pad") fillit = lambda x: x.fillna(method="pad") - expected = df.groupby(lambda x: x.month, dropna=False).transform(fillit) + expected = df.groupby(lambda x: x.month).transform(fillit) tm.assert_frame_equal(filled, expected) @@ -412,10 +412,10 @@ def nsum(x): return np.nansum(x) results = [ - df.groupby("col1", dropna=False).transform(sum)["col2"], - df.groupby("col1", dropna=False)["col2"].transform(sum), - df.groupby("col1", dropna=False).transform(nsum)["col2"], - df.groupby("col1", dropna=False)["col2"].transform(nsum), + df.groupby("col1").transform(sum)["col2"], + df.groupby("col1")["col2"].transform(sum), + df.groupby("col1").transform(nsum)["col2"], + df.groupby("col1")["col2"].transform(nsum), ] for result in results: tm.assert_series_equal(result, expected, check_names=False) @@ -448,9 +448,7 @@ def test_groupby_transform_with_int(): ) ) with np.errstate(all="ignore"): - result = df.groupby("A", dropna=False).transform( - lambda x: (x - x.mean()) / x.std() - ) + result = df.groupby("A").transform(lambda x: (x - x.mean()) / x.std()) expected = DataFrame( dict(B=np.nan, C=Series([-1, 0, 1, -1, 0, 1], dtype="float64")) ) @@ -614,7 +612,8 @@ def test_cython_transform_series(op, args, targop): # series for data in [s, s_missing]: - expected = data.groupby(labels, dropna=False).transform(targop) + # print(data.head()) + expected = data.groupby(labels).transform(targop) tm.assert_series_equal(expected, data.groupby(labels).transform(op, *args)) tm.assert_series_equal(expected, getattr(data.groupby(labels), op)(*args)) From 46e5f6617b6a5ca14e49e288dada03e687e9d251 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 21 Aug 2020 19:49:10 +0000 Subject: [PATCH 11/42] move DataFrameGroupBy._transform_general logic to _set_result_index_ordered --- pandas/core/groupby/generic.py | 11 ++++------- pandas/core/groupby/groupby.py | 5 ++++- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 01cfaa429daa3..9877f7dde4b37 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1397,6 +1397,9 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): def _transform_general( self, func, *args, engine="cython", engine_kwargs=None, **kwargs ): + """ + Transform with a non-str `func`. + """ from pandas.core.reshape.concat import concat applied = [] @@ -1455,17 +1458,11 @@ def _transform_general( applied.append(r) else: applied.append(res) - concat_index = obj.columns if self.axis == 0 else obj.index other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1 concatenated = concat(applied, axis=self.axis, verify_integrity=False) concatenated = concatenated.reindex(concat_index, axis=other_axis, copy=False) - if not self.dropna or not has_nan: - return self._set_result_index_ordered(concatenated) - else: - concatenated.sort_index(inplace=True) - concatenated.index = obj.index[concatenated.index.asi8] - return concatenated + return self._set_result_index_ordered(concatenated) @Substitution(klass="DataFrame") @Appender(_transform_template) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 0047877ef78ee..9702173077b84 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -690,7 +690,10 @@ def _set_result_index_ordered(self, result): result.set_axis(index, axis=self.axis, inplace=True) result = result.sort_index(axis=self.axis) - result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True) + result_idx, obj_idx = result.index, self.obj._get_axis(self.axis) + intersection = result_idx.intersection(obj_idx) + indexer = obj_idx if intersection.empty else intersection + result.set_axis(indexer, axis=self.axis, inplace=True) return result def _dir_additions(self): From 6fca785c3f100706fd5b64ed21f2e292597da9f2 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 21 Aug 2020 22:53:24 +0000 Subject: [PATCH 12/42] handle edge case (datetime index, no NaNs) --- pandas/core/groupby/generic.py | 13 +++++++------ pandas/tests/groupby/test_apply.py | 2 +- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 9877f7dde4b37..58a6b4c75027a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -547,11 +547,14 @@ def _transform_general( concatenated = concat(results) - if not self.dropna: - result = self._set_result_index_ordered(concatenated) - else: + if self.dropna: result = concatenated.sort_index() - result.index = self._selected_obj.index[result.index.asi8] + if len(result.index) < len(self._selected_obj.index): + result.index = self._selected_obj.index[result.index.asi8] + else: + result.index = self._selected_obj.index + else: + result = self._set_result_index_ordered(concatenated) else: result = self.obj._constructor(dtype=np.float64) # we will only try to coerce the result type if @@ -1412,9 +1415,7 @@ def _transform_general( else: fast_path, slow_path = self._define_paths(func, *args, **kwargs) - has_nan = False for name, group in gen: - has_nan = has_nan or isna(name) object.__setattr__(group, "name", name) if maybe_use_numba(engine): diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 388a0ad7e7041..ee38722ffb8ce 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -404,7 +404,7 @@ def trans2(group): def test_apply_transform(ts): - grouped = ts.groupby(lambda x: x.month, dropna=False) + grouped = ts.groupby(lambda x: x.month) result = grouped.apply(lambda x: x * 2) expected = grouped.transform(lambda x: x * 2) tm.assert_series_equal(result, expected) From 269516bf024342dfe5f9e993c476a847f612bf53 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 4 Sep 2020 16:03:22 +0000 Subject: [PATCH 13/42] move logic to BaseGrouper and _set_index_ordered --- pandas/core/groupby/generic.py | 18 +++++------------- pandas/core/groupby/groupby.py | 12 ++++++++---- pandas/core/groupby/grouper.py | 4 +++- pandas/core/groupby/ops.py | 10 ++++++++-- .../tests/groupby/transform/test_transform.py | 16 ++++++++-------- 5 files changed, 32 insertions(+), 28 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index f5ca52604d0b5..274bd2172c4f2 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -540,23 +540,14 @@ def _transform_general( if isinstance(res, (ABCDataFrame, ABCSeries)): res = res._values - indexer = self._get_index(name) if self.dropna else group.index - results.append(klass(res, index=indexer)) + results.append(klass(res, index=group.index)) # check for empty "results" to avoid concat ValueError if results: from pandas.core.reshape.concat import concat concatenated = concat(results) - - if self.dropna: - result = concatenated.sort_index() - if len(result.index) < len(self._selected_obj.index): - result.index = self._selected_obj.index[result.index.asi8] - else: - result.index = self._selected_obj.index - else: - result = self._set_result_index_ordered(concatenated) + result = self._set_result_index_ordered(concatenated) else: result = self.obj._constructor(dtype=np.float64) # we will only try to coerce the result type if @@ -1345,8 +1336,9 @@ def _transform_general( if cache_key not in NUMBA_FUNC_CACHE: NUMBA_FUNC_CACHE[cache_key] = numba_func # Return the result as a DataFrame for concatenation later - indexer = self._get_index(name) if self.dropna else group.index - res = self.obj._constructor(res, index=indexer, columns=group.columns) + res = self.obj._constructor( + res, index=group.index, columns=group.columns + ) else: # Try slow path and fast path. try: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index a5d2054f0f07d..c812a6d799517 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -691,10 +691,14 @@ def _set_result_index_ordered(self, result): result.set_axis(index, axis=self.axis, inplace=True) result = result.sort_index(axis=self.axis) - result_idx, obj_idx = result.index, self.obj._get_axis(self.axis) - intersection = result_idx.intersection(obj_idx) - indexer = obj_idx if intersection.empty else intersection - result.set_axis(indexer, axis=self.axis, inplace=True) + if ( + self.dropna + and self.axis == 0 + and len(result.index) < len(self._selected_obj.index) + ): + result.index = self._selected_obj.index[result.index.asi8] + else: + result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True) return result def _dir_additions(self): diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 3017521c6a065..e138154dd6607 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -815,7 +815,9 @@ def is_in_obj(gpr) -> bool: groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp))) # create the internals grouper - grouper = ops.BaseGrouper(group_axis, groupings, sort=sort, mutated=mutated) + grouper = ops.BaseGrouper( + group_axis, groupings, sort=sort, mutated=mutated, dropna=dropna + ) return grouper, exclusions, obj diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 4dd5b7f30e7f0..09c12a8e79ea8 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -87,6 +87,7 @@ def __init__( group_keys: bool = True, mutated: bool = False, indexer: Optional[np.ndarray] = None, + dropna: bool = False, ): assert isinstance(axis, Index), axis @@ -97,6 +98,7 @@ def __init__( self.group_keys = group_keys self.mutated = mutated self.indexer = indexer + self.dropna = dropna @property def groupings(self) -> List["grouper.Grouping"]: @@ -124,8 +126,12 @@ def get_iterator(self, data: FrameOrSeries, axis: int = 0): """ splitter = self._get_splitter(data, axis=axis) keys = self._get_group_keys() - for key, (i, group) in zip(keys, splitter): - yield key, group + if self.dropna: + for key, (i, group) in zip(keys, splitter): + yield key, group.dropna() + else: + for key, (i, group) in zip(keys, splitter): + yield key, group def _get_splitter(self, data: FrameOrSeries, axis: int = 0) -> "DataSplitter": comp_ids, _, ngroups = self.group_info diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index c09f35526a6bf..5dd526aa625c7 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -148,8 +148,9 @@ def test_transform_broadcast(tsframe, ts): for col in tsframe: assert_fp_equal(res[col], agged[col]) + print(pd.isna(tsframe)) # group columns - grouped = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1) + grouped = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1, dropna=False) result = grouped.transform(np.mean) tm.assert_index_equal(result.index, tsframe.index) tm.assert_index_equal(result.columns, tsframe.columns) @@ -313,7 +314,7 @@ def test_dispatch_transform(tsframe): filled = grouped.fillna(method="pad") fillit = lambda x: x.fillna(method="pad") - expected = df.groupby(lambda x: x.month).transform(fillit) + expected = df.groupby(lambda x: x.month, dropna=False).transform(fillit) tm.assert_frame_equal(filled, expected) @@ -412,10 +413,10 @@ def nsum(x): return np.nansum(x) results = [ - df.groupby("col1").transform(sum)["col2"], - df.groupby("col1")["col2"].transform(sum), - df.groupby("col1").transform(nsum)["col2"], - df.groupby("col1")["col2"].transform(nsum), + df.groupby("col1", dropna=False).transform(sum)["col2"], + df.groupby("col1", dropna=False)["col2"].transform(sum), + df.groupby("col1", dropna=False).transform(nsum)["col2"], + df.groupby("col1", dropna=False)["col2"].transform(nsum), ] for result in results: tm.assert_series_equal(result, expected, check_names=False) @@ -612,8 +613,7 @@ def test_cython_transform_series(op, args, targop): # series for data in [s, s_missing]: - # print(data.head()) - expected = data.groupby(labels).transform(targop) + expected = data.groupby(labels, dropna=False).transform(targop) tm.assert_series_equal(expected, data.groupby(labels).transform(op, *args)) tm.assert_series_equal(expected, getattr(data.groupby(labels), op)(*args)) From 2ec491d79f243bf0a9b4f6b023d38d4c7d1b0652 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 4 Sep 2020 17:49:06 +0000 Subject: [PATCH 14/42] add dropna args to tests --- pandas/tests/groupby/transform/test_transform.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 5dd526aa625c7..c0ed68ff2b510 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -148,7 +148,6 @@ def test_transform_broadcast(tsframe, ts): for col in tsframe: assert_fp_equal(res[col], agged[col]) - print(pd.isna(tsframe)) # group columns grouped = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1, dropna=False) result = grouped.transform(np.mean) @@ -182,7 +181,7 @@ def test_transform_axis(tsframe): tm.assert_frame_equal(result, expected) ts = ts.T - grouped = ts.groupby(lambda x: x.weekday(), axis=1) + grouped = ts.groupby(lambda x: x.weekday(), axis=1, dropna=False) result = ts - grouped.transform("mean") expected = grouped.apply(lambda x: (x.T - x.mean(1)).T) tm.assert_frame_equal(result, expected) @@ -195,7 +194,7 @@ def test_transform_axis(tsframe): tm.assert_frame_equal(result, expected) ts = ts.T - grouped = ts.groupby(lambda x: x.weekday(), axis=1) + grouped = ts.groupby(lambda x: x.weekday(), axis=1, dropna=False) result = ts - grouped.transform("mean") expected = grouped.apply(lambda x: (x.T - x.mean(1)).T) tm.assert_frame_equal(result, expected) From 249fc2ac3ae13473cea5bd1966e78bc221c7fc37 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 4 Sep 2020 18:22:55 +0000 Subject: [PATCH 15/42] add dropna=False arg to test --- pandas/tests/indexing/multiindex/test_setitem.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index 853b92ea91274..73f0069e07f5f 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -226,7 +226,7 @@ def test_multiindex_assignment(self): columns=col_names, ) df = df.set_index(index_cols).sort_index() - grp = df.groupby(level=index_cols[:4]) + grp = df.groupby(level=index_cols[:4], dropna=False) df["new_col"] = np.nan f_index = np.arange(5) From b1cafad684296fbb74f030c0efa239a062a0e294 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 11 Sep 2020 05:18:31 +0000 Subject: [PATCH 16/42] feedback --- pandas/core/groupby/ops.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 09c12a8e79ea8..f0858a29061df 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -126,12 +126,8 @@ def get_iterator(self, data: FrameOrSeries, axis: int = 0): """ splitter = self._get_splitter(data, axis=axis) keys = self._get_group_keys() - if self.dropna: - for key, (i, group) in zip(keys, splitter): - yield key, group.dropna() - else: - for key, (i, group) in zip(keys, splitter): - yield key, group + for key, (i, group) in zip(keys, splitter): + yield key, group.dropna() if self.dropna else group def _get_splitter(self, data: FrameOrSeries, axis: int = 0) -> "DataSplitter": comp_ids, _, ngroups = self.group_info From 9abc8c4d917261e5fd238eec5d5dfea25f4c079b Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 11 Sep 2020 05:40:26 +0000 Subject: [PATCH 17/42] revert change to test --- pandas/tests/groupby/transform/test_transform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index c0ed68ff2b510..0ba603d43e621 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -149,7 +149,7 @@ def test_transform_broadcast(tsframe, ts): assert_fp_equal(res[col], agged[col]) # group columns - grouped = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1, dropna=False) + grouped = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1) result = grouped.transform(np.mean) tm.assert_index_equal(result.index, tsframe.index) tm.assert_index_equal(result.columns, tsframe.columns) From c63a24c46004da6bc4b3feadac9eaf21a40cac6c Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 11 Sep 2020 13:42:14 +0000 Subject: [PATCH 18/42] feedback --- pandas/core/groupby/groupby.py | 2 +- pandas/core/groupby/ops.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c16da7ece9386..2d91389805473 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -696,7 +696,7 @@ def _set_result_index_ordered(self, result): and self.axis == 0 and len(result.index) < len(self._selected_obj.index) ): - result.index = self._selected_obj.index[result.index.asi8] + pass else: result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True) return result diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index db7143c43d7a4..08d198abe2ae3 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -127,7 +127,7 @@ def get_iterator(self, data: FrameOrSeries, axis: int = 0): splitter = self._get_splitter(data, axis=axis) keys = self._get_group_keys() for key, (i, group) in zip(keys, splitter): - yield key, group.dropna() if self.dropna else group + yield key, group def _get_splitter(self, data: FrameOrSeries, axis: int = 0) -> "DataSplitter": comp_ids, _, ngroups = self.group_info From 9791e1ecb1795dc2a00ab15dce1c94b74011921f Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 11 Sep 2020 13:44:55 +0000 Subject: [PATCH 19/42] revert changes to test_transform --- .../tests/groupby/transform/test_transform.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 0ba603d43e621..c09f35526a6bf 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -181,7 +181,7 @@ def test_transform_axis(tsframe): tm.assert_frame_equal(result, expected) ts = ts.T - grouped = ts.groupby(lambda x: x.weekday(), axis=1, dropna=False) + grouped = ts.groupby(lambda x: x.weekday(), axis=1) result = ts - grouped.transform("mean") expected = grouped.apply(lambda x: (x.T - x.mean(1)).T) tm.assert_frame_equal(result, expected) @@ -194,7 +194,7 @@ def test_transform_axis(tsframe): tm.assert_frame_equal(result, expected) ts = ts.T - grouped = ts.groupby(lambda x: x.weekday(), axis=1, dropna=False) + grouped = ts.groupby(lambda x: x.weekday(), axis=1) result = ts - grouped.transform("mean") expected = grouped.apply(lambda x: (x.T - x.mean(1)).T) tm.assert_frame_equal(result, expected) @@ -313,7 +313,7 @@ def test_dispatch_transform(tsframe): filled = grouped.fillna(method="pad") fillit = lambda x: x.fillna(method="pad") - expected = df.groupby(lambda x: x.month, dropna=False).transform(fillit) + expected = df.groupby(lambda x: x.month).transform(fillit) tm.assert_frame_equal(filled, expected) @@ -412,10 +412,10 @@ def nsum(x): return np.nansum(x) results = [ - df.groupby("col1", dropna=False).transform(sum)["col2"], - df.groupby("col1", dropna=False)["col2"].transform(sum), - df.groupby("col1", dropna=False).transform(nsum)["col2"], - df.groupby("col1", dropna=False)["col2"].transform(nsum), + df.groupby("col1").transform(sum)["col2"], + df.groupby("col1")["col2"].transform(sum), + df.groupby("col1").transform(nsum)["col2"], + df.groupby("col1")["col2"].transform(nsum), ] for result in results: tm.assert_series_equal(result, expected, check_names=False) @@ -612,7 +612,8 @@ def test_cython_transform_series(op, args, targop): # series for data in [s, s_missing]: - expected = data.groupby(labels, dropna=False).transform(targop) + # print(data.head()) + expected = data.groupby(labels).transform(targop) tm.assert_series_equal(expected, data.groupby(labels).transform(op, *args)) tm.assert_series_equal(expected, getattr(data.groupby(labels), op)(*args)) From 21a6fbbc70d198978958ba5554843deca870ff4c Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 11 Sep 2020 13:46:42 +0000 Subject: [PATCH 20/42] revert changes to test --- pandas/tests/indexing/multiindex/test_setitem.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index 73f0069e07f5f..853b92ea91274 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -226,7 +226,7 @@ def test_multiindex_assignment(self): columns=col_names, ) df = df.set_index(index_cols).sort_index() - grp = df.groupby(level=index_cols[:4], dropna=False) + grp = df.groupby(level=index_cols[:4]) df["new_col"] = np.nan f_index = np.arange(5) From 239e16a3d3777c80451c7a8e5d61af4dfcd4b142 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sat, 12 Sep 2020 17:15:35 +0000 Subject: [PATCH 21/42] feedback --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 2d91389805473..6e216ec20fd6c 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -696,7 +696,7 @@ def _set_result_index_ordered(self, result): and self.axis == 0 and len(result.index) < len(self._selected_obj.index) ): - pass + result.index = self._selected_obj.index[result.index] else: result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True) return result From 0cdea22618723e4f4e2f1a29f98eab8084db9183 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sat, 12 Sep 2020 17:42:44 +0000 Subject: [PATCH 22/42] add non-RangeIndex test cases --- pandas/tests/groupby/test_groupby_dropna.py | 51 +++++++++++++-------- 1 file changed, 32 insertions(+), 19 deletions(-) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 92530f3c3e416..56d4bd23b025c 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -163,39 +163,52 @@ def test_groupby_dropna_series_by(dropna, expected): @pytest.mark.parametrize( - "dropna,expected", + "dropna,input_index,expected_data,expected_index", [ - (True, pd.DataFrame({"B": [2, 2, 1]})), - (False, pd.DataFrame({"B": [2, 2, 1, 1]})), + (True, pd.RangeIndex(0, 4), {"B": [2, 2, 1]}, pd.RangeIndex(0, 3)), + (True, list("abcd"), {"B": [2, 2, 1]}, list("abc")), + ( + True, + pd.MultiIndex.from_tuples( + [(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"] + ), + {"B": [2, 2, 1]}, + pd.MultiIndex.from_tuples( + [(1, "R"), (1, "B"), (2, "R")], names=["num", "col"] + ), + ), + (False, pd.RangeIndex(0, 4), {"B": [2, 2, 1, 1]}, pd.RangeIndex(0, 4)), + (False, list("abcd"), {"B": [2, 2, 1, 1]}, list("abcd")), + ( + False, + pd.MultiIndex.from_tuples( + [(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"] + ), + {"B": [2, 2, 1, 1]}, + pd.MultiIndex.from_tuples( + [(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"] + ), + ), ], ) -def test_groupby_dataframe_slice_then_transform(dropna, expected): +def test_groupby_dataframe_slice_then_transform( + dropna, input_index, expected_data, expected_index +): # GH35014 & GH35612 - df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}) + df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}, index=input_index) gb = df.groupby("A", dropna=dropna) result = gb.transform(len) + expected = pd.DataFrame(expected_data, index=expected_index) tm.assert_frame_equal(result, expected) result = gb[["B"]].transform(len) + expected = pd.DataFrame(expected_data, index=expected_index) tm.assert_frame_equal(result, expected) - -@pytest.mark.parametrize( - "dropna,expected", - [ - (True, pd.Series(data=[2, 2, 1], name="B")), - (False, pd.Series(data=[2, 2, 1, 1], name="B")), - ], -) -def test_groupby_series_slice_then_transform_(dropna, expected): - # GH35014 & GH35612 - - df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}) - gb = df.groupby("A", dropna=dropna) - result = gb["B"].transform(len) + expected = pd.Series(expected_data["B"], index=expected_index, name="B") tm.assert_series_equal(result, expected) From ca2f898a082fc559aecd79b761ca8f37e8ee703d Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 18 Sep 2020 17:04:09 +0000 Subject: [PATCH 23/42] add comments/clean up _set_result_index_ordered --- pandas/core/groupby/groupby.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 85d458ccb8496..ef9c3b6c28c2d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -689,12 +689,17 @@ def _set_result_index_ordered(self, result): result.set_axis(index, axis=self.axis, inplace=True) result = result.sort_index(axis=self.axis) - if ( + # result.index is a standard index => may need to restore original index + + if ( # check if rows were dropped self.dropna - and self.axis == 0 - and len(result.index) < len(self._selected_obj.index) + and not self.axis # if self.axis == 1 rows are never dropped + and len(result) < len(self._selected_obj) # rows dropped iff NaNs present ): - result.index = self._selected_obj.index[result.index] + # use result.index to select from index of original object + original_index = self._selected_obj.index[result.index] + + result.index = original_index else: result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True) return result From e15df1acd5d338a4a6a081671313f58a8db507bc Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sat, 19 Sep 2020 05:16:32 +0000 Subject: [PATCH 24/42] revert accidental change to _transform_general signature --- pandas/core/groupby/generic.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ce8485e724942..862fa3a27d8df 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1277,9 +1277,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): return self._reindex_output(result) - def _transform_general( - self, func, *args, engine="cython", engine_kwargs=None, **kwargs - ): + def _transform_general(self, func, *args, **kwargs): """ Transform with a non-str `func`. """ From 342540f2b2e31f5f861d7ad71203f26844be7b58 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sat, 19 Sep 2020 06:28:56 +0000 Subject: [PATCH 25/42] rewrite _set_result_index_ordered using auxiliary method --- pandas/core/groupby/groupby.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index ef9c3b6c28c2d..d98512b51fb07 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -678,7 +678,7 @@ def _set_group_selection(self): self._group_selection = ax.difference(Index(groupers), sort=False).tolist() self._reset_cache("_selected_obj") - def _set_result_index_ordered(self, result): + def _set_result_index_ordered(self, result: FrameOrSeries) -> FrameOrSeries: # set the result index on the passed values object and # return the new object, xref 8046 @@ -689,19 +689,22 @@ def _set_result_index_ordered(self, result): result.set_axis(index, axis=self.axis, inplace=True) result = result.sort_index(axis=self.axis) + return self._restore_grouper_index(result) + + def _restore_grouper_index(self, result: FrameOrSeries) -> FrameOrSeries: + # GH 35612 # result.index is a standard index => may need to restore original index - if ( # check if rows were dropped - self.dropna - and not self.axis # if self.axis == 1 rows are never dropped - and len(result) < len(self._selected_obj) # rows dropped iff NaNs present - ): + rows_dropped: bool = len(result) < len(self._selected_obj) + # TODO: address case of axis==1, dropped columns + + if self.dropna and not self.axis and rows_dropped: # use result.index to select from index of original object original_index = self._selected_obj.index[result.index] - result.index = original_index else: result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True) + return result def _dir_additions(self): From 90e687bd29c5e0473b6f9c7ffc23d6868a4f54f6 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sat, 19 Sep 2020 06:29:26 +0000 Subject: [PATCH 26/42] rewrite whatsnew note --- doc/source/whatsnew/v1.2.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 5ed213ceebc32..8c1c2d908fc28 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -298,7 +298,8 @@ Indexing Missing ^^^^^^^ -- Bug in :meth:`SeriesGroupBy.transform` and :meth:`DataFrameGroupBy.transform` now correctly handle missing values (:issue:`35014` and :issue:`35612`) +- Bug in :meth:`SeriesGroupBy.transform` now correctly handles missing values for `dropna=False` (:issue:`35014`) +- Bug in :class:`pd.Grouper` now correctly propagates `dropna` argument and :meth:`DataFrameGroupBy.transform` now correctly handle missing values for `dropna=True`(:issue:`35612`) - MultiIndex From 531414fa6d0834b314b580c6ec44099bf32a3ac1 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Mon, 21 Sep 2020 22:32:14 +0000 Subject: [PATCH 27/42] feedback --- pandas/core/groupby/groupby.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d98512b51fb07..d86e31fd8d85e 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -689,23 +689,27 @@ def _set_result_index_ordered(self, result: FrameOrSeries) -> FrameOrSeries: result.set_axis(index, axis=self.axis, inplace=True) result = result.sort_index(axis=self.axis) - return self._restore_grouper_index(result) + def restore_grouper_index(result): + # xref 35612 + # result.index is a standard index => may need to restore original index - def _restore_grouper_index(self, result: FrameOrSeries) -> FrameOrSeries: - # GH 35612 - # result.index is a standard index => may need to restore original index + rows_dropped: bool = len(result) < len(self._selected_obj) + # TODO: address case of axis==1, dropped columns - rows_dropped: bool = len(result) < len(self._selected_obj) - # TODO: address case of axis==1, dropped columns + if self.dropna and self.axis == 0 and rows_dropped: + # use result.index to select from index of original object + original_index = self._selected_obj.index[result.index] + result.index = original_index + else: + result.set_axis( + self.obj._get_axis(self.axis), axis=self.axis, inplace=True + ) - if self.dropna and not self.axis and rows_dropped: - # use result.index to select from index of original object - original_index = self._selected_obj.index[result.index] - result.index = original_index - else: - result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True) + return result - return result + return restore_grouper_index(result) + + return self._restore_grouper_index(result) def _dir_additions(self): return self.obj._dir_additions() | self._apply_allowlist From 2fcfda044cae80b4b6fa49fcd52978baad3c3e87 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 22 Sep 2020 02:03:22 +0000 Subject: [PATCH 28/42] rename variables (possibly) for clarity --- pandas/core/groupby/groupby.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d86e31fd8d85e..45d1f38af4643 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -678,7 +678,9 @@ def _set_group_selection(self): self._group_selection = ax.difference(Index(groupers), sort=False).tolist() self._reset_cache("_selected_obj") - def _set_result_index_ordered(self, result: FrameOrSeries) -> FrameOrSeries: + def _set_result_index_ordered( + self, factorized_result: FrameOrSeries + ) -> FrameOrSeries: # set the result index on the passed values object and # return the new object, xref 8046 @@ -686,30 +688,28 @@ def _set_result_index_ordered(self, result: FrameOrSeries) -> FrameOrSeries: # shortcut if we have an already ordered grouper if not self.grouper.is_monotonic: index = Index(np.concatenate(self._get_indices(self.grouper.result_index))) - result.set_axis(index, axis=self.axis, inplace=True) - result = result.sort_index(axis=self.axis) + factorized_result.set_axis(index, axis=self.axis, inplace=True) + factorized_result = factorized_result.sort_index(axis=self.axis) - def restore_grouper_index(result): + def restore_grouper_index(factorized_result): # xref 35612 - # result.index is a standard index => may need to restore original index + # factorized_result has a standard index => restore original index - rows_dropped: bool = len(result) < len(self._selected_obj) + rows_dropped: bool = len(factorized_result) < len(self._selected_obj) # TODO: address case of axis==1, dropped columns if self.dropna and self.axis == 0 and rows_dropped: # use result.index to select from index of original object - original_index = self._selected_obj.index[result.index] - result.index = original_index + original_index = self._selected_obj.index[factorized_result.index] + factorized_result.index = original_index else: - result.set_axis( + factorized_result.set_axis( self.obj._get_axis(self.axis), axis=self.axis, inplace=True ) - return result - - return restore_grouper_index(result) + return factorized_result - return self._restore_grouper_index(result) + return restore_grouper_index(factorized_result) def _dir_additions(self): return self.obj._dir_additions() | self._apply_allowlist From 62caeb616cb4814cc5b2de36997356a309820946 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sat, 26 Sep 2020 15:16:43 +0000 Subject: [PATCH 29/42] simplify _set_result_index_ordered --- pandas/core/groupby/groupby.py | 37 ++++++++++++---------------------- 1 file changed, 13 insertions(+), 24 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index fde772934d096..d09cad86e5712 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -679,38 +679,27 @@ def _set_group_selection(self): self._group_selection = ax.difference(Index(groupers), sort=False).tolist() self._reset_cache("_selected_obj") - def _set_result_index_ordered( - self, factorized_result: FrameOrSeries - ) -> FrameOrSeries: + def _set_result_index_ordered(self, result): # set the result index on the passed values object and # return the new object, xref 8046 # the values/counts are repeated according to the group index # shortcut if we have an already ordered grouper if not self.grouper.is_monotonic: - index = Index(np.concatenate(self._get_indices(self.grouper.result_index))) - factorized_result.set_axis(index, axis=self.axis, inplace=True) - factorized_result = factorized_result.sort_index(axis=self.axis) - - def restore_grouper_index(factorized_result): - # xref 35612 - # factorized_result has a standard index => restore original index - - rows_dropped: bool = len(factorized_result) < len(self._selected_obj) - # TODO: address case of axis==1, dropped columns - - if self.dropna and self.axis == 0 and rows_dropped: - # use result.index to select from index of original object - original_index = self._selected_obj.index[factorized_result.index] - factorized_result.index = original_index - else: - factorized_result.set_axis( - self.obj._get_axis(self.axis), axis=self.axis, inplace=True - ) + # row order is scrambled => sort the rows by position in original index + original_positions = Index( + np.concatenate(self._get_indices(self.grouper.result_index)) + ) + result.set_axis(original_positions, axis=self.axis, inplace=True) + result = result.sort_index(axis=self.axis) - return factorized_result + if len(original_positions) < len(self._selected_obj): + sorted_indexer = result.index + result.index = self._selected_obj.index[sorted_indexer] + return result - return restore_grouper_index(factorized_result) + result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True) + return result def _dir_additions(self): return self.obj._dir_additions() | self._apply_allowlist From deb1b09056c07889845735c80a7bc9aacdad117f Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sat, 26 Sep 2020 15:19:35 +0000 Subject: [PATCH 30/42] minimize diff --- pandas/core/groupby/generic.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index f7624dfdd0ca4..80085b4cf01b7 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1295,9 +1295,6 @@ def _wrap_applied_output_series( return self._reindex_output(result) def _transform_general(self, func, *args, **kwargs): - """ - Transform with a non-str `func`. - """ from pandas.core.reshape.concat import concat applied = [] @@ -1340,6 +1337,7 @@ def _transform_general(self, func, *args, **kwargs): applied.append(r) else: applied.append(res) + concat_index = obj.columns if self.axis == 0 else obj.index other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1 concatenated = concat(applied, axis=self.axis, verify_integrity=False) From b6fd41cc82738c0333a166790c1fedabda8ea030 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Mon, 5 Oct 2020 04:19:06 +0000 Subject: [PATCH 31/42] add type hints to _set_result_index_ordered --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 6334829342fb9..efea76ff6ee6a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -679,7 +679,7 @@ def _set_group_selection(self): self._group_selection = ax.difference(Index(groupers), sort=False).tolist() self._reset_cache("_selected_obj") - def _set_result_index_ordered(self, result): + def _set_result_index_ordered(self, result: FrameOrSeries) -> FrameOrSeries: # set the result index on the passed values object and # return the new object, xref 8046 From 983bb8ef11ef6eaae8b9b5114708d3befc130a53 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 9 Oct 2020 17:04:03 +0000 Subject: [PATCH 32/42] revert type hints --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index efea76ff6ee6a..6334829342fb9 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -679,7 +679,7 @@ def _set_group_selection(self): self._group_selection = ax.difference(Index(groupers), sort=False).tolist() self._reset_cache("_selected_obj") - def _set_result_index_ordered(self, result: FrameOrSeries) -> FrameOrSeries: + def _set_result_index_ordered(self, result): # set the result index on the passed values object and # return the new object, xref 8046 From 9e6a1302e03c71b939ff451be7b587262f3340db Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 30 Oct 2020 14:13:02 +0000 Subject: [PATCH 33/42] BUG: fix merge mistake --- pandas/tests/groupby/test_grouping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 6c74e1521eeeb..5a5815973f2b7 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -615,7 +615,7 @@ def test_list_grouper_with_nat(self): [ ( "transform", - Series(name=2, dtype=np.float64, index=pd.RangeIndex(0, 0, 1)), + Series(name=2, dtype=np.float64, index=pd.Index([])), ), ( "agg", From 8be535c055ac17b7ff6be64e67957d515d008266 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sat, 31 Oct 2020 17:30:10 +0000 Subject: [PATCH 34/42] feedback: rewrite _set_result_index_ordered --- pandas/core/groupby/groupby.py | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 07cbfcb68d15d..f25222d9fe7ed 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -717,20 +717,27 @@ def _set_result_index_ordered( # the values/counts are repeated according to the group index # shortcut if we have an already ordered grouper - if not self.grouper.is_monotonic: - # row order is scrambled => sort the rows by position in original index - original_positions = Index( - np.concatenate(self._get_indices(self.grouper.result_index)) - ) - result.set_axis(original_positions, axis=self.axis, inplace=True) - result = result.sort_index(axis=self.axis) + if self.grouper.is_monotonic: + result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True) + return result + + # row order is scrambled => sort the rows by position in original index + original_positions = Index( + np.concatenate(self._get_indices(self.grouper.result_index)) + ) + result.set_axis(original_positions, axis=self.axis, inplace=True) + result = result.sort_index(axis=self.axis) + + rows_dropped = len(result.index) < len(self._selected_obj) - if len(original_positions) < len(self._selected_obj): - sorted_indexer = result.index - result.index = self._selected_obj.index[sorted_indexer] - return result + # get index by slicing original index according to original positions + # slice drops attrs => use set_axis when no rows were dropped + if rows_dropped: + sorted_indexer = result.index + result.index = self._selected_obj.index[sorted_indexer] + else: + result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True) - result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True) return result def _dir_additions(self) -> Set[str]: From 85d2165fc88f9eedafdfb5e779ab43fe6473ce43 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sat, 31 Oct 2020 17:59:30 +0000 Subject: [PATCH 35/42] CI: fix pd namespace usage --- pandas/tests/groupby/test_grouping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index eb03ef06a93e3..c8ea77e17d86d 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -613,7 +613,7 @@ def test_list_grouper_with_nat(self): [ ( "transform", - Series(name=2, dtype=np.float64, index=pd.Index([])), + Series(name=2, dtype=np.float64, index=Index([])), ), ( "agg", From 1e7ab914d546ec768c49931d626a47d2469aa092 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Thu, 5 Nov 2020 17:41:59 +0000 Subject: [PATCH 36/42] DOC: fix use of pd namespace in whatsnew --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index b1470f0657f82..010ee3019f477 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -470,7 +470,7 @@ Missing ^^^^^^^ - Bug in :meth:`SeriesGroupBy.transform` now correctly handles missing values for ``dropna=False`` (:issue:`35014`) -- Bug in :class:`pd.Grouper` now correctly propagates ``dropna`` argument and :meth:`DataFrameGroupBy.transform` now correctly handle missing values for ``dropna=True``(:issue:`35612`) +- Bug in :class:`Grouper` now correctly propagates ``dropna`` argument and :meth:`DataFrameGroupBy.transform` now correctly handle missing values for ``dropna=True``(:issue:`35612`) - MultiIndex From 96b5af4d8504a316f577fd162879469473fbbbae Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Thu, 5 Nov 2020 17:42:51 +0000 Subject: [PATCH 37/42] DOC: fix typo in whatsnew entry --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 010ee3019f477..269a0ea296bf4 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -470,7 +470,7 @@ Missing ^^^^^^^ - Bug in :meth:`SeriesGroupBy.transform` now correctly handles missing values for ``dropna=False`` (:issue:`35014`) -- Bug in :class:`Grouper` now correctly propagates ``dropna`` argument and :meth:`DataFrameGroupBy.transform` now correctly handle missing values for ``dropna=True``(:issue:`35612`) +- Bug in :class:`Grouper` now correctly propagates ``dropna`` argument and :meth:`DataFrameGroupBy.transform` now correctly handles missing values for ``dropna=True``(:issue:`35612`) - MultiIndex From f5a163563796372397b71b0c32c25b9e010b5b15 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sun, 8 Nov 2020 07:56:17 +0000 Subject: [PATCH 38/42] REF (feedback): _set_result_index_ordered --- pandas/core/groupby/groupby.py | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index dd3b4945f1ea1..653dde35542df 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -716,28 +716,21 @@ def _set_result_index_ordered( # set the result index on the passed values object and # return the new object, xref 8046 - # the values/counts are repeated according to the group index - # shortcut if we have an already ordered grouper if self.grouper.is_monotonic: + # shortcut if we have an already ordered grouper result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True) - return result - - # row order is scrambled => sort the rows by position in original index - original_positions = Index( - np.concatenate(self._get_indices(self.grouper.result_index)) - ) - result.set_axis(original_positions, axis=self.axis, inplace=True) - result = result.sort_index(axis=self.axis) - - rows_dropped = len(result.index) < len(self._selected_obj) + else: + # row order is scrambled => sort the rows by position in original index + original_positions = Index( + np.concatenate(self._get_indices(self.grouper.result_index)) + ) + result.set_axis(original_positions, axis=self.axis, inplace=True) + result = result.sort_index(axis=self.axis) - # get index by slicing original index according to original positions - # slice drops attrs => use set_axis when no rows were dropped - if rows_dropped: + # get index by slicing original index according to original positions + # slice drops attrs => use set_axis when no rows were dropped sorted_indexer = result.index result.index = self._selected_obj.index[sorted_indexer] - else: - result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True) return result From bd1abf93bddafb3c6c9f06f8472bf9577413de75 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Tue, 10 Nov 2020 06:44:56 +0000 Subject: [PATCH 39/42] DOC: @rhshadrach fix --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 269a0ea296bf4..3e41e924564f6 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -470,7 +470,7 @@ Missing ^^^^^^^ - Bug in :meth:`SeriesGroupBy.transform` now correctly handles missing values for ``dropna=False`` (:issue:`35014`) -- Bug in :class:`Grouper` now correctly propagates ``dropna`` argument and :meth:`DataFrameGroupBy.transform` now correctly handles missing values for ``dropna=True``(:issue:`35612`) +- Bug in :class:`Grouper` now correctly propagates ``dropna`` argument and :meth:`DataFrameGroupBy.transform` now correctly handles missing values for ``dropna=True`` (:issue:`35612`) - MultiIndex From a789b6a0b7f7b4fc0d19c32015a7d35c6e99cda2 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Tue, 10 Nov 2020 07:41:20 +0000 Subject: [PATCH 40/42] REF: restore casing on rows_dropped --- pandas/core/groupby/groupby.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 653dde35542df..caadef160d7ec 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -719,18 +719,24 @@ def _set_result_index_ordered( if self.grouper.is_monotonic: # shortcut if we have an already ordered grouper result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True) - else: - # row order is scrambled => sort the rows by position in original index - original_positions = Index( - np.concatenate(self._get_indices(self.grouper.result_index)) - ) - result.set_axis(original_positions, axis=self.axis, inplace=True) - result = result.sort_index(axis=self.axis) + return result + + # row order is scrambled => sort the rows by position in original index + original_positions = Index( + np.concatenate(self._get_indices(self.grouper.result_index)) + ) + result.set_axis(original_positions, axis=self.axis, inplace=True) + result = result.sort_index(axis=self.axis) + + dropped_rows = len(result.index) < len(self.obj.index) + if dropped_rows: # get index by slicing original index according to original positions # slice drops attrs => use set_axis when no rows were dropped sorted_indexer = result.index result.index = self._selected_obj.index[sorted_indexer] + else: + result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True) return result From faf6570ae1065575f52ca970303c27c14dd55c9f Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sun, 13 Dec 2020 12:27:35 -0500 Subject: [PATCH 41/42] move whatsnew to 1.3 --- doc/source/whatsnew/v1.2.0.rst | 1 - doc/source/whatsnew/v1.3.0.rst | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 9e5ce143b15ac..0430a9ffc09f6 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -701,7 +701,6 @@ Missing ^^^^^^^ - Bug in :meth:`SeriesGroupBy.transform` now correctly handles missing values for ``dropna=False`` (:issue:`35014`) -- Bug in :class:`Grouper` now correctly propagates ``dropna`` argument and :meth:`DataFrameGroupBy.transform` now correctly handles missing values for ``dropna=True`` (:issue:`35612`) - Bug in :meth:`Series.nunique` with ``dropna=True`` was returning incorrect results when both ``NA`` and ``None`` missing values were present (:issue:`37566`) - Bug in :meth:`Series.interpolate` where kwarg ``limit_area`` and ``limit_direction`` had no effect when using methods ``pad`` and ``backfill`` (:issue:`31048`) - diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 90f611c55e710..2bf83171487a3 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -134,6 +134,7 @@ Indexing Missing ^^^^^^^ +- Bug in :class:`Grouper` now correctly propagates ``dropna`` argument and :meth:`DataFrameGroupBy.transform` now correctly handles missing values for ``dropna=True`` (:issue:`35612`) - - From 24bb112fe5ee9ba8cd767899a4ab0fb823ae8d99 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sun, 13 Dec 2020 12:30:55 -0500 Subject: [PATCH 42/42] minimize diff --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index b50ae777a3276..bc7f5b8174573 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -701,7 +701,7 @@ Indexing Missing ^^^^^^^ -- Bug in :meth:`SeriesGroupBy.transform` now correctly handles missing values for ``dropna=False`` (:issue:`35014`) +- Bug in :meth:`.SeriesGroupBy.transform` now correctly handles missing values for ``dropna=False`` (:issue:`35014`) - Bug in :meth:`Series.nunique` with ``dropna=True`` was returning incorrect results when both ``NA`` and ``None`` missing values were present (:issue:`37566`) - Bug in :meth:`Series.interpolate` where kwarg ``limit_area`` and ``limit_direction`` had no effect when using methods ``pad`` and ``backfill`` (:issue:`31048`) -