From b839af99d73b78240aa6d9a999e7776a1fd85eef Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 11 Feb 2023 16:37:13 +0100 Subject: [PATCH 1/6] ENH: Optimize CoW for fillna with ea dtypes --- pandas/core/internals/blocks.py | 11 +++-- pandas/tests/copy_view/test_interp_fillna.py | 46 ++++++++++++++++++++ 2 files changed, 54 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 60d022a0c7964..e6cf23ac9aa5f 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1752,9 +1752,14 @@ def fillna( downcast=downcast, using_cow=using_cow, ) - new_values = self.values.fillna(value=value, method=None, limit=limit) - nb = self.make_block_same_class(new_values) - return nb._maybe_downcast([nb], downcast) + if using_cow and self._can_hold_na and not self.values.isna().any(): + refs = self.refs + new_values = self.values + else: + refs = None + new_values = self.values.fillna(value=value, method=None, limit=limit) + nb = self.make_block_same_class(new_values, refs=refs) + return nb._maybe_downcast([nb], downcast, using_cow=using_cow) @cache_readonly def shape(self) -> Shape: diff --git a/pandas/tests/copy_view/test_interp_fillna.py b/pandas/tests/copy_view/test_interp_fillna.py index 32a36b9690465..59676ccea1ca6 100644 --- a/pandas/tests/copy_view/test_interp_fillna.py +++ b/pandas/tests/copy_view/test_interp_fillna.py @@ -2,6 +2,7 @@ import pytest from pandas import ( + NA, DataFrame, Interval, NaT, @@ -232,3 +233,48 @@ def test_fillna_interval_inplace_reference(using_copy_on_write): assert np.shares_memory( get_array(ser, "a").left.values, get_array(view, "a").left.values ) + + +def test_fillna_ea_noop_shares_memory( + using_copy_on_write, any_numeric_ea_and_arrow_dtype +): + df = DataFrame({"a": [1, NA, 3], "b": 1}, dtype=any_numeric_ea_and_arrow_dtype) + df_orig = df.copy() + df2 = df.fillna(100) + + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + if using_copy_on_write: + assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert not df2._mgr._has_no_reference(1) + else: + assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + + tm.assert_frame_equal(df_orig, df) + + df2.iloc[0, 1] = 100 + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert df2._mgr._has_no_reference(1) + assert df._mgr._has_no_reference(1) + tm.assert_frame_equal(df_orig, df) + + +def test_fillna_inplace_ea_noop_shares_memory( + using_copy_on_write, any_numeric_ea_and_arrow_dtype +): + df = DataFrame({"a": [1, NA, 3], "b": 1}, dtype=any_numeric_ea_and_arrow_dtype) + df_orig = df.copy() + view = df[:] + df.fillna(100, inplace=True) + + assert not np.shares_memory(get_array(df, "a"), get_array(view, "a")) + + if using_copy_on_write: + assert np.shares_memory(get_array(df, "b"), get_array(view, "b")) + assert not df._mgr._has_no_reference(1) + assert not view._mgr._has_no_reference(1) + else: + assert not np.shares_memory(get_array(df, "b"), get_array(view, "b")) + df.iloc[0, 1] = 100 + tm.assert_frame_equal(df_orig, view) From 8b3335494d220650b87ea7b9c3808f985728870a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 15 Feb 2023 23:42:40 +0100 Subject: [PATCH 2/6] Fix tests --- pandas/tests/extension/base/methods.py | 11 +++++++---- pandas/tests/extension/json/test_json.py | 4 ++++ pandas/tests/extension/test_datetime.py | 7 +++---- pandas/tests/extension/test_interval.py | 7 +++---- pandas/tests/extension/test_numpy.py | 8 ++++---- pandas/tests/extension/test_period.py | 7 +++---- pandas/tests/extension/test_sparse.py | 15 +++++++++++---- 7 files changed, 35 insertions(+), 24 deletions(-) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 85c2febefb6ce..23eae360c3e70 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -240,23 +240,26 @@ def test_factorize_empty(self, data): tm.assert_numpy_array_equal(codes, expected_codes) self.assert_extension_array_equal(uniques, expected_uniques) - def test_fillna_copy_frame(self, data_missing): + def test_fillna_copy_frame(self, data_missing, using_copy_on_write): arr = data_missing.take([1, 1]) df = pd.DataFrame({"A": arr}) filled_val = df.iloc[0, 0] result = df.fillna(filled_val) - assert df.A.values is not result.A.values + if using_copy_on_write: + assert df.A.values is result.A.values + else: + assert df.A.values is not result.A.values - def test_fillna_copy_series(self, data_missing, no_op_with_cow: bool = False): + def test_fillna_copy_series(self, data_missing, using_copy_on_write): arr = data_missing.take([1, 1]) ser = pd.Series(arr) filled_val = ser[0] result = ser.fillna(filled_val) - if no_op_with_cow: + if using_copy_on_write: assert ser._values is result._values else: assert ser._values is not result._values diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 2e5e2fc77d6c4..58ed282776061 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -301,6 +301,10 @@ def test_searchsorted(self, data_for_sorting): def test_equals(self, data, na_value, as_series): super().test_equals(data, na_value, as_series) + @pytest.mark.skip("fill-value is interpreted as a dict of values") + def test_fillna_copy_frame(self, data_missing, using_copy_on_write): + super().test_fillna_copy_frame(data_missing, using_copy_on_write) + class TestCasting(BaseJSON, base.BaseCastingTests): @pytest.mark.xfail(reason="failing on np.array(self, dtype=str)") diff --git a/pandas/tests/extension/test_datetime.py b/pandas/tests/extension/test_datetime.py index 0ad2de9e834fa..9f1c854fd92d0 100644 --- a/pandas/tests/extension/test_datetime.py +++ b/pandas/tests/extension/test_datetime.py @@ -116,10 +116,9 @@ def test_combine_add(self, data_repeated): # Timestamp.__add__(Timestamp) not defined pass - def test_fillna_copy_series(self, data_missing, using_copy_on_write): - super().test_fillna_copy_series( - data_missing, no_op_with_cow=using_copy_on_write - ) + @pytest.mark.skip("NDArrayBacked EAs always make shallow copies") + def test_fillna_copy_frame(self, data_missing, using_copy_on_write): + super().test_fillna_copy_frame(data_missing, using_copy_on_write) class TestInterface(BaseDatetimeTests, base.BaseInterfaceTests): diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index 3cf24d848e453..c4bf903bcda46 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -132,10 +132,9 @@ def test_combine_add(self, data_repeated): def test_fillna_length_mismatch(self, data_missing): super().test_fillna_length_mismatch(data_missing) - def test_fillna_copy_series(self, data_missing, using_copy_on_write): - super().test_fillna_copy_series( - data_missing, no_op_with_cow=using_copy_on_write - ) + @pytest.mark.skip("NDArrayBacked EAs always make shallow copies") + def test_fillna_copy_frame(self, data_missing, using_copy_on_write): + super().test_fillna_copy_frame(data_missing, using_copy_on_write) class TestMissing(BaseInterval, base.BaseMissingTests): diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 9cf7a08357720..edab050bf6f96 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -250,14 +250,14 @@ def test_shift_fill_value(self, data): super().test_shift_fill_value(data) @skip_nested - def test_fillna_copy_frame(self, data_missing): + def test_fillna_copy_frame(self, data_missing, using_copy_on_write): # The "scalar" for this array isn't a scalar. - super().test_fillna_copy_frame(data_missing) + super().test_fillna_copy_frame(data_missing, using_copy_on_write) @skip_nested - def test_fillna_copy_series(self, data_missing): + def test_fillna_copy_series(self, data_missing, using_copy_on_write): # The "scalar" for this array isn't a scalar. - super().test_fillna_copy_series(data_missing) + super().test_fillna_copy_series(data_missing, using_copy_on_write) @skip_nested def test_searchsorted(self, data_for_sorting, as_series): diff --git a/pandas/tests/extension/test_period.py b/pandas/tests/extension/test_period.py index 1ecd279e1f34b..5af312b2dbf6a 100644 --- a/pandas/tests/extension/test_period.py +++ b/pandas/tests/extension/test_period.py @@ -105,10 +105,9 @@ def test_diff(self, data, periods): else: super().test_diff(data, periods) - def test_fillna_copy_series(self, data_missing, using_copy_on_write): - super().test_fillna_copy_series( - data_missing, no_op_with_cow=using_copy_on_write - ) + @pytest.mark.skip("NDArrayBacked EAs always make shallow copies") + def test_fillna_copy_frame(self, data_missing, using_copy_on_write): + super().test_fillna_copy_frame(data_missing, using_copy_on_write) class TestInterface(BasePeriodTests, base.BaseInterfaceTests): diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 745911871694c..2aeb2af567ea0 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -272,7 +272,7 @@ def test_fillna_frame(self, data_missing): class TestMethods(BaseSparseTests, base.BaseMethodsTests): _combine_le_expected_dtype = "Sparse[bool]" - def test_fillna_copy_frame(self, data_missing): + def test_fillna_copy_frame(self, data_missing, using_copy_on_write): arr = data_missing.take([1, 1]) df = pd.DataFrame({"A": arr}, copy=False) @@ -280,17 +280,24 @@ def test_fillna_copy_frame(self, data_missing): result = df.fillna(filled_val) if hasattr(df._mgr, "blocks"): - assert df.values.base is not result.values.base + if using_copy_on_write: + assert df.values.base is result.values.base + else: + assert df.values.base is not result.values.base assert df.A._values.to_dense() is arr.to_dense() - def test_fillna_copy_series(self, data_missing): + def test_fillna_copy_series(self, data_missing, using_copy_on_write): arr = data_missing.take([1, 1]) ser = pd.Series(arr) filled_val = ser[0] result = ser.fillna(filled_val) - assert ser._values is not result._values + if using_copy_on_write: + assert ser._values is result._values + + else: + assert ser._values is not result._values assert ser._values.to_dense() is arr.to_dense() @pytest.mark.xfail(reason="Not Applicable") From 4c1d310499499eade05f460de1dbd39125135c9a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 16 Feb 2023 15:27:28 +0100 Subject: [PATCH 3/6] Fix tests --- pandas/tests/groupby/test_raises.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 76ba4c974b3fd..8851b7669932d 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -303,7 +303,9 @@ def test_groupby_raises_datetime_np(how, by, groupby_series, groupby_func_np): @pytest.mark.parametrize("how", ["method", "agg", "transform"]) -def test_groupby_raises_category(how, by, groupby_series, groupby_func): +def test_groupby_raises_category( + how, by, groupby_series, groupby_func, using_copy_on_write +): # GH#50749 df = DataFrame( { @@ -370,7 +372,9 @@ def test_groupby_raises_category(how, by, groupby_series, groupby_func): TypeError, r"Cannot setitem on a Categorical with a new category \(0\), " + "set the categories first", - ), + ) + if not using_copy_on_write + else (None, ""), # no-op with CoW "first": (None, ""), "idxmax": (None, ""), "idxmin": (None, ""), @@ -491,7 +495,7 @@ def test_groupby_raises_category_np(how, by, groupby_series, groupby_func_np): @pytest.mark.parametrize("how", ["method", "agg", "transform"]) def test_groupby_raises_category_on_category( - how, by, groupby_series, groupby_func, observed + how, by, groupby_series, groupby_func, observed, using_copy_on_write ): # GH#50749 df = DataFrame( @@ -562,7 +566,9 @@ def test_groupby_raises_category_on_category( TypeError, r"Cannot setitem on a Categorical with a new category \(0\), " + "set the categories first", - ), + ) + if not using_copy_on_write + else (None, ""), # no-op with CoW "first": (None, ""), "idxmax": (ValueError, "attempt to get argmax of an empty sequence") if empty_groups From 618223e8c4f3e1482bd7b624e27e6ac876d288d0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 16 Feb 2023 15:27:53 +0100 Subject: [PATCH 4/6] Optimize --- pandas/core/internals/blocks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index ecbb7672875e5..1fd8b67332616 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1741,7 +1741,7 @@ def fillna( downcast=downcast, using_cow=using_cow, ) - if using_cow and self._can_hold_na and not self.values.isna().any(): + if using_cow and self._can_hold_na and not self.values._hasna: refs = self.refs new_values = self.values else: From 3d518c6d5b3ea831ea1c09c171ab100ba903aa2a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 16 Feb 2023 15:44:16 +0100 Subject: [PATCH 5/6] Adjust tests --- pandas/tests/extension/base/methods.py | 20 +++++++++----------- pandas/tests/extension/conftest.py | 13 ++++++++++++- pandas/tests/extension/json/test_json.py | 4 ++-- pandas/tests/extension/test_datetime.py | 4 ---- pandas/tests/extension/test_interval.py | 4 ---- pandas/tests/extension/test_numpy.py | 8 ++++---- pandas/tests/extension/test_period.py | 4 ---- 7 files changed, 27 insertions(+), 30 deletions(-) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 23eae360c3e70..d13f2508a9eaf 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -240,30 +240,28 @@ def test_factorize_empty(self, data): tm.assert_numpy_array_equal(codes, expected_codes) self.assert_extension_array_equal(uniques, expected_uniques) - def test_fillna_copy_frame(self, data_missing, using_copy_on_write): + def test_fillna_copy_frame(self, data_missing): arr = data_missing.take([1, 1]) df = pd.DataFrame({"A": arr}) + df_orig = df.copy() filled_val = df.iloc[0, 0] result = df.fillna(filled_val) - if using_copy_on_write: - assert df.A.values is result.A.values - else: - assert df.A.values is not result.A.values + result.iloc[0, 0] = filled_val + + tm.assert_extension_array_equal(df.A.values, df_orig.A.values) - def test_fillna_copy_series(self, data_missing, using_copy_on_write): + def test_fillna_copy_series(self, data_missing): arr = data_missing.take([1, 1]) ser = pd.Series(arr) + ser_orig = ser.copy() filled_val = ser[0] result = ser.fillna(filled_val) + result.iloc[0] = filled_val - if using_copy_on_write: - assert ser._values is result._values - else: - assert ser._values is not result._values - assert ser._values is arr + tm.assert_extension_array_equal(ser.values, ser_orig.values) def test_fillna_length_mismatch(self, data_missing): msg = "Length of 'value' does not match." diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py index 3827ba234cfd8..0d14128e3bebf 100644 --- a/pandas/tests/extension/conftest.py +++ b/pandas/tests/extension/conftest.py @@ -2,7 +2,10 @@ import pytest -from pandas import Series +from pandas import ( + Series, + options, +) @pytest.fixture @@ -193,3 +196,11 @@ def invalid_scalar(data): If the array can hold any item (i.e. object dtype), then use pytest.skip. """ return object.__new__(object) + + +@pytest.fixture +def using_copy_on_write() -> bool: + """ + Fixture to check if Copy-on-Write is enabled. + """ + return options.mode.copy_on_write and options.mode.data_manager == "block" diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 58ed282776061..37a7d78d3aa3d 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -302,8 +302,8 @@ def test_equals(self, data, na_value, as_series): super().test_equals(data, na_value, as_series) @pytest.mark.skip("fill-value is interpreted as a dict of values") - def test_fillna_copy_frame(self, data_missing, using_copy_on_write): - super().test_fillna_copy_frame(data_missing, using_copy_on_write) + def test_fillna_copy_frame(self, data_missing): + super().test_fillna_copy_frame(data_missing) class TestCasting(BaseJSON, base.BaseCastingTests): diff --git a/pandas/tests/extension/test_datetime.py b/pandas/tests/extension/test_datetime.py index 9f1c854fd92d0..92796c604333d 100644 --- a/pandas/tests/extension/test_datetime.py +++ b/pandas/tests/extension/test_datetime.py @@ -116,10 +116,6 @@ def test_combine_add(self, data_repeated): # Timestamp.__add__(Timestamp) not defined pass - @pytest.mark.skip("NDArrayBacked EAs always make shallow copies") - def test_fillna_copy_frame(self, data_missing, using_copy_on_write): - super().test_fillna_copy_frame(data_missing, using_copy_on_write) - class TestInterface(BaseDatetimeTests, base.BaseInterfaceTests): pass diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index c4bf903bcda46..0f916cea9d518 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -132,10 +132,6 @@ def test_combine_add(self, data_repeated): def test_fillna_length_mismatch(self, data_missing): super().test_fillna_length_mismatch(data_missing) - @pytest.mark.skip("NDArrayBacked EAs always make shallow copies") - def test_fillna_copy_frame(self, data_missing, using_copy_on_write): - super().test_fillna_copy_frame(data_missing, using_copy_on_write) - class TestMissing(BaseInterval, base.BaseMissingTests): # Index.fillna only accepts scalar `value`, so we have to xfail all diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index edab050bf6f96..9cf7a08357720 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -250,14 +250,14 @@ def test_shift_fill_value(self, data): super().test_shift_fill_value(data) @skip_nested - def test_fillna_copy_frame(self, data_missing, using_copy_on_write): + def test_fillna_copy_frame(self, data_missing): # The "scalar" for this array isn't a scalar. - super().test_fillna_copy_frame(data_missing, using_copy_on_write) + super().test_fillna_copy_frame(data_missing) @skip_nested - def test_fillna_copy_series(self, data_missing, using_copy_on_write): + def test_fillna_copy_series(self, data_missing): # The "scalar" for this array isn't a scalar. - super().test_fillna_copy_series(data_missing, using_copy_on_write) + super().test_fillna_copy_series(data_missing) @skip_nested def test_searchsorted(self, data_for_sorting, as_series): diff --git a/pandas/tests/extension/test_period.py b/pandas/tests/extension/test_period.py index 5af312b2dbf6a..cb1ebd87875e1 100644 --- a/pandas/tests/extension/test_period.py +++ b/pandas/tests/extension/test_period.py @@ -105,10 +105,6 @@ def test_diff(self, data, periods): else: super().test_diff(data, periods) - @pytest.mark.skip("NDArrayBacked EAs always make shallow copies") - def test_fillna_copy_frame(self, data_missing, using_copy_on_write): - super().test_fillna_copy_frame(data_missing, using_copy_on_write) - class TestInterface(BasePeriodTests, base.BaseInterfaceTests): pass From 26abefc1826c5a77c3e4f14909b57d35c0632af4 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 15 Feb 2023 12:00:39 +0100 Subject: [PATCH 6/6] Adjust --- pandas/tests/extension/base/methods.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index d13f2508a9eaf..ca867ffb77296 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -250,7 +250,7 @@ def test_fillna_copy_frame(self, data_missing): result.iloc[0, 0] = filled_val - tm.assert_extension_array_equal(df.A.values, df_orig.A.values) + self.assert_frame_equal(df, df_orig) def test_fillna_copy_series(self, data_missing): arr = data_missing.take([1, 1]) @@ -261,7 +261,7 @@ def test_fillna_copy_series(self, data_missing): result = ser.fillna(filled_val) result.iloc[0] = filled_val - tm.assert_extension_array_equal(ser.values, ser_orig.values) + self.assert_series_equal(ser, ser_orig) def test_fillna_length_mismatch(self, data_missing): msg = "Length of 'value' does not match."