diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index ec7b8b375abe5..bb8def5fb0d47 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -381,6 +381,7 @@ Other Removals - Enforced deprecation of strings ``T``, ``L``, ``U``, and ``N`` denoting frequencies in :class:`Minute`, :class:`Milli`, :class:`Micro`, :class:`Nano` (:issue:`57627`) - Enforced deprecation of strings ``T``, ``L``, ``U``, and ``N`` denoting units in :class:`Timedelta` (:issue:`57627`) - Enforced deprecation of the behavior of :func:`concat` when ``len(keys) != len(objs)`` would truncate to the shorter of the two. Now this raises a ``ValueError`` (:issue:`43485`) +- Enforced deprecation of the behavior of :meth:`DataFrame.replace` and :meth:`Series.replace` with :class:`CategoricalDtype` that would introduce new categories. (:issue:`58270`) - Enforced deprecation of values "pad", "ffill", "bfill", and "backfill" for :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` (:issue:`57869`) - Enforced deprecation removing :meth:`Categorical.to_list`, use ``obj.tolist()`` instead (:issue:`51254`) - Enforced silent-downcasting deprecation for :ref:`all relevant methods ` (:issue:`54710`) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c656e4bf1e20c..18b52f741370f 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -10,7 +10,6 @@ cast, overload, ) -import warnings import numpy as np @@ -23,7 +22,6 @@ ) from pandas._libs.arrays import NDArrayBacked from pandas.compat.numpy import function as nv -from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( @@ -2673,62 +2671,6 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: code_values = code_values[null_mask | (code_values >= 0)] return algorithms.isin(self.codes, code_values) - @overload - def _replace(self, *, to_replace, value, inplace: Literal[False] = ...) -> Self: ... - - @overload - def _replace(self, *, to_replace, value, inplace: Literal[True]) -> None: ... - - def _replace(self, *, to_replace, value, inplace: bool = False) -> Self | None: - from pandas import Index - - orig_dtype = self.dtype - - inplace = validate_bool_kwarg(inplace, "inplace") - cat = self if inplace else self.copy() - - mask = isna(np.asarray(value)) - if mask.any(): - removals = np.asarray(to_replace)[mask] - removals = cat.categories[cat.categories.isin(removals)] - new_cat = cat.remove_categories(removals) - NDArrayBacked.__init__(cat, new_cat.codes, new_cat.dtype) - - ser = cat.categories.to_series() - ser = ser.replace(to_replace=to_replace, value=value) - - all_values = Index(ser) - - # GH51016: maintain order of existing categories - idxr = cat.categories.get_indexer_for(all_values) - locs = np.arange(len(ser)) - locs = np.where(idxr == -1, locs, idxr) - locs = locs.argsort() - - new_categories = ser.take(locs) - new_categories = new_categories.drop_duplicates(keep="first") - index_categories = Index(new_categories) - new_codes = recode_for_categories( - cat._codes, all_values, index_categories, copy=False - ) - new_dtype = CategoricalDtype(index_categories, ordered=self.dtype.ordered) - NDArrayBacked.__init__(cat, new_codes, new_dtype) - - if new_dtype != orig_dtype: - warnings.warn( - # GH#55147 - "The behavior of Series.replace (and DataFrame.replace) with " - "CategoricalDtype is deprecated. In a future version, replace " - "will only be used for cases that preserve the categories. " - "To change the categories, use ser.cat.rename_categories " - "instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - if not inplace: - return cat - return None - # ------------------------------------------------------------------------ # String methods interface def _str_map( diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 3614d43425a09..6bb335bca12b3 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -100,7 +100,6 @@ ) from pandas.core.array_algos.transforms import shift from pandas.core.arrays import ( - Categorical, DatetimeArray, ExtensionArray, IntervalArray, @@ -696,14 +695,6 @@ def replace( # go through replace_list values = self.values - if isinstance(values, Categorical): - # TODO: avoid special-casing - # GH49404 - blk = self._maybe_copy(inplace) - values = cast(Categorical, blk.values) - values._replace(to_replace=to_replace, value=value, inplace=True) - return [blk] - if not self._can_hold_element(to_replace): # We cannot hold `to_replace`, so we know immediately that # replacing it is a no-op. @@ -803,14 +794,6 @@ def replace_list( """ values = self.values - if isinstance(values, Categorical): - # TODO: avoid special-casing - # GH49404 - blk = self._maybe_copy(inplace) - values = cast(Categorical, blk.values) - values._replace(to_replace=src_list, value=dest_list, inplace=True) - return [blk] - # Exclude anything that we know we won't contain pairs = [ (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x) diff --git a/pandas/tests/arrays/categorical/test_replace.py b/pandas/tests/arrays/categorical/test_replace.py index 3c677142846d7..7f3e8d3ed6e6e 100644 --- a/pandas/tests/arrays/categorical/test_replace.py +++ b/pandas/tests/arrays/categorical/test_replace.py @@ -6,106 +6,66 @@ @pytest.mark.parametrize( - "to_replace,value,expected,flip_categories", + "to_replace,value,expected", [ # one-to-one - (1, 2, [2, 2, 3], False), - (1, 4, [4, 2, 3], False), - (4, 1, [1, 2, 3], False), - (5, 6, [1, 2, 3], False), + (4, 1, [1, 2, 3]), + (3, 1, [1, 2, 1]), # many-to-one - ([1], 2, [2, 2, 3], False), - ([1, 2], 3, [3, 3, 3], False), - ([1, 2], 4, [4, 4, 3], False), - ((1, 2, 4), 5, [5, 5, 3], False), - ((5, 6), 2, [1, 2, 3], False), - ([1], [2], [2, 2, 3], False), - ([1, 4], [5, 2], [5, 2, 3], False), - # GH49404: overlap between to_replace and value - ([1, 2, 3], [2, 3, 4], [2, 3, 4], False), - # GH50872, GH46884: replace with null - (1, None, [None, 2, 3], False), - (1, pd.NA, [None, 2, 3], False), - # check_categorical sorts categories, which crashes on mixed dtypes - (3, "4", [1, 2, "4"], False), - ([1, 2, "3"], "5", ["5", "5", 3], True), + ((5, 6), 2, [1, 2, 3]), + ((3, 2), 1, [1, 1, 1]), ], ) -@pytest.mark.filterwarnings( - "ignore:.*with CategoricalDtype is deprecated:FutureWarning" -) -def test_replace_categorical_series(to_replace, value, expected, flip_categories): +def test_replace_categorical_series(to_replace, value, expected): # GH 31720 - ser = pd.Series([1, 2, 3], dtype="category") result = ser.replace(to_replace, value) - expected = pd.Series(expected, dtype="category") - ser.replace(to_replace, value, inplace=True) - - if flip_categories: - expected = expected.cat.set_categories(expected.cat.categories[::-1]) - - tm.assert_series_equal(expected, result, check_category_order=False) - tm.assert_series_equal(expected, ser, check_category_order=False) + expected = pd.Series(Categorical(expected, categories=[1, 2, 3])) + tm.assert_series_equal(result, expected) @pytest.mark.parametrize( - "to_replace, value, result, expected_error_msg", + "to_replace,value", [ - ("b", "c", ["a", "c"], "Categorical.categories are different"), - ("c", "d", ["a", "b"], None), - # https://github.com/pandas-dev/pandas/issues/33288 - ("a", "a", ["a", "b"], None), - ("b", None, ["a", None], "Categorical.categories length are different"), + # one-to-one + (3, 5), + # many-to-one + ((3, 2), 5), ], ) -def test_replace_categorical(to_replace, value, result, expected_error_msg): - # GH#26988 - cat = Categorical(["a", "b"]) - expected = Categorical(result) - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - warn = FutureWarning if expected_error_msg is not None else None - with tm.assert_produces_warning(warn, match=msg): - result = pd.Series(cat, copy=False).replace(to_replace, value)._values +def test_replace_categorical_series_new_category_raises(to_replace, value): + # GH 31720 + ser = pd.Series([1, 2, 3], dtype="category") + with pytest.raises( + TypeError, match="Cannot setitem on a Categorical with a new category" + ): + ser.replace(to_replace, value) - tm.assert_categorical_equal(result, expected) - if to_replace == "b": # the "c" test is supposed to be unchanged - with pytest.raises(AssertionError, match=expected_error_msg): - # ensure non-inplace call does not affect original - tm.assert_categorical_equal(cat, expected) - ser = pd.Series(cat, copy=False) - with tm.assert_produces_warning(warn, match=msg): - ser.replace(to_replace, value, inplace=True) - tm.assert_categorical_equal(cat, expected) +def test_replace_maintain_ordering(): + # GH51016 + dtype = pd.CategoricalDtype([0, 1, 2], ordered=True) + ser = pd.Series([0, 1, 2], dtype=dtype) + result = ser.replace(0, 2) + expected = pd.Series([2, 1, 2], dtype=dtype) + tm.assert_series_equal(expected, result, check_category_order=True) def test_replace_categorical_ea_dtype(): # GH49404 - cat = Categorical(pd.array(["a", "b"], dtype="string")) - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" + cat = Categorical(pd.array(["a", "b", "c"], dtype="string")) + result = pd.Series(cat).replace(["a", "b"], ["c", "c"])._values + expected = Categorical( + pd.array(["c"] * 3, dtype="string"), + categories=pd.array(["a", "b", "c"], dtype="string"), ) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = pd.Series(cat).replace(["a", "b"], ["c", pd.NA])._values - expected = Categorical(pd.array(["c", pd.NA], dtype="string")) tm.assert_categorical_equal(result, expected) -def test_replace_maintain_ordering(): - # GH51016 - dtype = pd.CategoricalDtype([0, 1, 2], ordered=True) - ser = pd.Series([0, 1, 2], dtype=dtype) - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = ser.replace(0, 2) - expected_dtype = pd.CategoricalDtype([1, 2], ordered=True) - expected = pd.Series([2, 1, 2], dtype=expected_dtype) - tm.assert_series_equal(expected, result, check_category_order=True) +def test_replace_categorical_ea_dtype_different_cats_raises(): + # GH49404 + cat = Categorical(pd.array(["a", "b"], dtype="string")) + with pytest.raises( + TypeError, match="Cannot setitem on a Categorical with a new category" + ): + pd.Series(cat).replace(["a", "b"], ["c", pd.NA]) diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index 63254f1244a2e..2eb88923c0087 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -129,18 +129,14 @@ def test_replace_to_replace_wrong_dtype(): def test_replace_list_categorical(): df = DataFrame({"a": ["a", "b", "c"]}, dtype="category") arr = get_array(df, "a") - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - df.replace(["c"], value="a", inplace=True) + + df.replace(["c"], value="a", inplace=True) assert np.shares_memory(arr.codes, get_array(df, "a").codes) assert df._mgr._has_no_reference(0) df_orig = df.copy() - with tm.assert_produces_warning(FutureWarning, match=msg): - df2 = df.replace(["b"], value="a") + df.replace(["b"], value="a") + df2 = df.apply(lambda x: x.cat.rename_categories({"b": "d"})) assert not np.shares_memory(arr.codes, get_array(df2, "a").codes) tm.assert_frame_equal(df, df_orig) @@ -150,13 +146,7 @@ def test_replace_list_inplace_refs_categorical(): df = DataFrame({"a": ["a", "b", "c"]}, dtype="category") view = df[:] df_orig = df.copy() - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - df.replace(["c"], value="a", inplace=True) - assert not np.shares_memory(get_array(view, "a").codes, get_array(df, "a").codes) + df.replace(["c"], value="a", inplace=True) tm.assert_frame_equal(df_orig, view) @@ -195,56 +185,34 @@ def test_replace_inplace_reference_no_op(to_replace): @pytest.mark.parametrize("to_replace", [1, [1]]) -@pytest.mark.parametrize("val", [1, 1.5]) -def test_replace_categorical_inplace_reference(val, to_replace): +def test_replace_categorical_inplace_reference(to_replace): df = DataFrame({"a": Categorical([1, 2, 3])}) df_orig = df.copy() arr_a = get_array(df, "a") view = df[:] - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - warn = FutureWarning if val == 1.5 else None - with tm.assert_produces_warning(warn, match=msg): - df.replace(to_replace=to_replace, value=val, inplace=True) - + df.replace(to_replace=to_replace, value=1, inplace=True) assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes) assert df._mgr._has_no_reference(0) assert view._mgr._has_no_reference(0) tm.assert_frame_equal(view, df_orig) -@pytest.mark.parametrize("val", [1, 1.5]) -def test_replace_categorical_inplace(val): +def test_replace_categorical_inplace(): df = DataFrame({"a": Categorical([1, 2, 3])}) arr_a = get_array(df, "a") - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - warn = FutureWarning if val == 1.5 else None - with tm.assert_produces_warning(warn, match=msg): - df.replace(to_replace=1, value=val, inplace=True) + df.replace(to_replace=1, value=1, inplace=True) assert np.shares_memory(get_array(df, "a").codes, arr_a.codes) assert df._mgr._has_no_reference(0) - expected = DataFrame({"a": Categorical([val, 2, 3])}) + expected = DataFrame({"a": Categorical([1, 2, 3])}) tm.assert_frame_equal(df, expected) -@pytest.mark.parametrize("val", [1, 1.5]) -def test_replace_categorical(val): +def test_replace_categorical(): df = DataFrame({"a": Categorical([1, 2, 3])}) df_orig = df.copy() - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" - ) - warn = FutureWarning if val == 1.5 else None - with tm.assert_produces_warning(warn, match=msg): - df2 = df.replace(to_replace=1, value=val) + df2 = df.replace(to_replace=1, value=1) assert df._mgr._has_no_reference(0) assert df2._mgr._has_no_reference(0) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index fb7ba2b7af38a..3fcc4aaa6960f 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1171,38 +1171,6 @@ def test_replace_with_empty_dictlike(self, mix_abc): tm.assert_frame_equal(df, df.replace({"b": {}})) tm.assert_frame_equal(df, df.replace(Series({"b": {}}))) - @pytest.mark.parametrize( - "replace_dict, final_data", - [({"a": 1, "b": 1}, [[3, 3], [2, 2]]), ({"a": 1, "b": 2}, [[3, 1], [2, 3]])], - ) - def test_categorical_replace_with_dict(self, replace_dict, final_data): - # GH 26988 - df = DataFrame([[1, 1], [2, 2]], columns=["a", "b"], dtype="category") - - final_data = np.array(final_data) - - a = pd.Categorical(final_data[:, 0], categories=[3, 2]) - - ex_cat = [3, 2] if replace_dict["b"] == 1 else [1, 3] - b = pd.Categorical(final_data[:, 1], categories=ex_cat) - - expected = DataFrame({"a": a, "b": b}) - msg2 = "with CategoricalDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg2): - result = df.replace(replace_dict, 3) - tm.assert_frame_equal(result, expected) - msg = ( - r"Attributes of DataFrame.iloc\[:, 0\] \(column name=\"a\"\) are " - "different" - ) - with pytest.raises(AssertionError, match=msg): - # ensure non-inplace call does not affect original - tm.assert_frame_equal(df, expected) - with tm.assert_produces_warning(FutureWarning, match=msg2): - return_value = df.replace(replace_dict, 3, inplace=True) - assert return_value is None - tm.assert_frame_equal(df, expected) - @pytest.mark.parametrize( "df, to_replace, exp", [ @@ -1300,6 +1268,30 @@ def test_replace_ea_ignore_float(self, frame_or_series, value): result = obj.replace(1.0, 0.0) tm.assert_equal(expected, result) + @pytest.mark.parametrize( + "replace_dict, final_data", + [({"a": 1, "b": 1}, [[2, 2], [2, 2]]), ({"a": 1, "b": 2}, [[2, 1], [2, 2]])], + ) + def test_categorical_replace_with_dict(self, replace_dict, final_data): + # GH 26988 + df = DataFrame([[1, 1], [2, 2]], columns=["a", "b"], dtype="category") + + final_data = np.array(final_data) + + a = pd.Categorical(final_data[:, 0], categories=[1, 2]) + b = pd.Categorical(final_data[:, 1], categories=[1, 2]) + + expected = DataFrame({"a": a, "b": b}) + result = df.replace(replace_dict, 2) + tm.assert_frame_equal(result, expected) + msg = r"DataFrame.iloc\[:, 0\] \(column name=\"a\"\) are " "different" + with pytest.raises(AssertionError, match=msg): + # ensure non-inplace call does not affect original + tm.assert_frame_equal(df, expected) + return_value = df.replace(replace_dict, 2, inplace=True) + assert return_value is None + tm.assert_frame_equal(df, expected) + def test_replace_value_category_type(self): """ Test for #23305: to ensure category dtypes are maintained @@ -1345,15 +1337,17 @@ def test_replace_value_category_type(self): ) # replace values in input dataframe - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" + input_df = input_df.apply( + lambda x: x.astype("category").cat.rename_categories({"d": "z"}) + ) + input_df = input_df.apply( + lambda x: x.astype("category").cat.rename_categories({"obj1": "obj9"}) + ) + result = input_df.apply( + lambda x: x.astype("category").cat.rename_categories({"cat2": "catX"}) ) - with tm.assert_produces_warning(FutureWarning, match=msg): - input_df = input_df.replace("d", "z") - input_df = input_df.replace("obj1", "obj9") - result = input_df.replace("cat2", "catX") + result = result.astype({"col1": "int64", "col3": "float64", "col5": "object"}) tm.assert_frame_equal(result, expected) def test_replace_dict_category_type(self): @@ -1378,12 +1372,11 @@ def test_replace_dict_category_type(self): ) # replace values in input dataframe using a dict - msg = ( - r"The behavior of Series\.replace \(and DataFrame.replace\) " - "with CategoricalDtype" + result = input_df.apply( + lambda x: x.cat.rename_categories( + {"a": "z", "obj1": "obj9", "cat1": "catX"} + ) ) - with tm.assert_produces_warning(FutureWarning, match=msg): - result = input_df.replace({"a": "z", "obj1": "obj9", "cat1": "catX"}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 0a79bcea679a7..90654df155cf0 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -370,9 +370,7 @@ def test_replace_mixed_types_with_string(self): def test_replace_categorical(self, categorical, numeric): # GH 24971, GH#23305 ser = pd.Series(pd.Categorical(categorical, categories=["A", "B"])) - msg = "with CategoricalDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = ser.replace({"A": 1, "B": 2}) + result = ser.cat.rename_categories({"A": 1, "B": 2}) expected = pd.Series(numeric).astype("category") if 2 not in expected.cat.categories: # i.e. categories should be [1, 2] even if there are no "B"s present @@ -380,16 +378,13 @@ def test_replace_categorical(self, categorical, numeric): expected = expected.cat.add_categories(2) tm.assert_series_equal(expected, result, check_categorical=False) - @pytest.mark.parametrize( - "data, data_exp", [(["a", "b", "c"], ["b", "b", "c"]), (["a"], ["b"])] - ) - def test_replace_categorical_inplace(self, data, data_exp): + def test_replace_categorical_inplace(self): # GH 53358 + data = ["a", "b", "c"] + data_exp = ["b", "b", "c"] result = pd.Series(data, dtype="category") - msg = "with CategoricalDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result.replace(to_replace="a", value="b", inplace=True) - expected = pd.Series(data_exp, dtype="category") + result.replace(to_replace="a", value="b", inplace=True) + expected = pd.Series(pd.Categorical(data_exp, categories=data)) tm.assert_series_equal(result, expected) def test_replace_categorical_single(self): @@ -404,25 +399,10 @@ def test_replace_categorical_single(self): expected = expected.cat.remove_unused_categories() assert c[2] != "foo" - msg = "with CategoricalDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = c.replace(c[2], "foo") + result = c.cat.rename_categories({c.values[2]: "foo"}) tm.assert_series_equal(expected, result) assert c[2] != "foo" # ensure non-inplace call does not alter original - msg = "with CategoricalDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - return_value = c.replace(c[2], "foo", inplace=True) - assert return_value is None - tm.assert_series_equal(expected, c) - - first_value = c[0] - msg = "with CategoricalDtype is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - return_value = c.replace(c[1], c[0], inplace=True) - assert return_value is None - assert c[0] == c[1] == first_value # test replacing with existing value - def test_replace_with_no_overflowerror(self): # GH 25616 # casts to object without Exception from OverflowError