From 91ffff995c77d7e71ca59e657fbfceaca25ef1a2 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 16 Dec 2021 11:50:56 -0800 Subject: [PATCH 1/7] REF: get regex logic out of Block.replace --- pandas/core/generic.py | 15 ++++++++++++--- pandas/core/internals/array_manager.py | 10 ++++++++-- pandas/core/internals/blocks.py | 23 +++++++++-------------- pandas/core/internals/managers.py | 12 +++++++++--- 4 files changed, 38 insertions(+), 22 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 1e5b0a107615e..359d89ce664c3 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -118,6 +118,7 @@ nanops, ) import pandas.core.algorithms as algos +from pandas.core.array_algos.replace import should_use_regex from pandas.core.arrays import ExtensionArray from pandas.core.base import PandasObject import pandas.core.common as com @@ -6688,9 +6689,17 @@ def replace( return self._replace_columnwise(mapping, inplace, regex) elif not is_list_like(value): # NA -> 0 - new_data = self._mgr.replace( - to_replace=to_replace, value=value, inplace=inplace, regex=regex - ) + regex = should_use_regex(regex, to_replace) + if regex: + new_data = self._mgr.replace_regex( + to_replace=to_replace, + value=value, + inplace=inplace, + ) + else: + new_data = self._mgr.replace( + to_replace=to_replace, value=value, inplace=inplace + ) else: raise TypeError( f'Invalid "to_replace" type: {repr(type(to_replace).__name__)}' diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 93a9e8fbcb1ad..a817341367592 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -413,11 +413,17 @@ def _convert(arr): return self.apply(_convert) - def replace(self: T, value, **kwargs) -> T: + def replace_regex(self: T, **kwargs): + return self.apply_with_block("_replace_regex", **kwargs) + + def replace(self: T, to_replace, value, inplace: bool) -> T: + inplace = validate_bool_kwarg(inplace, "inplace") assert np.ndim(value) == 0, value # TODO "replace" is right now implemented on the blocks, we should move # it to general array algos so it can be reused here - return self.apply_with_block("replace", value=value, **kwargs) + return self.apply_with_block( + "replace", value=value, to_replace=to_replace, inplace=inplace + ) def replace_list( self: T, diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 7056a34c73008..da2ff58ea3d0d 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -640,14 +640,11 @@ def replace( to_replace, value, inplace: bool = False, - regex: bool = False, ) -> list[Block]: """ replace the to_replace value with value, possible to create new - blocks here this is just a call to putmask. regex is not used here. - It is used in ObjectBlocks. It is here for API compatibility. + blocks here this is just a call to putmask. """ - inplace = validate_bool_kwarg(inplace, "inplace") # Note: the checks we do in NDFrame.replace ensure we never get # here with listlike to_replace or value, as those cases @@ -661,11 +658,6 @@ def replace( blk.values._replace(to_replace=to_replace, value=value, inplace=True) return [blk] - regex = should_use_regex(regex, to_replace) - - if regex: - return self._replace_regex(to_replace, value, inplace=inplace) - if not self._can_hold_element(to_replace): # We cannot hold `to_replace`, so we know immediately that # replacing it is a no-op. @@ -691,13 +683,12 @@ def replace( to_replace=to_replace, value=value, inplace=True, - regex=regex, ) else: # split so that we only upcast where necessary return self.split_and_operate( - type(self).replace, to_replace, value, inplace=True, regex=regex + type(self).replace, to_replace, value, inplace=True ) @final @@ -756,10 +747,14 @@ def replace_list( values = self.values # TODO: dont special-case Categorical - if isinstance(values, Categorical) and len(algos.unique(dest_list)) == 1: + if ( + isinstance(values, Categorical) + and len(algos.unique(dest_list)) == 1 + and not regex + ): # We likely got here by tiling value inside NDFrame.replace, # so un-tile here - return self.replace(src_list, dest_list[0], inplace, regex) + return self.replace(src_list, dest_list[0], inplace) # Exclude anything that we know we won't contain pairs = [ @@ -866,7 +861,7 @@ def _replace_coerce( convert=False, mask=mask, ) - return self.replace(to_replace, value, inplace=inplace, regex=False) + return self.replace(to_replace, value, inplace=inplace) return [self] # --------------------------------------------------------------------- diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index cb18c6cccbc60..5ebc0292f24b4 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -432,12 +432,18 @@ def convert( timedelta=timedelta, ) - def replace(self: T, to_replace, value, inplace: bool, regex: bool) -> T: - assert np.ndim(value) == 0, value + def replace(self: T, to_replace, value, inplace: bool) -> T: + inplace = validate_bool_kwarg(inplace, "inplace") + # NDFrame.replace ensures the not-is_list_likes here + assert not is_list_like(to_replace) + assert not is_list_like(value) return self.apply( - "replace", to_replace=to_replace, value=value, inplace=inplace, regex=regex + "replace", to_replace=to_replace, value=value, inplace=inplace ) + def replace_regex(self, **kwargs): + return self.apply("_replace_regex", **kwargs) + def replace_list( self: T, src_list: list[Any], From ddd71b6f383e7df6bd5a2c6dc048ae4ae62932fd Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 16 Dec 2021 20:09:58 -0800 Subject: [PATCH 2/7] BUG: nullable dtypes not retained by replace --- pandas/core/array_algos/replace.py | 3 +- pandas/core/internals/blocks.py | 47 +++++------- .../tests/arrays/categorical/test_replace.py | 12 +-- pandas/tests/frame/methods/test_replace.py | 4 - pandas/tests/series/methods/test_replace.py | 75 ++++++++++++++----- 5 files changed, 78 insertions(+), 63 deletions(-) diff --git a/pandas/core/array_algos/replace.py b/pandas/core/array_algos/replace.py index 4d1fb8f33e5ad..e26bb9fb6ebad 100644 --- a/pandas/core/array_algos/replace.py +++ b/pandas/core/array_algos/replace.py @@ -80,7 +80,8 @@ def _check_comparison_types( f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}" ) - if not regex: + if not regex or not should_use_regex(regex, b): + # TODO: should use missing.mask_missing? op = lambda x: operator.eq(x, b) else: op = np.vectorize( diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index da2ff58ea3d0d..abbebcefc7a87 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -640,6 +640,8 @@ def replace( to_replace, value, inplace: bool = False, + # mask may be pre-computed if we're called from replace_list + mask: npt.NDArray[np.bool_] | None = None, ) -> list[Block]: """ replace the to_replace value with value, possible to create new @@ -665,7 +667,8 @@ def replace( # replace_list instead of replace. return [self] if inplace else [self.copy()] - mask = missing.mask_missing(values, to_replace) + if mask is None: + mask = missing.mask_missing(values, to_replace) if not mask.any(): # Note: we get here with test_replace_extension_other incorrectly # bc _can_hold_element is incorrect. @@ -683,6 +686,7 @@ def replace( to_replace=to_replace, value=value, inplace=True, + mask=mask, ) else: @@ -746,16 +750,6 @@ def replace_list( """ values = self.values - # TODO: dont special-case Categorical - if ( - isinstance(values, Categorical) - and len(algos.unique(dest_list)) == 1 - and not regex - ): - # We likely got here by tiling value inside NDFrame.replace, - # so un-tile here - return self.replace(src_list, dest_list[0], inplace) - # Exclude anything that we know we won't contain pairs = [ (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x) @@ -844,25 +838,18 @@ def _replace_coerce( ------- List[Block] """ - if mask.any(): - if not regex: - nb = self.coerce_to_target_dtype(value) - if nb is self and not inplace: - nb = nb.copy() - putmask_inplace(nb.values, mask, value) - return [nb] - else: - regex = should_use_regex(regex, to_replace) - if regex: - return self._replace_regex( - to_replace, - value, - inplace=inplace, - convert=False, - mask=mask, - ) - return self.replace(to_replace, value, inplace=inplace) - return [self] + if should_use_regex(regex, to_replace): + return self._replace_regex( + to_replace, + value, + inplace=inplace, + convert=False, + mask=mask, + ) + else: + return self.replace( + to_replace=to_replace, value=value, inplace=inplace, mask=mask + ) # --------------------------------------------------------------------- diff --git a/pandas/tests/arrays/categorical/test_replace.py b/pandas/tests/arrays/categorical/test_replace.py index fe12e7c7571ea..a50b1eddd99be 100644 --- a/pandas/tests/arrays/categorical/test_replace.py +++ b/pandas/tests/arrays/categorical/test_replace.py @@ -1,4 +1,3 @@ -import numpy as np import pytest import pandas as pd @@ -20,10 +19,8 @@ ([1, 2], 4, [4, 4, 3], False), ((1, 2, 4), 5, [5, 5, 3], False), ((5, 6), 2, [1, 2, 3], False), - # many-to-many, handled outside of Categorical and results in separate dtype - # except for cases with only 1 unique entry in `value` - ([1], [2], [2, 2, 3], True), - ([1, 4], [5, 2], [5, 2, 3], True), + ([1], [2], [2, 2, 3], False), + ([1, 4], [5, 2], [5, 2, 3], False), # check_categorical sorts categories, which crashes on mixed dtypes (3, "4", [1, 2, "4"], False), ([1, 2, "3"], "5", ["5", "5", 3], True), @@ -31,7 +28,6 @@ ) def test_replace_categorical_series(to_replace, value, expected, flip_categories): # GH 31720 - stays_categorical = not isinstance(value, list) or len(pd.unique(value)) == 1 ser = pd.Series([1, 2, 3], dtype="category") result = ser.replace(to_replace, value) @@ -41,10 +37,6 @@ def test_replace_categorical_series(to_replace, value, expected, flip_categories if flip_categories: expected = expected.cat.set_categories(expected.cat.categories[::-1]) - if not stays_categorical: - # the replace call loses categorical dtype - expected = pd.Series(np.asarray(expected)) - tm.assert_series_equal(expected, result, check_category_order=False) tm.assert_series_equal(expected, ser, check_category_order=False) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 28e28490c73b9..44489ffd5761a 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1382,10 +1382,6 @@ def test_replace_value_category_type(self): tm.assert_frame_equal(result, expected) - @pytest.mark.xfail( - reason="category dtype gets changed to object type after replace, see #35268", - raises=AssertionError, - ) def test_replace_dict_category_type(self): """ Test to ensure category dtypes are maintained diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 3a55062af618f..22f55af69dbfd 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -5,6 +5,7 @@ import pandas as pd import pandas._testing as tm +from pandas.core.arrays import IntervalArray class TestSeriesReplace: @@ -148,20 +149,22 @@ def test_replace_with_single_list(self): tm.assert_series_equal(s, ser) def test_replace_mixed_types(self): - s = pd.Series(np.arange(5), dtype="int64") + ser = pd.Series(np.arange(5), dtype="int64") def check_replace(to_rep, val, expected): - sc = s.copy() - r = s.replace(to_rep, val) + sc = ser.copy() + result = ser.replace(to_rep, val) return_value = sc.replace(to_rep, val, inplace=True) assert return_value is None - tm.assert_series_equal(expected, r) + tm.assert_series_equal(expected, result) tm.assert_series_equal(expected, sc) - # MUST upcast to float - e = pd.Series([0.0, 1.0, 2.0, 3.0, 4.0]) + # 3.0 can still be held in our int64 series, so we do not upcast + # Note this matches what we get with the scalars 3 and 3.0 tr, v = [3], [3.0] - check_replace(tr, v, e) + check_replace(tr, v, ser) + # Note this matches what we get with the scalars 3 and 3.0 + check_replace(tr[0], v[0], ser) # MUST upcast to float e = pd.Series([0, 1, 2, 3.5, 4]) @@ -258,9 +261,9 @@ def test_replace2(self): def test_replace_with_dictlike_and_string_dtype(self, nullable_string_dtype): # GH 32621 - s = pd.Series(["one", "two", np.nan], dtype=nullable_string_dtype) - expected = pd.Series(["1", "2", np.nan]) - result = s.replace({"one": "1", "two": "2"}) + ser = pd.Series(["one", "two", np.nan], dtype=nullable_string_dtype) + expected = pd.Series(["1", "2", np.nan], dtype=nullable_string_dtype) + result = ser.replace({"one": "1", "two": "2"}) tm.assert_series_equal(expected, result) def test_replace_with_empty_dictlike(self): @@ -305,17 +308,17 @@ def test_replace_mixed_types_with_string(self): "categorical, numeric", [ (pd.Categorical(["A"], categories=["A", "B"]), [1]), - (pd.Categorical(("A",), categories=["A", "B"]), [1]), - (pd.Categorical(("A", "B"), categories=["A", "B"]), [1, 2]), + (pd.Categorical(["A", "B"], categories=["A", "B"]), [1, 2]), ], ) def test_replace_categorical(self, categorical, numeric): - # GH 24971 - # Do not check if dtypes are equal due to a known issue that - # Categorical.replace sometimes coerces to object (GH 23305) - s = pd.Series(categorical) - result = s.replace({"A": 1, "B": 2}) - expected = pd.Series(numeric) + # GH 24971, GH#23305 + ser = pd.Series(categorical) + result = ser.replace({"A": 1, "B": 2}) + expected = pd.Series(numeric).astype("category") + if 2 not in expected.cat.categories: + # i.e. categories should be [1, 2] even if there are no "B"s present + expected = expected.cat.add_categories(2) tm.assert_series_equal(expected, result) def test_replace_categorical_single(self): @@ -514,3 +517,39 @@ def test_pandas_replace_na(self): result = ser.replace(regex_mapping, regex=True) exp = pd.Series(["CC", "CC", "CC-REPL", "DD", "CC", "", pd.NA], dtype="string") tm.assert_series_equal(result, exp) + + @pytest.mark.parametrize( + "dtype, input_data, to_replace, expected_data", + [ + ("bool", [True, False], {True: False}, [False, False]), + ("int64", [1, 2], {1: 10, 2: 20}, [10, 20]), + ("Int64", [1, 2], {1: 10, 2: 20}, [10, 20]), + ("float64", [1.1, 2.2], {1.1: 10.1, 2.2: 20.5}, [10.1, 20.5]), + ("Float64", [1.1, 2.2], {1.1: 10.1, 2.2: 20.5}, [10.1, 20.5]), + ("string", ["one", "two"], {"one": "1", "two": "2"}, ["1", "2"]), + ( + pd.IntervalDtype("int64"), + IntervalArray([pd.Interval(1, 2), pd.Interval(2, 3)]), + {pd.Interval(1, 2): pd.Interval(10, 20)}, + IntervalArray([pd.Interval(10, 20), pd.Interval(2, 3)]), + ), + ( + pd.IntervalDtype("float64"), + IntervalArray([pd.Interval(1.0, 2.7), pd.Interval(2.8, 3.1)]), + {pd.Interval(1.0, 2.7): pd.Interval(10.6, 20.8)}, + IntervalArray([pd.Interval(10.6, 20.8), pd.Interval(2.8, 3.1)]), + ), + ( + pd.PeriodDtype("M"), + [pd.Period("2020-05", freq="M")], + {pd.Period("2020-05", freq="M"): pd.Period("2020-06", freq="M")}, + [pd.Period("2020-06", freq="M")], + ), + ], + ) + def test_replace_dtype(self, dtype, input_data, to_replace, expected_data): + # GH#33484 + ser = pd.Series(input_data, dtype=dtype) + result = ser.replace(to_replace) + expected = pd.Series(expected_data, dtype=dtype) + tm.assert_series_equal(result, expected) From 56c8b8494a0926a01a5c4c5a1888157da7829048 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 17 Dec 2021 08:47:05 -0800 Subject: [PATCH 3/7] catch warning --- pandas/tests/indexing/test_coercion.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 2366dd39c25f2..9213c420a9a00 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -1177,6 +1177,7 @@ def test_replace_series_datetime_tz(self, how, to_key, from_key, replacer): assert obj.dtype == from_key result = obj.replace(replacer) + exp = pd.Series(self.rep[to_key], index=index, name="yyy") assert exp.dtype == to_key @@ -1197,7 +1198,21 @@ def test_replace_series_datetime_datetime(self, how, to_key, from_key, replacer) obj = pd.Series(self.rep[from_key], index=index, name="yyy") assert obj.dtype == from_key - result = obj.replace(replacer) + warn = None + rep_ser = pd.Series(replacer) + if ( + isinstance(obj.dtype, pd.DatetimeTZDtype) + and isinstance(rep_ser.dtype, pd.DatetimeTZDtype) + and obj.dtype != rep_ser.dtype + ): + # mismatched tz DatetimeArray behavior will change to cast + # for setitem-like methods with mismatched tzs + warn = FutureWarning + + msg = "explicitly cast to object" + with tm.assert_produces_warning(warn, match=msg): + result = obj.replace(replacer) + exp = pd.Series(self.rep[to_key], index=index, name="yyy") assert exp.dtype == to_key From f3d9b4096f025b3e9ba9cff82b8f9ccab3a092c2 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 17 Dec 2021 11:38:38 -0800 Subject: [PATCH 4/7] whatsnew, GH ref --- doc/source/whatsnew/v1.4.0.rst | 4 ++-- pandas/tests/frame/methods/test_replace.py | 1 + pandas/tests/indexing/test_coercion.py | 2 +- pandas/tests/series/methods/test_replace.py | 6 +++--- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index caf3a4281561f..d46669cf943fe 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -529,7 +529,7 @@ Other Deprecations - Deprecated silent dropping of columns that raised a ``TypeError`` in :class:`Series.transform` and :class:`DataFrame.transform` when used with a dictionary (:issue:`43740`) - Deprecated silent dropping of columns that raised a ``TypeError``, ``DataError``, and some cases of ``ValueError`` in :meth:`Series.aggregate`, :meth:`DataFrame.aggregate`, :meth:`Series.groupby.aggregate`, and :meth:`DataFrame.groupby.aggregate` when used with a list (:issue:`43740`) - Deprecated casting behavior when setting timezone-aware value(s) into a timezone-aware :class:`Series` or :class:`DataFrame` column when the timezones do not match. Previously this cast to object dtype. In a future version, the values being inserted will be converted to the series or column's existing timezone (:issue:`37605`) -- Deprecated casting behavior when passing an item with mismatched-timezone to :meth:`DatetimeIndex.insert`, :meth:`DatetimeIndex.putmask`, :meth:`DatetimeIndex.where` :meth:`DatetimeIndex.fillna`, :meth:`Series.mask`, :meth:`Series.where`, :meth:`Series.fillna`, :meth:`Series.shift`, :meth:`Series.replace`, :meth:`Series.reindex` (and :class:`DataFrame` column analogues). In the past this has cast to object dtype. In a future version, these will cast the passed item to the index or series's timezone (:issue:`37605`) +- Deprecated casting behavior when passing an item with mismatched-timezone to :meth:`DatetimeIndex.insert`, :meth:`DatetimeIndex.putmask`, :meth:`DatetimeIndex.where` :meth:`DatetimeIndex.fillna`, :meth:`Series.mask`, :meth:`Series.where`, :meth:`Series.fillna`, :meth:`Series.shift`, :meth:`Series.replace`, :meth:`Series.reindex` (and :class:`DataFrame` column analogues). In the past this has cast to object dtype. In a future version, these will cast the passed item to the index or series's timezone (:issue:`37605`,:issue:`44940`) - Deprecated the 'errors' keyword argument in :meth:`Series.where`, :meth:`DataFrame.where`, :meth:`Series.mask`, and meth:`DataFrame.mask`; in a future version the argument will be removed (:issue:`44294`) - Deprecated the ``prefix`` keyword argument in :func:`read_csv` and :func:`read_table`, in a future version the argument will be removed (:issue:`43396`) - Deprecated :meth:`PeriodIndex.astype` to ``datetime64[ns]`` or ``DatetimeTZDtype``, use ``obj.to_timestamp(how).tz_localize(dtype.tz)`` instead (:issue:`44398`) @@ -837,7 +837,7 @@ ExtensionArray - Bug in :func:`array` incorrectly raising when passed a ``ndarray`` with ``float16`` dtype (:issue:`44715`) - Bug in calling ``np.sqrt`` on :class:`BooleanArray` returning a malformed :class:`FloatingArray` (:issue:`44715`) - Bug in :meth:`Series.where` with ``ExtensionDtype`` when ``other`` is a NA scalar incompatible with the series dtype (e.g. ``NaT`` with a numeric dtype) incorrectly casting to a compatible NA value (:issue:`44697`) -- +- Fixed bug in :meth:`Series.replace` with ``FloatDtype``, ``string[python]``, or ``string[pyarrow]`` dtype not being preserved when possible (:issue:`33484`) Styler ^^^^^^ diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 44489ffd5761a..a772b745e47d3 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1387,6 +1387,7 @@ def test_replace_dict_category_type(self): Test to ensure category dtypes are maintained after replace with dict values """ + # GH#35268, GH#44940 # create input dataframe input_dict = {"col1": ["a"], "col2": ["obj1"], "col3": ["cat1"]} diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 9213c420a9a00..014f0f5933387 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -1206,7 +1206,7 @@ def test_replace_series_datetime_datetime(self, how, to_key, from_key, replacer) and obj.dtype != rep_ser.dtype ): # mismatched tz DatetimeArray behavior will change to cast - # for setitem-like methods with mismatched tzs + # for setitem-like methods with mismatched tzs GH#44940 warn = FutureWarning msg = "explicitly cast to object" diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 22f55af69dbfd..78129439952da 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -159,8 +159,7 @@ def check_replace(to_rep, val, expected): tm.assert_series_equal(expected, result) tm.assert_series_equal(expected, sc) - # 3.0 can still be held in our int64 series, so we do not upcast - # Note this matches what we get with the scalars 3 and 3.0 + # 3.0 can still be held in our int64 series, so we do not upcast GH#44940 tr, v = [3], [3.0] check_replace(tr, v, ser) # Note this matches what we get with the scalars 3 and 3.0 @@ -260,7 +259,7 @@ def test_replace2(self): assert (ser[20:30] == -1).all() def test_replace_with_dictlike_and_string_dtype(self, nullable_string_dtype): - # GH 32621 + # GH 32621, GH#44940 ser = pd.Series(["one", "two", np.nan], dtype=nullable_string_dtype) expected = pd.Series(["1", "2", np.nan], dtype=nullable_string_dtype) result = ser.replace({"one": "1", "two": "2"}) @@ -318,6 +317,7 @@ def test_replace_categorical(self, categorical, numeric): expected = pd.Series(numeric).astype("category") if 2 not in expected.cat.categories: # i.e. categories should be [1, 2] even if there are no "B"s present + # GH#44940 expected = expected.cat.add_categories(2) tm.assert_series_equal(expected, result) From 569db46eb50f7300e67bcb1663f826a5a6e4220a Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 17 Dec 2021 22:12:04 -0800 Subject: [PATCH 5/7] TST: closes #40732 --- pandas/tests/series/methods/test_replace.py | 26 +++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 78129439952da..971861dfe8812 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -553,3 +553,29 @@ def test_replace_dtype(self, dtype, input_data, to_replace, expected_data): result = ser.replace(to_replace) expected = pd.Series(expected_data, dtype=dtype) tm.assert_series_equal(result, expected) + + def test_replace_string_dtype(self): + # GH#40732, GH#44940 + ser = pd.Series(["one", "two", np.nan], dtype="string") + res = ser.replace({"one": "1", "two": "2"}) + expected = pd.Series(["1", "2", np.nan], dtype="string") + tm.assert_series_equal(res, expected) + + def test_replace_nullable_numeric(self): + # GH#40732, GH#44940 + + floats = pd.Series([1.0, 2.0, 3.999, 4.4], dtype=pd.Float64Dtype()) + assert floats.replace({1.0: 9}).dtype == floats.dtype + assert floats.replace(1.0, 9).dtype == floats.dtype + assert floats.replace({1.0: 9.0}).dtype == floats.dtype + assert floats.replace(1.0, 9.0).dtype == floats.dtype + + res = floats.replace(to_replace=[1.0, 2.0], value=[9.0, 10.0]) + assert res.dtype == floats.dtype + + ints = pd.Series([1, 2, 3, 4], dtype=pd.Int64Dtype()) + assert ints.replace({1: 9}).dtype == ints.dtype + assert ints.replace(1, 9).dtype == ints.dtype + assert ints.replace({1: 9.0}).dtype == ints.dtype + assert ints.replace(1, 9.0).dtype == ints.dtype + # FIXME: ints.replace({1: 9.5}) raises bc of incorrect _can_hold_element From 2351cbbb4a294b0431af464516d1150bd836c18f Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 17 Dec 2021 22:27:18 -0800 Subject: [PATCH 6/7] tests for more closed issues --- pandas/tests/series/methods/test_replace.py | 25 +++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 971861dfe8812..b7d6c498d1e0b 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -561,6 +561,31 @@ def test_replace_string_dtype(self): expected = pd.Series(["1", "2", np.nan], dtype="string") tm.assert_series_equal(res, expected) + # GH#31644 + ser2 = pd.Series(["A", np.nan], dtype="string") + res2 = ser2.replace("A", "B") + expected2 = pd.Series(["B", np.nan], dtype="string") + tm.assert_series_equal(res2, expected2) + + ser3 = pd.Series(["A", "B"], dtype="string") + res3 = ser3.replace("A", pd.NA) + expected3 = pd.Series([pd.NA, "B"], dtype="string") + tm.assert_series_equal(res3, expected3) + + def test_replace_string_dtype_list_to_replace(self): + # GH#41215, GH#44940 + ser = pd.Series(["abc", "def"], dtype="string") + res = ser.replace(["abc", "any other string"], "xyz") + expected = pd.Series(["xyz", "def"], dtype="string") + tm.assert_series_equal(res, expected) + + def test_replace_string_dtype_regex(self): + # GH#31644 + ser = pd.Series(["A", "B"], dtype="string") + res = ser.replace(r".", "C", regex=True) + expected = pd.Series(["C", "C"], dtype="string") + tm.assert_series_equal(res, expected) + def test_replace_nullable_numeric(self): # GH#40732, GH#44940 From b7679d363ec206e44f69ae861a68e2d0f5b86a15 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 17 Dec 2021 22:31:19 -0800 Subject: [PATCH 7/7] test for closed issue --- pandas/tests/frame/methods/test_replace.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index a772b745e47d3..1bfc00f8d31ac 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -624,6 +624,14 @@ def test_replace_mixed3(self): expected.iloc[1, 1] = m[1] tm.assert_frame_equal(result, expected) + def test_replace_nullable_int_with_string_doesnt_cast(self): + # GH#25438 don't cast df['a'] to float64 + df = DataFrame({"a": [1, 2, 3, np.nan], "b": ["some", "strings", "here", "he"]}) + df["a"] = df["a"].astype("Int64") + + res = df.replace("", np.nan) + tm.assert_series_equal(res["a"], df["a"]) + @pytest.mark.parametrize("dtype", ["boolean", "Int64", "Float64"]) def test_replace_with_nullable_column(self, dtype): # GH-44499