From e6a3f40d56334f1c248e0c4774dd27295f6701bf Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Fri, 20 Nov 2020 22:20:26 -0500 Subject: [PATCH 01/13] ENH: add dtype validation for value in StringArray.fillna --- pandas/core/arrays/string_.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 3b297e7c2b13b..d55b1350a85cb 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -283,7 +283,8 @@ def __setitem__(self, key, value): super().__setitem__(key, value) def fillna(self, value=None, method=None, limit=None): - # TODO: validate dtype + if not isinstance(value, str): + raise TypeError(f"{value} is not a valid fill value; must be a string") return super().fillna(value, method, limit) def astype(self, dtype, copy=True): From 0a85704f66cabe2f275a3edb28908db2bdbcdec8 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Fri, 20 Nov 2020 22:26:50 -0500 Subject: [PATCH 02/13] TST: test fillna args --- pandas/tests/arrays/string_/test_string.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 07e9484994c26..faa2ef2b053c8 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -422,6 +422,16 @@ def test_reduce_missing(skipna, dtype): assert pd.isna(result) +def test_fillna_args(): + arr = pd.array(["a", pd.NA], dtype="string") + arr.fillna(value="b") + + arr = pd.array(["a", "b"], dtype="string") + msg = r"1 is not a valid fill value; must be a string" + with pytest.raises(TypeError, match=msg): + arr.fillna(value=1) + + @td.skip_if_no("pyarrow", min_version="0.15.0") def test_arrow_array(dtype): # protocol added in 0.15.0 From 48146144cc92e7507b82f22bb5936fec4ac9f2c6 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Fri, 20 Nov 2020 22:20:26 -0500 Subject: [PATCH 03/13] ENH: add dtype validation for value in StringArray.fillna --- pandas/core/arrays/string_.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 3b297e7c2b13b..d55b1350a85cb 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -283,7 +283,8 @@ def __setitem__(self, key, value): super().__setitem__(key, value) def fillna(self, value=None, method=None, limit=None): - # TODO: validate dtype + if not isinstance(value, str): + raise TypeError(f"{value} is not a valid fill value; must be a string") return super().fillna(value, method, limit) def astype(self, dtype, copy=True): From 7051e970f0d812d7ea003216c27e1f9ef63238df Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Fri, 20 Nov 2020 22:26:50 -0500 Subject: [PATCH 04/13] TST: test fillna args --- pandas/tests/arrays/string_/test_string.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 07e9484994c26..faa2ef2b053c8 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -422,6 +422,16 @@ def test_reduce_missing(skipna, dtype): assert pd.isna(result) +def test_fillna_args(): + arr = pd.array(["a", pd.NA], dtype="string") + arr.fillna(value="b") + + arr = pd.array(["a", "b"], dtype="string") + msg = r"1 is not a valid fill value; must be a string" + with pytest.raises(TypeError, match=msg): + arr.fillna(value=1) + + @td.skip_if_no("pyarrow", min_version="0.15.0") def test_arrow_array(dtype): # protocol added in 0.15.0 From df8e18515d053d8c16c56fb171b83ac9c00dd0d2 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sat, 21 Nov 2020 05:27:56 +0000 Subject: [PATCH 05/13] feedback: broaden validation to np.str_ + allow NA fill value --- pandas/core/arrays/string_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index d55b1350a85cb..6cc5004483423 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -283,7 +283,7 @@ def __setitem__(self, key, value): super().__setitem__(key, value) def fillna(self, value=None, method=None, limit=None): - if not isinstance(value, str): + if not (isinstance(value, (str, np.str_)) or isna(value)): raise TypeError(f"{value} is not a valid fill value; must be a string") return super().fillna(value, method, limit) From 43a6646dc6c8428ebfe9424ea59974f29ec33e97 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sat, 21 Nov 2020 05:47:04 +0000 Subject: [PATCH 06/13] TST: add tests with np._str, missing value --- pandas/tests/arrays/string_/test_string.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index faa2ef2b053c8..92f13c5346cb4 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -423,10 +423,18 @@ def test_reduce_missing(skipna, dtype): def test_fillna_args(): + # GH 37987 + arr = pd.array(["a", pd.NA], dtype="string") - arr.fillna(value="b") - arr = pd.array(["a", "b"], dtype="string") + res = arr.fillna(value="b") + expected = pd.array(["a", "b"], dtype="string") + tm.assert_extension_array_equal(res, expected) + + res = arr.fillna(value=np.str_("b")) + expected = pd.array(["a", "b"], dtype="string") + tm.assert_extension_array_equal(res, expected) + msg = r"1 is not a valid fill value; must be a string" with pytest.raises(TypeError, match=msg): arr.fillna(value=1) From 616a6c7d158193ba75678c5cf44e7f8a32d36339 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sat, 21 Nov 2020 05:49:59 +0000 Subject: [PATCH 07/13] skip check if no value provided --- pandas/core/arrays/string_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 6cc5004483423..eb4fb6fcee75c 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -283,7 +283,7 @@ def __setitem__(self, key, value): super().__setitem__(key, value) def fillna(self, value=None, method=None, limit=None): - if not (isinstance(value, (str, np.str_)) or isna(value)): + if value is not None and not (isinstance(value, (str, np.str_)) or isna(value)): raise TypeError(f"{value} is not a valid fill value; must be a string") return super().fillna(value, method, limit) From a43de6173d85b371c9eea9ec065ee1a7e2c01335 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sat, 21 Nov 2020 17:26:18 +0000 Subject: [PATCH 08/13] feedback: remove np.str_ check --- pandas/core/arrays/string_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index eb4fb6fcee75c..7ccb2172d4a9a 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -283,7 +283,7 @@ def __setitem__(self, key, value): super().__setitem__(key, value) def fillna(self, value=None, method=None, limit=None): - if value is not None and not (isinstance(value, (str, np.str_)) or isna(value)): + if value is not None and not (isinstance(value, str) or isna(value)): raise TypeError(f"{value} is not a valid fill value; must be a string") return super().fillna(value, method, limit) From 24cb928857ee206be228e672c191788f2a21d51b Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sat, 21 Nov 2020 19:28:31 +0000 Subject: [PATCH 09/13] add is_string_dtype check --- pandas/core/arrays/string_.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 7ccb2172d4a9a..8595e0bdb7286 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -283,7 +283,9 @@ def __setitem__(self, key, value): super().__setitem__(key, value) def fillna(self, value=None, method=None, limit=None): - if value is not None and not (isinstance(value, str) or isna(value)): + if value is not None and not ( + isinstance(value, str) or is_string_dtype(value) or isna(value) + ): raise TypeError(f"{value} is not a valid fill value; must be a string") return super().fillna(value, method, limit) From b6587917aa14eb746b91c3d50451cee1f9e08087 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Tue, 24 Nov 2020 00:31:31 -0500 Subject: [PATCH 10/13] rewrite using _validate_setitem_value --- pandas/core/arrays/base.py | 5 ++++ pandas/core/arrays/string_.py | 34 ++++++++++++---------- pandas/tests/arrays/string_/test_string.py | 4 +-- 3 files changed, 25 insertions(+), 18 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 448025e05422d..1364ab0ef5181 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -8,6 +8,7 @@ """ from __future__ import annotations +import abc import operator from typing import ( Any, @@ -607,6 +608,10 @@ def argmax(self): """ return nargminmax(self, "argmax") + @abc.abstractmethod + def _validate_setitem_value(self, value): + pass + def fillna(self, value=None, method=None, limit=None): """ Fill NA/NaN values using the specified method. diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 8595e0bdb7286..d4cfea2f6706a 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -254,19 +254,8 @@ def _values_for_factorize(self): arr[mask] = -1 return arr, -1 - def __setitem__(self, key, value): - value = extract_array(value, extract_numpy=True) - if isinstance(value, type(self)): - # extract_array doesn't extract PandasArray subclasses - value = value._ndarray - - key = check_array_indexer(self, key) - scalar_key = lib.is_scalar(key) + def _validate_setitem_value(self, value): scalar_value = lib.is_scalar(value) - if scalar_key and not scalar_value: - raise ValueError("setting an array element with a sequence.") - - # validate new items if scalar_value: if isna(value): value = StringDtype.na_value @@ -279,14 +268,27 @@ def __setitem__(self, key, value): value = np.asarray(value, dtype=object) if len(value) and not lib.is_string_array(value, skipna=True): raise ValueError("Must provide strings.") + return value + + def __setitem__(self, key, value): + value = extract_array(value, extract_numpy=True) + if isinstance(value, type(self)): + # extract_array doesn't extract PandasArray subclasses + value = value._ndarray + + key = check_array_indexer(self, key) + scalar_key = lib.is_scalar(key) + scalar_value = lib.is_scalar(value) + if scalar_key and not scalar_value: + raise ValueError("setting an array element with a sequence.") + + value = self._validate_setitem_value(value) super().__setitem__(key, value) def fillna(self, value=None, method=None, limit=None): - if value is not None and not ( - isinstance(value, str) or is_string_dtype(value) or isna(value) - ): - raise TypeError(f"{value} is not a valid fill value; must be a string") + if value is not None: + value = self._validate_setitem_value(value) return super().fillna(value, method, limit) def astype(self, dtype, copy=True): diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 92f13c5346cb4..9a1634380aaba 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -435,8 +435,8 @@ def test_fillna_args(): expected = pd.array(["a", "b"], dtype="string") tm.assert_extension_array_equal(res, expected) - msg = r"1 is not a valid fill value; must be a string" - with pytest.raises(TypeError, match=msg): + msg = "Cannot set non-string value '1' into a StringArray." + with pytest.raises(ValueError, match=msg): arr.fillna(value=1) From 135f55a1a8ef79370bbac0ba7ce7214e6c60b63c Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Tue, 24 Nov 2020 11:11:43 -0500 Subject: [PATCH 11/13] revert changes to base class --- pandas/core/arrays/base.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 1364ab0ef5181..448025e05422d 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -8,7 +8,6 @@ """ from __future__ import annotations -import abc import operator from typing import ( Any, @@ -608,10 +607,6 @@ def argmax(self): """ return nargminmax(self, "argmax") - @abc.abstractmethod - def _validate_setitem_value(self, value): - pass - def fillna(self, value=None, method=None, limit=None): """ Fill NA/NaN values using the specified method. From 719d3cdbcab448bf7953563fa65c58ba6ddce480 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Wed, 25 Nov 2020 09:19:32 -0500 Subject: [PATCH 12/13] feedback: revert changes to string_.py --- pandas/core/arrays/string_.py | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index d4cfea2f6706a..57c16de6fe396 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -254,8 +254,19 @@ def _values_for_factorize(self): arr[mask] = -1 return arr, -1 - def _validate_setitem_value(self, value): + def __setitem__(self, key, value): + value = extract_array(value, extract_numpy=True) + if isinstance(value, type(self)): + # extract_array doesn't extract PandasArray subclasses + value = value._ndarray + + key = check_array_indexer(self, key) + scalar_key = lib.is_scalar(key) scalar_value = lib.is_scalar(value) + if scalar_key and not scalar_value: + raise ValueError("setting an array element with a sequence.") + + # validate new items if scalar_value: if isna(value): value = StringDtype.na_value @@ -268,27 +279,10 @@ def _validate_setitem_value(self, value): value = np.asarray(value, dtype=object) if len(value) and not lib.is_string_array(value, skipna=True): raise ValueError("Must provide strings.") - return value - - def __setitem__(self, key, value): - value = extract_array(value, extract_numpy=True) - if isinstance(value, type(self)): - # extract_array doesn't extract PandasArray subclasses - value = value._ndarray - - key = check_array_indexer(self, key) - scalar_key = lib.is_scalar(key) - scalar_value = lib.is_scalar(value) - if scalar_key and not scalar_value: - raise ValueError("setting an array element with a sequence.") - - value = self._validate_setitem_value(value) super().__setitem__(key, value) def fillna(self, value=None, method=None, limit=None): - if value is not None: - value = self._validate_setitem_value(value) return super().fillna(value, method, limit) def astype(self, dtype, copy=True): From 0e84df86a8a1f0f2a719ce51cee05622e8632845 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Wed, 25 Nov 2020 10:45:10 -0500 Subject: [PATCH 13/13] remove fillna override from StringArray --- pandas/core/arrays/string_.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 57c16de6fe396..e75305e55348c 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -282,9 +282,6 @@ def __setitem__(self, key, value): super().__setitem__(key, value) - def fillna(self, value=None, method=None, limit=None): - return super().fillna(value, method, limit) - def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if isinstance(dtype, StringDtype):