diff --git a/doc/source/whatsnew/v2.3.2.rst b/doc/source/whatsnew/v2.3.2.rst index 3bb74da4bd6f4..dc1927087a66b 100644 --- a/doc/source/whatsnew/v2.3.2.rst +++ b/doc/source/whatsnew/v2.3.2.rst @@ -22,6 +22,8 @@ become the default string dtype in pandas 3.0. See Bug fixes ^^^^^^^^^ +- Fix :meth:`~Series.str.isdigit` to correctly recognize unicode superscript + characters as digits for :class:`StringDtype` backed by PyArrow (:issue:`61466`) - Fix :meth:`~DataFrame.to_json` with ``orient="table"`` to correctly use the "string" type in the JSON Table Schema for :class:`StringDtype` columns (:issue:`61889`) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index b2c1e07b23a1e..d4f4c5bdea0a0 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -15,6 +15,7 @@ from pandas.compat import ( HAS_PYARROW, pa_version_under17p0, + pa_version_under21p0, ) if HAS_PYARROW: @@ -267,6 +268,12 @@ def _str_isdecimal(self): return self._convert_bool_result(result) def _str_isdigit(self): + if pa_version_under21p0: + # https://github.com/pandas-dev/pandas/issues/61466 + res_list = self._apply_elementwise(str.isdigit) + return self._convert_bool_result( + pa.chunked_array(res_list, type=pa.bool_()) + ) result = pc.utf8_is_digit(self._pa_array) return self._convert_bool_result(result) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 21e6e2efbe778..b78ea3a9bf883 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -3602,16 +3602,26 @@ def casefold(self): Series.str.isupper : Check whether all characters are uppercase. Series.str.istitle : Check whether all characters are titlecase. - Examples - -------- + Notes + ----- Similar to ``str.isdecimal`` but also includes special digits, like superscripted and subscripted digits in unicode. + The exact behavior of this method, i.e. which unicode characters are + considered as digits, depends on the backend used for string operations, + and there can be small differences. + For example, Python considers the ³ superscript character as a digit, but + not the ⅕ fraction character, while PyArrow considers both as digits. For + simple (ascii) decimal numbers, the behaviour is consistent. + + Examples + -------- + >>> s3 = pd.Series(['23', '³', '⅕', '']) >>> s3.str.isdigit() 0 True - 1 False - 2 False + 1 True + 2 True 3 False dtype: bool """ diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 24f7245f032ed..fc0dd23334706 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -7,6 +7,7 @@ import numpy as np import pytest +from pandas.compat import pa_version_under21p0 from pandas.errors import Pandas4Warning from pandas import ( @@ -15,6 +16,7 @@ Index, MultiIndex, Series, + StringDtype, option_context, ) import pandas._testing as tm @@ -249,8 +251,9 @@ def test_ismethods(method, expected, any_string_dtype): @pytest.mark.parametrize( "method, expected", [ - ("isnumeric", [False, True, True, False, True, True, False]), - ("isdecimal", [False, True, False, False, False, True, False]), + ("isnumeric", [False, True, True, True, False, True, True, False]), + ("isdecimal", [False, True, False, False, False, False, True, False]), + ("isdigit", [False, True, True, False, False, False, True, False]), ], ) def test_isnumeric_unicode(method, expected, any_string_dtype): @@ -259,19 +262,35 @@ def test_isnumeric_unicode(method, expected, any_string_dtype): # 0x1378: ፸ ETHIOPIC NUMBER SEVENTY # 0xFF13: 3 Em 3 # noqa: RUF003 ser = Series( - ["A", "3", "¼", "★", "፸", "3", "four"], # noqa: RUF001 + ["A", "3", "³", "¼", "★", "፸", "3", "four"], # noqa: RUF001 dtype=any_string_dtype, ) expected_dtype = ( "bool" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" ) expected = Series(expected, dtype=expected_dtype) + if ( + method == "isdigit" + and isinstance(ser.dtype, StringDtype) + and ser.dtype.storage == "pyarrow" + and not pa_version_under21p0 + ): + # known difference in behavior between python and pyarrow unicode handling + # pyarrow 21+ considers ¼ and ፸ as a digit, while python does not + expected.iloc[3] = True + expected.iloc[5] = True + result = getattr(ser.str, method)() tm.assert_series_equal(result, expected) # compare with standard library - expected = [getattr(item, method)() for item in ser] - assert list(result) == expected + # (only for non-pyarrow storage given the above differences) + if any_string_dtype == "object" or ( + isinstance(any_string_dtype, StringDtype) + and any_string_dtype.storage == "python" + ): + expected = [getattr(item, method)() for item in ser] + assert list(result) == expected @pytest.mark.parametrize(