diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index d71160cdbc369..b38e95c86d8cd 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -147,6 +147,7 @@ Other API changes - The ``other`` argument in :meth:`DataFrame.mask` and :meth:`Series.mask` now defaults to ``no_default`` instead of ``np.nan`` consistent with :meth:`DataFrame.where` and :meth:`Series.where`. Entries will be filled with the corresponding NULL value (``np.nan`` for numpy dtypes, ``pd.NA`` for extension dtypes). (:issue:`49111`) - When creating a :class:`Series` with a object-dtype :class:`Index` of datetime objects, pandas no longer silently converts the index to a :class:`DatetimeIndex` (:issue:`39307`, :issue:`23598`) - :meth:`Series.unique` with dtype "timedelta64[ns]" or "datetime64[ns]" now returns :class:`TimedeltaArray` or :class:`DatetimeArray` instead of ``numpy.ndarray`` (:issue:`49176`) +- :func:`pandas.api.dtypes.is_string_dtype` now only returns ``True`` for array-likes with ``dtype=object`` when the elements are inferred to be strings (:issue:`15585`) - Passing a sequence containing ``datetime`` objects and ``date`` objects to :class:`Series` constructor will return with ``object`` dtype instead of ``datetime64[ns]`` dtype, consistent with :class:`Index` behavior (:issue:`49341`) - Passing strings that cannot be parsed as datetimes to :class:`Series` or :class:`DataFrame` with ``dtype="datetime64[ns]"`` will raise instead of silently ignoring the keyword and returning ``object`` dtype (:issue:`24435`) - @@ -393,7 +394,7 @@ Conversion Strings ^^^^^^^ -- +- Bug in :func:`pandas.api.dtypes.is_string_dtype` that would not return ``True`` for :class:`StringDtype` (:issue:`15585`) - Interval diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 3c2aa1f6bab5d..a7b8e720ad8e2 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -500,6 +500,9 @@ def is_string_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of the string dtype. + If an array is passed with an object dtype, the elements must be + inferred as strings. + Parameters ---------- arr_or_dtype : array-like or dtype @@ -518,21 +521,23 @@ def is_string_dtype(arr_or_dtype) -> bool: True >>> is_string_dtype(int) False - >>> >>> is_string_dtype(np.array(['a', 'b'])) True >>> is_string_dtype(pd.Series([1, 2])) False + >>> is_string_dtype(pd.Series([1, 2], dtype=object)) + False """ - # TODO: gh-15585: consider making the checks stricter. - def condition(dtype) -> bool: - return dtype.kind in ("O", "S", "U") and not is_excluded_dtype(dtype) + if hasattr(arr_or_dtype, "dtype") and get_dtype(arr_or_dtype).kind == "O": + return is_all_strings(arr_or_dtype) - def is_excluded_dtype(dtype) -> bool: - """ - These have kind = "O" but aren't string dtypes so need to be explicitly excluded - """ - return isinstance(dtype, (PeriodDtype, IntervalDtype, CategoricalDtype)) + def condition(dtype) -> bool: + if is_string_or_object_np_dtype(dtype): + return True + try: + return dtype == "string" + except TypeError: + return False return _is_dtype(arr_or_dtype, condition) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 589e2e04d668a..c8a3c992248ad 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -290,6 +290,15 @@ def test_is_string_dtype(): assert com.is_string_dtype(pd.StringDtype()) +@pytest.mark.parametrize( + "data", + [[(0, 1), (1, 1)], pd.Categorical([1, 2, 3]), np.array([1, 2], dtype=object)], +) +def test_is_string_dtype_arraylike_with_object_elements_not_strings(data): + # GH 15585 + assert not com.is_string_dtype(pd.Series(data)) + + def test_is_string_dtype_nullable(nullable_string_dtype): assert com.is_string_dtype(pd.array(["a", "b"], dtype=nullable_string_dtype)) diff --git a/pandas/tests/extension/base/dtype.py b/pandas/tests/extension/base/dtype.py index ea4443010c6a6..32a9246264d69 100644 --- a/pandas/tests/extension/base/dtype.py +++ b/pandas/tests/extension/base/dtype.py @@ -45,10 +45,10 @@ def test_is_dtype_other_input(self, dtype): assert dtype.is_dtype([1, 2, 3]) is False def test_is_not_string_type(self, dtype): - return not is_string_dtype(dtype) + assert not is_string_dtype(dtype) def test_is_not_object_type(self, dtype): - return not is_object_dtype(dtype) + assert not is_object_dtype(dtype) def test_eq_with_str(self, dtype): assert dtype == dtype.name diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 148059a6a16f3..d6a5557c89f14 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -26,6 +26,7 @@ import pandas as pd import pandas._testing as tm +from pandas.api.types import is_object_dtype from pandas.core.arrays.numpy_ import PandasArray from pandas.core.internals import blocks from pandas.tests.extension import base @@ -218,6 +219,14 @@ def test_check_dtype(self, data, request): ) super().test_check_dtype(data) + def test_is_not_object_type(self, dtype, request): + if dtype.numpy_dtype == "object": + # Different from BaseDtypeTests.test_is_not_object_type + # because PandasDtype(object) is an object type + assert is_object_dtype(dtype) + else: + super().test_is_not_object_type(dtype) + class TestGetitem(BaseNumPyTests, base.BaseGetitemTests): @skip_nested diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index c5aebb282bafa..8cbd4342ea13f 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -26,6 +26,7 @@ import pandas as pd import pandas._testing as tm +from pandas.api.types import is_string_dtype from pandas.core.arrays import ArrowStringArray from pandas.core.arrays.string_ import StringDtype from pandas.tests.extension import base @@ -106,6 +107,11 @@ def test_eq_with_str(self, dtype): assert dtype == f"string[{dtype.storage}]" super().test_eq_with_str(dtype) + def test_is_not_string_type(self, dtype): + # Different from BaseDtypeTests.test_is_not_string_type + # because StringDtype is a string type + assert is_string_dtype(dtype) + class TestInterface(base.BaseInterfaceTests): def test_view(self, data, request):