From 74e4a1c52317915200d4b2e6d76278bedbe8f7f3 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 24 Jan 2023 11:58:24 -0800 Subject: [PATCH 1/2] BUG: is_string_dtype returns True for ArrowDtype(pa.string()) --- doc/source/whatsnew/v2.0.0.rst | 2 +- pandas/core/arrays/arrow/dtype.py | 3 +++ pandas/tests/extension/test_arrow.py | 9 ++++++++- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 2d9d78da130ef..add1d7edfa777 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1021,7 +1021,7 @@ Conversion Strings ^^^^^^^ -- Bug in :func:`pandas.api.dtypes.is_string_dtype` that would not return ``True`` for :class:`StringDtype` (:issue:`15585`) +- Bug in :func:`pandas.api.dtypes.is_string_dtype` that would not return ``True`` for :class:`StringDtype` or :class:`ArrowDtype` with ``pyarrow.string()`` (:issue:`15585`) - Bug in converting string dtypes to "datetime64[ns]" or "timedelta64[ns]" incorrectly raising ``TypeError`` (:issue:`36153`) - diff --git a/pandas/core/arrays/arrow/dtype.py b/pandas/core/arrays/arrow/dtype.py index f5f87bea83b8f..3e3213b48670f 100644 --- a/pandas/core/arrays/arrow/dtype.py +++ b/pandas/core/arrays/arrow/dtype.py @@ -95,6 +95,9 @@ def name(self) -> str: # type: ignore[override] @cache_readonly def numpy_dtype(self) -> np.dtype: """Return an instance of the related numpy dtype""" + if pa.types.is_string(self.pyarrow_dtype): + # pa.string().to_pandas_dtype() = object which we don't want + return np.dtype(str) try: return np.dtype(self.pyarrow_dtype.to_pandas_dtype()) except (NotImplementedError, TypeError): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 2467471e3643e..1f877ebd54431 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -46,6 +46,7 @@ is_integer_dtype, is_numeric_dtype, is_signed_integer_dtype, + is_string_dtype, is_unsigned_integer_dtype, ) from pandas.tests.extension import base @@ -730,7 +731,6 @@ def test_get_common_dtype(self, dtype, request): and (pa_dtype.unit != "ns" or pa_dtype.tz is not None) ) or (pa.types.is_duration(pa_dtype) and pa_dtype.unit != "ns") - or pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype) ): request.node.add_marker( @@ -743,6 +743,13 @@ def test_get_common_dtype(self, dtype, request): ) super().test_get_common_dtype(dtype) + def test_is_not_string_type(self, dtype): + pa_dtype = dtype.pyarrow_dtype + if pa.types.is_string(pa_dtype): + assert is_string_dtype(dtype) + else: + super().test_is_not_string_type(dtype) + class TestBaseIndex(base.BaseIndexTests): pass From b2519bd422325f249bbb7e8c5d5d70803f12c9f4 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 24 Jan 2023 15:57:29 -0800 Subject: [PATCH 2/2] adjust groupby test --- pandas/tests/extension/test_arrow.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 1f877ebd54431..10ec9910a8686 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -652,6 +652,24 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping, request): ): super().test_groupby_extension_agg(as_index, data_for_grouping) + def test_in_numeric_groupby(self, data_for_grouping): + if is_string_dtype(data_for_grouping.dtype): + df = pd.DataFrame( + { + "A": [1, 1, 2, 2, 3, 3, 1, 4], + "B": data_for_grouping, + "C": [1, 1, 1, 1, 1, 1, 1, 1], + } + ) + + expected = pd.Index(["C"]) + with pytest.raises(TypeError, match="does not support"): + df.groupby("A").sum().columns + result = df.groupby("A").sum(numeric_only=True).columns + tm.assert_index_equal(result, expected) + else: + super().test_in_numeric_groupby(data_for_grouping) + class TestBaseDtype(base.BaseDtypeTests): def test_construct_from_string_own_name(self, dtype, request):