Skip to content

Commit a2fb11e

Browse files
String dtype: use 'str' string alias and representation for NaN-variant of the dtype (#59388)
1 parent 7147203 commit a2fb11e

File tree

79 files changed

+305
-191
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

79 files changed

+305
-191
lines changed

pandas/_testing/__init__.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
import numpy as np
1414

15+
from pandas._config import using_string_dtype
1516
from pandas._config.localization import (
1617
can_set_locale,
1718
get_locales,
@@ -106,7 +107,10 @@
106107
ALL_FLOAT_DTYPES: list[Dtype] = [*FLOAT_NUMPY_DTYPES, *FLOAT_EA_DTYPES]
107108

108109
COMPLEX_DTYPES: list[Dtype] = [complex, "complex64", "complex128"]
109-
STRING_DTYPES: list[Dtype] = [str, "str", "U"]
110+
if using_string_dtype():
111+
STRING_DTYPES: list[Dtype] = [str, "U"]
112+
else:
113+
STRING_DTYPES: list[Dtype] = [str, "str", "U"] # type: ignore[no-redef]
110114
COMPLEX_FLOAT_DTYPES: list[Dtype] = [*COMPLEX_DTYPES, *FLOAT_NUMPY_DTYPES]
111115

112116
DATETIME64_DTYPES: list[Dtype] = ["datetime64[ns]", "M8[ns]"]

pandas/core/arrays/arrow/array.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -575,7 +575,10 @@ def __getitem__(self, item: PositionalIndexer):
575575
if isinstance(item, np.ndarray):
576576
if not len(item):
577577
# Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string]
578-
if self._dtype.name == "string" and self._dtype.storage == "pyarrow":
578+
if (
579+
isinstance(self._dtype, StringDtype)
580+
and self._dtype.storage == "pyarrow"
581+
):
579582
# TODO(infer_string) should this be large_string?
580583
pa_dtype = pa.string()
581584
else:

pandas/core/arrays/string_.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
from typing import (
55
TYPE_CHECKING,
66
Any,
7-
ClassVar,
87
Literal,
98
cast,
109
)
@@ -118,9 +117,12 @@ class StringDtype(StorageExtensionDtype):
118117
string[pyarrow]
119118
"""
120119

121-
# error: Cannot override instance variable (previously declared on
122-
# base class "StorageExtensionDtype") with class variable
123-
name: ClassVar[str] = "string" # type: ignore[misc]
120+
@property
121+
def name(self) -> str: # type: ignore[override]
122+
if self._na_value is libmissing.NA:
123+
return "string"
124+
else:
125+
return "str"
124126

125127
#: StringDtype().na_value uses pandas.NA except the implementation that
126128
# follows NumPy semantics, which uses nan.
@@ -137,7 +139,7 @@ def __init__(
137139
) -> None:
138140
# infer defaults
139141
if storage is None:
140-
if using_string_dtype() and na_value is not libmissing.NA:
142+
if na_value is not libmissing.NA:
141143
if HAS_PYARROW:
142144
storage = "pyarrow"
143145
else:
@@ -170,11 +172,19 @@ def __init__(
170172
self.storage = storage
171173
self._na_value = na_value
172174

175+
def __repr__(self) -> str:
176+
if self._na_value is libmissing.NA:
177+
return f"{self.name}[{self.storage}]"
178+
else:
179+
# TODO add more informative repr
180+
return self.name
181+
173182
def __eq__(self, other: object) -> bool:
174183
# we need to override the base class __eq__ because na_value (NA or NaN)
175184
# cannot be checked with normal `==`
176185
if isinstance(other, str):
177-
if other == self.name:
186+
# TODO should dtype == "string" work for the NaN variant?
187+
if other == "string" or other == self.name: # noqa: PLR1714
178188
return True
179189
try:
180190
other = self.construct_from_string(other)
@@ -231,6 +241,8 @@ def construct_from_string(cls, string) -> Self:
231241
)
232242
if string == "string":
233243
return cls()
244+
elif string == "str" and using_string_dtype():
245+
return cls(na_value=np.nan)
234246
elif string == "string[python]":
235247
return cls(storage="python")
236248
elif string == "string[pyarrow]":

pandas/core/frame.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4807,7 +4807,9 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame:
48074807
-----
48084808
* To select all *numeric* types, use ``np.number`` or ``'number'``
48094809
* To select strings you must use the ``object`` dtype, but note that
4810-
this will return *all* object dtype columns
4810+
this will return *all* object dtype columns. With
4811+
``pd.options.future.infer_string`` enabled, using ``"str"`` will
4812+
work to select all string columns.
48114813
* See the `numpy dtype hierarchy
48124814
<https://numpy.org/doc/stable/reference/arrays.scalars.html>`__
48134815
* To select datetimes, use ``np.datetime64``, ``'datetime'`` or

pandas/core/interchange/utils.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,12 @@ def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str:
135135
if format_str is not None:
136136
return format_str
137137

138-
if lib.is_np_dtype(dtype, "M"):
138+
if isinstance(dtype, pd.StringDtype):
139+
# TODO(infer_string) this should be LARGE_STRING for pyarrow storage,
140+
# but current tests don't cover this distinction
141+
return ArrowCTypes.STRING
142+
143+
elif lib.is_np_dtype(dtype, "M"):
139144
# Selecting the first char of resolution string:
140145
# dtype.str -> '<M8[ns]' -> 'n'
141146
resolution = np.datetime_data(dtype)[0][0]

pandas/tests/apply/test_numba.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ def test_numba_unsupported_dtypes(apply_axis):
110110

111111
with pytest.raises(
112112
ValueError,
113-
match="Column b must have a numeric dtype. Found 'object|string' instead",
113+
match="Column b must have a numeric dtype. Found 'object|str' instead",
114114
):
115115
df.apply(f, engine="numba", axis=apply_axis)
116116

pandas/tests/apply/test_series_apply.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,7 @@ def test_apply_categorical(by_row, using_infer_string):
224224
result = ser.apply(lambda x: "A")
225225
exp = Series(["A"] * 7, name="XX", index=list("abcdefg"))
226226
tm.assert_series_equal(result, exp)
227-
assert result.dtype == object if not using_infer_string else "string[pyarrow_numpy]"
227+
assert result.dtype == object if not using_infer_string else "str"
228228

229229

230230
@pytest.mark.parametrize("series", [["1-1", "1-1", np.nan], ["1-1", "1-2", np.nan]])

pandas/tests/arrays/boolean/test_astype.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import pandas._testing as tm
66

77

8-
def test_astype():
8+
def test_astype(using_infer_string):
99
# with missing values
1010
arr = pd.array([True, False, None], dtype="boolean")
1111

@@ -20,8 +20,14 @@ def test_astype():
2020
tm.assert_numpy_array_equal(result, expected)
2121

2222
result = arr.astype("str")
23-
expected = np.array(["True", "False", "<NA>"], dtype=f"{tm.ENDIAN}U5")
24-
tm.assert_numpy_array_equal(result, expected)
23+
if using_infer_string:
24+
expected = pd.array(
25+
["True", "False", None], dtype=pd.StringDtype(na_value=np.nan)
26+
)
27+
tm.assert_extension_array_equal(result, expected)
28+
else:
29+
expected = np.array(["True", "False", "<NA>"], dtype=f"{tm.ENDIAN}U5")
30+
tm.assert_numpy_array_equal(result, expected)
2531

2632
# no missing values
2733
arr = pd.array([True, False, True], dtype="boolean")

pandas/tests/arrays/categorical/test_astype.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def test_astype(self, ordered):
8888
expected = np.array(cat)
8989
tm.assert_numpy_array_equal(result, expected)
9090

91-
msg = r"Cannot cast object|string dtype to float64"
91+
msg = r"Cannot cast object|str dtype to float64"
9292
with pytest.raises(ValueError, match=msg):
9393
cat.astype(float)
9494

pandas/tests/arrays/categorical/test_repr.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def test_print(self, using_infer_string):
2222
if using_infer_string:
2323
expected = [
2424
"['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",
25-
"Categories (3, string): [a < b < c]",
25+
"Categories (3, str): [a < b < c]",
2626
]
2727
else:
2828
expected = [

0 commit comments

Comments
 (0)