pandas-dev
diff --git a/‎pandas/_testing/__init__.py
Lines changed: 5 additions & 1 deletion b/‎pandas/_testing/__init__.py
Lines changed: 5 additions & 1 deletion
diff --git a/‎pandas/core/arrays/arrow/array.py
Lines changed: 4 additions & 1 deletion b/‎pandas/core/arrays/arrow/array.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎pandas/core/arrays/string_.py
Lines changed: 18 additions & 6 deletions b/‎pandas/core/arrays/string_.py
Lines changed: 18 additions & 6 deletions
diff --git a/‎pandas/core/frame.py
Lines changed: 3 additions & 1 deletion b/‎pandas/core/frame.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎pandas/core/interchange/utils.py
Lines changed: 6 additions & 1 deletion b/‎pandas/core/interchange/utils.py
Lines changed: 6 additions & 1 deletion
diff --git a/‎pandas/tests/apply/test_numba.py
Lines changed: 1 addition & 1 deletion b/‎pandas/tests/apply/test_numba.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎pandas/tests/apply/test_series_apply.py
Lines changed: 1 addition & 1 deletion b/‎pandas/tests/apply/test_series_apply.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎pandas/tests/arrays/boolean/test_astype.py
Lines changed: 9 additions & 3 deletions b/‎pandas/tests/arrays/boolean/test_astype.py
Lines changed: 9 additions & 3 deletions
diff --git a/‎pandas/tests/arrays/categorical/test_astype.py
Lines changed: 1 addition & 1 deletion b/‎pandas/tests/arrays/categorical/test_astype.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎pandas/tests/arrays/categorical/test_repr.py
Lines changed: 1 addition & 1 deletion b/‎pandas/tests/arrays/categorical/test_repr.py
Lines changed: 1 addition & 1 deletion
@@ -12,6 +12,7 @@
 
 import numpy as np
 
+from pandas._config import using_string_dtype
 from pandas._config.localization import (
     can_set_locale,
     get_locales,
@@ -106,7 +107,10 @@
 ALL_FLOAT_DTYPES: list[Dtype] = [*FLOAT_NUMPY_DTYPES, *FLOAT_EA_DTYPES]
 
 COMPLEX_DTYPES: list[Dtype] = [complex, "complex64", "complex128"]
-STRING_DTYPES: list[Dtype] = [str, "str", "U"]
+if using_string_dtype():
+    STRING_DTYPES: list[Dtype] = [str, "U"]
+else:
+    STRING_DTYPES: list[Dtype] = [str, "str", "U"]  # type: ignore[no-redef]
 COMPLEX_FLOAT_DTYPES: list[Dtype] = [*COMPLEX_DTYPES, *FLOAT_NUMPY_DTYPES]
 
 DATETIME64_DTYPES: list[Dtype] = ["datetime64[ns]", "M8[ns]"]
 
@@ -575,7 +575,10 @@ def __getitem__(self, item: PositionalIndexer):
         if isinstance(item, np.ndarray):
             if not len(item):
                 # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string]
-                if self._dtype.name == "string" and self._dtype.storage == "pyarrow":
+                if (
+                    isinstance(self._dtype, StringDtype)
+                    and self._dtype.storage == "pyarrow"
+                ):
                     # TODO(infer_string) should this be large_string?
                     pa_dtype = pa.string()
                 else:
 
@@ -4,7 +4,6 @@
 from typing import (
     TYPE_CHECKING,
     Any,
-    ClassVar,
     Literal,
     cast,
 )
@@ -118,9 +117,12 @@ class StringDtype(StorageExtensionDtype):
     string[pyarrow]
     """
 
-    # error: Cannot override instance variable (previously declared on
-    # base class "StorageExtensionDtype") with class variable
-    name: ClassVar[str] = "string"  # type: ignore[misc]
+    @property
+    def name(self) -> str:  # type: ignore[override]
+        if self._na_value is libmissing.NA:
+            return "string"
+        else:
+            return "str"
 
     #: StringDtype().na_value uses pandas.NA except the implementation that
     # follows NumPy semantics, which uses nan.
@@ -137,7 +139,7 @@ def __init__(
     ) -> None:
         # infer defaults
         if storage is None:
-            if using_string_dtype() and na_value is not libmissing.NA:
+            if na_value is not libmissing.NA:
                 if HAS_PYARROW:
                     storage = "pyarrow"
                 else:
@@ -170,11 +172,19 @@ def __init__(
         self.storage = storage
         self._na_value = na_value
 
+    def __repr__(self) -> str:
+        if self._na_value is libmissing.NA:
+            return f"{self.name}[{self.storage}]"
+        else:
+            # TODO add more informative repr
+            return self.name
+
     def __eq__(self, other: object) -> bool:
         # we need to override the base class __eq__ because na_value (NA or NaN)
         # cannot be checked with normal `==`
         if isinstance(other, str):
-            if other == self.name:
+            # TODO should dtype == "string" work for the NaN variant?
+            if other == "string" or other == self.name:  # noqa: PLR1714
                 return True
             try:
                 other = self.construct_from_string(other)
@@ -231,6 +241,8 @@ def construct_from_string(cls, string) -> Self:
             )
         if string == "string":
             return cls()
+        elif string == "str" and using_string_dtype():
+            return cls(na_value=np.nan)
         elif string == "string[python]":
             return cls(storage="python")
         elif string == "string[pyarrow]":
 
@@ -4807,7 +4807,9 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame:
         -----
         * To select all *numeric* types, use ``np.number`` or ``'number'``
         * To select strings you must use the ``object`` dtype, but note that
-          this will return *all* object dtype columns
+          this will return *all* object dtype columns. With
+          ``pd.options.future.infer_string`` enabled, using ``"str"`` will
+          work to select all string columns.
         * See the `numpy dtype hierarchy
           <https://numpy.org/doc/stable/reference/arrays.scalars.html>`__
         * To select datetimes, use ``np.datetime64``, ``'datetime'`` or
 
@@ -135,7 +135,12 @@ def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str:
     if format_str is not None:
         return format_str
 
-    if lib.is_np_dtype(dtype, "M"):
+    if isinstance(dtype, pd.StringDtype):
+        # TODO(infer_string) this should be LARGE_STRING for pyarrow storage,
+        # but current tests don't cover this distinction
+        return ArrowCTypes.STRING
+
+    elif lib.is_np_dtype(dtype, "M"):
         # Selecting the first char of resolution string:
         # dtype.str -> '<M8[ns]' -> 'n'
         resolution = np.datetime_data(dtype)[0][0]
 
@@ -110,7 +110,7 @@ def test_numba_unsupported_dtypes(apply_axis):
 
     with pytest.raises(
         ValueError,
-        match="Column b must have a numeric dtype. Found 'object|string' instead",
+        match="Column b must have a numeric dtype. Found 'object|str' instead",
     ):
         df.apply(f, engine="numba", axis=apply_axis)
 
 
@@ -224,7 +224,7 @@ def test_apply_categorical(by_row, using_infer_string):
     result = ser.apply(lambda x: "A")
     exp = Series(["A"] * 7, name="XX", index=list("abcdefg"))
     tm.assert_series_equal(result, exp)
-    assert result.dtype == object if not using_infer_string else "string[pyarrow_numpy]"
+    assert result.dtype == object if not using_infer_string else "str"
 
 
 @pytest.mark.parametrize("series", [["1-1", "1-1", np.nan], ["1-1", "1-2", np.nan]])
 
@@ -5,7 +5,7 @@
 import pandas._testing as tm
 
 
-def test_astype():
+def test_astype(using_infer_string):
     # with missing values
     arr = pd.array([True, False, None], dtype="boolean")
 
@@ -20,8 +20,14 @@ def test_astype():
     tm.assert_numpy_array_equal(result, expected)
 
     result = arr.astype("str")
-    expected = np.array(["True", "False", "<NA>"], dtype=f"{tm.ENDIAN}U5")
-    tm.assert_numpy_array_equal(result, expected)
+    if using_infer_string:
+        expected = pd.array(
+            ["True", "False", None], dtype=pd.StringDtype(na_value=np.nan)
+        )
+        tm.assert_extension_array_equal(result, expected)
+    else:
+        expected = np.array(["True", "False", "<NA>"], dtype=f"{tm.ENDIAN}U5")
+        tm.assert_numpy_array_equal(result, expected)
 
     # no missing values
     arr = pd.array([True, False, True], dtype="boolean")
 
@@ -88,7 +88,7 @@ def test_astype(self, ordered):
         expected = np.array(cat)
         tm.assert_numpy_array_equal(result, expected)
 
-        msg = r"Cannot cast object|string dtype to float64"
+        msg = r"Cannot cast object|str dtype to float64"
         with pytest.raises(ValueError, match=msg):
             cat.astype(float)
 
 
@@ -22,7 +22,7 @@ def test_print(self, using_infer_string):
         if using_infer_string:
             expected = [
                 "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",
-                "Categories (3, string): [a < b < c]",
+                "Categories (3, str): [a < b < c]",
             ]
         else:
             expected = [
Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,7 @@ def test_print(self, using_infer_string):`
`22`	`22`	`if using_infer_string:`
`23`	`23`	`expected = [`
`24`	`24`	`"['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",`
`25`		`- "Categories (3, string): [a < b < c]",`
	`25`	`+ "Categories (3, str): [a < b < c]",`
`26`	`26`	`]`
`27`	`27`	`else:`
`28`	`28`	`expected = [`