Skip to content

Commit 3428a7f

Browse files
Switch default string storage from python to pyarrow (if installed) also for NA-variant of the StringDtype (#62118)
1 parent f6d5094 commit 3428a7f

File tree

14 files changed

+59
-44
lines changed

14 files changed

+59
-44
lines changed

pandas/_libs/parsers.pyx

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,10 @@ import warnings
1010

1111
from pandas.util._exceptions import find_stack_level
1212

13-
from pandas import StringDtype
13+
from pandas import (
14+
ArrowDtype,
15+
StringDtype,
16+
)
1417
from pandas.core.arrays import (
1518
ArrowExtensionArray,
1619
BooleanArray,
@@ -43,7 +46,6 @@ from libc.string cimport (
4346
strncpy,
4447
)
4548

46-
4749
import numpy as np
4850

4951
cimport numpy as cnp
@@ -1452,7 +1454,13 @@ def _maybe_upcast(
14521454

14531455
elif arr.dtype == np.object_:
14541456
if use_dtype_backend:
1455-
dtype = StringDtype()
1457+
if dtype_backend == "pyarrow":
1458+
# using the StringDtype below would use large_string by default
1459+
# keep here to pyarrow's default of string
1460+
import pyarrow as pa
1461+
dtype = ArrowDtype(pa.string())
1462+
else:
1463+
dtype = StringDtype()
14561464
cls = dtype.construct_array_type()
14571465
arr = cls._from_sequence(arr, dtype=dtype)
14581466

pandas/core/arrays/string_.py

Lines changed: 24 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,10 @@
7575

7676
from pandas.io.formats import printing
7777

78+
if HAS_PYARROW:
79+
import pyarrow as pa
80+
import pyarrow.compute as pc
81+
7882
if TYPE_CHECKING:
7983
from collections.abc import MutableMapping
8084

@@ -128,10 +132,10 @@ class StringDtype(StorageExtensionDtype):
128132
Examples
129133
--------
130134
>>> pd.StringDtype()
131-
<StringDtype(storage='python', na_value=<NA>)>
132-
133-
>>> pd.StringDtype(storage="pyarrow")
134135
<StringDtype(na_value=<NA>)>
136+
137+
>>> pd.StringDtype(storage="python")
138+
<StringDtype(storage='python', na_value=<NA>)>
135139
"""
136140

137141
@property
@@ -156,16 +160,11 @@ def __init__(
156160
) -> None:
157161
# infer defaults
158162
if storage is None:
159-
if na_value is not libmissing.NA:
160-
storage = get_option("mode.string_storage")
161-
if storage == "auto":
162-
if HAS_PYARROW:
163-
storage = "pyarrow"
164-
else:
165-
storage = "python"
166-
else:
167-
storage = get_option("mode.string_storage")
168-
if storage == "auto":
163+
storage = get_option("mode.string_storage")
164+
if storage == "auto":
165+
if HAS_PYARROW:
166+
storage = "pyarrow"
167+
else:
169168
storage = "python"
170169

171170
if storage == "pyarrow_numpy":
@@ -343,7 +342,15 @@ def __from_arrow__(
343342
Construct StringArray from pyarrow Array/ChunkedArray.
344343
"""
345344
if self.storage == "pyarrow":
346-
from pandas.core.arrays.string_arrow import ArrowStringArray
345+
from pandas.core.arrays.string_arrow import (
346+
ArrowStringArray,
347+
_chk_pyarrow_available,
348+
)
349+
350+
_chk_pyarrow_available()
351+
352+
if not pa.types.is_large_string(array.type):
353+
array = pc.cast(array, pa.large_string())
347354

348355
return ArrowStringArray(array, dtype=self)
349356

@@ -612,7 +619,7 @@ class StringArray(BaseStringArray, NumpyExtensionArray): # type: ignore[misc]
612619
Examples
613620
--------
614621
>>> pd.array(["This is", "some text", None, "data."], dtype="string")
615-
<StringArray>
622+
<ArrowStringArray>
616623
['This is', 'some text', <NA>, 'data.']
617624
Length: 4, dtype: string
618625
@@ -624,15 +631,15 @@ class StringArray(BaseStringArray, NumpyExtensionArray): # type: ignore[misc]
624631
['1', 1]
625632
Length: 2, dtype: object
626633
>>> pd.array(["1", 1], dtype="string")
627-
<StringArray>
634+
<ArrowStringArray>
628635
['1', '1']
629636
Length: 2, dtype: string
630637
631638
However, instantiating StringArrays directly with non-strings will raise an error.
632639
633640
For comparison methods, `StringArray` returns a :class:`pandas.BooleanArray`:
634641
635-
>>> pd.array(["a", None, "c"], dtype="string") == "a"
642+
>>> pd.array(["a", None, "c"], dtype="string[python]") == "a"
636643
<BooleanArray>
637644
[True, <NA>, False]
638645
Length: 3, dtype: boolean

pandas/core/construction.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -229,14 +229,14 @@ def array(
229229
Length: 2, dtype: Float64
230230
231231
>>> pd.array(["a", None, "c"])
232-
<StringArray>
232+
<ArrowStringArray>
233233
['a', <NA>, 'c']
234234
Length: 3, dtype: string
235235
236-
>>> with pd.option_context("string_storage", "pyarrow"):
236+
>>> with pd.option_context("string_storage", "python"):
237237
... arr = pd.array(["a", None, "c"])
238238
>>> arr
239-
<ArrowStringArray>
239+
<StringArray>
240240
['a', <NA>, 'c']
241241
Length: 3, dtype: string
242242

pandas/tests/arrays/categorical/test_constructors.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -736,9 +736,7 @@ def test_interval(self):
736736

737737
def test_categorical_extension_array_nullable(self, nulls_fixture):
738738
# GH:
739-
arr = pd.arrays.StringArray._from_sequence(
740-
[nulls_fixture] * 2, dtype=pd.StringDtype()
741-
)
739+
arr = pd.array([nulls_fixture] * 2, dtype=pd.StringDtype())
742740
result = Categorical(arr)
743741
assert arr.dtype == result.categories.dtype
744742
expected = Categorical(Series([pd.NA, pd.NA], dtype=arr.dtype))

pandas/tests/arrays/string_/test_string_arrow.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import numpy as np
55
import pytest
66

7+
from pandas.compat import HAS_PYARROW
78
import pandas.util._test_decorators as td
89

910
import pandas as pd
@@ -25,10 +26,10 @@ def test_eq_all_na():
2526
tm.assert_extension_array_equal(result, expected)
2627

2728

28-
def test_config(string_storage, using_infer_string):
29+
def test_config(string_storage):
2930
# with the default string_storage setting
3031
# always "python" at the moment
31-
assert StringDtype().storage == "python"
32+
assert StringDtype().storage == "pyarrow" if HAS_PYARROW else "python"
3233

3334
with pd.option_context("string_storage", string_storage):
3435
assert StringDtype().storage == string_storage

pandas/tests/copy_view/test_array.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ def test_dataframe_array_ea_dtypes():
128128

129129

130130
def test_dataframe_array_string_dtype():
131-
df = DataFrame({"a": ["a", "b"]}, dtype="string")
131+
df = DataFrame({"a": ["a", "b"]}, dtype="string[python]")
132132
arr = np.asarray(df)
133133
assert np.shares_memory(arr, get_array(df, "a"))
134134
assert arr.flags.writeable is False

pandas/tests/copy_view/test_astype.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ def test_astype_numpy_to_ea():
8282

8383

8484
@pytest.mark.parametrize(
85-
"dtype, new_dtype", [("object", "string"), ("string", "object")]
85+
"dtype, new_dtype", [("object", "string[python]"), ("string[python]", "object")]
8686
)
8787
def test_astype_string_and_object(dtype, new_dtype):
8888
df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype)
@@ -95,7 +95,7 @@ def test_astype_string_and_object(dtype, new_dtype):
9595

9696

9797
@pytest.mark.parametrize(
98-
"dtype, new_dtype", [("object", "string"), ("string", "object")]
98+
"dtype, new_dtype", [("object", "string[python]"), ("string[python]", "object")]
9999
)
100100
def test_astype_string_and_object_update_original(dtype, new_dtype):
101101
df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype)
@@ -218,9 +218,7 @@ def test_convert_dtypes(using_infer_string):
218218
df_orig = df.copy()
219219
df2 = df.convert_dtypes()
220220

221-
if using_infer_string and HAS_PYARROW:
222-
# TODO the default nullable string dtype still uses python storage
223-
# this should be changed to pyarrow if installed
221+
if HAS_PYARROW:
224222
assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
225223
else:
226224
assert tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))

pandas/tests/dtypes/test_common.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ def test_period_dtype(self, dtype):
118118
"float": np.dtype(np.float64),
119119
"object": np.dtype(object),
120120
"category": com.pandas_dtype("category"),
121-
"string": pd.StringDtype(),
121+
"string": pd.StringDtype("python"),
122122
}
123123

124124

pandas/tests/frame/methods/test_convert_dtypes.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ def test_convert_dtypes_avoid_block_splitting(self):
199199
{
200200
"a": [1, 2, 3],
201201
"b": [4, 5, 6],
202-
"c": pd.Series(["a"] * 3, dtype="string[python]"),
202+
"c": pd.Series(["a"] * 3, dtype="string"),
203203
}
204204
)
205205
tm.assert_frame_equal(result, expected)
@@ -209,7 +209,7 @@ def test_convert_dtypes_from_arrow(self):
209209
# GH#56581
210210
df = pd.DataFrame([["a", datetime.time(18, 12)]], columns=["a", "b"])
211211
result = df.convert_dtypes()
212-
expected = df.astype({"a": "string[python]"})
212+
expected = df.astype({"a": "string"})
213213
tm.assert_frame_equal(result, expected)
214214

215215
def test_convert_dtype_pyarrow_timezone_preserve(self):

pandas/tests/io/excel/test_readers.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -657,6 +657,10 @@ def test_dtype_backend(self, read_ext, dtype_backend, engine, tmp_excel):
657657
for col in df.columns
658658
}
659659
)
660+
661+
# pandas uses large_string by default, but pyarrow infers string
662+
expected["d"] = expected["d"].astype(pd.ArrowDtype(pa.string()))
663+
expected["h"] = expected["h"].astype(pd.ArrowDtype(pa.string()))
660664
# pyarrow by default infers timestamp resolution as us, not ns
661665
expected["i"] = ArrowExtensionArray(
662666
expected["i"].array._pa_array.cast(pa.timestamp(unit="us"))

0 commit comments

Comments
 (0)