From eea19629f2b42c7757ad02df968b6874bd4abbdc Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 13 Aug 2024 00:28:50 +0200 Subject: [PATCH] String dtype: honor mode.string_storage option (and change default to None) (#59488) * String dtype: honor mode.string_storage option (and change default to None) * fix test + explicitly test default * use 'auto' instead of None --- pandas/core/arrays/string_.py | 12 ++++++++---- pandas/core/config_init.py | 7 +++---- pandas/tests/arrays/string_/test_string_arrow.py | 10 ++++------ pandas/tests/dtypes/test_common.py | 13 +++++++++---- 4 files changed, 24 insertions(+), 18 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index f2811703cbecf8..c881437ba25aff 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -136,12 +136,16 @@ def __init__( # infer defaults if storage is None: if na_value is not libmissing.NA: - if HAS_PYARROW: - storage = "pyarrow" - else: - storage = "python" + storage = get_option("mode.string_storage") + if storage == "auto": + if HAS_PYARROW: + storage = "pyarrow" + else: + storage = "python" else: storage = get_option("mode.string_storage") + if storage == "auto": + storage = "python" if storage == "pyarrow_numpy": # TODO raise a deprecation warning diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 4cd7e50f0ec502..a1df455eebacf8 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -505,13 +505,12 @@ def use_inf_as_na_cb(key) -> None: string_storage_doc = """ : string - The default storage for StringDtype. This option is ignored if - ``future.infer_string`` is set to True. + The default storage for StringDtype. """ def is_valid_string_storage(value: Any) -> None: - legal_values = ["python", "pyarrow"] + legal_values = ["auto", "python", "pyarrow"] if value not in legal_values: msg = "Value must be one of python|pyarrow" if value == "pyarrow_numpy": @@ -526,7 +525,7 @@ def is_valid_string_storage(value: Any) -> None: with cf.config_prefix("mode"): cf.register_option( "string_storage", - "python", + "auto", string_storage_doc, # validator=is_one_of_factory(["python", "pyarrow"]), validator=is_valid_string_storage, diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 6bab04e95de9eb..72d672ba8a7d92 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -4,7 +4,6 @@ import numpy as np import pytest -from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td import pandas as pd @@ -27,11 +26,10 @@ def test_eq_all_na(): tm.assert_extension_array_equal(result, expected) -def test_config(string_storage, request, using_infer_string): - if using_infer_string and string_storage == "python" and HAS_PYARROW: - # string storage with na_value=NaN always uses pyarrow if available - # -> does not yet honor the option - request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) +def test_config(string_storage, using_infer_string): + # with the default string_storage setting + # always "python" at the moment + assert StringDtype().storage == "python" with pd.option_context("string_storage", string_storage): assert StringDtype().storage == string_storage diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index e0232bb292d6eb..ccd30caba5dee1 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -3,6 +3,7 @@ import numpy as np import pytest +from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td from pandas.core.dtypes.astype import astype_array @@ -802,13 +803,17 @@ def test_pandas_dtype_ea_not_instance(): def test_pandas_dtype_string_dtypes(string_storage): - # TODO(infer_string) remove skip if "python" is supported - pytest.importorskip("pyarrow") + with pd.option_context("future.infer_string", True): + # with the default string_storage setting + result = pandas_dtype("str") + assert result == pd.StringDtype( + "pyarrow" if HAS_PYARROW else "python", na_value=np.nan + ) + with pd.option_context("future.infer_string", True): with pd.option_context("string_storage", string_storage): result = pandas_dtype("str") - # TODO(infer_string) hardcoded to pyarrow until python is supported - assert result == pd.StringDtype("pyarrow", na_value=np.nan) + assert result == pd.StringDtype(string_storage, na_value=np.nan) with pd.option_context("future.infer_string", False): with pd.option_context("string_storage", string_storage):