From 8be8806d152fce7ad99263717c130f6c50025290 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 1 Aug 2024 17:55:37 +0200 Subject: [PATCH] String dtype: restrict options.mode.string_storage to python|pyarrow (remove pyarrow_numpy) (#59376) * String dtype: restrict options.mode.string_storage to python|pyarrow (remove pyarrow_numpy) * add type annotation --- pandas/conftest.py | 2 -- pandas/core/config_init.py | 22 +++++++++++-- pandas/tests/arrays/string_/test_string.py | 32 +++++-------------- .../tests/arrays/string_/test_string_arrow.py | 10 +++--- pandas/tests/frame/methods/test_astype.py | 9 ++++++ .../frame/methods/test_convert_dtypes.py | 5 ++- pandas/tests/io/conftest.py | 16 ---------- 7 files changed, 45 insertions(+), 51 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index dfd4de82657d02..a04845c892c955 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1248,7 +1248,6 @@ def nullable_string_dtype(request): params=[ "python", pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")), - pytest.param("pyarrow_numpy", marks=td.skip_if_no("pyarrow")), ] ) def string_storage(request): @@ -1257,7 +1256,6 @@ def string_storage(request): * 'python' * 'pyarrow' - * 'pyarrow_numpy' """ return request.param diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index a6625d99eaa71e..4cd7e50f0ec502 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -12,7 +12,10 @@ from __future__ import annotations import os -from typing import Callable +from typing import ( + Any, + Callable, +) import pandas._config.config as cf from pandas._config.config import ( @@ -506,12 +509,27 @@ def use_inf_as_na_cb(key) -> None: ``future.infer_string`` is set to True. """ + +def is_valid_string_storage(value: Any) -> None: + legal_values = ["python", "pyarrow"] + if value not in legal_values: + msg = "Value must be one of python|pyarrow" + if value == "pyarrow_numpy": + # TODO: we can remove extra message after 3.0 + msg += ( + ". 'pyarrow_numpy' was specified, but this option should be " + "enabled using pandas.options.future.infer_string instead" + ) + raise ValueError(msg) + + with cf.config_prefix("mode"): cf.register_option( "string_storage", "python", string_storage_doc, - validator=is_one_of_factory(["python", "pyarrow", "pyarrow_numpy"]), + # validator=is_one_of_factory(["python", "pyarrow"]), + validator=is_valid_string_storage, ) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index b1c5f4338a4ede..6bf20b6fcd5f72 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -516,19 +516,12 @@ def test_arrow_array(dtype): assert arr.equals(expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") -def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): +def test_arrow_roundtrip(dtype, string_storage, using_infer_string): # roundtrip possible from arrow 1.0.0 pa = pytest.importorskip("pyarrow") - if using_infer_string and string_storage2 != "pyarrow_numpy": - request.applymarker( - pytest.mark.xfail( - reason="infer_string takes precedence over string storage" - ) - ) - data = pd.array(["a", "b", None], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) @@ -536,30 +529,21 @@ def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): assert table.field("a").type == "string" else: assert table.field("a").type == "large_string" - with pd.option_context("string_storage", string_storage2): + with pd.option_context("string_storage", string_storage): result = table.to_pandas() assert isinstance(result["a"].dtype, pd.StringDtype) - expected = df.astype(f"string[{string_storage2}]") + expected = df.astype(f"string[{string_storage}]") tm.assert_frame_equal(result, expected) # ensure the missing value is represented by NA and not np.nan or None assert result.loc[2, "a"] is result["a"].dtype.na_value -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") -def test_arrow_load_from_zero_chunks( - dtype, string_storage2, request, using_infer_string -): +def test_arrow_load_from_zero_chunks(dtype, string_storage, using_infer_string): # GH-41040 pa = pytest.importorskip("pyarrow") - if using_infer_string and string_storage2 != "pyarrow_numpy": - request.applymarker( - pytest.mark.xfail( - reason="infer_string takes precedence over string storage" - ) - ) - data = pd.array([], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) @@ -569,10 +553,10 @@ def test_arrow_load_from_zero_chunks( assert table.field("a").type == "large_string" # Instantiate the same table with no chunks at all table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema) - with pd.option_context("string_storage", string_storage2): + with pd.option_context("string_storage", string_storage): result = table.to_pandas() assert isinstance(result["a"].dtype, pd.StringDtype) - expected = df.astype(f"string[{string_storage2}]") + expected = df.astype(f"string[{string_storage}]") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index dceb93364d5057..d38b728aaf1205 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -27,16 +27,18 @@ def test_eq_all_na(): def test_config(string_storage, request, using_infer_string): - if using_infer_string and string_storage != "pyarrow_numpy": - request.applymarker(pytest.mark.xfail(reason="infer string takes precedence")) - if string_storage == "pyarrow_numpy": + if using_infer_string and string_storage == "python": + # python string storage with na_value=NaN is not yet implemented request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) + with pd.option_context("string_storage", string_storage): assert StringDtype().storage == string_storage result = pd.array(["a", "b"]) assert result.dtype.storage == string_storage - dtype = StringDtype(string_storage) + dtype = StringDtype( + string_storage, na_value=np.nan if using_infer_string else pd.NA + ) expected = dtype.construct_array_type()._from_sequence(["a", "b"], dtype=dtype) tm.assert_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index b6510f384fabe5..ea9cc22d937580 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -912,3 +912,12 @@ def test_astype_to_string_not_modifying_input(string_storage, val): with option_context("mode.string_storage", string_storage): df.astype("string", copy=False) tm.assert_frame_equal(df, expected) + + +@pytest.mark.parametrize("val", [None, 1, 1.5, np.nan, NaT]) +def test_astype_to_string_dtype_not_modifying_input(any_string_dtype, val): + # GH#51073 - variant of the above test with explicit dtype instances + df = DataFrame({"a": ["a", "b", val]}) + expected = df.copy() + df.astype(any_string_dtype) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 91fa81b5bee2ec..59779234b46d91 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -10,6 +10,8 @@ class TestConvertDtypes: + # TODO convert_dtypes should not use NaN variant of string dtype, but always NA + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")] ) @@ -18,9 +20,6 @@ def test_convert_dtypes( ): # Specific types are tested in tests/series/test_dtypes.py # Just check that it works for DataFrame here - if using_infer_string: - string_storage = "pyarrow_numpy" - df = pd.DataFrame( { "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index ab6cacc4cc860d..bdefadf3dbec05 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -224,19 +224,3 @@ def compression_format(request): @pytest.fixture(params=_compression_formats_params) def compression_ext(request): return request.param[0] - - -@pytest.fixture( - params=[ - "python", - pytest.param("pyarrow", marks=td.skip_if_no("pyarrow")), - ] -) -def string_storage(request): - """ - Parametrized fixture for pd.options.mode.string_storage. - - * 'python' - * 'pyarrow' - """ - return request.param