diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 8209525721b98..89a03ddbef2a2 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -460,6 +460,7 @@ Other Deprecations - Deprecated behavior of :meth:`Index.insert` with an object-dtype index silently performing type inference on the result, explicitly call ``result.infer_objects(copy=False)`` for the old behavior instead (:issue:`51363`) - Deprecated casting non-datetimelike values (mainly strings) in :meth:`Series.isin` and :meth:`Index.isin` with ``datetime64``, ``timedelta64``, and :class:`PeriodDtype` dtypes (:issue:`53111`) - Deprecated downcasting behavior in :meth:`Series.where`, :meth:`DataFrame.where`, :meth:`Series.mask`, :meth:`DataFrame.mask`, :meth:`Series.clip`, :meth:`DataFrame.clip`; in a future version these will not infer object-dtype columns to non-object dtype, or all-round floats to integer dtype. Call ``result.infer_objects(copy=False)`` on the result for object inference, or explicitly cast floats to ints. To opt in to the future version, use ``pd.set_option("future.no_silent_downcasting", True)`` (:issue:`53656`) +- Deprecated dtype inference in :class:`Index`, :class:`Series` and :class:`DataFrame` constructors when giving a pandas input, call ``.infer_objects`` on the input to keep the current behavior (:issue:`56012`) - Deprecated dtype inference when setting a :class:`Index` into a :class:`DataFrame`, cast explicitly instead (:issue:`56102`) - Deprecated including the groups in computations when using :meth:`.DataFrameGroupBy.apply` and :meth:`.DataFrameGroupBy.resample`; pass ``include_groups=False`` to exclude the groups (:issue:`7155`) - Deprecated indexing an :class:`Index` with a boolean indexer of length zero (:issue:`55820`) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 69b2b0876fc80..672c16a85086c 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -10,6 +10,7 @@ ContextManager, cast, ) +import warnings import numpy as np @@ -285,11 +286,17 @@ def box_expected(expected, box_cls, transpose: bool = True): else: expected = pd.array(expected, copy=False) elif box_cls is Index: - expected = Index(expected) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "Dtype inference", category=FutureWarning) + expected = Index(expected) elif box_cls is Series: - expected = Series(expected) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "Dtype inference", category=FutureWarning) + expected = Series(expected) elif box_cls is DataFrame: - expected = Series(expected).to_frame() + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "Dtype inference", category=FutureWarning) + expected = Series(expected).to_frame() if transpose: # for vector operations, we need a DataFrame to be a single-row, # not a single-column, in order to operate against non-DataFrame diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e741fa7b37f33..e1048a51089ac 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -726,6 +726,10 @@ def __init__( manager = _get_option("mode.data_manager", silent=True) + is_pandas_object = isinstance(data, (Series, Index, ExtensionArray)) + data_dtype = getattr(data, "dtype", None) + original_dtype = dtype + # GH47215 if isinstance(index, set): raise ValueError("index cannot be a set") @@ -912,6 +916,18 @@ def __init__( NDFrame.__init__(self, mgr) + if original_dtype is None and is_pandas_object and data_dtype == np.object_: + if self.dtypes.iloc[0] != data_dtype: + warnings.warn( + "Dtype inference on a pandas object " + "(Series, Index, ExtensionArray) is deprecated. The DataFrame " + "constructor will keep the original dtype in the future. " + "Call `infer_objects` on the result to get the old " + "behavior.", + FutureWarning, + stacklevel=2, + ) + # ---------------------------------------------------------------------- def __dataframe__( diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5dc4a85ba9792..e01c4e75e9f8a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -492,6 +492,8 @@ def __new__( if not copy and isinstance(data, (ABCSeries, Index)): refs = data._references + is_pandas_object = isinstance(data, (ABCSeries, Index, ExtensionArray)) + # range if isinstance(data, (range, RangeIndex)): result = RangeIndex(start=data, copy=copy, name=name) @@ -571,7 +573,19 @@ def __new__( klass = cls._dtype_to_subclass(arr.dtype) arr = klass._ensure_array(arr, arr.dtype, copy=False) - return klass._simple_new(arr, name, refs=refs) + result = klass._simple_new(arr, name, refs=refs) + if dtype is None and is_pandas_object and data_dtype == np.object_: + if result.dtype != data_dtype: + warnings.warn( + "Dtype inference on a pandas object " + "(Series, Index, ExtensionArray) is deprecated. The Index " + "constructor will keep the original dtype in the future. " + "Call `infer_objects` on the result to get the old " + "behavior.", + FutureWarning, + stacklevel=2, + ) + return result # type: ignore[return-value] @classmethod def _ensure_array(cls, data, dtype, copy: bool): diff --git a/pandas/core/series.py b/pandas/core/series.py index e4dca97bc645d..d46068b6338c5 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -424,6 +424,10 @@ def __init__( self.name = name return + is_pandas_object = isinstance(data, (Series, Index, ExtensionArray)) + data_dtype = getattr(data, "dtype", None) + original_dtype = dtype + if isinstance(data, (ExtensionArray, np.ndarray)): if copy is not False and using_copy_on_write(): if dtype is None or astype_is_view(data.dtype, pandas_dtype(dtype)): @@ -581,6 +585,17 @@ def __init__( self.name = name self._set_axis(0, index) + if original_dtype is None and is_pandas_object and data_dtype == np.object_: + if self.dtype != data_dtype: + warnings.warn( + "Dtype inference on a pandas object " + "(Series, Index, ExtensionArray) is deprecated. The Series " + "constructor will keep the original dtype in the future. " + "Call `infer_objects` on the result to get the old behavior.", + FutureWarning, + stacklevel=find_stack_level(), + ) + def _init_dict( self, data, index: Index | None = None, dtype: DtypeObj | None = None ): diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 75866c6f6013a..1b7d632c0fa80 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -689,19 +689,18 @@ def cat( result = cat_safe(all_cols, sep) out: Index | Series + if isinstance(self._orig.dtype, CategoricalDtype): + # We need to infer the new categories. + dtype = self._orig.dtype.categories.dtype + else: + dtype = self._orig.dtype if isinstance(self._orig, ABCIndex): # add dtype for case that result is all-NA - dtype = None if isna(result).all(): - dtype = object + dtype = object # type: ignore[assignment] out = Index(result, dtype=dtype, name=self._orig.name) else: # Series - if isinstance(self._orig.dtype, CategoricalDtype): - # We need to infer the new categories. - dtype = self._orig.dtype.categories.dtype # type: ignore[assignment] - else: - dtype = self._orig.dtype res_ser = Series( result, dtype=dtype, index=data.index, name=self._orig.name, copy=False ) diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index 7d5c485958039..1aa458a625028 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -314,7 +314,8 @@ def test_dataframe_from_series_or_index_different_dtype(using_copy_on_write, con def test_dataframe_from_series_infer_datetime(using_copy_on_write): ser = Series([Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype=object) - df = DataFrame(ser) + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + df = DataFrame(ser) assert not np.shares_memory(get_array(ser), get_array(df, 0)) if using_copy_on_write: assert df._mgr._has_no_reference(0) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index e1abd0344e356..4c4760501db4e 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2768,6 +2768,23 @@ def test_frame_string_inference_block_dim(self): df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]])) assert df._mgr.blocks[0].ndim == 2 + def test_inference_on_pandas_objects(self): + # GH#56012 + idx = Index([Timestamp("2019-12-31")], dtype=object) + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + result = DataFrame(idx, columns=["a"]) + assert result.dtypes.iloc[0] != np.object_ + result = DataFrame({"a": idx}) + assert result.dtypes.iloc[0] == np.object_ + + ser = Series([Timestamp("2019-12-31")], dtype=object) + + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + result = DataFrame(ser, columns=["a"]) + assert result.dtypes.iloc[0] != np.object_ + result = DataFrame({"a": ser}) + assert result.dtypes.iloc[0] == np.object_ + class TestDataFrameConstructorIndexInference: def test_frame_from_dict_of_series_overlapping_monthly_period_indexes(self): diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py index 60abbfc441e8e..fd5176a28565e 100644 --- a/pandas/tests/indexes/base_class/test_constructors.py +++ b/pandas/tests/indexes/base_class/test_constructors.py @@ -5,6 +5,7 @@ from pandas import ( Index, MultiIndex, + Series, ) import pandas._testing as tm @@ -57,3 +58,16 @@ def test_index_string_inference(self): with pd.option_context("future.infer_string", True): ser = Index(["a", 1]) tm.assert_index_equal(ser, expected) + + def test_inference_on_pandas_objects(self): + # GH#56012 + idx = Index([pd.Timestamp("2019-12-31")], dtype=object) + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + result = Index(idx) + assert result.dtype != np.object_ + + ser = Series([pd.Timestamp("2019-12-31")], dtype=object) + + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + result = Index(ser) + assert result.dtype != np.object_ diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 185e34efdc177..666d92064c86c 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -104,7 +104,8 @@ def test_constructor_copy(self, index, using_infer_string): ) def test_constructor_from_index_dtlike(self, cast_as_obj, index): if cast_as_obj: - result = Index(index.astype(object)) + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + result = Index(index.astype(object)) else: result = Index(index) diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 083a4c4b24adb..34465a7c12c18 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -259,9 +259,9 @@ def test_dt_accessor_limited_display_api(self): tm.assert_almost_equal(results, sorted(set(ok_for_dt + ok_for_dt_methods))) # Period - ser = Series( - period_range("20130101", periods=5, freq="D", name="xxx").astype(object) - ) + idx = period_range("20130101", periods=5, freq="D", name="xxx").astype(object) + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + ser = Series(idx) results = get_dir(ser) tm.assert_almost_equal( results, sorted(set(ok_for_period + ok_for_period_methods)) diff --git a/pandas/tests/series/methods/test_between.py b/pandas/tests/series/methods/test_between.py index 8f4931beae589..3913419038876 100644 --- a/pandas/tests/series/methods/test_between.py +++ b/pandas/tests/series/methods/test_between.py @@ -20,7 +20,7 @@ def test_between(self): tm.assert_series_equal(result, expected) def test_between_datetime_object_dtype(self): - ser = Series(bdate_range("1/1/2000", periods=20).astype(object)) + ser = Series(bdate_range("1/1/2000", periods=20), dtype=object) ser[::2] = np.nan result = ser[ser.between(ser[3], ser[17])] diff --git a/pandas/tests/series/methods/test_equals.py b/pandas/tests/series/methods/test_equals.py index b94723b7cbddf..875ffdd3fe851 100644 --- a/pandas/tests/series/methods/test_equals.py +++ b/pandas/tests/series/methods/test_equals.py @@ -82,13 +82,15 @@ def test_equals_matching_nas(): left = Series([np.datetime64("NaT")], dtype=object) right = Series([np.datetime64("NaT")], dtype=object) assert left.equals(right) - assert Index(left).equals(Index(right)) + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + assert Index(left).equals(Index(right)) assert left.array.equals(right.array) left = Series([np.timedelta64("NaT")], dtype=object) right = Series([np.timedelta64("NaT")], dtype=object) assert left.equals(right) - assert Index(left).equals(Index(right)) + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + assert Index(left).equals(Index(right)) assert left.array.equals(right.array) left = Series([np.float64("NaN")], dtype=object) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 0e6f1c284a988..5f591b4b22f1c 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1316,7 +1316,8 @@ def test_constructor_periodindex(self): pi = period_range("20130101", periods=5, freq="D") s = Series(pi) assert s.dtype == "Period[D]" - expected = Series(pi.astype(object)) + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + expected = Series(pi.astype(object)) tm.assert_series_equal(s, expected) def test_constructor_dict(self): @@ -2137,6 +2138,20 @@ def test_series_string_inference_na_first(self): result = Series([pd.NA, "b"]) tm.assert_series_equal(result, expected) + def test_inference_on_pandas_objects(self): + # GH#56012 + ser = Series([Timestamp("2019-12-31")], dtype=object) + with tm.assert_produces_warning(None): + # This doesn't do inference + result = Series(ser) + assert result.dtype == np.object_ + + idx = Index([Timestamp("2019-12-31")], dtype=object) + + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + result = Series(idx) + assert result.dtype != np.object_ + class TestSeriesConstructorIndexCoercion: def test_series_constructor_datetimelike_index_coercion(self): diff --git a/pandas/tests/strings/test_cat.py b/pandas/tests/strings/test_cat.py index 284932491a65e..c1e7ad6e02779 100644 --- a/pandas/tests/strings/test_cat.py +++ b/pandas/tests/strings/test_cat.py @@ -98,14 +98,18 @@ def test_str_cat_categorical( with option_context("future.infer_string", infer_string): s = Index(["a", "a", "b", "a"], dtype=dtype_caller) - s = s if box == Index else Series(s, index=s) + s = s if box == Index else Series(s, index=s, dtype=s.dtype) t = Index(["b", "a", "b", "c"], dtype=dtype_target) - expected = Index(["ab", "aa", "bb", "ac"]) + expected = Index( + ["ab", "aa", "bb", "ac"], dtype=object if dtype_caller == "object" else None + ) expected = ( expected if box == Index - else Series(expected, index=Index(s, dtype=dtype_caller)) + else Series( + expected, index=Index(s, dtype=dtype_caller), dtype=expected.dtype + ) ) # Series/Index with unaligned Index -> t.values @@ -123,12 +127,19 @@ def test_str_cat_categorical( # Series/Index with Series having different Index t = Series(t.values, index=t.values) - expected = Index(["aa", "aa", "bb", "bb", "aa"]) + expected = Index( + ["aa", "aa", "bb", "bb", "aa"], + dtype=object if dtype_caller == "object" else None, + ) dtype = object if dtype_caller == "object" else s.dtype.categories.dtype expected = ( expected if box == Index - else Series(expected, index=Index(expected.str[:1], dtype=dtype)) + else Series( + expected, + index=Index(expected.str[:1], dtype=dtype), + dtype=expected.dtype, + ) ) result = s.str.cat(t, sep=sep) diff --git a/pandas/tests/tseries/frequencies/test_inference.py b/pandas/tests/tseries/frequencies/test_inference.py index 45741e852fef7..99a504f4188c1 100644 --- a/pandas/tests/tseries/frequencies/test_inference.py +++ b/pandas/tests/tseries/frequencies/test_inference.py @@ -23,6 +23,7 @@ date_range, period_range, ) +import pandas._testing as tm from pandas.core.arrays import ( DatetimeArray, TimedeltaArray, @@ -206,7 +207,8 @@ def test_infer_freq_custom(base_delta_code_pair, constructor): ) def test_infer_freq_index(freq, expected): rng = period_range("1959Q2", "2009Q3", freq=freq) - rng = Index(rng.to_timestamp("D", how="e").astype(object)) + with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): + rng = Index(rng.to_timestamp("D", how="e").astype(object)) assert rng.inferred_freq == expected