diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 04edf165e3011..f291820bc5266 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -206,6 +206,7 @@ Removal of prior version deprecations/changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - :class:`.DataFrameGroupBy.idxmin`, :class:`.DataFrameGroupBy.idxmax`, :class:`.SeriesGroupBy.idxmin`, and :class:`.SeriesGroupBy.idxmax` will now raise a ``ValueError`` when used with ``skipna=False`` and an NA value is encountered (:issue:`10694`) - :func:`concat` no longer ignores empty objects when determining output dtypes (:issue:`39122`) +- :func:`concat` with all-NA entries no longer ignores the dtype of those entries when determining the result dtype (:issue:`40893`) - :func:`read_excel`, :func:`read_json`, :func:`read_html`, and :func:`read_xml` no longer accept raw string or byte representation of the data. That type of data must be wrapped in a :py:class:`StringIO` or :py:class:`BytesIO` (:issue:`53767`) - :meth:`DataFrame.groupby` with ``as_index=False`` and aggregation methods will no longer exclude from the result the groupings that do not arise from the input (:issue:`49519`) - :meth:`Series.dt.to_pydatetime` now returns a :class:`Series` of :py:class:`datetime.datetime` objects (:issue:`52459`) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index d833dab5b820f..a15806683aab6 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -4,7 +4,6 @@ TYPE_CHECKING, cast, ) -import warnings import numpy as np @@ -16,7 +15,6 @@ ) from pandas._libs.missing import NA from pandas.util._decorators import cache_readonly -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.cast import ( ensure_dtype_can_hold_na, @@ -24,19 +22,11 @@ ) from pandas.core.dtypes.common import ( is_1d_only_ea_dtype, - is_scalar, needs_i8_conversion, ) from pandas.core.dtypes.concat import concat_compat -from pandas.core.dtypes.dtypes import ( - ExtensionDtype, - SparseDtype, -) -from pandas.core.dtypes.missing import ( - is_valid_na_for_dtype, - isna, - isna_all, -) +from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.missing import is_valid_na_for_dtype from pandas.core.construction import ensure_wrapped_if_datetimelike from pandas.core.internals.blocks import ( @@ -100,6 +90,7 @@ def concatenate_managers( if first_dtype in [np.float64, np.float32]: # TODO: support more dtypes here. This will be simpler once # JoinUnit.is_na behavior is deprecated. + # (update 2024-04-13 that deprecation has been enforced) if ( all(_is_homogeneous_mgr(mgr, first_dtype) for mgr, _ in mgrs_indexers) and len(mgrs_indexers) > 1 @@ -351,41 +342,6 @@ def _is_valid_na_for(self, dtype: DtypeObj) -> bool: @cache_readonly def is_na(self) -> bool: - blk = self.block - if blk.dtype.kind == "V": - return True - - if not blk._can_hold_na: - return False - - values = blk.values - if values.size == 0: - # GH#39122 this case will return False once deprecation is enforced - return True - - if isinstance(values.dtype, SparseDtype): - return False - - if values.ndim == 1: - # TODO(EA2D): no need for special case with 2D EAs - val = values[0] - if not is_scalar(val) or not isna(val): - # ideally isna_all would do this short-circuiting - return False - return isna_all(values) - else: - val = values[0][0] - if not is_scalar(val) or not isna(val): - # ideally isna_all would do this short-circuiting - return False - return all(isna_all(row) for row in values) - - @cache_readonly - def is_na_after_size_and_isna_all_deprecation(self) -> bool: - """ - Will self.is_na be True after values.size == 0 deprecation and isna_all - deprecation are enforced? - """ blk = self.block if blk.dtype.kind == "V": return True @@ -421,7 +377,7 @@ def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike """ Concatenate values from several join units along axis=1. """ - empty_dtype, empty_dtype_future = _get_empty_dtype(join_units) + empty_dtype = _get_empty_dtype(join_units) has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units) upcasted_na = _dtype_to_na_value(empty_dtype, has_none_blocks) @@ -446,18 +402,6 @@ def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike else: concat_values = concat_compat(to_concat, axis=1) - if empty_dtype != empty_dtype_future: - if empty_dtype == concat_values.dtype: - # GH#39122, GH#40893 - warnings.warn( - "The behavior of DataFrame concatenation with empty or all-NA " - "entries is deprecated. In a future version, this will no longer " - "exclude empty or all-NA columns when determining the result dtypes. " - "To retain the old behavior, exclude the relevant entries before " - "the concat operation.", - FutureWarning, - stacklevel=find_stack_level(), - ) return concat_values @@ -484,7 +428,7 @@ def _dtype_to_na_value(dtype: DtypeObj, has_none_blocks: bool): raise NotImplementedError -def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> tuple[DtypeObj, DtypeObj]: +def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj: """ Return dtype and N/A values to use when concatenating specified units. @@ -496,38 +440,17 @@ def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> tuple[DtypeObj, DtypeObj """ if lib.dtypes_all_equal([ju.block.dtype for ju in join_units]): empty_dtype = join_units[0].block.dtype - return empty_dtype, empty_dtype + return empty_dtype has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units) dtypes = [unit.block.dtype for unit in join_units if not unit.is_na] - if not len(dtypes): - dtypes = [ - unit.block.dtype for unit in join_units if unit.block.dtype.kind != "V" - ] dtype = find_common_type(dtypes) if has_none_blocks: dtype = ensure_dtype_can_hold_na(dtype) - dtype_future = dtype - if len(dtypes) != len(join_units): - dtypes_future = [ - unit.block.dtype - for unit in join_units - if not unit.is_na_after_size_and_isna_all_deprecation - ] - if not len(dtypes_future): - dtypes_future = [ - unit.block.dtype for unit in join_units if unit.block.dtype.kind != "V" - ] - - if len(dtypes) != len(dtypes_future): - dtype_future = find_common_type(dtypes_future) - if has_none_blocks: - dtype_future = ensure_dtype_can_hold_na(dtype_future) - - return dtype, dtype_future + return dtype def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool: diff --git a/pandas/tests/reshape/concat/test_append.py b/pandas/tests/reshape/concat/test_append.py index 81b5914fef402..6e18ccfc70e06 100644 --- a/pandas/tests/reshape/concat/test_append.py +++ b/pandas/tests/reshape/concat/test_append.py @@ -332,7 +332,7 @@ def test_append_empty_tz_frame_with_datetime64ns(self): # pd.NaT gets inferred as tz-naive, so append result is tz-naive result = df._append({"a": pd.NaT}, ignore_index=True) - expected = DataFrame({"a": [np.nan]}, dtype=object) + expected = DataFrame({"a": [pd.NaT]}, dtype=object) tm.assert_frame_equal(result, expected) # also test with typed value to append @@ -359,12 +359,6 @@ def test_append_empty_frame_with_timedelta64ns_nat(self, dtype_str, val): result = df._append(other, ignore_index=True) expected = other.astype(object) - if isinstance(val, str) and dtype_str != "int64": - # TODO: expected used to be `other.astype(object)` which is a more - # reasonable result. This was changed when tightening - # assert_frame_equal's treatment of mismatched NAs to match the - # existing behavior. - expected = DataFrame({"a": [np.nan]}, dtype=object) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 2f9fd1eb421d4..f86cc0c69d363 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -789,14 +789,13 @@ def test_concat_ignore_empty_object_float(empty_dtype, df_dtype): df = DataFrame({"foo": [1, 2], "bar": [1, 2]}, dtype=df_dtype) empty = DataFrame(columns=["foo", "bar"], dtype=empty_dtype) - msg = "The behavior of DataFrame concatenation with empty or all-NA entries" - warn = None + needs_update = False if df_dtype == "datetime64[ns]" or ( df_dtype == "float64" and empty_dtype != "float64" ): - warn = FutureWarning - with tm.assert_produces_warning(warn, match=msg): - result = concat([empty, df]) + needs_update = True + + result = concat([empty, df]) expected = df if df_dtype == "int64": # TODO what exact behaviour do we want for integer eventually? @@ -804,6 +803,10 @@ def test_concat_ignore_empty_object_float(empty_dtype, df_dtype): expected = df.astype("float64") else: expected = df.astype("object") + + if needs_update: + # GH#40893 changed the expected here to retain dependence on empty + expected = expected.astype(object) tm.assert_frame_equal(result, expected) @@ -820,17 +823,19 @@ def test_concat_ignore_all_na_object_float(empty_dtype, df_dtype): else: df_dtype = "float64" - msg = "The behavior of DataFrame concatenation with empty or all-NA entries" - warn = None + needs_update = False if empty_dtype != df_dtype and empty_dtype is not None: - warn = FutureWarning + needs_update = True elif df_dtype == "datetime64[ns]": - warn = FutureWarning + needs_update = True - with tm.assert_produces_warning(warn, match=msg): - result = concat([empty, df], ignore_index=True) + result = concat([empty, df], ignore_index=True) expected = DataFrame({"foo": [np.nan, 1, 2], "bar": [np.nan, 1, 2]}, dtype=df_dtype) + if needs_update: + # GH#40893 changed the expected here to retain dependence on empty + expected = expected.astype(object) + expected.iloc[0] = np.nan tm.assert_frame_equal(result, expected) @@ -841,10 +846,16 @@ def test_concat_ignore_empty_from_reindex(): aligned = df2.reindex(columns=df1.columns) - msg = "The behavior of DataFrame concatenation with empty or all-NA entries" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = concat([df1, aligned], ignore_index=True) - expected = df1 = DataFrame({"a": [1, 2], "b": [pd.Timestamp("2012-01-01"), pd.NaT]}) + result = concat([df1, aligned], ignore_index=True) + + expected = DataFrame( + { + "a": [1, 2], + "b": pd.array([pd.Timestamp("2012-01-01"), np.nan], dtype=object), + }, + dtype=object, + ) + expected["a"] = expected["a"].astype("int64") tm.assert_frame_equal(result, expected) @@ -907,10 +918,10 @@ def test_concat_none_with_timezone_timestamp(): # GH#52093 df1 = DataFrame([{"A": None}]) df2 = DataFrame([{"A": pd.Timestamp("1990-12-20 00:00:00+00:00")}]) - msg = "The behavior of DataFrame concatenation with empty or all-NA entries" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = concat([df1, df2], ignore_index=True) - expected = DataFrame({"A": [None, pd.Timestamp("1990-12-20 00:00:00+00:00")]}) + result = concat([df1, df2], ignore_index=True) + expected = DataFrame( + {"A": [None, pd.Timestamp("1990-12-20 00:00:00+00:00")]}, dtype=object + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index d7791ec38a7ae..3e046b2df72d8 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -226,15 +226,6 @@ def test_concat_NaT_dataframes_all_NaT_axis_0(self, tz1, tz2, item): expected = expected.apply(lambda x: x.dt.tz_localize(tz2)) if tz1 != tz2: expected = expected.astype(object) - if item is pd.NaT: - # GH#18463 - # TODO: setting nan here is to keep the test passing as we - # make assert_frame_equal stricter, but is nan really the - # ideal behavior here? - if tz1 is not None: - expected.iloc[-1, 0] = np.nan - else: - expected.iloc[:-1, 0] = np.nan tm.assert_frame_equal(result, expected) @@ -590,8 +581,9 @@ def test_concat_float_datetime64(): result = concat([df_time.iloc[:0], df_float]) tm.assert_frame_equal(result, expected) - expected = DataFrame({"A": pd.array(["2000"], dtype="datetime64[ns]")}) - msg = "The behavior of DataFrame concatenation with empty or all-NA entries" - with tm.assert_produces_warning(FutureWarning, match=msg): - result = concat([df_time, df_float.iloc[:0]]) + expected = DataFrame({"A": pd.array(["2000"], dtype="datetime64[ns]")}).astype( + object + ) + + result = concat([df_time, df_float.iloc[:0]]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 1a764cb505ead..7ab8ee24bd194 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -709,16 +709,14 @@ def test_join_append_timedeltas(self): {"d": [datetime(2013, 11, 5, 5, 56)], "t": [timedelta(0, 22500)]} ) df = DataFrame(columns=list("dt")) - msg = "The behavior of DataFrame concatenation with empty or all-NA entries" - warn = FutureWarning - with tm.assert_produces_warning(warn, match=msg): - df = concat([df, d], ignore_index=True) - result = concat([df, d], ignore_index=True) + df = concat([df, d], ignore_index=True) + result = concat([df, d], ignore_index=True) expected = DataFrame( { "d": [datetime(2013, 11, 5, 5, 56), datetime(2013, 11, 5, 5, 56)], "t": [timedelta(0, 22500), timedelta(0, 22500)], - } + }, + dtype=object, ) tm.assert_frame_equal(result, expected)