From 14d59e1b95072d3ce190f0377ef4600c81f32d1d Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 8 Jan 2022 23:07:17 -0800 Subject: [PATCH] TST: implement xfailed test for #18415 --- pandas/tests/frame/indexing/test_coercion.py | 183 +++++++++++++++++++ pandas/tests/series/indexing/test_setitem.py | 138 -------------- 2 files changed, 183 insertions(+), 138 deletions(-) create mode 100644 pandas/tests/frame/indexing/test_coercion.py diff --git a/pandas/tests/frame/indexing/test_coercion.py b/pandas/tests/frame/indexing/test_coercion.py new file mode 100644 index 0000000000000..8b2bc60953e3e --- /dev/null +++ b/pandas/tests/frame/indexing/test_coercion.py @@ -0,0 +1,183 @@ +""" +Tests for values coercion in setitem-like operations on DataFrame. + +For the most part, these should be multi-column DataFrames, otherwise +we would share the tests with Series. +""" +import numpy as np +import pytest + +import pandas as pd +from pandas import ( + DataFrame, + MultiIndex, + NaT, + Series, + Timestamp, + date_range, +) +import pandas._testing as tm + + +class TestDataFrameSetitemCoercion: + @pytest.mark.xfail(reason="Unnecessary cast.") + @pytest.mark.parametrize("consolidate", [True, False]) + def test_loc_setitem_multiindex_columns(self, consolidate): + # GH#18415 Setting values in a single column preserves dtype, + # while setting them in multiple columns did unwanted cast. + + # Note that A here has 2 blocks, below we do the same thing + # with a consolidated frame. + A = DataFrame(np.zeros((6, 5), dtype=np.float32)) + A = pd.concat([A, A], axis=1, keys=[1, 2]) + if consolidate: + A = A._consolidate() + + A.loc[2:3, (1, slice(2, 3))] = np.ones((2, 2), dtype=np.float32) + assert (A.dtypes == np.float32).all() + + A.loc[0:5, (1, slice(2, 3))] = np.ones((6, 2), dtype=np.float32) + assert (A.dtypes == np.float32).all() + + A.loc[:, (1, slice(2, 3))] = np.ones((6, 2), dtype=np.float32) + assert (A.dtypes == np.float32).all() + + # TODO: i think this isn't about MultiIndex and could be done with iloc? + + +def test_37477(): + # fixed by GH#45121 + orig = DataFrame({"A": [1, 2, 3], "B": [3, 4, 5]}) + expected = DataFrame({"A": [1, 2, 3], "B": [3, 1.2, 5]}) + + df = orig.copy() + df.at[1, "B"] = 1.2 + tm.assert_frame_equal(df, expected) + + df = orig.copy() + df.loc[1, "B"] = 1.2 + tm.assert_frame_equal(df, expected) + + df = orig.copy() + df.iat[1, 1] = 1.2 + tm.assert_frame_equal(df, expected) + + df = orig.copy() + df.iloc[1, 1] = 1.2 + tm.assert_frame_equal(df, expected) + + +def test_6942(indexer_al): + # check that the .at __setitem__ after setting "Live" actually sets the data + start = Timestamp("2014-04-01") + t1 = Timestamp("2014-04-23 12:42:38.883082") + t2 = Timestamp("2014-04-24 01:33:30.040039") + + dti = date_range(start, periods=1) + orig = DataFrame(index=dti, columns=["timenow", "Live"]) + + df = orig.copy() + indexer_al(df)[start, "timenow"] = t1 + + df["Live"] = True + + df.at[start, "timenow"] = t2 + assert df.iloc[0, 0] == t2 + + +def test_26395(indexer_al): + # .at case fixed by GH#45121 (best guess) + df = DataFrame(index=["A", "B", "C"]) + df["D"] = 0 + + indexer_al(df)["C", "D"] = 2 + expected = DataFrame({"D": [0, 0, 2]}, index=["A", "B", "C"], dtype=np.int64) + tm.assert_frame_equal(df, expected) + + indexer_al(df)["C", "D"] = 44.5 + expected = DataFrame({"D": [0, 0, 44.5]}, index=["A", "B", "C"], dtype=np.float64) + tm.assert_frame_equal(df, expected) + + indexer_al(df)["C", "D"] = "hello" + expected = DataFrame({"D": [0, 0, "hello"]}, index=["A", "B", "C"], dtype=object) + tm.assert_frame_equal(df, expected) + + +@pytest.mark.xfail(reason="unwanted upcast") +def test_15231(): + df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"]) + df.loc[2] = Series({"a": 5, "b": 6}) + assert (df.dtypes == np.int64).all() + + df.loc[3] = Series({"a": 7}) + + # df["a"] doesn't have any NaNs, should not have been cast + exp_dtypes = Series([np.int64, np.float64], dtype=object, index=["a", "b"]) + tm.assert_series_equal(df.dtypes, exp_dtypes) + + +@pytest.mark.xfail(reason="Unnecessarily upcasts to float64") +def test_iloc_setitem_unnecesssary_float_upcasting(): + # GH#12255 + df = DataFrame( + { + 0: np.array([1, 3], dtype=np.float32), + 1: np.array([2, 4], dtype=np.float32), + 2: ["a", "b"], + } + ) + orig = df.copy() + + values = df[0].values.reshape(2, 1) + df.iloc[:, 0:1] = values + + tm.assert_frame_equal(df, orig) + + +@pytest.mark.xfail(reason="unwanted casting to dt64") +def test_12499(): + # TODO: OP in GH#12499 used np.datetim64("NaT") instead of pd.NaT, + # which has consequences for the expected df["two"] (though i think at + # the time it might not have because of a separate bug). See if it makes + # a difference which one we use here. + ts = Timestamp("2016-03-01 03:13:22.98986", tz="UTC") + + data = [{"one": 0, "two": ts}] + orig = DataFrame(data) + df = orig.copy() + df.loc[1] = [np.nan, NaT] + + expected = DataFrame( + {"one": [0, np.nan], "two": Series([ts, NaT], dtype="datetime64[ns, UTC]")} + ) + tm.assert_frame_equal(df, expected) + + data = [{"one": 0, "two": ts}] + df = orig.copy() + df.loc[1, :] = [np.nan, NaT] + tm.assert_frame_equal(df, expected) + + +@pytest.mark.xfail(reason="Too many columns cast to float64") +def test_20476(): + mi = MultiIndex.from_product([["A", "B"], ["a", "b", "c"]]) + df = DataFrame(-1, index=range(3), columns=mi) + filler = DataFrame([[1, 2, 3.0]] * 3, index=range(3), columns=["a", "b", "c"]) + df["A"] = filler + + expected = DataFrame( + { + 0: [1, 1, 1], + 1: [2, 2, 2], + 2: [3.0, 3.0, 3.0], + 3: [-1, -1, -1], + 4: [-1, -1, -1], + 5: [-1, -1, -1], + } + ) + expected.columns = mi + exp_dtypes = Series( + [np.dtype(np.int64)] * 2 + [np.dtype(np.float64)] + [np.dtype(np.int64)] * 3, + index=mi, + ) + tm.assert_series_equal(df.dtypes, exp_dtypes) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 94a5ca38afce4..734f3292d5e73 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -1360,28 +1360,6 @@ def test_15413(): tm.assert_series_equal(ser, expected) -def test_37477(): - # fixed by GH#45121 - orig = DataFrame({"A": [1, 2, 3], "B": [3, 4, 5]}) - expected = DataFrame({"A": [1, 2, 3], "B": [3, 1.2, 5]}) - - df = orig.copy() - df.at[1, "B"] = 1.2 - tm.assert_frame_equal(df, expected) - - df = orig.copy() - df.loc[1, "B"] = 1.2 - tm.assert_frame_equal(df, expected) - - df = orig.copy() - df.iat[1, 1] = 1.2 - tm.assert_frame_equal(df, expected) - - df = orig.copy() - df.iloc[1, 1] = 1.2 - tm.assert_frame_equal(df, expected) - - def test_32878_int_itemsize(): # Fixed by GH#45121 arr = np.arange(5).astype("i4") @@ -1392,24 +1370,6 @@ def test_32878_int_itemsize(): tm.assert_series_equal(ser, expected) -def test_26395(indexer_al): - # .at case fixed by GH#45121 (best guess) - df = DataFrame(index=["A", "B", "C"]) - df["D"] = 0 - - indexer_al(df)["C", "D"] = 2 - expected = DataFrame({"D": [0, 0, 2]}, index=["A", "B", "C"], dtype=np.int64) - tm.assert_frame_equal(df, expected) - - indexer_al(df)["C", "D"] = 44.5 - expected = DataFrame({"D": [0, 0, 44.5]}, index=["A", "B", "C"], dtype=np.float64) - tm.assert_frame_equal(df, expected) - - indexer_al(df)["C", "D"] = "hello" - expected = DataFrame({"D": [0, 0, "hello"]}, index=["A", "B", "C"], dtype=object) - tm.assert_frame_equal(df, expected) - - def test_37692(indexer_al): # GH#37692 ser = Series([1, 2, 3], index=["a", "b", "c"]) @@ -1442,24 +1402,6 @@ def test_setitem_bool_int_float_consistency(indexer_sli): indexer_sli(ser)[0] = np.int64(1) -def test_6942(indexer_al): - # check that the .at __setitem__ after setting "Live" actually sets the data - start = Timestamp("2014-04-01") - t1 = Timestamp("2014-04-23 12:42:38.883082") - t2 = Timestamp("2014-04-24 01:33:30.040039") - - dti = date_range(start, periods=1) - orig = DataFrame(index=dti, columns=["timenow", "Live"]) - - df = orig.copy() - indexer_al(df)[start, "timenow"] = t1 - - df["Live"] = True - - df.at[start, "timenow"] = t2 - assert df.iloc[0, 0] == t2 - - def test_setitem_positional_with_casting(): # GH#45070 case where in __setitem__ we get a KeyError, then when # we fallback we *also* get a ValueError if we try to set inplace. @@ -1479,19 +1421,6 @@ def test_setitem_positional_float_into_int_coerces(): tm.assert_series_equal(ser, expected) -@pytest.mark.xfail(reason="unwanted upcast") -def test_15231(): - df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"]) - df.loc[2] = Series({"a": 5, "b": 6}) - assert (df.dtypes == np.int64).all() - - df.loc[3] = Series({"a": 7}) - - # df["a"] doesn't have any NaNs, should not have been cast - exp_dtypes = Series([np.int64, np.float64], dtype=object, index=["a", "b"]) - tm.assert_series_equal(df.dtypes, exp_dtypes) - - @pytest.mark.xfail(reason="Fails to upcast") def test_32878_complex_itemsize(): # TODO: when fixed, put adjacent to test_32878_int_itemsize @@ -1507,73 +1436,6 @@ def test_32878_complex_itemsize(): tm.assert_series_equal(ser, expected) -@pytest.mark.xfail(reason="Unnecessarily upcasts to float64") -def test_iloc_setitem_unnecesssary_float_upcasting(): - # GH#12255 - df = DataFrame( - { - 0: np.array([1, 3], dtype=np.float32), - 1: np.array([2, 4], dtype=np.float32), - 2: ["a", "b"], - } - ) - orig = df.copy() - - values = df[0].values.reshape(2, 1) - df.iloc[:, 0:1] = values - - tm.assert_frame_equal(df, orig) - - -@pytest.mark.xfail(reason="unwanted casting to dt64") -def test_12499(): - # TODO: OP in GH#12499 used np.datetim64("NaT") instead of pd.NaT, - # which has consequences for the expected df["two"] (though i think at - # the time it might not have because of a separate bug). See if it makes - # a difference which one we use here. - ts = Timestamp("2016-03-01 03:13:22.98986", tz="UTC") - - data = [{"one": 0, "two": ts}] - orig = DataFrame(data) - df = orig.copy() - df.loc[1] = [np.nan, NaT] - - expected = DataFrame( - {"one": [0, np.nan], "two": Series([ts, NaT], dtype="datetime64[ns, UTC]")} - ) - tm.assert_frame_equal(df, expected) - - data = [{"one": 0, "two": ts}] - df = orig.copy() - df.loc[1, :] = [np.nan, NaT] - tm.assert_frame_equal(df, expected) - - -@pytest.mark.xfail(reason="Too many columns cast to float64") -def test_20476(): - mi = MultiIndex.from_product([["A", "B"], ["a", "b", "c"]]) - df = DataFrame(-1, index=range(3), columns=mi) - filler = DataFrame([[1, 2, 3.0]] * 3, index=range(3), columns=["a", "b", "c"]) - df["A"] = filler - - expected = DataFrame( - { - 0: [1, 1, 1], - 1: [2, 2, 2], - 2: [3.0, 3.0, 3.0], - 3: [-1, -1, -1], - 4: [-1, -1, -1], - 5: [-1, -1, -1], - } - ) - expected.columns = mi - exp_dtypes = Series( - [np.dtype(np.int64)] * 2 + [np.dtype(np.float64)] + [np.dtype(np.int64)] * 3, - index=mi, - ) - tm.assert_series_equal(df.dtypes, exp_dtypes) - - def test_setitem_int_as_positional_fallback_deprecation(): # GH#42215 deprecated falling back to positional on __setitem__ with an # int not contained in the index