Skip to content

TST: implement xfailed test for #18415 #45290

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
183 changes: 183 additions & 0 deletions pandas/tests/frame/indexing/test_coercion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
"""
Tests for values coercion in setitem-like operations on DataFrame.

For the most part, these should be multi-column DataFrames, otherwise
we would share the tests with Series.
"""
import numpy as np
import pytest

import pandas as pd
from pandas import (
DataFrame,
MultiIndex,
NaT,
Series,
Timestamp,
date_range,
)
import pandas._testing as tm


class TestDataFrameSetitemCoercion:
@pytest.mark.xfail(reason="Unnecessary cast.")
@pytest.mark.parametrize("consolidate", [True, False])
def test_loc_setitem_multiindex_columns(self, consolidate):
# GH#18415 Setting values in a single column preserves dtype,
# while setting them in multiple columns did unwanted cast.

# Note that A here has 2 blocks, below we do the same thing
# with a consolidated frame.
A = DataFrame(np.zeros((6, 5), dtype=np.float32))
A = pd.concat([A, A], axis=1, keys=[1, 2])
if consolidate:
A = A._consolidate()

A.loc[2:3, (1, slice(2, 3))] = np.ones((2, 2), dtype=np.float32)
assert (A.dtypes == np.float32).all()

A.loc[0:5, (1, slice(2, 3))] = np.ones((6, 2), dtype=np.float32)
assert (A.dtypes == np.float32).all()

A.loc[:, (1, slice(2, 3))] = np.ones((6, 2), dtype=np.float32)
assert (A.dtypes == np.float32).all()

# TODO: i think this isn't about MultiIndex and could be done with iloc?


def test_37477():
# fixed by GH#45121
orig = DataFrame({"A": [1, 2, 3], "B": [3, 4, 5]})
expected = DataFrame({"A": [1, 2, 3], "B": [3, 1.2, 5]})

df = orig.copy()
df.at[1, "B"] = 1.2
tm.assert_frame_equal(df, expected)

df = orig.copy()
df.loc[1, "B"] = 1.2
tm.assert_frame_equal(df, expected)

df = orig.copy()
df.iat[1, 1] = 1.2
tm.assert_frame_equal(df, expected)

df = orig.copy()
df.iloc[1, 1] = 1.2
tm.assert_frame_equal(df, expected)


def test_6942(indexer_al):
# check that the .at __setitem__ after setting "Live" actually sets the data
start = Timestamp("2014-04-01")
t1 = Timestamp("2014-04-23 12:42:38.883082")
t2 = Timestamp("2014-04-24 01:33:30.040039")

dti = date_range(start, periods=1)
orig = DataFrame(index=dti, columns=["timenow", "Live"])

df = orig.copy()
indexer_al(df)[start, "timenow"] = t1

df["Live"] = True

df.at[start, "timenow"] = t2
assert df.iloc[0, 0] == t2


def test_26395(indexer_al):
# .at case fixed by GH#45121 (best guess)
df = DataFrame(index=["A", "B", "C"])
df["D"] = 0

indexer_al(df)["C", "D"] = 2
expected = DataFrame({"D": [0, 0, 2]}, index=["A", "B", "C"], dtype=np.int64)
tm.assert_frame_equal(df, expected)

indexer_al(df)["C", "D"] = 44.5
expected = DataFrame({"D": [0, 0, 44.5]}, index=["A", "B", "C"], dtype=np.float64)
tm.assert_frame_equal(df, expected)

indexer_al(df)["C", "D"] = "hello"
expected = DataFrame({"D": [0, 0, "hello"]}, index=["A", "B", "C"], dtype=object)
tm.assert_frame_equal(df, expected)


@pytest.mark.xfail(reason="unwanted upcast")
def test_15231():
df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"])
df.loc[2] = Series({"a": 5, "b": 6})
assert (df.dtypes == np.int64).all()

df.loc[3] = Series({"a": 7})

# df["a"] doesn't have any NaNs, should not have been cast
exp_dtypes = Series([np.int64, np.float64], dtype=object, index=["a", "b"])
tm.assert_series_equal(df.dtypes, exp_dtypes)


@pytest.mark.xfail(reason="Unnecessarily upcasts to float64")
def test_iloc_setitem_unnecesssary_float_upcasting():
# GH#12255
df = DataFrame(
{
0: np.array([1, 3], dtype=np.float32),
1: np.array([2, 4], dtype=np.float32),
2: ["a", "b"],
}
)
orig = df.copy()

values = df[0].values.reshape(2, 1)
df.iloc[:, 0:1] = values

tm.assert_frame_equal(df, orig)


@pytest.mark.xfail(reason="unwanted casting to dt64")
def test_12499():
# TODO: OP in GH#12499 used np.datetim64("NaT") instead of pd.NaT,
# which has consequences for the expected df["two"] (though i think at
# the time it might not have because of a separate bug). See if it makes
# a difference which one we use here.
ts = Timestamp("2016-03-01 03:13:22.98986", tz="UTC")

data = [{"one": 0, "two": ts}]
orig = DataFrame(data)
df = orig.copy()
df.loc[1] = [np.nan, NaT]

expected = DataFrame(
{"one": [0, np.nan], "two": Series([ts, NaT], dtype="datetime64[ns, UTC]")}
)
tm.assert_frame_equal(df, expected)

data = [{"one": 0, "two": ts}]
df = orig.copy()
df.loc[1, :] = [np.nan, NaT]
tm.assert_frame_equal(df, expected)


@pytest.mark.xfail(reason="Too many columns cast to float64")
def test_20476():
mi = MultiIndex.from_product([["A", "B"], ["a", "b", "c"]])
df = DataFrame(-1, index=range(3), columns=mi)
filler = DataFrame([[1, 2, 3.0]] * 3, index=range(3), columns=["a", "b", "c"])
df["A"] = filler

expected = DataFrame(
{
0: [1, 1, 1],
1: [2, 2, 2],
2: [3.0, 3.0, 3.0],
3: [-1, -1, -1],
4: [-1, -1, -1],
5: [-1, -1, -1],
}
)
expected.columns = mi
exp_dtypes = Series(
[np.dtype(np.int64)] * 2 + [np.dtype(np.float64)] + [np.dtype(np.int64)] * 3,
index=mi,
)
tm.assert_series_equal(df.dtypes, exp_dtypes)
138 changes: 0 additions & 138 deletions pandas/tests/series/indexing/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -1360,28 +1360,6 @@ def test_15413():
tm.assert_series_equal(ser, expected)


def test_37477():
# fixed by GH#45121
orig = DataFrame({"A": [1, 2, 3], "B": [3, 4, 5]})
expected = DataFrame({"A": [1, 2, 3], "B": [3, 1.2, 5]})

df = orig.copy()
df.at[1, "B"] = 1.2
tm.assert_frame_equal(df, expected)

df = orig.copy()
df.loc[1, "B"] = 1.2
tm.assert_frame_equal(df, expected)

df = orig.copy()
df.iat[1, 1] = 1.2
tm.assert_frame_equal(df, expected)

df = orig.copy()
df.iloc[1, 1] = 1.2
tm.assert_frame_equal(df, expected)


def test_32878_int_itemsize():
# Fixed by GH#45121
arr = np.arange(5).astype("i4")
Expand All @@ -1392,24 +1370,6 @@ def test_32878_int_itemsize():
tm.assert_series_equal(ser, expected)


def test_26395(indexer_al):
# .at case fixed by GH#45121 (best guess)
df = DataFrame(index=["A", "B", "C"])
df["D"] = 0

indexer_al(df)["C", "D"] = 2
expected = DataFrame({"D": [0, 0, 2]}, index=["A", "B", "C"], dtype=np.int64)
tm.assert_frame_equal(df, expected)

indexer_al(df)["C", "D"] = 44.5
expected = DataFrame({"D": [0, 0, 44.5]}, index=["A", "B", "C"], dtype=np.float64)
tm.assert_frame_equal(df, expected)

indexer_al(df)["C", "D"] = "hello"
expected = DataFrame({"D": [0, 0, "hello"]}, index=["A", "B", "C"], dtype=object)
tm.assert_frame_equal(df, expected)


def test_37692(indexer_al):
# GH#37692
ser = Series([1, 2, 3], index=["a", "b", "c"])
Expand Down Expand Up @@ -1442,24 +1402,6 @@ def test_setitem_bool_int_float_consistency(indexer_sli):
indexer_sli(ser)[0] = np.int64(1)


def test_6942(indexer_al):
# check that the .at __setitem__ after setting "Live" actually sets the data
start = Timestamp("2014-04-01")
t1 = Timestamp("2014-04-23 12:42:38.883082")
t2 = Timestamp("2014-04-24 01:33:30.040039")

dti = date_range(start, periods=1)
orig = DataFrame(index=dti, columns=["timenow", "Live"])

df = orig.copy()
indexer_al(df)[start, "timenow"] = t1

df["Live"] = True

df.at[start, "timenow"] = t2
assert df.iloc[0, 0] == t2


def test_setitem_positional_with_casting():
# GH#45070 case where in __setitem__ we get a KeyError, then when
# we fallback we *also* get a ValueError if we try to set inplace.
Expand All @@ -1479,19 +1421,6 @@ def test_setitem_positional_float_into_int_coerces():
tm.assert_series_equal(ser, expected)


@pytest.mark.xfail(reason="unwanted upcast")
def test_15231():
df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"])
df.loc[2] = Series({"a": 5, "b": 6})
assert (df.dtypes == np.int64).all()

df.loc[3] = Series({"a": 7})

# df["a"] doesn't have any NaNs, should not have been cast
exp_dtypes = Series([np.int64, np.float64], dtype=object, index=["a", "b"])
tm.assert_series_equal(df.dtypes, exp_dtypes)


@pytest.mark.xfail(reason="Fails to upcast")
def test_32878_complex_itemsize():
# TODO: when fixed, put adjacent to test_32878_int_itemsize
Expand All @@ -1507,73 +1436,6 @@ def test_32878_complex_itemsize():
tm.assert_series_equal(ser, expected)


@pytest.mark.xfail(reason="Unnecessarily upcasts to float64")
def test_iloc_setitem_unnecesssary_float_upcasting():
# GH#12255
df = DataFrame(
{
0: np.array([1, 3], dtype=np.float32),
1: np.array([2, 4], dtype=np.float32),
2: ["a", "b"],
}
)
orig = df.copy()

values = df[0].values.reshape(2, 1)
df.iloc[:, 0:1] = values

tm.assert_frame_equal(df, orig)


@pytest.mark.xfail(reason="unwanted casting to dt64")
def test_12499():
# TODO: OP in GH#12499 used np.datetim64("NaT") instead of pd.NaT,
# which has consequences for the expected df["two"] (though i think at
# the time it might not have because of a separate bug). See if it makes
# a difference which one we use here.
ts = Timestamp("2016-03-01 03:13:22.98986", tz="UTC")

data = [{"one": 0, "two": ts}]
orig = DataFrame(data)
df = orig.copy()
df.loc[1] = [np.nan, NaT]

expected = DataFrame(
{"one": [0, np.nan], "two": Series([ts, NaT], dtype="datetime64[ns, UTC]")}
)
tm.assert_frame_equal(df, expected)

data = [{"one": 0, "two": ts}]
df = orig.copy()
df.loc[1, :] = [np.nan, NaT]
tm.assert_frame_equal(df, expected)


@pytest.mark.xfail(reason="Too many columns cast to float64")
def test_20476():
mi = MultiIndex.from_product([["A", "B"], ["a", "b", "c"]])
df = DataFrame(-1, index=range(3), columns=mi)
filler = DataFrame([[1, 2, 3.0]] * 3, index=range(3), columns=["a", "b", "c"])
df["A"] = filler

expected = DataFrame(
{
0: [1, 1, 1],
1: [2, 2, 2],
2: [3.0, 3.0, 3.0],
3: [-1, -1, -1],
4: [-1, -1, -1],
5: [-1, -1, -1],
}
)
expected.columns = mi
exp_dtypes = Series(
[np.dtype(np.int64)] * 2 + [np.dtype(np.float64)] + [np.dtype(np.int64)] * 3,
index=mi,
)
tm.assert_series_equal(df.dtypes, exp_dtypes)


def test_setitem_int_as_positional_fallback_deprecation():
# GH#42215 deprecated falling back to positional on __setitem__ with an
# int not contained in the index
Expand Down