pandas-dev · jreback · Oct 23, 2020 · Oct 23, 2020 · Oct 23, 2020 · Oct 23, 2020
diff --git a/pandas/tests/reshape/concat/test_append.py b/pandas/tests/reshape/concat/test_append.py
@@ -0,0 +1,383 @@
+import datetime as dt
+from datetime import datetime
+from itertools import combinations
+
+import dateutil
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import DataFrame, Index, Series, Timestamp, concat, isna
+import pandas._testing as tm
+
+
+@pytest.fixture(params=[True, False])
+def sort(request):
+    """Boolean sort keyword for concat and DataFrame.append."""
+    return request.param
+
+
+class TestAppend:
+    def test_append(self, sort, float_frame):
+        mixed_frame = float_frame.copy()
+        mixed_frame["foo"] = "bar"
+
+        begin_index = float_frame.index[:5]
+        end_index = float_frame.index[5:]
+
+        begin_frame = float_frame.reindex(begin_index)
+        end_frame = float_frame.reindex(end_index)
+
+        appended = begin_frame.append(end_frame)
+        tm.assert_almost_equal(appended["A"], float_frame["A"])
+
+        del end_frame["A"]
+        partial_appended = begin_frame.append(end_frame, sort=sort)
+        assert "A" in partial_appended
+
+        partial_appended = end_frame.append(begin_frame, sort=sort)
+        assert "A" in partial_appended
+
+        # mixed type handling
+        appended = mixed_frame[:5].append(mixed_frame[5:])
+        tm.assert_frame_equal(appended, mixed_frame)
+
+        # what to test here
+        mixed_appended = mixed_frame[:5].append(float_frame[5:], sort=sort)
+        mixed_appended2 = float_frame[:5].append(mixed_frame[5:], sort=sort)
+
+        # all equal except 'foo' column
+        tm.assert_frame_equal(
+            mixed_appended.reindex(columns=["A", "B", "C", "D"]),
+            mixed_appended2.reindex(columns=["A", "B", "C", "D"]),
+        )
+
+    def test_append_empty(self, float_frame):
+        empty = DataFrame()
+
+        appended = float_frame.append(empty)
+        tm.assert_frame_equal(float_frame, appended)
+        assert appended is not float_frame
+
+        appended = empty.append(float_frame)
+        tm.assert_frame_equal(float_frame, appended)
+        assert appended is not float_frame
+
+    def test_append_overlap_raises(self, float_frame):
+        msg = "Indexes have overlapping values"
+        with pytest.raises(ValueError, match=msg):
+            float_frame.append(float_frame, verify_integrity=True)
+
+    def test_append_new_columns(self):
+        # see gh-6129: new columns
+        df = DataFrame({"a": {"x": 1, "y": 2}, "b": {"x": 3, "y": 4}})
+        row = Series([5, 6, 7], index=["a", "b", "c"], name="z")
+        expected = DataFrame(
+            {
+                "a": {"x": 1, "y": 2, "z": 5},
+                "b": {"x": 3, "y": 4, "z": 6},
+                "c": {"z": 7},
+            }
+        )
+        result = df.append(row)
+        tm.assert_frame_equal(result, expected)
+
+    def test_append_length0_frame(self, sort):
+        df = DataFrame(columns=["A", "B", "C"])
+        df3 = DataFrame(index=[0, 1], columns=["A", "B"])
+        df5 = df.append(df3, sort=sort)
+
+        expected = DataFrame(index=[0, 1], columns=["A", "B", "C"])
+        tm.assert_frame_equal(df5, expected)
+
+    def test_append_records(self):
+        arr1 = np.zeros((2,), dtype=("i4,f4,a10"))
+        arr1[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")]
+
+        arr2 = np.zeros((3,), dtype=("i4,f4,a10"))
+        arr2[:] = [(3, 4.0, "foo"), (5, 6.0, "bar"), (7.0, 8.0, "baz")]
+
+        df1 = DataFrame(arr1)
+        df2 = DataFrame(arr2)
+
+        result = df1.append(df2, ignore_index=True)
+        expected = DataFrame(np.concatenate((arr1, arr2)))
+        tm.assert_frame_equal(result, expected)
+
+    # rewrite sort fixture, since we also want to test default of None
+    def test_append_sorts(self, sort):
+        df1 = DataFrame({"a": [1, 2], "b": [1, 2]}, columns=["b", "a"])
+        df2 = DataFrame({"a": [1, 2], "c": [3, 4]}, index=[2, 3])
+
+        with tm.assert_produces_warning(None):
+            result = df1.append(df2, sort=sort)
+
+        # for None / True
+        expected = DataFrame(
+            {"b": [1, 2, None, None], "a": [1, 2, 1, 2], "c": [None, None, 3, 4]},
+            columns=["a", "b", "c"],
+        )
+        if sort is False:
+            expected = expected[["b", "a", "c"]]
+        tm.assert_frame_equal(result, expected)
+
+    def test_append_different_columns(self, sort):
+        df = DataFrame(
+            {
+                "bools": np.random.randn(10) > 0,
+                "ints": np.random.randint(0, 10, 10),
+                "floats": np.random.randn(10),
+                "strings": ["foo", "bar"] * 5,
+            }
+        )
+
+        a = df[:5].loc[:, ["bools", "ints", "floats"]]
+        b = df[5:].loc[:, ["strings", "ints", "floats"]]
+
+        appended = a.append(b, sort=sort)
+        assert isna(appended["strings"][0:4]).all()
+        assert isna(appended["bools"][5:]).all()
+
+    def test_append_many(self, sort, float_frame):
+        chunks = [
+            float_frame[:5],
+            float_frame[5:10],
+            float_frame[10:15],
+            float_frame[15:],
+        ]
+
+        result = chunks[0].append(chunks[1:])
+        tm.assert_frame_equal(result, float_frame)
+
+        chunks[-1] = chunks[-1].copy()
+        chunks[-1]["foo"] = "bar"
+        result = chunks[0].append(chunks[1:], sort=sort)
+        tm.assert_frame_equal(result.loc[:, float_frame.columns], float_frame)
+        assert (result["foo"][15:] == "bar").all()
+        assert result["foo"][:15].isna().all()
+
+    def test_append_preserve_index_name(self):
+        # #980
+        df1 = DataFrame(columns=["A", "B", "C"])
+        df1 = df1.set_index(["A"])
+        df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]], columns=["A", "B", "C"])
+        df2 = df2.set_index(["A"])
+
+        result = df1.append(df2)
+        assert result.index.name == "A"
+
+    indexes_can_append = [
+        pd.RangeIndex(3),
+        Index([4, 5, 6]),
+        Index([4.5, 5.5, 6.5]),
+        Index(list("abc")),
+        pd.CategoricalIndex("A B C".split()),
+        pd.CategoricalIndex("D E F".split(), ordered=True),
+        pd.IntervalIndex.from_breaks([7, 8, 9, 10]),
+        pd.DatetimeIndex(
+            [
+                dt.datetime(2013, 1, 3, 0, 0),
+                dt.datetime(2013, 1, 3, 6, 10),
+                dt.datetime(2013, 1, 3, 7, 12),
+            ]
+        ),
+    ]
+
+    indexes_cannot_append_with_other = [
+        pd.MultiIndex.from_arrays(["A B C".split(), "D E F".split()])
+    ]
+
+    all_indexes = indexes_can_append + indexes_cannot_append_with_other
+
+    @pytest.mark.parametrize("index", all_indexes, ids=lambda x: type(x).__name__)
+    def test_append_same_columns_type(self, index):
+        # GH18359
+
+        # df wider than ser
+        df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=index)
+        ser_index = index[:2]
+        ser = Series([7, 8], index=ser_index, name=2)
+        result = df.append(ser)
+        expected = DataFrame(
+            [[1.0, 2.0, 3.0], [4, 5, 6], [7, 8, np.nan]], index=[0, 1, 2], columns=index
+        )
+        tm.assert_frame_equal(result, expected)
+
+        # ser wider than df
+        ser_index = index
+        index = index[:2]
+        df = DataFrame([[1, 2], [4, 5]], columns=index)
+        ser = Series([7, 8, 9], index=ser_index, name=2)
+        result = df.append(ser)
+        expected = DataFrame(
+            [[1, 2, np.nan], [4, 5, np.nan], [7, 8, 9]],
+            index=[0, 1, 2],
+            columns=ser_index,
+        )
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "df_columns, series_index",
+        combinations(indexes_can_append, r=2),
+        ids=lambda x: type(x).__name__,
+    )
+    def test_append_different_columns_types(self, df_columns, series_index):
+        # GH18359
+        # See also test 'test_append_different_columns_types_raises' below
+        # for errors raised when appending
+
+        df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=df_columns)
+        ser = Series([7, 8, 9], index=series_index, name=2)
+
+        result = df.append(ser)
+        idx_diff = ser.index.difference(df_columns)
+        combined_columns = Index(df_columns.tolist()).append(idx_diff)
+        expected = DataFrame(
+            [
+                [1.0, 2.0, 3.0, np.nan, np.nan, np.nan],
+                [4, 5, 6, np.nan, np.nan, np.nan],
+                [np.nan, np.nan, np.nan, 7, 8, 9],
+            ],
+            index=[0, 1, 2],
+            columns=combined_columns,
+        )
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "index_can_append", indexes_can_append, ids=lambda x: type(x).__name__
+    )
+    @pytest.mark.parametrize(
+        "index_cannot_append_with_other",
+        indexes_cannot_append_with_other,
+        ids=lambda x: type(x).__name__,
+    )
+    def test_append_different_columns_types_raises(
+        self, index_can_append, index_cannot_append_with_other
+    ):
+        # GH18359
+        # Dataframe.append will raise if MultiIndex appends
+        # or is appended to a different index type
+        #
+        # See also test 'test_append_different_columns_types' above for
+        # appending without raising.
+
+        df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=index_can_append)
+        ser = Series([7, 8, 9], index=index_cannot_append_with_other, name=2)
+        msg = (
+            r"Expected tuple, got (int|long|float|str|"
+            r"pandas._libs.interval.Interval)|"
+            r"object of type '(int|float|Timestamp|"
+            r"pandas._libs.interval.Interval)' has no len\(\)|"
+        )
+        with pytest.raises(TypeError, match=msg):
+            df.append(ser)
+
+        df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=index_cannot_append_with_other)
+        ser = Series([7, 8, 9], index=index_can_append, name=2)
+
+        with pytest.raises(TypeError, match=msg):
+            df.append(ser)
+
+    def test_append_dtype_coerce(self, sort):
+
+        # GH 4993
+        # appending with datetime will incorrectly convert datetime64
+
+        df1 = DataFrame(
+            index=[1, 2],
+            data=[dt.datetime(2013, 1, 1, 0, 0), dt.datetime(2013, 1, 2, 0, 0)],
+            columns=["start_time"],
+        )
+        df2 = DataFrame(
+            index=[4, 5],
+            data=[
+                [dt.datetime(2013, 1, 3, 0, 0), dt.datetime(2013, 1, 3, 6, 10)],
+                [dt.datetime(2013, 1, 4, 0, 0), dt.datetime(2013, 1, 4, 7, 10)],
+            ],
+            columns=["start_time", "end_time"],
+        )
+
+        expected = concat(
+            [
+                Series(
+                    [
+                        pd.NaT,
+                        pd.NaT,
+                        dt.datetime(2013, 1, 3, 6, 10),
+                        dt.datetime(2013, 1, 4, 7, 10),
+                    ],
+                    name="end_time",
+                ),
+                Series(
+                    [
+                        dt.datetime(2013, 1, 1, 0, 0),
+                        dt.datetime(2013, 1, 2, 0, 0),
+                        dt.datetime(2013, 1, 3, 0, 0),
+                        dt.datetime(2013, 1, 4, 0, 0),
+                    ],
+                    name="start_time",
+                ),
+            ],
+            axis=1,
+            sort=sort,
+        )
+        result = df1.append(df2, ignore_index=True, sort=sort)
+        if sort:
+            expected = expected[["end_time", "start_time"]]
+        else:
+            expected = expected[["start_time", "end_time"]]
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_append_missing_column_proper_upcast(self, sort):
+        df1 = DataFrame({"A": np.array([1, 2, 3, 4], dtype="i8")})
+        df2 = DataFrame({"B": np.array([True, False, True, False], dtype=bool)})
+
+        appended = df1.append(df2, ignore_index=True, sort=sort)
+        assert appended["A"].dtype == "f8"
+        assert appended["B"].dtype == "O"
+
+    def test_append_empty_frame_to_series_with_dateutil_tz(self):
+        # GH 23682
+        date = Timestamp("2018-10-24 07:30:00", tz=dateutil.tz.tzutc())
+        s = Series({"date": date, "a": 1.0, "b": 2.0})
+        df = DataFrame(columns=["c", "d"])
+        result_a = df.append(s, ignore_index=True)
+        expected = DataFrame(
+            [[np.nan, np.nan, 1.0, 2.0, date]], columns=["c", "d", "a", "b", "date"]
+        )
+        # These columns get cast to object after append
+        expected["c"] = expected["c"].astype(object)
+        expected["d"] = expected["d"].astype(object)
+        tm.assert_frame_equal(result_a, expected)
+
+        expected = DataFrame(
+            [[np.nan, np.nan, 1.0, 2.0, date]] * 2, columns=["c", "d", "a", "b", "date"]
+        )
+        expected["c"] = expected["c"].astype(object)
+        expected["d"] = expected["d"].astype(object)
+
+        result_b = result_a.append(s, ignore_index=True)
+        tm.assert_frame_equal(result_b, expected)
+
+        # column order is different
+        expected = expected[["c", "d", "date", "a", "b"]]
+        result = df.append([s, s], ignore_index=True)
+        tm.assert_frame_equal(result, expected)
+
+    def test_append_empty_tz_frame_with_datetime64ns(self):
+        # https://github.com/pandas-dev/pandas/issues/35460
+        df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]")
+
+        # pd.NaT gets inferred as tz-naive, so append result is tz-naive
+        result = df.append({"a": pd.NaT}, ignore_index=True)
+        expected = DataFrame({"a": [pd.NaT]}).astype("datetime64[ns]")
+        tm.assert_frame_equal(result, expected)
+
+        # also test with typed value to append
+        df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]")
+        result = df.append(
+            Series({"a": pd.NaT}, dtype="datetime64[ns]"), ignore_index=True
+        )
+        expected = DataFrame({"a": [pd.NaT]}).astype("datetime64[ns]")
+        tm.assert_frame_equal(result, expected)