diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9cd3a4e91eaf8..b6c80a51cec16 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -201,6 +201,7 @@ if TYPE_CHECKING: import datetime + from pandas._libs.internals import BlockValuesRefs from pandas._typing import ( AggFuncType, AnyAll, @@ -3923,11 +3924,11 @@ def isetitem(self, loc, value) -> None: ) for i, idx in enumerate(loc): - arraylike = self._sanitize_column(value.iloc[:, i]) + arraylike, _ = self._sanitize_column(value.iloc[:, i]) self._iset_item_mgr(idx, arraylike, inplace=False) return - arraylike = self._sanitize_column(value) + arraylike, _ = self._sanitize_column(value) self._iset_item_mgr(loc, arraylike, inplace=False) def __setitem__(self, key, value): @@ -4098,7 +4099,7 @@ def _set_item_frame_value(self, key, value: DataFrame) -> None: return # now align rows - arraylike = _reindex_for_setitem(value, self.index) + arraylike, _ = _reindex_for_setitem(value, self.index) self._set_item_mgr(key, arraylike) return @@ -4111,20 +4112,26 @@ def _set_item_frame_value(self, key, value: DataFrame) -> None: self[key] = value[value.columns[0]] def _iset_item_mgr( - self, loc: int | slice | np.ndarray, value, inplace: bool = False + self, + loc: int | slice | np.ndarray, + value, + inplace: bool = False, + refs: BlockValuesRefs | None = None, ) -> None: # when called from _set_item_mgr loc can be anything returned from get_loc - self._mgr.iset(loc, value, inplace=inplace) + self._mgr.iset(loc, value, inplace=inplace, refs=refs) self._clear_item_cache() - def _set_item_mgr(self, key, value: ArrayLike) -> None: + def _set_item_mgr( + self, key, value: ArrayLike, refs: BlockValuesRefs | None = None + ) -> None: try: loc = self._info_axis.get_loc(key) except KeyError: # This item wasn't present, just insert at end - self._mgr.insert(len(self._info_axis), key, value) + self._mgr.insert(len(self._info_axis), key, value, refs) else: - self._iset_item_mgr(loc, value) + self._iset_item_mgr(loc, value, refs=refs) # check if we are modifying a copy # try to set first as we want an invalid @@ -4154,7 +4161,7 @@ def _set_item(self, key, value) -> None: Series/TimeSeries will be conformed to the DataFrames index to ensure homogeneity. """ - value = self._sanitize_column(value) + value, refs = self._sanitize_column(value, using_cow=using_copy_on_write()) if ( key in self.columns @@ -4166,8 +4173,9 @@ def _set_item(self, key, value) -> None: existing_piece = self[key] if isinstance(existing_piece, DataFrame): value = np.tile(value, (len(existing_piece.columns), 1)).T + refs = None - self._set_item_mgr(key, value) + self._set_item_mgr(key, value, refs) def _set_value( self, index: IndexLabel, col, value: Scalar, takeable: bool = False @@ -4795,7 +4803,7 @@ def insert( elif isinstance(value, DataFrame): value = value.iloc[:, 0] - value = self._sanitize_column(value) + value, _ = self._sanitize_column(value) self._mgr.insert(loc, column, value) def assign(self, **kwargs) -> DataFrame: @@ -4866,10 +4874,13 @@ def assign(self, **kwargs) -> DataFrame: data[k] = com.apply_if_callable(v, data) return data - def _sanitize_column(self, value) -> ArrayLike: + def _sanitize_column( + self, value, using_cow: bool = False + ) -> tuple[ArrayLike, BlockValuesRefs | None]: """ Ensures new columns (which go into the BlockManager as new blocks) are - always copied and converted into an array. + always copied (or a reference is being tracked to them under CoW) + and converted into an array. Parameters ---------- @@ -4877,18 +4888,20 @@ def _sanitize_column(self, value) -> ArrayLike: Returns ------- - numpy.ndarray or ExtensionArray + tuple of numpy.ndarray or ExtensionArray and optional BlockValuesRefs """ self._ensure_valid_index(value) # Using a DataFrame would mean coercing values to one dtype assert not isinstance(value, DataFrame) if is_dict_like(value): - return _reindex_for_setitem(Series(value), self.index) + if not isinstance(value, Series): + value = Series(value) + return _reindex_for_setitem(value, self.index, using_cow=using_cow) if is_list_like(value): com.require_length_match(value, self.index) - return sanitize_array(value, self.index, copy=True, allow_2d=True) + return sanitize_array(value, self.index, copy=True, allow_2d=True), None @property def _series(self): @@ -11899,11 +11912,15 @@ def _from_nested_dict(data) -> collections.defaultdict: return new_data -def _reindex_for_setitem(value: DataFrame | Series, index: Index) -> ArrayLike: +def _reindex_for_setitem( + value: DataFrame | Series, index: Index, using_cow: bool = False +) -> tuple[ArrayLike, BlockValuesRefs | None]: # reindex if necessary if value.index.equals(index) or not len(index): - return value._values.copy() + if using_cow and isinstance(value, Series): + return value._values, value._references + return value._values.copy(), None # GH#4107 try: @@ -11917,4 +11934,4 @@ def _reindex_for_setitem(value: DataFrame | Series, index: Index) -> ArrayLike: raise TypeError( "incompatible index of inserted column with frame index" ) from err - return reindexed_value + return reindexed_value, None diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index c1364db88bfdd..0d2058e66cfab 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -798,7 +798,11 @@ def column_arrays(self) -> list[ArrayLike]: return [np.asarray(arr) for arr in self.arrays] def iset( - self, loc: int | slice | np.ndarray, value: ArrayLike, inplace: bool = False + self, + loc: int | slice | np.ndarray, + value: ArrayLike, + inplace: bool = False, + refs=None, ) -> None: """ Set new column(s). @@ -874,7 +878,7 @@ def column_setitem( # update existing ArrayManager in-place self.arrays[loc] = new_mgr.arrays[0] - def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None: + def insert(self, loc: int, item: Hashable, value: ArrayLike, refs=None) -> None: """ Insert item at selected position. diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 3c7b238b6c390..397f9d5b1bbe6 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1129,7 +1129,11 @@ def column_arrays(self) -> list[np.ndarray]: return result # type: ignore[return-value] def iset( - self, loc: int | slice | np.ndarray, value: ArrayLike, inplace: bool = False + self, + loc: int | slice | np.ndarray, + value: ArrayLike, + inplace: bool = False, + refs: BlockValuesRefs | None = None, ): """ Set new item in-place. Does not consolidate. Adds new Block if not @@ -1170,6 +1174,7 @@ def iset( inplace=inplace, blkno=blkno, blk=blk, + refs=refs, ) # error: Incompatible types in assignment (expression has type @@ -1215,7 +1220,7 @@ def value_getitem(placement): continue else: # Defer setting the new values to enable consolidation - self._iset_split_block(blkno_l, blk_locs) + self._iset_split_block(blkno_l, blk_locs, refs=refs) if len(removed_blknos): # Remove blocks & update blknos accordingly @@ -1245,6 +1250,7 @@ def value_getitem(placement): new_block_2d( values=value, placement=BlockPlacement(slice(mgr_loc, mgr_loc + 1)), + refs=refs, ) for mgr_loc in unfit_idxr ) @@ -1260,6 +1266,7 @@ def value_getitem(placement): new_block_2d( values=value_getitem(unfit_val_items), placement=BlockPlacement(unfit_idxr), + refs=refs, ) ) @@ -1276,6 +1283,7 @@ def _iset_split_block( blkno_l: int, blk_locs: np.ndarray | list[int], value: ArrayLike | None = None, + refs: BlockValuesRefs | None = None, ) -> None: """Removes columns from a block by splitting the block. @@ -1288,6 +1296,7 @@ def _iset_split_block( blkno_l: The block number to operate on, relevant for updating the manager blk_locs: The locations of our block that should be deleted. value: The value to set as a replacement. + refs: The reference tracking object of the value to set. """ blk = self.blocks[blkno_l] @@ -1297,7 +1306,7 @@ def _iset_split_block( nbs_tup = tuple(blk.delete(blk_locs)) if value is not None: locs = blk.mgr_locs.as_array[blk_locs] - first_nb = new_block_2d(value, BlockPlacement(locs)) + first_nb = new_block_2d(value, BlockPlacement(locs), refs=refs) else: first_nb = nbs_tup[0] nbs_tup = tuple(nbs_tup[1:]) @@ -1319,7 +1328,13 @@ def _iset_split_block( self._blknos[nb.mgr_locs.indexer] = i + nr_blocks def _iset_single( - self, loc: int, value: ArrayLike, inplace: bool, blkno: int, blk: Block + self, + loc: int, + value: ArrayLike, + inplace: bool, + blkno: int, + blk: Block, + refs: BlockValuesRefs | None = None, ) -> None: """ Fastpath for iset when we are only setting a single position and @@ -1339,7 +1354,7 @@ def _iset_single( blk.set_inplace(slice(iloc, iloc + 1), value, copy=copy) return - nb = new_block_2d(value, placement=blk._mgr_locs) + nb = new_block_2d(value, placement=blk._mgr_locs, refs=refs) old_blocks = self.blocks new_blocks = old_blocks[:blkno] + (nb,) + old_blocks[blkno + 1 :] self.blocks = new_blocks @@ -1377,7 +1392,7 @@ def column_setitem( new_mgr = col_mgr.setitem((idx,), value) self.iset(loc, new_mgr._block.values, inplace=True) - def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None: + def insert(self, loc: int, item: Hashable, value: ArrayLike, refs=None) -> None: """ Insert item at selected position. @@ -1386,6 +1401,7 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None: loc : int item : hashable value : np.ndarray or ExtensionArray + refs : The reference tracking object of the value to set. """ # insert to the axis; this could possibly raise a TypeError new_axis = self.items.insert(loc, item) @@ -1401,7 +1417,7 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None: bp = BlockPlacement(slice(loc, loc + 1)) # TODO(CoW) do we always "own" the passed `value`? - block = new_block_2d(values=value, placement=bp) + block = new_block_2d(values=value, placement=bp, refs=refs) if not len(self.blocks): # Fastpath diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index acbc64628f166..8bb5de708a741 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -977,28 +977,24 @@ def test_column_as_series_no_item_cache( # TODO add tests for other indexing methods on the Series -def test_dataframe_add_column_from_series(backend): +def test_dataframe_add_column_from_series(backend, using_copy_on_write): # Case: adding a new column to a DataFrame from an existing column/series - # -> always already takes a copy on assignment - # (no change in behaviour here) - # TODO can we achieve the same behaviour with Copy-on-Write? + # -> delays copy under CoW _, DataFrame, Series = backend df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) s = Series([10, 11, 12]) df["new"] = s - assert not np.shares_memory(get_array(df, "new"), s.values) + if using_copy_on_write: + assert np.shares_memory(get_array(df, "new"), get_array(s)) + else: + assert not np.shares_memory(get_array(df, "new"), get_array(s)) # editing series -> doesn't modify column in frame s[0] = 0 expected = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "new": [10, 11, 12]}) tm.assert_frame_equal(df, expected) - # editing column in frame -> doesn't modify series - df.loc[2, "new"] = 100 - expected_s = Series([0, 11, 12]) - tm.assert_series_equal(s, expected_s) - @pytest.mark.parametrize("val", [100, "a"]) @pytest.mark.parametrize( diff --git a/pandas/tests/copy_view/test_setitem.py b/pandas/tests/copy_view/test_setitem.py index 9e0d350dde0de..2a99a00e249fa 100644 --- a/pandas/tests/copy_view/test_setitem.py +++ b/pandas/tests/copy_view/test_setitem.py @@ -3,10 +3,12 @@ from pandas import ( DataFrame, Index, + MultiIndex, RangeIndex, Series, ) import pandas._testing as tm +from pandas.tests.copy_view.util import get_array # ----------------------------------------------------------------------------- # Copy/view behaviour for the values that are set in a DataFrame @@ -20,7 +22,7 @@ def test_set_column_with_array(): df["c"] = arr # the array data is copied - assert not np.shares_memory(df["c"].values, arr) + assert not np.shares_memory(get_array(df, "c"), arr) # and thus modifying the array does not modify the DataFrame arr[0] = 0 tm.assert_series_equal(df["c"], Series([1, 2, 3], name="c")) @@ -28,18 +30,17 @@ def test_set_column_with_array(): def test_set_column_with_series(using_copy_on_write): # Case: setting a series as a new column (df[col] = s) copies that data + # (with delayed copy with CoW) df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) ser = Series([1, 2, 3]) df["c"] = ser if using_copy_on_write: - # TODO(CoW) with CoW we can delay the copy - # assert np.shares_memory(df["c"].values, ser.values) - assert not np.shares_memory(df["c"].values, ser.values) + assert np.shares_memory(get_array(df, "c"), get_array(ser)) else: # the series data is copied - assert not np.shares_memory(df["c"].values, ser.values) + assert not np.shares_memory(get_array(df, "c"), get_array(ser)) # and modifying the series does not modify the DataFrame ser.iloc[0] = 0 @@ -55,7 +56,7 @@ def test_set_column_with_index(using_copy_on_write): df["c"] = idx # the index data is copied - assert not np.shares_memory(df["c"].values, idx.values) + assert not np.shares_memory(get_array(df, "c"), idx.values) # and thus modifying the index does not modify the DataFrame idx.values[0] = 0 @@ -66,26 +67,82 @@ def test_set_column_with_index(using_copy_on_write): df["d"] = idx - assert not np.shares_memory(df["d"].values, arr) + assert not np.shares_memory(get_array(df, "d"), arr) arr[0] = 0 tm.assert_series_equal(df["d"], Series([1, 2, 3], name="d")) def test_set_columns_with_dataframe(using_copy_on_write): # Case: setting a DataFrame as new columns copies that data + # (with delayed copy with CoW) df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) df2 = DataFrame({"c": [7, 8, 9], "d": [10, 11, 12]}) df[["c", "d"]] = df2 if using_copy_on_write: - # TODO(CoW) with CoW we can delay the copy - # assert np.shares_memory(df["c"].values, df2["c"].values) - assert not np.shares_memory(df["c"].values, df2["c"].values) + assert np.shares_memory(get_array(df, "c"), get_array(df2, "c")) else: # the data is copied - assert not np.shares_memory(df["c"].values, df2["c"].values) + assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c")) # and modifying the set DataFrame does not modify the original DataFrame df2.iloc[0, 0] = 0 tm.assert_series_equal(df["c"], Series([7, 8, 9], name="c")) + + +def test_setitem_series_no_copy(using_copy_on_write): + # Case: setting a Series as column into a DataFrame can delay copying that data + df = DataFrame({"a": [1, 2, 3]}) + rhs = Series([4, 5, 6]) + rhs_orig = rhs.copy() + + # adding a new column + df["b"] = rhs + if using_copy_on_write: + assert np.shares_memory(get_array(rhs), get_array(df, "b")) + + df.iloc[0, 1] = 100 + tm.assert_series_equal(rhs, rhs_orig) + + +def test_setitem_series_no_copy_single_block(using_copy_on_write): + # Overwriting an existing column that is a single block + df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) + rhs = Series([4, 5, 6]) + rhs_orig = rhs.copy() + + df["a"] = rhs + if using_copy_on_write: + assert np.shares_memory(get_array(rhs), get_array(df, "a")) + + df.iloc[0, 0] = 100 + tm.assert_series_equal(rhs, rhs_orig) + + +def test_setitem_series_no_copy_split_block(using_copy_on_write): + # Overwriting an existing column that is part of a larger block + df = DataFrame({"a": [1, 2, 3], "b": 1}) + rhs = Series([4, 5, 6]) + rhs_orig = rhs.copy() + + df["b"] = rhs + if using_copy_on_write: + assert np.shares_memory(get_array(rhs), get_array(df, "b")) + + df.iloc[0, 1] = 100 + tm.assert_series_equal(rhs, rhs_orig) + + +def test_setitem_series_column_midx_broadcasting(using_copy_on_write): + # Setting a Series to multiple columns will repeat the data + # (currently copying the data eagerly) + df = DataFrame( + [[1, 2, 3], [3, 4, 5]], + columns=MultiIndex.from_arrays([["a", "a", "b"], [1, 2, 3]]), + ) + rhs = Series([10, 11]) + df["a"] = rhs + assert not np.shares_memory(get_array(rhs), df._get_column_array(0)) + if using_copy_on_write: + assert df._mgr._has_no_reference(0)