From 9183b31c81306b1ed043cb877644a1509e58b061 Mon Sep 17 00:00:00 2001 From: Abhijit Deo <72816663+abhi-glitchhg@users.noreply.github.com> Date: Fri, 20 Oct 2023 22:16:23 +0530 Subject: [PATCH 01/12] [Documentation] Added another example in `df.clip` documentation. (#55589) * Update generic.py * Update generic.py * Update generic.py Fix for issue #55509 (updating intersection only) fix typo - black Added test --- pandas/core/frame.py | 8 ++++---- pandas/core/generic.py | 10 ++++++++++ pandas/tests/frame/methods/test_update.py | 9 +++++++++ 3 files changed, 23 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 70a5ac69011d1..620a8ed6a7b5f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8857,11 +8857,11 @@ def update( if not isinstance(other, DataFrame): other = DataFrame(other) - other = other.reindex(self.index) + indexes_intersection = self.index.intersection(other.index) for col in self.columns.intersection(other.columns): - this = self[col]._values - that = other[col]._values + this = self[col].loc[indexes_intersection]._values + that = other[col].loc[indexes_intersection]._values if filter_func is not None: mask = ~filter_func(this) | isna(that) @@ -8881,7 +8881,7 @@ def update( if mask.all(): continue - self.loc[:, col] = expressions.where(mask, this, that) + self.loc[indexes_intersection, col] = expressions.where(mask, this, that) # ---------------------------------------------------------------------- # Data reshaping diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 1ae4c3cdfc458..c525003cabd10 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8785,6 +8785,16 @@ def clip( 3 -1 6 4 5 -4 + Clips using specific lower and upper thresholds per column: + + >>> df.clip([-2, -1], [4,5]) + col_0 col_1 + 0 4 -1 + 1 -2 -1 + 2 0 5 + 3 -1 5 + 4 4 -1 + Clips using specific lower and upper thresholds per column element: >>> t = pd.Series([2, -4, -1, 6, 3]) diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index 5738a25f26fcb..3c6529c4b0b16 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -177,3 +177,12 @@ def test_update_dt_column_with_NaT_create_column(self): {"A": [1.0, 3.0], "B": [pd.NaT, pd.to_datetime("2016-01-01")]} ) tm.assert_frame_equal(df, expected) + + def test_update_preserve_column_dtype_bool(self): + # GH#55509 + df = DataFrame({"A": [True, True]}, index=[1, 2]) + other = DataFrame({"A": [False]}, index=[1]) + expected = DataFrame({"A": [False, True]}, index=[1, 2]) + df.update(other) + + tm.assert_frame_equal(df, expected) From af983c802384109a73a33b580814e0be7cf7f56c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marco=20Aur=C3=A9lio=20A=2E=20Barbosa?= Date: Sun, 22 Oct 2023 11:09:51 -0300 Subject: [PATCH 02/12] BUG: ensure dataFrame.update reads only changing rows (#55509) --- doc/source/whatsnew/v2.1.2.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 97a718dd496e9..9697b08f966a7 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -27,6 +27,7 @@ Bug fixes - Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`) - Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`) - Fixed bug in :meth:`DataFrame.interpolate` raising incorrect error message (:issue:`55347`) +- Fixed bug in :meth:`DataFrame.update` bool dtype being converted to object (:issue:`55509`) - Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (:issue:`55365`) - Fixed bug in :meth:`Series.all` and :meth:`Series.any` not treating missing values correctly for ``dtype="string[pyarrow_numpy]"`` (:issue:`55367`) - Fixed bug in :meth:`Series.floordiv` for :class:`ArrowDtype` (:issue:`55561`) From 82ecad25d3454fe8e1a980902ba15a712fed5cd1 Mon Sep 17 00:00:00 2001 From: Marco Barbosa Date: Wed, 1 Nov 2023 13:12:07 -0300 Subject: [PATCH 03/12] fix test not guarding FutureWarning properly --- pandas/tests/frame/methods/test_update.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index 3c6529c4b0b16..9f41bac6c3b35 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -144,10 +144,10 @@ def test_update_with_different_dtype(self, using_copy_on_write): # GH#3217 df = DataFrame({"a": [1, 3], "b": [np.nan, 2]}) df["c"] = np.nan - if using_copy_on_write: - df.update({"c": Series(["foo"], index=[0])}) - else: - with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): + if using_copy_on_write: + df.update({"c": Series(["foo"], index=[0])}) + else: df["c"].update(Series(["foo"], index=[0])) expected = DataFrame({"a": [1, 3], "b": [np.nan, 2], "c": ["foo", np.nan]}) From 07046c01072e44228c40a6b01003443b97a8b5a1 Mon Sep 17 00:00:00 2001 From: Marco Barbosa Date: Wed, 1 Nov 2023 14:18:58 -0300 Subject: [PATCH 04/12] PERF: Avoid chained index - without _values --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 620a8ed6a7b5f..1bae8cda49269 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8860,8 +8860,8 @@ def update( indexes_intersection = self.index.intersection(other.index) for col in self.columns.intersection(other.columns): - this = self[col].loc[indexes_intersection]._values - that = other[col].loc[indexes_intersection]._values + this = self.loc[indexes_intersection, col] + that = other.loc[indexes_intersection, col] if filter_func is not None: mask = ~filter_func(this) | isna(that) From e8ee23330a020c873fb684d42757ed4fe192d1d0 Mon Sep 17 00:00:00 2001 From: Marco Barbosa Date: Sun, 5 Nov 2023 10:27:37 -0300 Subject: [PATCH 05/12] PERF: include _values --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1bae8cda49269..6bb292881ad96 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8860,8 +8860,8 @@ def update( indexes_intersection = self.index.intersection(other.index) for col in self.columns.intersection(other.columns): - this = self.loc[indexes_intersection, col] - that = other.loc[indexes_intersection, col] + this = self.loc[indexes_intersection, col]._values + that = other.loc[indexes_intersection, col]._values if filter_func is not None: mask = ~filter_func(this) | isna(that) From 85a107e3ee2f976882c42df49dc54ef23dc36277 Mon Sep 17 00:00:00 2001 From: Marco Barbosa Date: Sun, 5 Nov 2023 19:27:04 -0300 Subject: [PATCH 06/12] TST: test that update preserve dtype (#55509) --- pandas/tests/frame/methods/test_update.py | 30 +++++++++++++++++++---- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index 9f41bac6c3b35..0985b2fd1dcbf 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -178,11 +178,31 @@ def test_update_dt_column_with_NaT_create_column(self): ) tm.assert_frame_equal(df, expected) - def test_update_preserve_column_dtype_bool(self): + @pytest.mark.parametrize( + "value_df, value_other, dtype", + [ + (True, False, bool), + (1, 2, int), + (np.uint64(1), np.uint(2), np.dtype("uint64")), + (1.0, 2.0, float), + (1.0 + 1j, 2.0 + 2j, complex), + ("a", "b", pd.StringDtype()), + ( + pd.to_timedelta("1 ms"), + pd.to_timedelta("2 ms"), + np.dtype("timedelta64[ns]"), + ), + ( + np.datetime64("2000-01-01T00:00:00"), + np.datetime64("2000-01-02T00:00:00"), + np.dtype("datetime64[ns]"), + ), + ], + ) + def test_update_preserve_dtype(self, value_df, value_other, dtype): # GH#55509 - df = DataFrame({"A": [True, True]}, index=[1, 2]) - other = DataFrame({"A": [False]}, index=[1]) - expected = DataFrame({"A": [False, True]}, index=[1, 2]) + df = DataFrame({"a": [value_df] * 2}, index=[1, 2]) + other = DataFrame({"a": [value_other]}, index=[1]) + expected = DataFrame({"a": [value_other, value_df]}, index=[1, 2]) df.update(other) - tm.assert_frame_equal(df, expected) From b2acbc4e41576b880f829eb4fd7fb95dcbac1cf3 Mon Sep 17 00:00:00 2001 From: Marco Barbosa Date: Sun, 5 Nov 2023 20:28:45 -0300 Subject: [PATCH 07/12] TST: update raises on duplicate arg index (#55509) --- pandas/tests/frame/methods/test_update.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index 0985b2fd1dcbf..cde9f9786ad9e 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -206,3 +206,10 @@ def test_update_preserve_dtype(self, value_df, value_other, dtype): expected = DataFrame({"a": [value_other, value_df]}, index=[1, 2]) df.update(other) tm.assert_frame_equal(df, expected) + + def test_update_raises_on_duplicate_argument_index(self): + # GH#55509 + df = DataFrame({"a": [True, True]}, index=[1, 2]) + other = DataFrame({"a": [False, False]}, index=[1, 1]) + with pytest.raises(ValueError, match="duplicate argument index"): + df.update(other) From c7958e0051e16c9b0c4497b4d9d2ee3e7388bc3b Mon Sep 17 00:00:00 2001 From: Marco Barbosa Date: Mon, 6 Nov 2023 10:46:54 -0300 Subject: [PATCH 08/12] TST: msg on update with duplicate indexes (#55509) --- pandas/tests/frame/methods/test_update.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index cde9f9786ad9e..57d2f6e6c2c57 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -211,5 +211,5 @@ def test_update_raises_on_duplicate_argument_index(self): # GH#55509 df = DataFrame({"a": [True, True]}, index=[1, 2]) other = DataFrame({"a": [False, False]}, index=[1, 1]) - with pytest.raises(ValueError, match="duplicate argument index"): + with pytest.raises(ValueError, match="duplicate index"): df.update(other) From 2635c0598d38436df22dccb01e2c3b51cfb0fcac Mon Sep 17 00:00:00 2001 From: Marco Barbosa Date: Mon, 6 Nov 2023 10:52:48 -0300 Subject: [PATCH 09/12] BUG: Raise dupl. + no common idx on update #55509 --- pandas/core/frame.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e372bd5034e0d..35e3de38f2105 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8859,10 +8859,22 @@ def update( other = DataFrame(other) indexes_intersection = self.index.intersection(other.index) + if not len(indexes_intersection): + raise ValueError( + "Can't update dataframe when other has no element in common." + ) for col in self.columns.intersection(other.columns): - this = self.loc[indexes_intersection, col]._values - that = other.loc[indexes_intersection, col]._values + this = self.loc[indexes_intersection, col] + that = other.loc[indexes_intersection, col] + + if this.index.has_duplicates or that.index.has_duplicates: + raise ValueError( + "Update not allowed with duplicate indexes on dataframe or other." + ) + + this = this._values + that = that._values if filter_func is not None: mask = ~filter_func(this) | isna(that) From bb41261f30b08dd9e5eff0b0b3a5acfa0060f42e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marco=20Aur=C3=A9lio=20Barbosa?= Date: Wed, 8 Nov 2023 21:45:25 -0300 Subject: [PATCH 10/12] TST: update with duplicate frame index #55509 --- pandas/tests/frame/methods/test_update.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index 57d2f6e6c2c57..b10c1b76670eb 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -209,7 +209,15 @@ def test_update_preserve_dtype(self, value_df, value_other, dtype): def test_update_raises_on_duplicate_argument_index(self): # GH#55509 - df = DataFrame({"a": [True, True]}, index=[1, 2]) - other = DataFrame({"a": [False, False]}, index=[1, 1]) + df = DataFrame({"a": [1, 1]}, index=[1, 2]) + other = DataFrame({"a": [2, 3]}, index=[1, 1]) with pytest.raises(ValueError, match="duplicate index"): df.update(other) + + def test_update_on_duplicate_frame_unique_argument_index(self): + # GH#55509 + df = DataFrame({"a": [1, 1, 1]}, index=[1, 1, 2]) + other = DataFrame({"a": [2, 3]}, index=[1,2]) + expected = DataFrame({"a": [2,2,3]}, index=[1, 1, 2]) + df.update(other) + tm.assert_frame_equal(df, expected) From a1786ba9a210b0a88b5d2857381e895142424fb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marco=20Aur=C3=A9lio=20Barbosa?= Date: Wed, 8 Nov 2023 21:51:01 -0300 Subject: [PATCH 11/12] BUG: update with duplicate frame index #55509 --- pandas/core/frame.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 35e3de38f2105..4442c7dbfc09c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8858,23 +8858,27 @@ def update( if not isinstance(other, DataFrame): other = DataFrame(other) - indexes_intersection = self.index.intersection(other.index) + indexes_intersection = other.index.intersection(self.index) # order is important if not len(indexes_intersection): raise ValueError( - "Can't update dataframe when other has no element in common." + "Can't update dataframe when other has no index in common with this dataframe." + ) + + if other.index.is_unique: + indexes_this = indexes_intersection + if self.index.is_unique: + indexes_that = indexes_intersection + else: + full_indexes_this = self.index.take(self.index.get_indexer_for(indexes_intersection)) + indexes_that = indexes_intersection.reindex(full_indexes_this)[0] + else: + raise ValueError( + "Update not allowed with duplicate indexes on other." ) for col in self.columns.intersection(other.columns): - this = self.loc[indexes_intersection, col] - that = other.loc[indexes_intersection, col] - - if this.index.has_duplicates or that.index.has_duplicates: - raise ValueError( - "Update not allowed with duplicate indexes on dataframe or other." - ) - - this = this._values - that = that._values + this = self.loc[indexes_this, col]._values + that = other.loc[indexes_that, col]._values if filter_func is not None: mask = ~filter_func(this) | isna(that) From 79af49462424c1c46530a3cee8eac936b63eb33b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marco=20Aur=C3=A9lio=20Barbosa?= Date: Wed, 8 Nov 2023 22:48:16 -0300 Subject: [PATCH 12/12] pre-commit --- pandas/core/frame.py | 21 ++++++++++++--------- pandas/tests/frame/methods/test_update.py | 4 ++-- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4442c7dbfc09c..d7a28539f79b0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8858,23 +8858,26 @@ def update( if not isinstance(other, DataFrame): other = DataFrame(other) - indexes_intersection = other.index.intersection(self.index) # order is important + indexes_intersection = other.index.intersection( + self.index + ) # order is important if not len(indexes_intersection): raise ValueError( - "Can't update dataframe when other has no index in common with this dataframe." + "Can't update dataframe when other has no index in common with " + "this dataframe." ) - + if other.index.is_unique: indexes_this = indexes_intersection if self.index.is_unique: indexes_that = indexes_intersection - else: - full_indexes_this = self.index.take(self.index.get_indexer_for(indexes_intersection)) + else: + full_indexes_this = self.index.take( + self.index.get_indexer_for(indexes_intersection) + ) indexes_that = indexes_intersection.reindex(full_indexes_this)[0] - else: - raise ValueError( - "Update not allowed with duplicate indexes on other." - ) + else: + raise ValueError("Update not allowed with duplicate indexes on other.") for col in self.columns.intersection(other.columns): this = self.loc[indexes_this, col]._values diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index b10c1b76670eb..ed86ce6ca450d 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -217,7 +217,7 @@ def test_update_raises_on_duplicate_argument_index(self): def test_update_on_duplicate_frame_unique_argument_index(self): # GH#55509 df = DataFrame({"a": [1, 1, 1]}, index=[1, 1, 2]) - other = DataFrame({"a": [2, 3]}, index=[1,2]) - expected = DataFrame({"a": [2,2,3]}, index=[1, 1, 2]) + other = DataFrame({"a": [2, 3]}, index=[1, 2]) + expected = DataFrame({"a": [2, 2, 3]}, index=[1, 1, 2]) df.update(other) tm.assert_frame_equal(df, expected)