From 7eee352575354bfdc7195dcfc54f778bfefa39d9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 1 Feb 2021 17:48:56 +0100 Subject: [PATCH 1/3] [ArrayManager] BUG: fix indexing with non-aligned boolean dataframe --- .github/workflows/ci.yml | 1 + pandas/core/frame.py | 4 ++++ pandas/core/generic.py | 15 +++++++++++++++ pandas/tests/frame/methods/test_fillna.py | 11 +++++++++++ 4 files changed, 31 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b551e7ded0178..46b8e99f95fde 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -157,3 +157,4 @@ jobs: run: | source activate pandas-dev pytest pandas/tests/frame/methods --array-manager + pytest pandas/tests/frame/indexing/test_indexing.py::TestDataFrameIndexing::test_setitem_boolean --array-manager diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 82e984d36b6a1..a7b08f65a4976 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -630,6 +630,10 @@ def _as_manager(self, typ: str) -> DataFrame: # fastpath of passing a manager doesn't check the option/manager class return DataFrame(new_mgr) + @property + def _has_array_manager(self): + return isinstance(self._mgr, ArrayManager) + # ---------------------------------------------------------------------- @property diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 96b35f1aaab9c..cfdb9e1632dea 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8817,6 +8817,11 @@ def _where( if axis is not None: axis = self._get_axis_number(axis) + # Needed for DataFrames with ArrayManager, see below for details + all_bool_columns = False + if isinstance(cond, ABCDataFrame) and cond._has_array_manager: + all_bool_columns = all(is_bool_dtype(dt) for dt in cond.dtypes) + # align the cond to same shape as myself cond = com.apply_if_callable(cond, self) if isinstance(cond, NDFrame): @@ -8832,6 +8837,16 @@ def _where( fill_value = bool(inplace) cond = cond.fillna(fill_value) + # With ArrayManager, `fillna` does not automatically change object dtype + # back to bools (if the alignment made it object by introducing NaNs). + # So in this case we cast back to bool manually *if* the original columns + # before aligning were bool + # TODO this workaround can be removed once we have nullable boolean dtype + # as default + if isinstance(cond, ABCDataFrame) and cond._has_array_manager: + if not all(is_bool_dtype(dt) for dt in cond.dtypes) and all_bool_columns: + cond = cond.astype(bool) + msg = "Boolean array expected for the condition, not {dtype}" if not cond.empty: diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 58016be82c405..035f31a29c599 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -544,3 +544,14 @@ def test_fillna_nonconsolidated_frame(): df_nonconsol = df.pivot("i1", "i2") result = df_nonconsol.fillna(0) assert result.isna().sum().sum() == 0 + + +def test_fillna_infer_bool_dtype(using_array_manager): + # With ArrayManager, fillna doesn't change/infer dtypes + df = DataFrame([[True, False], [np.nan, True], [False, None]], dtype=object) + result = df.fillna(True) + if using_array_manager: + expected = DataFrame([[True, False], [True, True], [False, True]], dtype=object) + else: + expected = DataFrame([[True, False], [True, True], [False, True]], dtype=bool) + tm.assert_frame_equal(result, expected) From 2476591c5a593e9c12c4c9bdadf97dca8b68e346 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 3 Feb 2021 14:57:10 +0100 Subject: [PATCH 2/3] additional fixes for test_where.py --- .github/workflows/ci.yml | 1 + pandas/core/generic.py | 17 ++++++++++++++++- pandas/tests/frame/indexing/test_where.py | 9 ++++++++- 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 46b8e99f95fde..9cb5a3e17f527 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -158,3 +158,4 @@ jobs: source activate pandas-dev pytest pandas/tests/frame/methods --array-manager pytest pandas/tests/frame/indexing/test_indexing.py::TestDataFrameIndexing::test_setitem_boolean --array-manager + pytest pandas/tests/frame/indexing/test_where.py --array-manager diff --git a/pandas/core/generic.py b/pandas/core/generic.py index cfdb9e1632dea..c82424898641d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8817,6 +8817,8 @@ def _where( if axis is not None: axis = self._get_axis_number(axis) + cond_orig = cond + # Needed for DataFrames with ArrayManager, see below for details all_bool_columns = False if isinstance(cond, ABCDataFrame) and cond._has_array_manager: @@ -8833,9 +8835,22 @@ def _where( raise ValueError("Array conditional must be same shape as self") cond = self._constructor(cond, **self._construct_axes_dict()) + # Needed for DataFrames with ArrayManager, see below for details + if ( + isinstance(cond, ABCDataFrame) + and cond._has_array_manager + and isinstance(cond_orig, ABCSeries) + ): + all_bool_columns = is_bool_dtype(cond_orig.dtype) + # make sure we are boolean fill_value = bool(inplace) - cond = cond.fillna(fill_value) + try: + cond = cond.fillna(fill_value) + except TypeError: + # With ArrayManager, fillna can raise an error if `cond` is not + # of boolean dtype + raise ValueError("Boolean array expected for the condition") # With ArrayManager, `fillna` does not automatically change object dtype # back to bools (if the alignment made it object by introducing NaNs). diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 2f098426efaf9..82934a81fbb42 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas.core.dtypes.common import is_scalar import pandas as pd @@ -159,7 +161,7 @@ def test_where_set(self, where_frame, float_string_frame): def _check_set(df, cond, check_dtypes=True): dfi = df.copy() - econd = cond.reindex_like(df).fillna(True) + econd = cond.reindex_like(df).fillna(True).astype(bool) expected = dfi.mask(~econd) return_value = dfi.where(cond, np.nan, inplace=True) @@ -499,6 +501,9 @@ def test_where_axis(self): assert return_value is None tm.assert_frame_equal(result, expected) + # TODO(ArrayManager) rewrite test to be valid + @td.skip_array_manager_invalid_test + def test_where_axis_multiple_dtypes(self): # Multiple dtypes (=> multiple Blocks) df = pd.concat( [ @@ -643,6 +648,8 @@ def test_df_where_with_category(self, kwargs): tm.assert_series_equal(result, expected) + # TODO(ArrayManager) rewrite test to be valid + @td.skip_array_manager_invalid_test def test_where_categorical_filtering(self): # GH#22609 Verify filtering operations on DataFrames with categorical Series df = DataFrame(data=[[0, 0], [1, 1]], columns=["a", "b"]) From 024d5b83a79ad2bd5a14df34c72d5032f577f78d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 8 Feb 2021 17:04:10 +0100 Subject: [PATCH 3/3] fix mixed numeric case --- pandas/core/internals/array_manager.py | 2 +- pandas/tests/frame/indexing/test_where.py | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 0f677ff3180be..40ce563f86162 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -454,7 +454,7 @@ def is_mixed_type(self) -> bool: @property def is_numeric_mixed_type(self) -> bool: - return False + return all(is_numeric_dtype(t) for t in self.get_dtypes()) @property def any_extension_types(self) -> bool: diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 82934a81fbb42..e4b161f9240ff 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -3,8 +3,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas.core.dtypes.common import is_scalar import pandas as pd @@ -501,8 +499,6 @@ def test_where_axis(self): assert return_value is None tm.assert_frame_equal(result, expected) - # TODO(ArrayManager) rewrite test to be valid - @td.skip_array_manager_invalid_test def test_where_axis_multiple_dtypes(self): # Multiple dtypes (=> multiple Blocks) df = pd.concat( @@ -648,8 +644,6 @@ def test_df_where_with_category(self, kwargs): tm.assert_series_equal(result, expected) - # TODO(ArrayManager) rewrite test to be valid - @td.skip_array_manager_invalid_test def test_where_categorical_filtering(self): # GH#22609 Verify filtering operations on DataFrames with categorical Series df = DataFrame(data=[[0, 0], [1, 1]], columns=["a", "b"])