Skip to content

ENH: Index[bool] #45061

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 36 commits into from
Feb 5, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
7ef2dba
ENH/WIP: Index[bool]
jbrockmendel Dec 24, 2021
f912217
Merge branch 'master' into enh-bool-index
jbrockmendel Dec 28, 2021
f15b927
Merge branch 'master' into enh-bool-index
jbrockmendel Dec 28, 2021
5142a2d
failing tests
jbrockmendel Dec 28, 2021
a30a485
Merge branch 'master' into enh-bool-index
jbrockmendel Dec 29, 2021
0db360a
mypy fixup
jbrockmendel Dec 29, 2021
84faf3e
Merge branch 'master' into enh-bool-index
jbrockmendel Dec 29, 2021
5a1b1f0
Merge branch 'master' into enh-bool-index
jbrockmendel Dec 30, 2021
b67bab1
xfail
jbrockmendel Dec 30, 2021
89a1cab
Merge branch 'master' into enh-bool-index
jbrockmendel Jan 1, 2022
3b2f6e3
fix Boolean GroupBy tests
jbrockmendel Jan 2, 2022
1d28d25
Merge branch 'master' into enh-bool-index
jbrockmendel Jan 7, 2022
8cf7dfe
Merge branch 'master' into enh-bool-index
jbrockmendel Jan 7, 2022
298757c
document value_counts behavior, skips and xfails
jbrockmendel Jan 7, 2022
33ca864
Merge branch 'master' into enh-bool-index
jbrockmendel Jan 12, 2022
16d5202
Merge branch 'main' into enh-bool-index
jbrockmendel Jan 14, 2022
f2c67a9
Merge branch 'main' into enh-bool-index
jbrockmendel Jan 15, 2022
7f2edc3
Merge branch 'main' into enh-bool-index
jbrockmendel Jan 26, 2022
ad59ca1
post-merge cleanup
jbrockmendel Jan 26, 2022
dc347b8
Merge branch 'main' into enh-bool-index
jbrockmendel Jan 26, 2022
44a49af
fix should_compare
jbrockmendel Jan 26, 2022
a2b4de5
Merge branch 'main' into enh-bool-index
jbrockmendel Jan 29, 2022
9b57f13
whatsnew
jbrockmendel Jan 29, 2022
c05a968
Merge branch 'main' into enh-bool-index
jbrockmendel Jan 29, 2022
26105c0
Merge branch 'main' into enh-bool-index
jbrockmendel Jan 30, 2022
04ede77
Merge branch 'main' into enh-bool-index
jbrockmendel Jan 30, 2022
941986c
patch IntervalIndex._can_hold_na
jbrockmendel Jan 30, 2022
5ba5a47
Merge branch 'main' into enh-bool-index
jbrockmendel Jan 31, 2022
3dd23bb
remove duplicate line
jbrockmendel Jan 31, 2022
ce19666
Merge branch 'main' into enh-bool-index
jbrockmendel Jan 31, 2022
7924be4
Merge branch 'main' into enh-bool-index
jbrockmendel Feb 1, 2022
589a72a
Merge branch 'main' into enh-bool-index
jbrockmendel Feb 1, 2022
5b46f11
Merge branch 'main' into enh-bool-index
jbrockmendel Feb 1, 2022
8c91b1c
fix _check_dtype for BoolEngine
jbrockmendel Feb 1, 2022
0b73eba
Merge branch 'main' into enh-bool-index
jbrockmendel Feb 2, 2022
c46d3a3
remove no-longer-needed
jbrockmendel Feb 2, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ Other enhancements
- :meth:`to_numeric` now preserves float64 arrays when downcasting would generate values not representable in float32 (:issue:`43693`)
- :meth:`Series.reset_index` and :meth:`DataFrame.reset_index` now support the argument ``allow_duplicates`` (:issue:`44410`)
- :meth:`.GroupBy.min` and :meth:`.GroupBy.max` now supports `Numba <https://numba.pydata.org/>`_ execution with the ``engine`` keyword (:issue:`45428`)
- Implemented a ``bool``-dtype :class:`Index`, passing a bool-dtype arraylike to ``pd.Index`` will now retain ``bool`` dtype instead of casting to ``object`` (:issue:`45061`)
-

.. ---------------------------------------------------------------------------
Expand Down
1 change: 1 addition & 0 deletions pandas/_libs/index.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ class ObjectEngine(IndexEngine): ...
class DatetimeEngine(Int64Engine): ...
class TimedeltaEngine(DatetimeEngine): ...
class PeriodEngine(Int64Engine): ...
class BoolEngine(UInt8Engine): ...

class BaseMultiIndexCodesEngine:
levels: list[np.ndarray]
Expand Down
7 changes: 7 additions & 0 deletions pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -802,6 +802,13 @@ cdef class BaseMultiIndexCodesEngine:
include "index_class_helper.pxi"


cdef class BoolEngine(UInt8Engine):
cdef _check_type(self, object val):
if not util.is_bool_object(val):
raise KeyError(val)
return <uint8_t>val


@cython.internal
@cython.freelist(32)
cdef class SharedEngine:
Expand Down
5 changes: 3 additions & 2 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -555,7 +555,8 @@ def _create_mi_with_dt64tz_level():
"num_uint8": tm.makeNumericIndex(100, dtype="uint8"),
"num_float64": tm.makeNumericIndex(100, dtype="float64"),
"num_float32": tm.makeNumericIndex(100, dtype="float32"),
"bool": tm.makeBoolIndex(10),
"bool-object": tm.makeBoolIndex(10).astype(object),
"bool-dtype": Index(np.random.randn(10) < 0),
"categorical": tm.makeCategoricalIndex(100),
"interval": tm.makeIntervalIndex(100),
"empty": Index([]),
Expand Down Expand Up @@ -630,7 +631,7 @@ def index_flat_unique(request):
key
for key in indices_dict
if not (
key in ["int", "uint", "range", "empty", "repeats"]
key in ["int", "uint", "range", "empty", "repeats", "bool-dtype"]
or key.startswith("num_")
)
and not isinstance(indices_dict[key], MultiIndex)
Expand Down
16 changes: 11 additions & 5 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,9 +220,6 @@ def _reconstruct_data(
elif is_bool_dtype(dtype):
values = values.astype(dtype, copy=False)

# we only support object dtypes bool Index
if isinstance(original, ABCIndex):
values = values.astype(object, copy=False)
elif dtype is not None:
if is_datetime64_dtype(dtype):
dtype = np.dtype("datetime64[ns]")
Expand Down Expand Up @@ -830,7 +827,10 @@ def value_counts(
-------
Series
"""
from pandas.core.series import Series
from pandas import (
Index,
Series,
)

name = getattr(values, "name", None)

Expand Down Expand Up @@ -868,7 +868,13 @@ def value_counts(
else:
keys, counts = value_counts_arraylike(values, dropna)

result = Series(counts, index=keys, name=name)
# For backwards compatibility, we let Index do its normal type
# inference, _except_ for if if infers from object to bool.
idx = Index._with_infer(keys)
if idx.dtype == bool and keys.dtype == object:
idx = idx.astype(object)

result = Series(counts, index=idx, name=name)

if sort:
result = result.sort_values(ascending=ascending)
Expand Down
14 changes: 10 additions & 4 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,6 +505,10 @@ def __new__(
if data.dtype.kind in ["i", "u", "f"]:
# maybe coerce to a sub-class
arr = data
elif data.dtype.kind == "b":
# No special subclass, and Index._ensure_array won't do this
# for us.
arr = np.asarray(data)
else:
arr = com.asarray_tuplesafe(data, dtype=_dtype_obj)

Expand Down Expand Up @@ -702,7 +706,7 @@ def _with_infer(cls, *args, **kwargs):
# "Union[ExtensionArray, ndarray[Any, Any]]"; expected
# "ndarray[Any, Any]"
values = lib.maybe_convert_objects(result._values) # type: ignore[arg-type]
if values.dtype.kind in ["i", "u", "f"]:
if values.dtype.kind in ["i", "u", "f", "b"]:
return Index(values, name=result.name)

return result
Expand Down Expand Up @@ -872,9 +876,12 @@ def _engine(
):
return libindex.ExtensionEngine(target_values)

target_values = cast(np.ndarray, target_values)
# to avoid a reference cycle, bind `target_values` to a local variable, so
# `self` is not passed into the lambda.
target_values = cast(np.ndarray, target_values)
if target_values.dtype == bool:
return libindex.BoolEngine(target_values)

# error: Argument 1 to "ExtensionEngine" has incompatible type
# "ndarray[Any, Any]"; expected "ExtensionArray"
return self._engine_type(target_values) # type:ignore[arg-type]
Expand Down Expand Up @@ -2680,7 +2687,6 @@ def _is_all_dates(self) -> bool:
"""
Whether or not the index values only consist of dates.
"""

if needs_i8_conversion(self.dtype):
return True
elif self.dtype != _dtype_obj:
Expand Down Expand Up @@ -7302,7 +7308,7 @@ def _maybe_cast_data_without_dtype(
FutureWarning,
stacklevel=3,
)
if result.dtype.kind in ["b", "c"]:
if result.dtype.kind in ["c"]:
return subarr
result = ensure_wrapped_if_datetimelike(result)
return result
Expand Down
2 changes: 2 additions & 0 deletions pandas/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -1076,6 +1076,8 @@ def to_datetime(
result = convert_listlike(arg, format)
else:
result = convert_listlike(np.array([arg]), format)[0]
if isinstance(arg, bool) and isinstance(result, np.bool_):
result = bool(result) # TODO: avoid this kludge.

# error: Incompatible return value type (got "Union[Timestamp, NaTType,
# Series, Index]", expected "Union[DatetimeIndex, Series, float, str,
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/util/hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ def _hash_ndarray(

# First, turn whatever array this is into unsigned 64-bit ints, if we can
# manage it.
elif isinstance(dtype, bool):
elif dtype == bool:
vals = vals.astype("u8")
elif issubclass(dtype.type, (np.datetime64, np.timedelta64)):
vals = vals.view("i8").astype("u8", copy=False)
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/base/test_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ def test_value_counts_with_nan(dropna, index_or_series):
obj = klass(values)
res = obj.value_counts(dropna=dropna)
if dropna is True:
expected = Series([1], index=[True])
expected = Series([1], index=Index([True], dtype=obj.dtype))
else:
expected = Series([1, 1, 1], index=[True, pd.NA, np.nan])
tm.assert_series_equal(res, expected)
5 changes: 5 additions & 0 deletions pandas/tests/indexes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,8 @@ def test_ensure_copied_data(self, index):
# RangeIndex cannot be initialized from data
# MultiIndex and CategoricalIndex are tested separately
return
elif index.dtype == object and index.inferred_type == "boolean":
init_kwargs["dtype"] = index.dtype

index_type = type(index)
result = index_type(index.values, copy=True, **init_kwargs)
Expand Down Expand Up @@ -522,6 +524,9 @@ def test_fillna(self, index):
# GH 11343
if len(index) == 0:
return
elif index.dtype == bool:
# can't hold NAs
return
elif isinstance(index, NumericIndex) and is_integer_dtype(index.dtype):
return
elif isinstance(index, MultiIndex):
Expand Down
19 changes: 14 additions & 5 deletions pandas/tests/indexes/multi/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -621,13 +621,22 @@ def test_get_loc_implicit_cast(self, level, dtypes):
idx = MultiIndex.from_product(levels)
assert idx.get_loc(tuple(key)) == 3

def test_get_loc_cast_bool(self):
# GH 19086 : int is casted to bool, but not vice-versa
levels = [[False, True], np.arange(2, dtype="int64")]
@pytest.mark.parametrize("dtype", [bool, object])
def test_get_loc_cast_bool(self, dtype):
# GH 19086 : int is casted to bool, but not vice-versa (for object dtype)
# With bool dtype, we don't cast in either direction.
levels = [Index([False, True], dtype=dtype), np.arange(2, dtype="int64")]
idx = MultiIndex.from_product(levels)

assert idx.get_loc((0, 1)) == 1
assert idx.get_loc((1, 0)) == 2
if dtype is bool:
with pytest.raises(KeyError, match=r"^\(0, 1\)$"):
assert idx.get_loc((0, 1)) == 1
with pytest.raises(KeyError, match=r"^\(1, 0\)$"):
assert idx.get_loc((1, 0)) == 2
else:
# We use python object comparisons, which treat 0 == False and 1 == True
assert idx.get_loc((0, 1)) == 1
assert idx.get_loc((1, 0)) == 2

with pytest.raises(KeyError, match=r"^\(False, True\)$"):
idx.get_loc((False, True))
Expand Down
4 changes: 4 additions & 0 deletions pandas/tests/indexes/test_any_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@ def test_mutability(index):
def test_map_identity_mapping(index):
# GH#12766
result = index.map(lambda x: x)
if index.dtype == object and result.dtype == bool:
assert (index == result).all()
# TODO: could work that into the 'exact="equiv"'?
return # FIXME: doesn't belong in this file anymore!
tm.assert_index_equal(result, index, exact="equiv")


Expand Down
27 changes: 18 additions & 9 deletions pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,15 +321,21 @@ def test_view_with_args(self, index):
"unicode",
"string",
pytest.param("categorical", marks=pytest.mark.xfail(reason="gh-25464")),
"bool",
"bool-object",
"bool-dtype",
"empty",
],
indirect=True,
)
def test_view_with_args_object_array_raises(self, index):
msg = "Cannot change data-type for object array"
with pytest.raises(TypeError, match=msg):
index.view("i8")
if index.dtype == bool:
msg = "When changing to a larger dtype"
with pytest.raises(ValueError, match=msg):
index.view("i8")
else:
msg = "Cannot change data-type for object array"
with pytest.raises(TypeError, match=msg):
index.view("i8")

@pytest.mark.parametrize("index", ["int", "range"], indirect=True)
def test_astype(self, index):
Expand Down Expand Up @@ -397,9 +403,9 @@ def test_is_(self):

def test_asof_numeric_vs_bool_raises(self):
left = Index([1, 2, 3])
right = Index([True, False])
right = Index([True, False], dtype=object)

msg = "Cannot compare dtypes int64 and object"
msg = "Cannot compare dtypes int64 and bool"
with pytest.raises(TypeError, match=msg):
left.asof(right[0])
# TODO: should right.asof(left[0]) also raise?
Expand Down Expand Up @@ -591,7 +597,8 @@ def test_append_empty_preserve_name(self, name, expected):
"index, expected",
[
("string", False),
("bool", False),
("bool-object", False),
("bool-dtype", False),
("categorical", False),
("int", True),
("datetime", False),
Expand All @@ -606,7 +613,8 @@ def test_is_numeric(self, index, expected):
"index, expected",
[
("string", True),
("bool", True),
("bool-object", True),
("bool-dtype", False),
("categorical", False),
("int", False),
("datetime", False),
Expand All @@ -621,7 +629,8 @@ def test_is_object(self, index, expected):
"index, expected",
[
("string", False),
("bool", False),
("bool-object", False),
("bool-dtype", False),
("categorical", False),
("int", False),
("datetime", True),
Expand Down
7 changes: 6 additions & 1 deletion pandas/tests/indexes/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,9 @@ def test_constructor_non_hashable_name(self, index_flat):

def test_constructor_unwraps_index(self, index_flat):
a = index_flat
b = type(a)(a)
# Passing dtype is necessary for Index([True, False], dtype=object)
# case.
b = type(a)(a, dtype=a.dtype)
tm.assert_equal(a._data, b._data)

def test_to_flat_index(self, index_flat):
Expand Down Expand Up @@ -426,6 +428,9 @@ def test_hasnans_isnans(self, index_flat):
return
elif isinstance(index, NumericIndex) and is_integer_dtype(index.dtype):
return
elif index.dtype == bool:
# values[1] = np.nan below casts to True!
return

values[1] = np.nan

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/indexes/test_index_new.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def test_constructor_dtypes_to_object(self, cast_index, vals):
index = Index(vals)

assert type(index) is Index
assert index.dtype == object
assert index.dtype == bool

def test_constructor_categorical_to_object(self):
# GH#32167 Categorical data and dtype=object should return object-dtype
Expand Down
14 changes: 9 additions & 5 deletions pandas/tests/indexes/test_numpy_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,16 +68,18 @@ def test_numpy_ufuncs_basic(index, func):
with tm.external_error_raised((TypeError, AttributeError)):
with np.errstate(all="ignore"):
func(index)
elif isinstance(index, NumericIndex) or (
not isinstance(index.dtype, np.dtype) and index.dtype._is_numeric
elif (
isinstance(index, NumericIndex)
or (not isinstance(index.dtype, np.dtype) and index.dtype._is_numeric)
or index.dtype == bool
):
# coerces to float (e.g. np.sin)
with np.errstate(all="ignore"):
result = func(index)
exp = Index(func(index.values), name=index.name)

tm.assert_index_equal(result, exp)
if type(index) is not Index:
if type(index) is not Index or index.dtype == bool:
# i.e NumericIndex
assert isinstance(result, Float64Index)
else:
Expand Down Expand Up @@ -117,8 +119,10 @@ def test_numpy_ufuncs_other(index, func):
with tm.external_error_raised(TypeError):
func(index)

elif isinstance(index, NumericIndex) or (
not isinstance(index.dtype, np.dtype) and index.dtype._is_numeric
elif (
isinstance(index, NumericIndex)
or (not isinstance(index.dtype, np.dtype) and index.dtype._is_numeric)
or index.dtype == bool
):
# Results in bool array
result = func(index)
Expand Down
19 changes: 14 additions & 5 deletions pandas/tests/indexes/test_setops.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import pytest

from pandas.core.dtypes.cast import find_common_type
from pandas.core.dtypes.common import is_dtype_equal

from pandas import (
CategoricalIndex,
Expand Down Expand Up @@ -55,14 +54,20 @@ def test_union_different_types(index_flat, index_flat2, request):

if (
not idx1.is_unique
and not idx2.is_unique
and not idx2.is_monotonic_decreasing
and idx1.dtype.kind == "i"
and is_dtype_equal(idx2.dtype, "boolean")
and idx2.dtype.kind == "b"
) or (
not idx2.is_unique
and not idx1.is_unique
and not idx1.is_monotonic_decreasing
and idx2.dtype.kind == "i"
and is_dtype_equal(idx1.dtype, "boolean")
and idx1.dtype.kind == "b"
):
mark = pytest.mark.xfail(reason="GH#44000 True==1", raises=ValueError)
mark = pytest.mark.xfail(
reason="GH#44000 True==1", raises=ValueError, strict=False
)
request.node.add_marker(mark)

common_dtype = find_common_type([idx1.dtype, idx2.dtype])
Expand Down Expand Up @@ -231,7 +236,11 @@ def test_union_base(self, index):
def test_difference_base(self, sort, index):
first = index[2:]
second = index[:4]
if isinstance(index, CategoricalIndex) or index.is_boolean():
if index.is_boolean():
# i think (TODO: be sure) there assumptions baked in about
# the index fixture that don't hold here?
answer = set(first).difference(set(second))
elif isinstance(index, CategoricalIndex):
answer = []
else:
answer = index[4:]
Expand Down
Loading