Skip to content

Commit 89bc0aa

Browse files
jorisvandenbosschejreback
authored andcommitted
REF: Implement BaseMaskedArray class for integer/boolean ExtensionArrays (#30789)
1 parent a73ce98 commit 89bc0aa

File tree

3 files changed

+215
-303
lines changed

3 files changed

+215
-303
lines changed

pandas/core/arrays/boolean.py

+6-183
Original file line numberDiff line numberDiff line change
@@ -15,24 +15,19 @@
1515
is_extension_array_dtype,
1616
is_float,
1717
is_float_dtype,
18-
is_integer,
1918
is_integer_dtype,
2019
is_list_like,
2120
is_numeric_dtype,
22-
is_object_dtype,
2321
is_scalar,
24-
is_string_dtype,
2522
pandas_dtype,
2623
)
2724
from pandas.core.dtypes.dtypes import register_extension_dtype
2825
from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
2926
from pandas.core.dtypes.missing import isna, notna
3027

3128
from pandas.core import nanops, ops
32-
from pandas.core.algorithms import take
33-
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
34-
import pandas.core.common as com
35-
from pandas.core.indexers import check_bool_array_indexer
29+
30+
from .masked import BaseMaskedArray
3631

3732
if TYPE_CHECKING:
3833
from pandas._typing import Scalar
@@ -199,7 +194,7 @@ def coerce_to_array(values, mask=None, copy: bool = False):
199194
return values, mask
200195

201196

202-
class BooleanArray(ExtensionArray, ExtensionOpsMixin):
197+
class BooleanArray(BaseMaskedArray):
203198
"""
204199
Array of boolean (True/False) data with missing values.
205200
@@ -253,6 +248,9 @@ class BooleanArray(ExtensionArray, ExtensionOpsMixin):
253248
Length: 3, dtype: boolean
254249
"""
255250

251+
# The value used to fill '_data' to avoid upcasting
252+
_internal_fill_value = False
253+
256254
def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False):
257255
if not (isinstance(values, np.ndarray) and values.dtype == np.bool_):
258256
raise TypeError(
@@ -297,127 +295,6 @@ def _values_for_factorize(self) -> Tuple[np.ndarray, Any]:
297295
def _from_factorized(cls, values, original: "BooleanArray"):
298296
return cls._from_sequence(values, dtype=original.dtype)
299297

300-
def _formatter(self, boxed=False):
301-
return str
302-
303-
@property
304-
def _hasna(self) -> bool:
305-
# Note: this is expensive right now! The hope is that we can
306-
# make this faster by having an optional mask, but not have to change
307-
# source code using it..
308-
return self._mask.any()
309-
310-
def __getitem__(self, item):
311-
if is_integer(item):
312-
if self._mask[item]:
313-
return self.dtype.na_value
314-
return self._data[item]
315-
316-
elif com.is_bool_indexer(item):
317-
item = check_bool_array_indexer(self, item)
318-
319-
return type(self)(self._data[item], self._mask[item])
320-
321-
def to_numpy(
322-
self, dtype=None, copy=False, na_value: "Scalar" = lib.no_default,
323-
):
324-
"""
325-
Convert to a NumPy Array.
326-
327-
By default converts to an object-dtype NumPy array. Specify the `dtype` and
328-
`na_value` keywords to customize the conversion.
329-
330-
Parameters
331-
----------
332-
dtype : dtype, default object
333-
The numpy dtype to convert to.
334-
copy : bool, default False
335-
Whether to ensure that the returned value is a not a view on
336-
the array. Note that ``copy=False`` does not *ensure* that
337-
``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
338-
a copy is made, even if not strictly necessary. This is typically
339-
only possible when no missing values are present and `dtype`
340-
is a boolean dtype.
341-
na_value : scalar, optional
342-
Scalar missing value indicator to use in numpy array. Defaults
343-
to the native missing value indicator of this array (pd.NA).
344-
345-
Returns
346-
-------
347-
numpy.ndarray
348-
349-
Examples
350-
--------
351-
An object-dtype is the default result
352-
353-
>>> a = pd.array([True, False], dtype="boolean")
354-
>>> a.to_numpy()
355-
array([True, False], dtype=object)
356-
357-
When no missing values are present, a boolean dtype can be used.
358-
359-
>>> a.to_numpy(dtype="bool")
360-
array([ True, False])
361-
362-
However, requesting a bool dtype will raise a ValueError if
363-
missing values are present and the default missing value :attr:`NA`
364-
is used.
365-
366-
>>> a = pd.array([True, False, pd.NA], dtype="boolean")
367-
>>> a
368-
<BooleanArray>
369-
[True, False, NA]
370-
Length: 3, dtype: boolean
371-
372-
>>> a.to_numpy(dtype="bool")
373-
Traceback (most recent call last):
374-
...
375-
ValueError: cannot convert to bool numpy array in presence of missing values
376-
377-
Specify a valid `na_value` instead
378-
379-
>>> a.to_numpy(dtype="bool", na_value=False)
380-
array([ True, False, False])
381-
"""
382-
if na_value is lib.no_default:
383-
na_value = libmissing.NA
384-
if dtype is None:
385-
dtype = object
386-
if self._hasna:
387-
if (
388-
not (is_object_dtype(dtype) or is_string_dtype(dtype))
389-
and na_value is libmissing.NA
390-
):
391-
raise ValueError(
392-
f"cannot convert to '{dtype}'-dtype NumPy array "
393-
"with missing values. Specify an appropriate 'na_value' "
394-
"for this dtype."
395-
)
396-
# don't pass copy to astype -> always need a copy since we are mutating
397-
data = self._data.astype(dtype)
398-
data[self._mask] = na_value
399-
else:
400-
data = self._data.astype(dtype, copy=copy)
401-
return data
402-
403-
__array_priority__ = 1000 # higher than ndarray so ops dispatch to us
404-
405-
def __array__(self, dtype=None):
406-
"""
407-
the array interface, return my values
408-
We return an object array here to preserve our scalar values
409-
"""
410-
# by default (no dtype specified), return an object array
411-
return self.to_numpy(dtype=dtype)
412-
413-
def __arrow_array__(self, type=None):
414-
"""
415-
Convert myself into a pyarrow Array.
416-
"""
417-
import pyarrow as pa
418-
419-
return pa.array(self._data, mask=self._mask, type=type)
420-
421298
_HANDLED_TYPES = (np.ndarray, numbers.Number, bool, np.bool_)
422299

423300
def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
@@ -465,40 +342,6 @@ def reconstruct(x):
465342
else:
466343
return reconstruct(result)
467344

468-
def __iter__(self):
469-
for i in range(len(self)):
470-
if self._mask[i]:
471-
yield self.dtype.na_value
472-
else:
473-
yield self._data[i]
474-
475-
def take(self, indexer, allow_fill=False, fill_value=None):
476-
# we always fill with False internally
477-
# to avoid upcasting
478-
data_fill_value = False if isna(fill_value) else fill_value
479-
result = take(
480-
self._data, indexer, fill_value=data_fill_value, allow_fill=allow_fill
481-
)
482-
483-
mask = take(self._mask, indexer, fill_value=True, allow_fill=allow_fill)
484-
485-
# if we are filling
486-
# we only fill where the indexer is null
487-
# not existing missing values
488-
# TODO(jreback) what if we have a non-na float as a fill value?
489-
if allow_fill and notna(fill_value):
490-
fill_mask = np.asarray(indexer) == -1
491-
result[fill_mask] = fill_value
492-
mask = mask ^ fill_mask
493-
494-
return type(self)(result, mask, copy=False)
495-
496-
def copy(self):
497-
data, mask = self._data, self._mask
498-
data = data.copy()
499-
mask = mask.copy()
500-
return type(self)(data, mask, copy=False)
501-
502345
def __setitem__(self, key, value):
503346
_is_scalar = is_scalar(value)
504347
if _is_scalar:
@@ -512,26 +355,6 @@ def __setitem__(self, key, value):
512355
self._data[key] = value
513356
self._mask[key] = mask
514357

515-
def __len__(self):
516-
return len(self._data)
517-
518-
@property
519-
def nbytes(self):
520-
return self._data.nbytes + self._mask.nbytes
521-
522-
def isna(self):
523-
return self._mask
524-
525-
@property
526-
def _na_value(self):
527-
return self._dtype.na_value
528-
529-
@classmethod
530-
def _concat_same_type(cls, to_concat):
531-
data = np.concatenate([x._data for x in to_concat])
532-
mask = np.concatenate([x._mask for x in to_concat])
533-
return cls(data, mask)
534-
535358
def astype(self, dtype, copy=True):
536359
"""
537360
Cast to a NumPy array or ExtensionArray with 'dtype'.

0 commit comments

Comments
 (0)