Skip to content

Commit

Permalink
ENH/PERF: use mask in factorize for nullable dtypes (#33064)
Browse files Browse the repository at this point in the history
  • Loading branch information
jorisvandenbossche authored May 9, 2020
1 parent f21bc99 commit 9ed015f
Show file tree
Hide file tree
Showing 8 changed files with 94 additions and 50 deletions.
17 changes: 14 additions & 3 deletions asv_bench/benchmarks/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,16 @@ class Factorize:
params = [
[True, False],
[True, False],
["int", "uint", "float", "string", "datetime64[ns]", "datetime64[ns, tz]"],
[
"int",
"uint",
"float",
"string",
"datetime64[ns]",
"datetime64[ns, tz]",
"Int64",
"boolean",
],
]
param_names = ["unique", "sort", "dtype"]

Expand All @@ -49,13 +58,15 @@ def setup(self, unique, sort, dtype):
"datetime64[ns, tz]": pd.date_range(
"2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
),
"Int64": pd.array(np.arange(N), dtype="Int64"),
"boolean": pd.array(np.random.randint(0, 2, N), dtype="boolean"),
}[dtype]
if not unique:
data = data.repeat(5)
self.idx = data
self.data = data

def time_factorize(self, unique, sort, dtype):
self.idx.factorize(sort=sort)
pd.factorize(self.data, sort=sort)


class Duplicated:
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,7 @@ Performance improvements
sparse values from ``scipy.sparse`` matrices using the
:meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`,
:issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`).
- Performance improvement in :func:`factorize` for nullable (integer and boolean) dtypes (:issue:`33064`).
- Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`).


Expand Down
35 changes: 28 additions & 7 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,7 @@ cdef class {{name}}HashTable(HashTable):
def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
object na_value=None, bint ignore_na=False,
bint return_inverse=False):
object mask=None, bint return_inverse=False):
"""
Calculate unique values and labels (no sorting!)

Expand All @@ -388,6 +388,10 @@ cdef class {{name}}HashTable(HashTable):
Whether NA-values should be ignored for calculating the uniques. If
True, the labels corresponding to missing values will be set to
na_sentinel.
mask : ndarray[bool], optional
If not None, the mask is used as indicator for missing values
(True = missing, False = valid) instead of `na_value` or
condition "val != val".
return_inverse : boolean, default False
Whether the mapping of the original array values to their location
in the vector of uniques should be returned.
Expand All @@ -406,12 +410,17 @@ cdef class {{name}}HashTable(HashTable):
{{dtype}}_t val, na_value2
khiter_t k
{{name}}VectorData *ud
bint use_na_value
bint use_na_value, use_mask
uint8_t[:] mask_values

if return_inverse:
labels = np.empty(n, dtype=np.int64)
ud = uniques.data
use_na_value = na_value is not None
use_mask = mask is not None

if use_mask:
mask_values = mask.view("uint8")

if use_na_value:
# We need this na_value2 because we want to allow users
Expand All @@ -427,7 +436,11 @@ cdef class {{name}}HashTable(HashTable):
for i in range(n):
val = values[i]

if ignore_na and (
if ignore_na and use_mask:
if mask_values[i]:
labels[i] = na_sentinel
continue
elif ignore_na and (
{{if not name.lower().startswith(("uint", "int"))}}
val != val or
{{endif}}
Expand Down Expand Up @@ -491,7 +504,7 @@ cdef class {{name}}HashTable(HashTable):
return_inverse=return_inverse)

def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
object na_value=None):
object na_value=None, object mask=None):
"""
Calculate unique values and labels (no sorting!)

Expand All @@ -509,6 +522,10 @@ cdef class {{name}}HashTable(HashTable):
any value "val" satisfying val != val is considered missing.
If na_value is not None, then _additionally_, any value "val"
satisfying val == na_value is considered missing.
mask : ndarray[bool], optional
If not None, the mask is used as indicator for missing values
(True = missing, False = valid) instead of `na_value` or
condition "val != val".

Returns
-------
Expand All @@ -519,7 +536,7 @@ cdef class {{name}}HashTable(HashTable):
"""
uniques_vector = {{name}}Vector()
return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
na_value=na_value, ignore_na=True,
na_value=na_value, ignore_na=True, mask=mask,
return_inverse=True)

def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
Expand Down Expand Up @@ -852,7 +869,7 @@ cdef class StringHashTable(HashTable):
return_inverse=return_inverse)

def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
object na_value=None):
object na_value=None, object mask=None):
"""
Calculate unique values and labels (no sorting!)

Expand All @@ -870,6 +887,8 @@ cdef class StringHashTable(HashTable):
that is not a string is considered missing. If na_value is
not None, then _additionally_ any value "val" satisfying
val == na_value is considered missing.
mask : ndarray[bool], optional
Not yet implementd for StringHashTable.

Returns
-------
Expand Down Expand Up @@ -1091,7 +1110,7 @@ cdef class PyObjectHashTable(HashTable):
return_inverse=return_inverse)

def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
object na_value=None):
object na_value=None, object mask=None):
"""
Calculate unique values and labels (no sorting!)

Expand All @@ -1109,6 +1128,8 @@ cdef class PyObjectHashTable(HashTable):
any value "val" satisfying val != val is considered missing.
If na_value is not None, then _additionally_, any value "val"
satisfying val == na_value is considered missing.
mask : ndarray[bool], optional
Not yet implemented for PyObjectHashTable.

Returns
-------
Expand Down
10 changes: 8 additions & 2 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,7 +461,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray:


def _factorize_array(
values, na_sentinel: int = -1, size_hint=None, na_value=None
values, na_sentinel: int = -1, size_hint=None, na_value=None, mask=None,
) -> Tuple[np.ndarray, np.ndarray]:
"""
Factorize an array-like to codes and uniques.
Expand All @@ -479,6 +479,10 @@ def _factorize_array(
parameter when you know that you don't have any values pandas would
consider missing in the array (NaN for float data, iNaT for
datetimes, etc.).
mask : ndarray[bool], optional
If not None, the mask is used as indicator for missing values
(True = missing, False = valid) instead of `na_value` or
condition "val != val".
Returns
-------
Expand All @@ -488,7 +492,9 @@ def _factorize_array(
hash_klass, values = _get_data_algo(values)

table = hash_klass(size_hint or len(values))
uniques, codes = table.factorize(values, na_sentinel=na_sentinel, na_value=na_value)
uniques, codes = table.factorize(
values, na_sentinel=na_sentinel, na_value=na_value, mask=mask
)

codes = ensure_platform_int(codes)
return codes, uniques
Expand Down
29 changes: 6 additions & 23 deletions pandas/core/arrays/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from pandas.compat import set_function_name
from pandas.compat.numpy import function as nv

from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.dtypes.cast import astype_nansafe
from pandas.core.dtypes.common import (
is_bool_dtype,
Expand All @@ -30,14 +29,14 @@
from pandas.core.array_algos import masked_reductions
from pandas.core.indexers import check_array_indexer

from .masked import BaseMaskedArray
from .masked import BaseMaskedArray, BaseMaskedDtype

if TYPE_CHECKING:
import pyarrow # noqa: F401


@register_extension_dtype
class BooleanDtype(ExtensionDtype):
class BooleanDtype(BaseMaskedDtype):
"""
Extension dtype for boolean data.
Expand All @@ -64,17 +63,6 @@ class BooleanDtype(ExtensionDtype):

name = "boolean"

@property
def na_value(self) -> libmissing.NAType:
"""
BooleanDtype uses :attr:`pandas.NA` as the missing NA value.
.. warning::
`na_value` may change in a future release.
"""
return libmissing.NA

@property
def type(self) -> Type[np.bool_]:
return np.bool_
Expand All @@ -83,6 +71,10 @@ def type(self) -> Type[np.bool_]:
def kind(self) -> str:
return "b"

@property
def numpy_dtype(self) -> np.dtype:
return np.dtype("bool")

@classmethod
def construct_array_type(cls) -> Type["BooleanArray"]:
"""
Expand Down Expand Up @@ -304,15 +296,6 @@ def map_string(s):
scalars = [map_string(x) for x in strings]
return cls._from_sequence(scalars, dtype, copy)

def _values_for_factorize(self) -> Tuple[np.ndarray, int]:
data = self._data.astype("int8")
data[self._mask] = -1
return data, -1

@classmethod
def _from_factorized(cls, values, original: "BooleanArray") -> "BooleanArray":
return cls._from_sequence(values, dtype=original.dtype)

_HANDLED_TYPES = (np.ndarray, numbers.Number, bool, np.bool_)

def __array_ufunc__(self, ufunc, method: str, *inputs, **kwargs):
Expand Down
15 changes: 2 additions & 13 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from pandas.compat.numpy import function as nv
from pandas.util._decorators import cache_readonly

from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.dtypes.cast import astype_nansafe
from pandas.core.dtypes.common import (
is_bool_dtype,
Expand All @@ -34,13 +33,13 @@
from pandas.core.ops.common import unpack_zerodim_and_defer
from pandas.core.tools.numeric import to_numeric

from .masked import BaseMaskedArray
from .masked import BaseMaskedArray, BaseMaskedDtype

if TYPE_CHECKING:
import pyarrow # noqa: F401


class _IntegerDtype(ExtensionDtype):
class _IntegerDtype(BaseMaskedDtype):
"""
An ExtensionDtype to hold a single size & kind of integer dtype.
Expand All @@ -53,7 +52,6 @@ class _IntegerDtype(ExtensionDtype):
name: str
base = None
type: Type
na_value = libmissing.NA

def __repr__(self) -> str:
sign = "U" if self.is_unsigned_integer else ""
Expand Down Expand Up @@ -372,10 +370,6 @@ def _from_sequence_of_strings(
scalars = to_numeric(strings, errors="raise")
return cls._from_sequence(scalars, dtype, copy)

@classmethod
def _from_factorized(cls, values, original) -> "IntegerArray":
return integer_array(values, dtype=original.dtype)

_HANDLED_TYPES = (np.ndarray, numbers.Number)

def __array_ufunc__(self, ufunc, method: str, *inputs, **kwargs):
Expand Down Expand Up @@ -485,11 +479,6 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike:
data = self.to_numpy(dtype=dtype, **kwargs)
return astype_nansafe(data, dtype, copy=False)

def _values_for_factorize(self) -> Tuple[np.ndarray, float]:
# TODO: https://github.com/pandas-dev/pandas/issues/30037
# use masked algorithms, rather than object-dtype / np.nan.
return self.to_numpy(na_value=np.nan), np.nan

def _values_for_argsort(self) -> np.ndarray:
"""
Return values for sorting.
Expand Down
35 changes: 33 additions & 2 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
from typing import TYPE_CHECKING, Optional, Type, TypeVar
from typing import TYPE_CHECKING, Optional, Tuple, Type, TypeVar

import numpy as np

from pandas._libs import lib, missing as libmissing
from pandas._typing import Scalar
from pandas.errors import AbstractMethodError
from pandas.util._decorators import doc

from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.dtypes.common import is_integer, is_object_dtype, is_string_dtype
from pandas.core.dtypes.missing import isna, notna

from pandas.core.algorithms import take
from pandas.core.algorithms import _factorize_array, take
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
from pandas.core.indexers import check_array_indexer

Expand All @@ -19,6 +22,18 @@
BaseMaskedArrayT = TypeVar("BaseMaskedArrayT", bound="BaseMaskedArray")


class BaseMaskedDtype(ExtensionDtype):
"""
Base class for dtypes for BasedMaskedArray subclasses.
"""

na_value = libmissing.NA

@property
def numpy_dtype(self) -> np.dtype:
raise AbstractMethodError


class BaseMaskedArray(ExtensionArray, ExtensionOpsMixin):
"""
Base class for masked arrays (which use _data and _mask to store the data).
Expand Down Expand Up @@ -48,6 +63,10 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False):
self._data = values
self._mask = mask

@property
def dtype(self) -> BaseMaskedDtype:
raise AbstractMethodError(self)

def __getitem__(self, item):
if is_integer(item):
if self._mask[item]:
Expand Down Expand Up @@ -228,6 +247,18 @@ def copy(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
mask = mask.copy()
return type(self)(data, mask, copy=False)

@doc(ExtensionArray.factorize)
def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]:
arr = self._data
mask = self._mask

codes, uniques = _factorize_array(arr, na_sentinel=na_sentinel, mask=mask)

# the hashtables don't handle all different types of bits
uniques = uniques.astype(self.dtype.numpy_dtype, copy=False)
uniques = type(self)(uniques, np.zeros(len(uniques), dtype=bool))
return codes, uniques

def value_counts(self, dropna: bool = True) -> "Series":
"""
Returns a Series containing counts of each unique value.
Expand Down
2 changes: 2 additions & 0 deletions pandas/tests/extension/base/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,8 @@ def test_factorize_equivalence(self, data_for_grouping, na_sentinel):

tm.assert_numpy_array_equal(codes_1, codes_2)
self.assert_extension_array_equal(uniques_1, uniques_2)
assert len(uniques_1) == len(pd.unique(uniques_1))
assert uniques_1.dtype == data_for_grouping.dtype

def test_factorize_empty(self, data):
codes, uniques = pd.factorize(data[:0])
Expand Down

0 comments on commit 9ed015f

Please sign in to comment.