pandas-dev · jorisvandenbossche · May 9, 2020 · Mar 27, 2020 · Mar 27, 2020 · Apr 2, 2020
diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
@@ -34,7 +34,16 @@ class Factorize:
     params = [
         [True, False],
         [True, False],
-        ["int", "uint", "float", "string", "datetime64[ns]", "datetime64[ns, tz]"],
+        [
+            "int",
+            "uint",
+            "float",
+            "string",
+            "datetime64[ns]",
+            "datetime64[ns, tz]",
+            "Int64",
+            "boolean",
+        ],
     ]
     param_names = ["unique", "sort", "dtype"]
 
@@ -49,13 +58,21 @@ def setup(self, unique, sort, dtype):
             "datetime64[ns, tz]": pd.date_range(
                 "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
             ),
+            "Int64": pd.array(np.arange(N), dtype="Int64"),
+            "boolean": pd.array(np.random.randint(0, 2, N), dtype="boolean"),
         }[dtype]
         if not unique:
             data = data.repeat(5)
         self.idx = data
+        if dtype in ("Int64", "boolean") and sort:
+            # sort is not a keyword on EAs
+            raise NotImplementedError
 
     def time_factorize(self, unique, sort, dtype):
-        self.idx.factorize(sort=sort)
+        if sort:
+            self.idx.factorize(sort=sort)
+        else:
+            self.idx.factorize()
 
 
 class Duplicated:

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -276,6 +276,7 @@ Performance improvements
   :meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`,
   :issue:`32825`,  :issue:`32826`, :issue:`32856`, :issue:`32858`).
 - Performance improvement in :meth:`Series.sum` for nullable (integer and boolean) dtypes (:issue:`30982`).
+- Performance improvement in :func:`factorize` for nullable (integer and boolean) dtypes (:issue:`33064`).
 
 
 .. ---------------------------------------------------------------------------

diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -368,7 +368,7 @@ cdef class {{name}}HashTable(HashTable):
     def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
                 Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                 object na_value=None, bint ignore_na=False,
-                bint return_inverse=False):
+                object mask=None, bint return_inverse=False):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -391,6 +391,10 @@ cdef class {{name}}HashTable(HashTable):
             Whether NA-values should be ignored for calculating the uniques. If
             True, the labels corresponding to missing values will be set to
             na_sentinel.
+        mask : ndarray[bool], optional
+            If not None, the mask is used as indicator for missing values
+            (True = missing, False = valid) instead of `na_value` or
+            condition "val != val".
         return_inverse : boolean, default False
             Whether the mapping of the original array values to their location
             in the vector of uniques should be returned.
@@ -409,12 +413,17 @@ cdef class {{name}}HashTable(HashTable):
             {{dtype}}_t val, na_value2
             khiter_t k
             {{name}}VectorData *ud
-            bint use_na_value
+            bint use_na_value, use_mask
+            uint8_t[:] mask_values
 
         if return_inverse:
             labels = np.empty(n, dtype=np.int64)
         ud = uniques.data
         use_na_value = na_value is not None
+        use_mask = mask is not None
+
+        if use_mask:
+            mask_values = mask.view("uint8")
 
         if use_na_value:
             # We need this na_value2 because we want to allow users
@@ -430,7 +439,11 @@ cdef class {{name}}HashTable(HashTable):
             for i in range(n):
                 val = values[i]
 
-                if ignore_na and (
+                if ignore_na and use_mask:
+                    if mask_values[i]:
+                        labels[i] = na_sentinel
+                        continue
+                elif ignore_na and (
                 {{if not name.lower().startswith(("uint", "int"))}}
                 val != val or
                 {{endif}}
@@ -494,7 +507,7 @@ cdef class {{name}}HashTable(HashTable):
                             return_inverse=return_inverse)
 
     def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
-                  object na_value=None):
+                  object na_value=None, object mask=None):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -512,6 +525,10 @@ cdef class {{name}}HashTable(HashTable):
             any value "val" satisfying val != val is considered missing.
             If na_value is not None, then _additionally_, any value "val"
             satisfying val == na_value is considered missing.
+        mask : ndarray[bool], optional
+            If not None, the mask is used as indicator for missing values
+            (True = missing, False = valid) instead of `na_value` or
+            condition "val != val".
 
         Returns
         -------
@@ -522,7 +539,7 @@ cdef class {{name}}HashTable(HashTable):
         """
         uniques_vector = {{name}}Vector()
         return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
-                            na_value=na_value, ignore_na=True,
+                            na_value=na_value, ignore_na=True, mask=mask,
                             return_inverse=True)
 
     def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
@@ -855,7 +872,7 @@ cdef class StringHashTable(HashTable):
                             return_inverse=return_inverse)
 
     def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
-                  object na_value=None):
+                  object na_value=None, object mask=None):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -873,6 +890,8 @@ cdef class StringHashTable(HashTable):
             that is not a string is considered missing. If na_value is
             not None, then _additionally_ any value "val" satisfying
             val == na_value is considered missing.
+        mask : ndarray[bool], optional
+            Not yet implementd for StringHashTable.
 
         Returns
         -------
@@ -1094,7 +1113,7 @@ cdef class PyObjectHashTable(HashTable):
                             return_inverse=return_inverse)
 
     def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
-                  object na_value=None):
+                  object na_value=None, object mask=None):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -1112,6 +1131,8 @@ cdef class PyObjectHashTable(HashTable):
             any value "val" satisfying val != val is considered missing.
             If na_value is not None, then _additionally_, any value "val"
             satisfying val == na_value is considered missing.
+        mask : ndarray[bool], optional
+            Not yet implemented for PyObjectHashTable.
 
         Returns
         -------

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -455,7 +455,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray:
 
 
 def _factorize_array(
-    values, na_sentinel: int = -1, size_hint=None, na_value=None
+    values, na_sentinel: int = -1, size_hint=None, na_value=None, mask=None,
 ) -> Tuple[np.ndarray, np.ndarray]:
     """
     Factorize an array-like to codes and uniques.
@@ -473,6 +473,10 @@ def _factorize_array(
         parameter when you know that you don't have any values pandas would
         consider missing in the array (NaN for float data, iNaT for
         datetimes, etc.).
+    mask : ndarray[bool], optional
+        If not None, the mask is used as indicator for missing values
+        (True = missing, False = valid) instead of `na_value` or
+        condition "val != val".
 
     Returns
     -------
@@ -482,7 +486,9 @@ def _factorize_array(
     hash_klass, values = _get_data_algo(values)
 
     table = hash_klass(size_hint or len(values))
-    uniques, codes = table.factorize(values, na_sentinel=na_sentinel, na_value=na_value)
+    uniques, codes = table.factorize(
+        values, na_sentinel=na_sentinel, na_value=na_value, mask=mask
+    )
 
     codes = ensure_platform_int(codes)
     return codes, uniques

diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py
@@ -83,6 +83,10 @@ def type(self) -> Type[np.bool_]:
     def kind(self) -> str:
         return "b"
 
+    @property
+    def numpy_dtype(self) -> np.dtype:
+        return np.dtype("bool")
+
     @classmethod
     def construct_array_type(cls) -> Type["BooleanArray"]:
         """
@@ -314,15 +318,6 @@ def map_string(s):
         scalars = [map_string(x) for x in strings]
         return cls._from_sequence(scalars, dtype, copy)
 
-    def _values_for_factorize(self) -> Tuple[np.ndarray, int]:
-        data = self._data.astype("int8")
-        data[self._mask] = -1
-        return data, -1
-
-    @classmethod
-    def _from_factorized(cls, values, original: "BooleanArray") -> "BooleanArray":
-        return cls._from_sequence(values, dtype=original.dtype)
-
     _HANDLED_TYPES = (np.ndarray, numbers.Number, bool, np.bool_)
 
     def __array_ufunc__(self, ufunc, method: str, *inputs, **kwargs):

diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
@@ -366,10 +366,6 @@ def _from_sequence_of_strings(
         scalars = to_numeric(strings, errors="raise")
         return cls._from_sequence(scalars, dtype, copy)
 
-    @classmethod
-    def _from_factorized(cls, values, original) -> "IntegerArray":
-        return integer_array(values, dtype=original.dtype)
-
     _HANDLED_TYPES = (np.ndarray, numbers.Number)
 
     def __array_ufunc__(self, ufunc, method: str, *inputs, **kwargs):
@@ -479,11 +475,6 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike:
         data = self.to_numpy(dtype=dtype, **kwargs)
         return astype_nansafe(data, dtype, copy=False)
 
-    def _values_for_factorize(self) -> Tuple[np.ndarray, float]:
-        # TODO: https://github.com/pandas-dev/pandas/issues/30037
-        # use masked algorithms, rather than object-dtype / np.nan.
-        return self.to_numpy(na_value=np.nan), np.nan
-
     def _values_for_argsort(self) -> np.ndarray:
         """
         Return values for sorting.

diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -1,14 +1,15 @@
-from typing import TYPE_CHECKING, Optional, Type, TypeVar
+from typing import TYPE_CHECKING, Optional, Tuple, Type, TypeVar
 
 import numpy as np
 
 from pandas._libs import lib, missing as libmissing
 from pandas._typing import Scalar
+from pandas.util._decorators import doc
 
 from pandas.core.dtypes.common import is_integer, is_object_dtype, is_string_dtype
 from pandas.core.dtypes.missing import isna, notna
 
-from pandas.core.algorithms import take
+from pandas.core.algorithms import _factorize_array, take
 from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
 from pandas.core.indexers import check_array_indexer
 
@@ -217,6 +218,18 @@ def copy(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
         mask = mask.copy()
         return type(self)(data, mask, copy=False)
 
+    @doc(ExtensionArray.factorize)
+    def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]:
+        arr = self._data
+        mask = self._mask
+
+        codes, uniques = _factorize_array(arr, na_sentinel=na_sentinel, mask=mask)
+
+        # the hashtables don't handle all different types of bits
+        uniques = uniques.astype(self.dtype.numpy_dtype, copy=False)  # type: ignore
+        uniques = type(self)(uniques, np.zeros(len(uniques), dtype=bool))
+        return codes, uniques
+
     def value_counts(self, dropna: bool = True) -> "Series":
         """
         Returns a Series containing counts of each unique value.

diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py
@@ -133,6 +133,8 @@ def test_factorize_equivalence(self, data_for_grouping, na_sentinel):
 
         tm.assert_numpy_array_equal(codes_1, codes_2)
         self.assert_extension_array_equal(uniques_1, uniques_2)
+        assert len(uniques_1) == len(pd.unique(uniques_1))
+        assert uniques_1.dtype == data_for_grouping.dtype
 
     def test_factorize_empty(self, data):
         codes, uniques = pd.factorize(data[:0])