pandas-dev · jorisvandenbossche · May 9, 2020 · Mar 27, 2020 · Mar 27, 2020 · Apr 2, 2020
diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -368,7 +368,7 @@ cdef class {{name}}HashTable(HashTable):
     def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
                 Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                 object na_value=None, bint ignore_na=False,
-                bint return_inverse=False):
+                object mask=None, bint return_inverse=False):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -391,6 +391,10 @@ cdef class {{name}}HashTable(HashTable):
             Whether NA-values should be ignored for calculating the uniques. If
             True, the labels corresponding to missing values will be set to
             na_sentinel.
+        mask : ndarray[bool], optional
+            If not None, the mask is used as indicator for missing values
+            (True = missing, False = valid) instead of `na_value` or
+            condition "val != val".
         return_inverse : boolean, default False
             Whether the mapping of the original array values to their location
             in the vector of uniques should be returned.
@@ -409,12 +413,17 @@ cdef class {{name}}HashTable(HashTable):
             {{dtype}}_t val, na_value2
             khiter_t k
             {{name}}VectorData *ud
-            bint use_na_value
+            bint use_na_value, use_mask
+            uint8_t[:] mask_values
 
         if return_inverse:
             labels = np.empty(n, dtype=np.int64)
         ud = uniques.data
         use_na_value = na_value is not None
+        use_mask = mask is not None
+
+        if use_mask:
+            mask_values = mask.view("uint8")
 
         if use_na_value:
             # We need this na_value2 because we want to allow users
@@ -430,7 +439,11 @@ cdef class {{name}}HashTable(HashTable):
             for i in range(n):
                 val = values[i]
 
-                if ignore_na and (
+                if ignore_na and use_mask:
+                    if mask_values[i]:
+                        labels[i] = na_sentinel
+                        continue
+                elif ignore_na and (
                 {{if not name.lower().startswith(("uint", "int"))}}
                 val != val or
                 {{endif}}
@@ -494,7 +507,7 @@ cdef class {{name}}HashTable(HashTable):
                             return_inverse=return_inverse)
 
     def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
-                  object na_value=None):
+                  object na_value=None, object mask=None):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -512,6 +525,10 @@ cdef class {{name}}HashTable(HashTable):
             any value "val" satisfying val != val is considered missing.
             If na_value is not None, then _additionally_, any value "val"
             satisfying val == na_value is considered missing.
+        mask : ndarray[bool], optional
+            If not None, the mask is used as indicator for missing values
+            (True = missing, False = valid) instead of `na_value` or
+            condition "val != val".
 
         Returns
         -------
@@ -522,7 +539,7 @@ cdef class {{name}}HashTable(HashTable):
         """
         uniques_vector = {{name}}Vector()
         return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
-                            na_value=na_value, ignore_na=True,
+                            na_value=na_value, ignore_na=True, mask=mask,
                             return_inverse=True)
 
     def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
@@ -855,7 +872,7 @@ cdef class StringHashTable(HashTable):
                             return_inverse=return_inverse)
 
     def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
-                  object na_value=None):
+                  object na_value=None, object mask=None):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -873,6 +890,8 @@ cdef class StringHashTable(HashTable):
             that is not a string is considered missing. If na_value is
             not None, then _additionally_ any value "val" satisfying
             val == na_value is considered missing.
+        mask : ndarray[bool], optional
+            Not yet implementd for StringHashTable.
 
         Returns
         -------
@@ -1094,7 +1113,7 @@ cdef class PyObjectHashTable(HashTable):
                             return_inverse=return_inverse)
 
     def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
-                  object na_value=None):
+                  object na_value=None, object mask=None):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -1112,6 +1131,8 @@ cdef class PyObjectHashTable(HashTable):
             any value "val" satisfying val != val is considered missing.
             If na_value is not None, then _additionally_, any value "val"
             satisfying val == na_value is considered missing.
+        mask : ndarray[bool], optional
+            Not yet implemented for PyObjectHashTable.
 
         Returns
         -------

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -455,7 +455,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray:
 
 
 def _factorize_array(
-    values, na_sentinel: int = -1, size_hint=None, na_value=None
+    values, na_sentinel: int = -1, size_hint=None, na_value=None, mask=None,
 ) -> Tuple[np.ndarray, np.ndarray]:
     """
     Factorize an array-like to codes and uniques.
@@ -473,6 +473,10 @@ def _factorize_array(
         parameter when you know that you don't have any values pandas would
         consider missing in the array (NaN for float data, iNaT for
         datetimes, etc.).
+    mask : ndarray[bool], optional
+        If not None, the mask is used as indicator for missing values
+        (True = missing, False = valid) instead of `na_value` or
+        condition "val != val".
 
     Returns
     -------
@@ -482,7 +486,9 @@ def _factorize_array(
     hash_klass, values = _get_data_algo(values)
 
     table = hash_klass(size_hint or len(values))
-    uniques, codes = table.factorize(values, na_sentinel=na_sentinel, na_value=na_value)
+    uniques, codes = table.factorize(
+        values, na_sentinel=na_sentinel, na_value=na_value, mask=mask
+    )
 
     codes = ensure_platform_int(codes)
     return codes, uniques

diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
@@ -27,6 +27,7 @@
 from pandas.core.dtypes.missing import isna
 
 from pandas.core import nanops, ops
+from pandas.core.algorithms import _factorize_array
 import pandas.core.common as com
 from pandas.core.indexers import check_array_indexer
 from pandas.core.ops import invalid_comparison
@@ -481,7 +482,16 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike:
     def _values_for_factorize(self) -> Tuple[np.ndarray, float]:
         # TODO: https://github.com/pandas-dev/pandas/issues/30037
         # use masked algorithms, rather than object-dtype / np.nan.
-        return self.to_numpy(na_value=np.nan), np.nan
+        return self.to_numpy(dtype=float, na_value=np.nan), np.nan
+
+    def factorize2(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "ExtensionArray"]:
+        arr = self._data
+        mask = self._mask
+
+        codes, uniques = _factorize_array(arr, na_sentinel=na_sentinel, mask=mask)
+
+        uniques = IntegerArray(uniques, np.zeros(len(uniques), dtype=bool))
+        return codes, uniques
 
     def _values_for_argsort(self) -> np.ndarray:
         """