-
-
Notifications
You must be signed in to change notification settings - Fork 18.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ENH/PERF: use mask in factorize for nullable dtypes #33064
Changes from 6 commits
246b787
af1cdea
d030ff1
be5a21d
b0a88b9
2e94842
bcde548
c97d357
175a8f6
a6bc6fc
889b1b9
0370045
24b528f
6ed5239
1f7e994
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -368,7 +368,7 @@ cdef class {{name}}HashTable(HashTable): | |
def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, | ||
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, | ||
object na_value=None, bint ignore_na=False, | ||
bint return_inverse=False): | ||
object mask=None, bint return_inverse=False): | ||
""" | ||
Calculate unique values and labels (no sorting!) | ||
|
||
|
@@ -391,6 +391,10 @@ cdef class {{name}}HashTable(HashTable): | |
Whether NA-values should be ignored for calculating the uniques. If | ||
True, the labels corresponding to missing values will be set to | ||
na_sentinel. | ||
mask : ndarray[bool], optional | ||
If not None, the mask is used as indicator for missing values | ||
(True = missing, False = valid) instead of `na_value` or | ||
condition "val != val". | ||
return_inverse : boolean, default False | ||
Whether the mapping of the original array values to their location | ||
in the vector of uniques should be returned. | ||
|
@@ -409,12 +413,17 @@ cdef class {{name}}HashTable(HashTable): | |
{{dtype}}_t val, na_value2 | ||
khiter_t k | ||
{{name}}VectorData *ud | ||
bint use_na_value | ||
bint use_na_value, use_mask | ||
uint8_t[:] mask_values | ||
|
||
if return_inverse: | ||
labels = np.empty(n, dtype=np.int64) | ||
ud = uniques.data | ||
use_na_value = na_value is not None | ||
use_mask = mask is not None | ||
|
||
if use_mask: | ||
mask_values = mask.view("uint8") | ||
|
||
if use_na_value: | ||
# We need this na_value2 because we want to allow users | ||
|
@@ -430,7 +439,11 @@ cdef class {{name}}HashTable(HashTable): | |
for i in range(n): | ||
val = values[i] | ||
|
||
if ignore_na and ( | ||
if ignore_na and use_mask: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if we're worried about perf for existing cases, could take this check outside of the loop? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need to check the mask for each value inside the loop, so not sure what can be moved outside? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was referring to the use_mask check; it would basically become a separate loop or even method There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think duplicating the full loop is worth it (the loop itself is 40 lines below here), given the minor performance impact I showed in the timings. |
||
if mask_values[i]: | ||
labels[i] = na_sentinel | ||
continue | ||
elif ignore_na and ( | ||
{{if not name.lower().startswith(("uint", "int"))}} | ||
val != val or | ||
{{endif}} | ||
|
@@ -494,7 +507,7 @@ cdef class {{name}}HashTable(HashTable): | |
return_inverse=return_inverse) | ||
|
||
def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1, | ||
object na_value=None): | ||
object na_value=None, object mask=None): | ||
""" | ||
Calculate unique values and labels (no sorting!) | ||
|
||
|
@@ -512,6 +525,10 @@ cdef class {{name}}HashTable(HashTable): | |
any value "val" satisfying val != val is considered missing. | ||
If na_value is not None, then _additionally_, any value "val" | ||
satisfying val == na_value is considered missing. | ||
mask : ndarray[bool], optional | ||
If not None, the mask is used as indicator for missing values | ||
(True = missing, False = valid) instead of `na_value` or | ||
condition "val != val". | ||
|
||
Returns | ||
------- | ||
|
@@ -522,7 +539,7 @@ cdef class {{name}}HashTable(HashTable): | |
""" | ||
uniques_vector = {{name}}Vector() | ||
return self._unique(values, uniques_vector, na_sentinel=na_sentinel, | ||
na_value=na_value, ignore_na=True, | ||
na_value=na_value, ignore_na=True, mask=mask, | ||
return_inverse=True) | ||
|
||
def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, | ||
|
@@ -855,7 +872,7 @@ cdef class StringHashTable(HashTable): | |
return_inverse=return_inverse) | ||
|
||
def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, | ||
object na_value=None): | ||
object na_value=None, object mask=None): | ||
""" | ||
Calculate unique values and labels (no sorting!) | ||
|
||
|
@@ -873,6 +890,8 @@ cdef class StringHashTable(HashTable): | |
that is not a string is considered missing. If na_value is | ||
not None, then _additionally_ any value "val" satisfying | ||
val == na_value is considered missing. | ||
mask : ndarray[bool], optional | ||
Not yet implementd for StringHashTable. | ||
|
||
Returns | ||
------- | ||
|
@@ -1094,7 +1113,7 @@ cdef class PyObjectHashTable(HashTable): | |
return_inverse=return_inverse) | ||
|
||
def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, | ||
object na_value=None): | ||
object na_value=None, object mask=None): | ||
""" | ||
Calculate unique values and labels (no sorting!) | ||
|
||
|
@@ -1112,6 +1131,8 @@ cdef class PyObjectHashTable(HashTable): | |
any value "val" satisfying val != val is considered missing. | ||
If na_value is not None, then _additionally_, any value "val" | ||
satisfying val == na_value is considered missing. | ||
mask : ndarray[bool], optional | ||
Not yet implemented for PyObjectHashTable. | ||
|
||
Returns | ||
------- | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,15 @@ | ||
from typing import TYPE_CHECKING, Optional, Type, TypeVar | ||
from typing import TYPE_CHECKING, Optional, Tuple, Type, TypeVar | ||
|
||
import numpy as np | ||
|
||
from pandas._libs import lib, missing as libmissing | ||
from pandas._typing import Scalar | ||
from pandas.util._decorators import doc | ||
|
||
from pandas.core.dtypes.common import is_integer, is_object_dtype, is_string_dtype | ||
from pandas.core.dtypes.missing import isna, notna | ||
|
||
from pandas.core.algorithms import take | ||
from pandas.core.algorithms import _factorize_array, take | ||
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin | ||
from pandas.core.indexers import check_array_indexer | ||
|
||
|
@@ -217,6 +218,18 @@ def copy(self: BaseMaskedArrayT) -> BaseMaskedArrayT: | |
mask = mask.copy() | ||
return type(self)(data, mask, copy=False) | ||
|
||
@doc(ExtensionArray.factorize) | ||
def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]: | ||
arr = self._data | ||
mask = self._mask | ||
|
||
codes, uniques = _factorize_array(arr, na_sentinel=na_sentinel, mask=mask) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is there a reason you want to call a private routine like this directly? shouldn't factorize just handle this directly? (isn't that the point of _values_for_factorize). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the way we also do it in the base EA The point of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is really polluting the interface, I would much rather just add they keywords. It seems we are special casing EA to no end. This needs to stop.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Expanding the public interface of |
||
|
||
# the hashtables don't handle all different types of bits | ||
uniques = uniques.astype(self.dtype.numpy_dtype, copy=False) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I get a "pandas/core/arrays/masked.py:229: error: "ExtensionDtype" has no attribute "numpy_dtype"" mypy failure cc @simonjayhawkins @WillAyd how can I solve / silence this? The There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm yea so I guess complaining because as far as this class is defined, the return type of I guess it comes back to the class design; if we have something else that inherits from BaseMaskedArray it could fail at runtime without if it isn't constructed to return a dtype from There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So the subclasses IntegerArray and BooleanArray have a correctly typed In principle I could add a
in the BaseMaskedArray class to solve this, I suppose? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right; I think it's going to be tough to make this work with mypy if we implicitly enforce that subclasses make What does the comment directly preceding it refer to? Perhaps there is a way to do this without breaking the currently implied subclass requirements? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you override the type signature of IntegerArray.dtype to be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a way to just disable mypy on this line?
The hashtable is only implemented for int64. So if you have an int32 array, the unique values coming out of I could do this differently by eg building up a mapping of EADtypes -> numpy dtypes and looking it up from there instead of using the attribute, but that would just be introducing more complex workarounds to just to satisfy mypy. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes, that's probably the cleanest solution architecturally There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes, that's probably the cleanest solution architecturally. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I don't think you'll need to change anything on the array side. The dtypes will inherit from MaskedExtensionDtype, so mypy should know that Or we add There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
No, since mypy thinks So either we would indeed need to add |
||
uniques = type(self)(uniques, np.zeros(len(uniques), dtype=bool)) | ||
return codes, uniques | ||
|
||
def value_counts(self, dropna: bool = True) -> "Series": | ||
""" | ||
Returns a Series containing counts of each unique value. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
isn't this redudant? since sort is a parameter?
self.idx.factorize(sort=sort)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ExtensionArrays don't support the
sort
keyword, the other values are Index objects, which have that keyword.So the tests for
sort=True
are skipped above in case ofidx
being an EAThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is very confusing then. I would separate the EAs out to a separate asv.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agree this is confusing. But I switched to use the factorize function in the hope to make this clearer, and to keep a single benchmark (the index method is just simply calling pd.factorize on itself, so this should benchmark the exact same thing).
And that way, we can actually remove the skip for
sort
for EAs.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@jreback updated. Much better now I think (and fine for a single benchmark class/function)