-
-
Notifications
You must be signed in to change notification settings - Fork 18k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ENH/PERF: use mask in factorize for nullable dtypes #33064
Changes from 8 commits
246b787
af1cdea
d030ff1
be5a21d
b0a88b9
2e94842
bcde548
c97d357
175a8f6
a6bc6fc
889b1b9
0370045
24b528f
6ed5239
1f7e994
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -368,7 +368,7 @@ cdef class {{name}}HashTable(HashTable): | |
def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, | ||
Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, | ||
object na_value=None, bint ignore_na=False, | ||
bint return_inverse=False): | ||
object mask=None, bint return_inverse=False): | ||
""" | ||
Calculate unique values and labels (no sorting!) | ||
|
||
|
@@ -391,6 +391,10 @@ cdef class {{name}}HashTable(HashTable): | |
Whether NA-values should be ignored for calculating the uniques. If | ||
True, the labels corresponding to missing values will be set to | ||
na_sentinel. | ||
mask : ndarray[bool], optional | ||
If not None, the mask is used as indicator for missing values | ||
(True = missing, False = valid) instead of `na_value` or | ||
condition "val != val". | ||
return_inverse : boolean, default False | ||
Whether the mapping of the original array values to their location | ||
in the vector of uniques should be returned. | ||
|
@@ -409,12 +413,17 @@ cdef class {{name}}HashTable(HashTable): | |
{{dtype}}_t val, na_value2 | ||
khiter_t k | ||
{{name}}VectorData *ud | ||
bint use_na_value | ||
bint use_na_value, use_mask | ||
uint8_t[:] mask_values | ||
|
||
if return_inverse: | ||
labels = np.empty(n, dtype=np.int64) | ||
ud = uniques.data | ||
use_na_value = na_value is not None | ||
use_mask = mask is not None | ||
|
||
if use_mask: | ||
mask_values = mask.view("uint8") | ||
|
||
if use_na_value: | ||
# We need this na_value2 because we want to allow users | ||
|
@@ -430,7 +439,11 @@ cdef class {{name}}HashTable(HashTable): | |
for i in range(n): | ||
val = values[i] | ||
|
||
if ignore_na and ( | ||
if ignore_na and use_mask: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if we're worried about perf for existing cases, could take this check outside of the loop? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need to check the mask for each value inside the loop, so not sure what can be moved outside? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was referring to the use_mask check; it would basically become a separate loop or even method There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think duplicating the full loop is worth it (the loop itself is 40 lines below here), given the minor performance impact I showed in the timings. |
||
if mask_values[i]: | ||
labels[i] = na_sentinel | ||
continue | ||
elif ignore_na and ( | ||
{{if not name.lower().startswith(("uint", "int"))}} | ||
val != val or | ||
{{endif}} | ||
|
@@ -494,7 +507,7 @@ cdef class {{name}}HashTable(HashTable): | |
return_inverse=return_inverse) | ||
|
||
def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1, | ||
object na_value=None): | ||
object na_value=None, object mask=None): | ||
""" | ||
Calculate unique values and labels (no sorting!) | ||
|
||
|
@@ -512,6 +525,10 @@ cdef class {{name}}HashTable(HashTable): | |
any value "val" satisfying val != val is considered missing. | ||
If na_value is not None, then _additionally_, any value "val" | ||
satisfying val == na_value is considered missing. | ||
mask : ndarray[bool], optional | ||
If not None, the mask is used as indicator for missing values | ||
(True = missing, False = valid) instead of `na_value` or | ||
condition "val != val". | ||
|
||
Returns | ||
------- | ||
|
@@ -522,7 +539,7 @@ cdef class {{name}}HashTable(HashTable): | |
""" | ||
uniques_vector = {{name}}Vector() | ||
return self._unique(values, uniques_vector, na_sentinel=na_sentinel, | ||
na_value=na_value, ignore_na=True, | ||
na_value=na_value, ignore_na=True, mask=mask, | ||
return_inverse=True) | ||
|
||
def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, | ||
|
@@ -855,7 +872,7 @@ cdef class StringHashTable(HashTable): | |
return_inverse=return_inverse) | ||
|
||
def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, | ||
object na_value=None): | ||
object na_value=None, object mask=None): | ||
""" | ||
Calculate unique values and labels (no sorting!) | ||
|
||
|
@@ -873,6 +890,8 @@ cdef class StringHashTable(HashTable): | |
that is not a string is considered missing. If na_value is | ||
not None, then _additionally_ any value "val" satisfying | ||
val == na_value is considered missing. | ||
mask : ndarray[bool], optional | ||
Not yet implementd for StringHashTable. | ||
|
||
Returns | ||
------- | ||
|
@@ -1094,7 +1113,7 @@ cdef class PyObjectHashTable(HashTable): | |
return_inverse=return_inverse) | ||
|
||
def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, | ||
object na_value=None): | ||
object na_value=None, object mask=None): | ||
""" | ||
Calculate unique values and labels (no sorting!) | ||
|
||
|
@@ -1112,6 +1131,8 @@ cdef class PyObjectHashTable(HashTable): | |
any value "val" satisfying val != val is considered missing. | ||
If na_value is not None, then _additionally_, any value "val" | ||
satisfying val == na_value is considered missing. | ||
mask : ndarray[bool], optional | ||
Not yet implemented for PyObjectHashTable. | ||
|
||
Returns | ||
------- | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,15 @@ | ||
from typing import TYPE_CHECKING, Optional, Type, TypeVar | ||
from typing import TYPE_CHECKING, Optional, Tuple, Type, TypeVar | ||
|
||
import numpy as np | ||
|
||
from pandas._libs import lib, missing as libmissing | ||
from pandas._typing import Scalar | ||
from pandas.util._decorators import doc | ||
|
||
from pandas.core.dtypes.common import is_integer, is_object_dtype, is_string_dtype | ||
from pandas.core.dtypes.missing import isna, notna | ||
|
||
from pandas.core.algorithms import take | ||
from pandas.core.algorithms import _factorize_array, take | ||
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin | ||
from pandas.core.indexers import check_array_indexer | ||
|
||
|
@@ -217,6 +218,18 @@ def copy(self: BaseMaskedArrayT) -> BaseMaskedArrayT: | |
mask = mask.copy() | ||
return type(self)(data, mask, copy=False) | ||
|
||
@doc(ExtensionArray.factorize) | ||
def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]: | ||
arr = self._data | ||
mask = self._mask | ||
|
||
codes, uniques = _factorize_array(arr, na_sentinel=na_sentinel, mask=mask) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is there a reason you want to call a private routine like this directly? shouldn't factorize just handle this directly? (isn't that the point of _values_for_factorize). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the way we also do it in the base EA The point of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is really polluting the interface, I would much rather just add they keywords. It seems we are special casing EA to no end. This needs to stop.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Expanding the public interface of |
||
|
||
# the hashtables don't handle all different types of bits | ||
uniques = uniques.astype(self.dtype.numpy_dtype, copy=False) # type: ignore | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would prefer not to type: ignore this; I find the current hierarchy of things and the implicit requirements a little wonky There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am happy to do a PR that tries to rework the class hierarchy, but can that be done separately? Because it is really unrelated to the rest of this PR. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it is related - this PR introduces an implicit requirement in factorize that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, I added a minimal BaseMaskedDtype class to ensure typing is correct. Can you take a look? There is no other way to indicate the types to mypy than adding yet another abstract property as I did? (the base ExtensionArray class also has an abstract There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think mypy is right about this one. It only knows (or knew prior to your commit) that BaseMasedArray.dtype is ExtensionDtype, which doesn't necessarily have a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The Generic base class uses a metaclass that defines getitem. types are erased at runtime. Using generic classes (parameterized or not) to access attributes will result in type check failure. Outside the class definition body, a class attribute cannot be assigned, and can only be looked up by accessing it through a class instance that does not have an instance attribute with the same name I believe the above restrictions applies to 3.6 as Generic was changed in 3.7 to not use a custom metaclass. see https://docs.python.org/3/library/typing.html#user-defined-generic-types
see https://www.python.org/dev/peps/pep-0484/#instantiating-generic-classes-and-type-erasure There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @simonjayhawkins since I don't fully grasp the generic, yet, I updated it with a simpler abstract dtype property (see the last added commit), which also seems to fix it. Or you OK with this, for now? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I'll look in a short while. I know there is significant resistance to Generic, but it would allow say ExtensionArray[PeriodDtype] to be equivalent to PeriodArray and then AnyArrayLike could then be parameterised to say AnyArrayLike[PeriodDtype]. I suspect at some point we will want to refine types. but that's a discussion for another day. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am certainly open to going that way if gives better typing in general for EAs and solves a bunch of things. But maybe that can then go into a separate PR (at least I am not doing a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
LGTM
no problem. Generic (and Protocol) are the two typing additions that actually could potentially alter runtime behaviour, so it is right that we scrutinize this. The only problem i've encountered is the repr of type(<instance>) in an assert statement in py3.6, where the instance is an instance of a Generic Class. xref #31574
👍 |
||
uniques = type(self)(uniques, np.zeros(len(uniques), dtype=bool)) | ||
return codes, uniques | ||
|
||
def value_counts(self, dropna: bool = True) -> "Series": | ||
""" | ||
Returns a Series containing counts of each unique value. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
isn't this redudant? since sort is a parameter?
self.idx.factorize(sort=sort)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ExtensionArrays don't support the
sort
keyword, the other values are Index objects, which have that keyword.So the tests for
sort=True
are skipped above in case ofidx
being an EAThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is very confusing then. I would separate the EAs out to a separate asv.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agree this is confusing. But I switched to use the factorize function in the hope to make this clearer, and to keep a single benchmark (the index method is just simply calling pd.factorize on itself, so this should benchmark the exact same thing).
And that way, we can actually remove the skip for
sort
for EAs.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@jreback updated. Much better now I think (and fine for a single benchmark class/function)