From 09183e868a7985879a4a79e549e7ea839719e36e Mon Sep 17 00:00:00 2001 From: Alex Lim Date: Sat, 2 Oct 2021 18:37:51 -0400 Subject: [PATCH 01/23] CLN: removing x and replacing with starget --- pandas/_libs/index.pyx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index f4d59962c111e..0c57e5417b193 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -313,7 +313,7 @@ cdef class IndexEngine: missing : np.ndarray[np.intp] """ cdef: - ndarray values, x + ndarray values ndarray[intp_t] result, missing set stargets, remaining_stargets dict d = {} @@ -366,7 +366,9 @@ cdef class IndexEngine: # GH#35392 if need_nan_check: # Do this check only once - stargets_has_nan = any(util.is_nan(val) for x in stargets) + stargets_has_nan = any( + util.is_nan(starget) for starget in stargets + ) need_nan_check = False if stargets_has_nan: From 62d533511bd23dcb7f83e4ce727c0469cf92cb1d Mon Sep 17 00:00:00 2001 From: Alex Lim Date: Sun, 3 Oct 2021 17:19:06 -0400 Subject: [PATCH 02/23] TST: adding NaT non unique tests --- pandas/tests/indexes/object/test_indexing.py | 37 ++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index 039483cc948df..17d622b844b36 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -2,6 +2,7 @@ import pytest from pandas._libs.missing import is_matching_na +from pandas._libs.tslibs.nattype import NaT import pandas as pd from pandas import Index @@ -96,6 +97,42 @@ def test_get_indexer_non_unique_nas(self, nulls_fixture): tm.assert_numpy_array_equal(indexer, expected_indexer) tm.assert_numpy_array_equal(missing, expected_missing) + # NaTs + if is_matching_na(nulls_fixture, NaT): + # NaT vs dt64nat + index = Index( + np.array( + ["2021-10-02", nulls_fixture, np.datetime64("NaT"), nulls_fixture], + dtype=object, + ) + ) + indexer, missing = index.get_indexer_non_unique([nulls_fixture]) + + expected_indexer = np.array([1, 3], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected_indexer) + tm.assert_numpy_array_equal(missing, expected_missing) + + # dt64nat vs td64nat + index = Index( + np.array( + [ + "2021-10-02", + np.datetime64("NaT"), + np.timedelta64("NaT"), + np.datetime64("NaT"), + ], + dtype=object, + ) + ) + # pass as index to prevent target from being casted to DatetimeIndex + indexer, missing = index.get_indexer_non_unique( + Index([np.datetime64("NaT")], dtype=object) + ) + + expected_indexer = np.array([1, 3], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected_indexer) + tm.assert_numpy_array_equal(missing, expected_missing) + class TestSliceLocs: @pytest.mark.parametrize( From 8271d58db39d87392ede8437cad2411194f1ef2c Mon Sep 17 00:00:00 2001 From: Alex Lim Date: Sun, 3 Oct 2021 21:56:06 -0400 Subject: [PATCH 03/23] BUG: check np.datetime64('NaT') and np.timedelta64('NaT') in get_indexer_non_unique --- pandas/_libs/index.pyx | 58 ++++++++++++++++++++++++++++++--- pandas/_libs/tslibs/nattype.pxd | 2 ++ pandas/_libs/tslibs/nattype.pyx | 16 +++++++++ 3 files changed, 71 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index fc05f8f443814..fd32dbd38108d 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -23,7 +23,11 @@ cnp.import_array() from pandas._libs cimport util from pandas._libs.hashtable cimport HashTable -from pandas._libs.tslibs.nattype cimport c_NaT as NaT +from pandas._libs.tslibs.nattype cimport ( + c_NaT as NaT, + is_dt64nat, + is_td64nat, +) from pandas._libs.tslibs.period cimport is_period_object from pandas._libs.tslibs.timedeltas cimport _Timedelta from pandas._libs.tslibs.timestamps cimport _Timestamp @@ -319,10 +323,13 @@ cdef class IndexEngine: ndarray[intp_t] result, missing set stargets, remaining_stargets dict d = {} - object val + object val, dt64nat, td64nat Py_ssize_t count = 0, count_missing = 0 Py_ssize_t i, j, n, n_t, n_alloc, start, end bint d_has_nan = False, stargets_has_nan = False, need_nan_check = True + d_has_dt64nat = False, stargets_has_dt64nat = False, + need_dt64nat_check = True, d_has_td64nat = False, + stargets_has_td64nat = False, need_td64nat_check = True values = self.values stargets = set(targets) @@ -380,12 +387,53 @@ cdef class IndexEngine: d_has_nan = True d[np.nan].append(i) + elif is_dt64nat(val): + if need_dt64nat_check: + # Do this check only once + stargets_has_dt64nat = any( + is_dt64nat(starget) for starget in stargets + ) + need_dt64nat_check = False + + if stargets_has_dt64nat: + if not d_has_dt64nat: + dt64nat = np.datetime64("NaT") + d[dt64nat] = [] + d_has_dt64nat = True + d[dt64nat].append(i) + + elif is_td64nat(val): + if need_td64nat_check: + # Do this check only once + stargets_has_td64nat = any( + is_td64nat(starget) for starget in stargets + ) + need_td64nat_check = False + + if stargets_has_td64nat: + if not d_has_td64nat: + td64nat = np.timedelta64("NaT") + d[td64nat] = [] + d_has_td64nat = True + d[td64nat].append(i) + for i in range(n_t): val = targets[i] - # found - if val in d or (d_has_nan and util.is_nan(val)): - key = val if not util.is_nan(val) else np.nan + if ( + val in d + or d_has_nan and util.is_nan(val) + or d_has_dt64nat and is_dt64nat(val) + or d_has_td64nat and is_td64nat(val) + ): + key = val + if d_has_nan and util.is_nan(key): + key = np.nan + elif d_has_dt64nat and is_dt64nat(key): + key = dt64nat + elif d_has_td64nat and is_td64nat(key): + key = td64nat + for j in d[key]: # realloc if needed diff --git a/pandas/_libs/tslibs/nattype.pxd b/pandas/_libs/tslibs/nattype.pxd index d38f4518f9bf0..35319bd88053a 100644 --- a/pandas/_libs/tslibs/nattype.pxd +++ b/pandas/_libs/tslibs/nattype.pxd @@ -16,4 +16,6 @@ cdef _NaT c_NaT cdef bint checknull_with_nat(object val) +cdef bint is_dt64nat(object val) +cdef bint is_td64nat(object val) cpdef bint is_null_datetimelike(object val, bint inat_is_null=*) diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 521927cd910ec..b7cceb693549b 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -1133,6 +1133,22 @@ cdef inline bint checknull_with_nat(object val): """ return val is None or util.is_nan(val) or val is c_NaT +cdef inline bint is_dt64nat(object val): + """ + Utility to check if val is np.datetime64("NaT"). + """ + if util.is_datetime64_object(val): + return get_datetime64_value(val) == NPY_NAT + return False + +cdef inline bint is_td64nat(object val): + """ + Utility to check if val is np.timedelta64("NaT"). + """ + if util.is_timedelta64_object(val): + return get_timedelta64_value(val) == NPY_NAT + return False + cpdef bint is_null_datetimelike(object val, bint inat_is_null=True): """ From da2dc442cddf5185e37307344f180b3884156089 Mon Sep 17 00:00:00 2001 From: Alex Lim Date: Sun, 3 Oct 2021 22:31:39 -0400 Subject: [PATCH 04/23] DOC: adding whatsnew --- doc/source/whatsnew/v1.4.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 381b0f39ff849..3f83af0ad38b7 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -438,6 +438,7 @@ Indexing - Bug in :meth:`DataFrame.query` where method calls in query strings led to errors when the ``numexpr`` package was installed. (:issue:`22435`) - Bug in :meth:`DataFrame.nlargest` and :meth:`Series.nlargest` where sorted result did not count indexes containing ``np.nan`` (:issue:`28984`) - Bug in indexing on a non-unique object-dtype :class:`Index` with an NA scalar (e.g. ``np.nan``) (:issue:`43711`) +- Bug in :meth:`Index.get_indexer_non_unique` when index contains multiple ``np.datetime64("NaT")`` and ``np.timedelta64("NaT")`` (:issue:`43869`) - Missing From 801d8da61000be7dd92382ccde821802c4ea834d Mon Sep 17 00:00:00 2001 From: Alex Lim Date: Thu, 7 Oct 2021 10:54:00 -0400 Subject: [PATCH 05/23] CLN: putting parens around each condition --- pandas/_libs/index.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index fd32dbd38108d..276fcea67b1d2 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -422,9 +422,9 @@ cdef class IndexEngine: # found if ( val in d - or d_has_nan and util.is_nan(val) - or d_has_dt64nat and is_dt64nat(val) - or d_has_td64nat and is_td64nat(val) + or (d_has_nan and util.is_nan(val)) + or (d_has_dt64nat and is_dt64nat(val)) + or (d_has_td64nat and is_td64nat(val)) ): key = val if d_has_nan and util.is_nan(key): From 00e0f68d4284fa9ce9b23bca079973ab8ce520cf Mon Sep 17 00:00:00 2001 From: Alex Lim Date: Thu, 7 Oct 2021 11:32:40 -0400 Subject: [PATCH 06/23] CLN: refactor with is_dt64nat and istd64nat --- pandas/_libs/missing.pyx | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index cbe79d11fbfc9..7153626932c28 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -20,7 +20,9 @@ from pandas._libs cimport util from pandas._libs.tslibs.nattype cimport ( c_NaT as NaT, checknull_with_nat, + is_dt64nat, is_null_datetimelike, + is_td64nat, ) from pandas._libs.tslibs.np_datetime cimport ( get_datetime64_value, @@ -77,18 +79,10 @@ cpdef bint is_matching_na(object left, object right, bint nan_matches_none=False and util.is_complex_object(right) and util.is_nan(right) ) - elif util.is_datetime64_object(left): - return ( - get_datetime64_value(left) == NPY_NAT - and util.is_datetime64_object(right) - and get_datetime64_value(right) == NPY_NAT - ) - elif util.is_timedelta64_object(left): - return ( - get_timedelta64_value(left) == NPY_NAT - and util.is_timedelta64_object(right) - and get_timedelta64_value(right) == NPY_NAT - ) + elif is_dt64nat(left): + return is_dt64nat(right) + elif is_td64nat(left): + return is_td64nat(right) elif is_decimal_na(left): return is_decimal_na(right) return False @@ -345,20 +339,16 @@ def isneginf_scalar(val: object) -> bool: cdef inline bint is_null_datetime64(v): # determine if we have a null for a datetime (or integer versions), # excluding np.timedelta64('nat') - if checknull_with_nat(v): + if checknull_with_nat(v) or is_dt64nat(v): return True - elif util.is_datetime64_object(v): - return get_datetime64_value(v) == NPY_NAT return False cdef inline bint is_null_timedelta64(v): # determine if we have a null for a timedelta (or integer versions), # excluding np.datetime64('nat') - if checknull_with_nat(v): + if checknull_with_nat(v) or is_td64nat(v): return True - elif util.is_timedelta64_object(v): - return get_timedelta64_value(v) == NPY_NAT return False From da54caa7579391b504a5432ff6b377081ffc1869 Mon Sep 17 00:00:00 2001 From: Alex Lim Date: Thu, 7 Oct 2021 21:35:30 -0400 Subject: [PATCH 07/23] TST: separate tests + use np nat fixtures --- pandas/_testing/__init__.py | 1 + pandas/conftest.py | 13 +++++++ pandas/tests/indexes/object/test_indexing.py | 38 +++++++++++--------- 3 files changed, 35 insertions(+), 17 deletions(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index c54185e324646..94d0072d989dc 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -157,6 +157,7 @@ ) NULL_OBJECTS = [None, np.nan, pd.NaT, float("nan"), pd.NA, Decimal("NaN")] +NP_NAT_OBJECTS = [np.datetime64("NaT"), np.timedelta64("NaT")] EMPTY_STRING_PATTERN = re.compile("^$") diff --git a/pandas/conftest.py b/pandas/conftest.py index 44b805c632723..0a94e5185b168 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -332,6 +332,19 @@ def unique_nulls_fixture(request): # Generate cartesian product of unique_nulls_fixture: unique_nulls_fixture2 = unique_nulls_fixture + +@pytest.fixture(params=tm.NP_NAT_OBJECTS, ids=lambda x: type(x).__name__) +def np_nat_fixture(request): + """ + Fixture for each NaT type in numpy. + """ + return request.param + + +# Generate cartesian product of unique_nulls_fixture: +np_nat_fixture2 = np_nat_fixture + + # ---------------------------------------------------------------- # Classes # ---------------------------------------------------------------- diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index 17d622b844b36..d40dcaee40259 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -2,7 +2,6 @@ import pytest from pandas._libs.missing import is_matching_na -from pandas._libs.tslibs.nattype import NaT import pandas as pd from pandas import Index @@ -97,38 +96,43 @@ def test_get_indexer_non_unique_nas(self, nulls_fixture): tm.assert_numpy_array_equal(indexer, expected_indexer) tm.assert_numpy_array_equal(missing, expected_missing) - # NaTs - if is_matching_na(nulls_fixture, NaT): - # NaT vs dt64nat + @pytest.mark.filterwarnings("ignore:elementwise comp:DeprecationWarning") + def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2): + expected_missing = np.array([], dtype=np.intp) + # matching-but-not-identical nats + if is_matching_na(np_nat_fixture, np_nat_fixture2): + # ensure nats are different objects index = Index( np.array( - ["2021-10-02", nulls_fixture, np.datetime64("NaT"), nulls_fixture], - dtype=object, - ) + ["2021-10-02", np_nat_fixture.copy(), np_nat_fixture2.copy()], + ), + dtype=object, ) - indexer, missing = index.get_indexer_non_unique([nulls_fixture]) - - expected_indexer = np.array([1, 3], dtype=np.intp) + # pass as index to prevent target from being casted to DatetimeIndex + indexer, missing = index.get_indexer_non_unique( + Index([np_nat_fixture], dtype=object) + ) + expected_indexer = np.array([1, 2], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected_indexer) tm.assert_numpy_array_equal(missing, expected_missing) - - # dt64nat vs td64nat + # dt64nat vs td64nat + else: index = Index( np.array( [ "2021-10-02", - np.datetime64("NaT"), - np.timedelta64("NaT"), - np.datetime64("NaT"), + np_nat_fixture, + np_nat_fixture2, + np_nat_fixture, + np_nat_fixture2, ], dtype=object, ) ) # pass as index to prevent target from being casted to DatetimeIndex indexer, missing = index.get_indexer_non_unique( - Index([np.datetime64("NaT")], dtype=object) + Index([np_nat_fixture], dtype=object) ) - expected_indexer = np.array([1, 3], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected_indexer) tm.assert_numpy_array_equal(missing, expected_missing) From 566e096ec247e70eebc008cee57743109326cb35 Mon Sep 17 00:00:00 2001 From: Alex Lim Date: Thu, 7 Oct 2021 21:37:59 -0400 Subject: [PATCH 08/23] CLN: short circuit np nat check for object dtype --- pandas/_libs/index.pyx | 67 +++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 30 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 276fcea67b1d2..c13c0d5279ad9 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -329,7 +329,8 @@ cdef class IndexEngine: bint d_has_nan = False, stargets_has_nan = False, need_nan_check = True d_has_dt64nat = False, stargets_has_dt64nat = False, need_dt64nat_check = True, d_has_td64nat = False, - stargets_has_td64nat = False, need_td64nat_check = True + stargets_has_td64nat = False, need_td64nat_check = True, + need_np_nat_check = False values = self.values stargets = set(targets) @@ -364,6 +365,11 @@ cdef class IndexEngine: if stargets: # otherwise, map by iterating through all items in the index + # determine if we need to check for numpy nats + # ie. np.datetime64("NaT") np.timedelta64("NaT") + if values.dtype == object: + need_np_nat_check = True + for i in range(n): val = values[i] if val in stargets: @@ -387,35 +393,36 @@ cdef class IndexEngine: d_has_nan = True d[np.nan].append(i) - elif is_dt64nat(val): - if need_dt64nat_check: - # Do this check only once - stargets_has_dt64nat = any( - is_dt64nat(starget) for starget in stargets - ) - need_dt64nat_check = False - - if stargets_has_dt64nat: - if not d_has_dt64nat: - dt64nat = np.datetime64("NaT") - d[dt64nat] = [] - d_has_dt64nat = True - d[dt64nat].append(i) - - elif is_td64nat(val): - if need_td64nat_check: - # Do this check only once - stargets_has_td64nat = any( - is_td64nat(starget) for starget in stargets - ) - need_td64nat_check = False - - if stargets_has_td64nat: - if not d_has_td64nat: - td64nat = np.timedelta64("NaT") - d[td64nat] = [] - d_has_td64nat = True - d[td64nat].append(i) + elif need_np_nat_check: + if is_dt64nat(val): + if need_dt64nat_check: + # Do this check only once + stargets_has_dt64nat = any( + is_dt64nat(starget) for starget in stargets + ) + need_dt64nat_check = False + + if stargets_has_dt64nat: + if not d_has_dt64nat: + dt64nat = np.datetime64("NaT") + d[dt64nat] = [] + d_has_dt64nat = True + d[dt64nat].append(i) + + elif is_td64nat(val): + if need_td64nat_check: + # Do this check only once + stargets_has_td64nat = any( + is_td64nat(starget) for starget in stargets + ) + need_td64nat_check = False + + if stargets_has_td64nat: + if not d_has_td64nat: + td64nat = np.timedelta64("NaT") + d[td64nat] = [] + d_has_td64nat = True + d[td64nat].append(i) for i in range(n_t): val = targets[i] From d20027ddc0ec8a75a259999258b507713367ae0e Mon Sep 17 00:00:00 2001 From: Alex Lim Date: Thu, 7 Oct 2021 21:55:17 -0400 Subject: [PATCH 09/23] CLN: reverting starget change for another PR --- pandas/_libs/index.pyx | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index c13c0d5279ad9..c959ac01f7fd7 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -381,9 +381,7 @@ cdef class IndexEngine: # GH#35392 if need_nan_check: # Do this check only once - stargets_has_nan = any( - util.is_nan(starget) for starget in stargets - ) + stargets_has_nan = any(util.is_nan(val) for x in stargets) need_nan_check = False if stargets_has_nan: From 299a45a625ccf858e0876ab9dab5d6a858d9ab35 Mon Sep 17 00:00:00 2001 From: Alex Lim Date: Sat, 9 Oct 2021 21:17:30 -0400 Subject: [PATCH 10/23] CLN: fixing docstring --- pandas/_libs/tslibs/nattype.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index b7cceb693549b..23094bdb90483 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -1135,7 +1135,7 @@ cdef inline bint checknull_with_nat(object val): cdef inline bint is_dt64nat(object val): """ - Utility to check if val is np.datetime64("NaT"). + Is this a np.datetime64 object np.datetime64("NaT"). """ if util.is_datetime64_object(val): return get_datetime64_value(val) == NPY_NAT @@ -1143,7 +1143,7 @@ cdef inline bint is_dt64nat(object val): cdef inline bint is_td64nat(object val): """ - Utility to check if val is np.timedelta64("NaT"). + Is this a np.timedelta64 object np.timedelta64("NaT"). """ if util.is_timedelta64_object(val): return get_timedelta64_value(val) == NPY_NAT From 8878edfcacd3f55e3ebba273f4fdd0bf0c134c55 Mon Sep 17 00:00:00 2001 From: Alex Lim Date: Sat, 9 Oct 2021 21:20:56 -0400 Subject: [PATCH 11/23] CLN: fixing np_nat_fixture2 comment --- pandas/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 0a94e5185b168..75711b19dfcfd 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -341,7 +341,7 @@ def np_nat_fixture(request): return request.param -# Generate cartesian product of unique_nulls_fixture: +# Generate cartesian product of np_nat_fixture: np_nat_fixture2 = np_nat_fixture From c4acbda29dd8174dc41f125d1f7612288f2d415f Mon Sep 17 00:00:00 2001 From: Alex Lim Date: Sat, 9 Oct 2021 21:58:18 -0400 Subject: [PATCH 12/23] CLN: forgot to undo this line for stargets --- pandas/_libs/index.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index c959ac01f7fd7..1a061ac903b2e 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -319,7 +319,7 @@ cdef class IndexEngine: missing : np.ndarray[np.intp] """ cdef: - ndarray values + ndarray values, x ndarray[intp_t] result, missing set stargets, remaining_stargets dict d = {} From 5c94e8b879227bdb6c1be9a970314f2883f160d6 Mon Sep 17 00:00:00 2001 From: Alex Lim Date: Sun, 10 Oct 2021 21:24:33 -0400 Subject: [PATCH 13/23] DOC: TODO for np nats --- pandas/tests/indexes/object/test_indexing.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index d40dcaee40259..abcf7d455549e 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -96,6 +96,9 @@ def test_get_indexer_non_unique_nas(self, nulls_fixture): tm.assert_numpy_array_equal(indexer, expected_indexer) tm.assert_numpy_array_equal(missing, expected_missing) + # TODO: distinguish between different date/time units + # for datetime64("NaT") and timedelta64("NaT"): + # ie. np.datetime64("NaT") vs np.datetime64("NaT", "ns"), np.datetime64("NaT", "ms") @pytest.mark.filterwarnings("ignore:elementwise comp:DeprecationWarning") def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2): expected_missing = np.array([], dtype=np.intp) From af33f5fd1b2500351fa4a22147119b8cf4fe5d08 Mon Sep 17 00:00:00 2001 From: Alex Lim Date: Sun, 10 Oct 2021 21:48:26 -0400 Subject: [PATCH 14/23] DOC: adding comments --- pandas/_libs/index.pyx | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 1a061ac903b2e..d30e9d5514f1d 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -402,6 +402,7 @@ cdef class IndexEngine: if stargets_has_dt64nat: if not d_has_dt64nat: + # store to ensure future access to `d` uses same key dt64nat = np.datetime64("NaT") d[dt64nat] = [] d_has_dt64nat = True @@ -417,6 +418,7 @@ cdef class IndexEngine: if stargets_has_td64nat: if not d_has_td64nat: + # store to ensure future access to `d` uses same key td64nat = np.timedelta64("NaT") d[td64nat] = [] d_has_td64nat = True @@ -425,6 +427,9 @@ cdef class IndexEngine: for i in range(n_t): val = targets[i] # found + # cannot search for nan/nat target using `in`, + # need to lookup key using d_has_... + # and confirm na type via util function if ( val in d or (d_has_nan and util.is_nan(val)) From 1490968cbdbde885bb4c82955566e45e778a2bf8 Mon Sep 17 00:00:00 2001 From: Alex Lim Date: Thu, 14 Oct 2021 16:13:22 -0400 Subject: [PATCH 15/23] TST: ensure numpy doesn't downcast nats --- pandas/tests/indexes/object/test_indexing.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index abcf7d455549e..e999d822e350e 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -108,6 +108,7 @@ def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2): index = Index( np.array( ["2021-10-02", np_nat_fixture.copy(), np_nat_fixture2.copy()], + dtype=object, ), dtype=object, ) @@ -130,7 +131,8 @@ def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2): np_nat_fixture2, ], dtype=object, - ) + ), + dtype=object, ) # pass as index to prevent target from being casted to DatetimeIndex indexer, missing = index.get_indexer_non_unique( From 77216f752f279007845769695edc7896b6d5540e Mon Sep 17 00:00:00 2001 From: Alex Lim Date: Sun, 17 Oct 2021 21:28:33 -0400 Subject: [PATCH 16/23] CLN: reverting back to original is_matching_na check + adding time units for nat --- pandas/_libs/missing.pyx | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 7153626932c28..90f409d371e6b 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -25,6 +25,7 @@ from pandas._libs.tslibs.nattype cimport ( is_td64nat, ) from pandas._libs.tslibs.np_datetime cimport ( + get_datetime64_unit, get_datetime64_value, get_timedelta64_value, ) @@ -79,10 +80,20 @@ cpdef bint is_matching_na(object left, object right, bint nan_matches_none=False and util.is_complex_object(right) and util.is_nan(right) ) - elif is_dt64nat(left): - return is_dt64nat(right) - elif is_td64nat(left): - return is_td64nat(right) + elif util.is_datetime64_object(left): + return ( + get_datetime64_value(left) == NPY_NAT + and util.is_datetime64_object(right) + and get_datetime64_value(right) == NPY_NAT + and get_datetime64_unit(left) == get_datetime64_unit(right) + ) + elif util.is_timedelta64_object(left): + return ( + get_timedelta64_value(left) == NPY_NAT + and util.is_timedelta64_object(right) + and get_timedelta64_value(right) == NPY_NAT + and get_datetime64_unit(left) == get_datetime64_unit(right) + ) elif is_decimal_na(left): return is_decimal_na(right) return False From 6b4179efe266920054e94a25815a92ed4cd67999 Mon Sep 17 00:00:00 2001 From: Alex Lim Date: Sun, 17 Oct 2021 21:29:54 -0400 Subject: [PATCH 17/23] TST: updating np_nat_objects fixtures with date units --- pandas/_testing/__init__.py | 31 +++++++++++++++++++- pandas/tests/indexes/object/test_indexing.py | 3 -- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 94d0072d989dc..bf5e0821d17ca 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -157,7 +157,36 @@ ) NULL_OBJECTS = [None, np.nan, pd.NaT, float("nan"), pd.NA, Decimal("NaN")] -NP_NAT_OBJECTS = [np.datetime64("NaT"), np.timedelta64("NaT")] +NP_NAT_OBJECTS = [ + np.datetime64("NaT"), + np.datetime64("NaT", "Y"), + np.datetime64("NaT", "M"), + np.datetime64("NaT", "W"), + np.datetime64("NaT", "D"), + np.datetime64("NaT", "h"), + np.datetime64("NaT", "m"), + np.datetime64("NaT", "s"), + np.datetime64("NaT", "ms"), + np.datetime64("NaT", "us"), + np.datetime64("NaT", "ns"), + np.datetime64("NaT", "ps"), + np.datetime64("NaT", "fs"), + np.datetime64("NaT", "as"), + np.timedelta64("NaT"), + np.timedelta64("NaT", "Y"), + np.timedelta64("NaT", "M"), + np.timedelta64("NaT", "W"), + np.timedelta64("NaT", "D"), + np.timedelta64("NaT", "h"), + np.timedelta64("NaT", "m"), + np.timedelta64("NaT", "s"), + np.timedelta64("NaT", "ms"), + np.timedelta64("NaT", "us"), + np.timedelta64("NaT", "ns"), + np.timedelta64("NaT", "ps"), + np.timedelta64("NaT", "fs"), + np.timedelta64("NaT", "as"), +] EMPTY_STRING_PATTERN = re.compile("^$") diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index e999d822e350e..ab65936efce83 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -96,9 +96,6 @@ def test_get_indexer_non_unique_nas(self, nulls_fixture): tm.assert_numpy_array_equal(indexer, expected_indexer) tm.assert_numpy_array_equal(missing, expected_missing) - # TODO: distinguish between different date/time units - # for datetime64("NaT") and timedelta64("NaT"): - # ie. np.datetime64("NaT") vs np.datetime64("NaT", "ns"), np.datetime64("NaT", "ms") @pytest.mark.filterwarnings("ignore:elementwise comp:DeprecationWarning") def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2): expected_missing = np.array([], dtype=np.intp) From 94e7adde032908753e349aca9ae00d28f303ae24 Mon Sep 17 00:00:00 2001 From: Alex Lim Date: Sun, 17 Oct 2021 21:42:02 -0400 Subject: [PATCH 18/23] CLN: consoldiating all object na checks into one --- pandas/_libs/index.pyx | 120 +++++++++++++++-------------------------- 1 file changed, 42 insertions(+), 78 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index d30e9d5514f1d..e3a4d496642f7 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -23,11 +23,7 @@ cnp.import_array() from pandas._libs cimport util from pandas._libs.hashtable cimport HashTable -from pandas._libs.tslibs.nattype cimport ( - c_NaT as NaT, - is_dt64nat, - is_td64nat, -) +from pandas._libs.tslibs.nattype cimport c_NaT as NaT from pandas._libs.tslibs.period cimport is_period_object from pandas._libs.tslibs.timedeltas cimport _Timedelta from pandas._libs.tslibs.timestamps cimport _Timestamp @@ -319,18 +315,14 @@ cdef class IndexEngine: missing : np.ndarray[np.intp] """ cdef: - ndarray values, x + ndarray values ndarray[intp_t] result, missing - set stargets, remaining_stargets + set stargets, remaining_stargets, found_nas dict d = {} - object val, dt64nat, td64nat + object val Py_ssize_t count = 0, count_missing = 0 Py_ssize_t i, j, n, n_t, n_alloc, start, end - bint d_has_nan = False, stargets_has_nan = False, need_nan_check = True - d_has_dt64nat = False, stargets_has_dt64nat = False, - need_dt64nat_check = True, d_has_td64nat = False, - stargets_has_td64nat = False, need_td64nat_check = True, - need_np_nat_check = False + bint check_na_values = False values = self.values stargets = set(targets) @@ -365,84 +357,56 @@ cdef class IndexEngine: if stargets: # otherwise, map by iterating through all items in the index - # determine if we need to check for numpy nats - # ie. np.datetime64("NaT") np.timedelta64("NaT") + # short-circuting na check if values.dtype == object: - need_np_nat_check = True + check_na_values = True + # keep track of nas in values + found_nas = set() for i in range(n): val = values[i] + + # GH#43870 + # handle lookup for nas + # (ie. np.nan, float("NaN"), Decimal("NaN"), dt64nat, td64nat) + if check_na_values and checknull(val): + match = [na for na in found_nas if is_matching_na(val, na)] + + # matching na not found + if not len(match): + found_nas.add(val) + + # add na to stargets to utilize `in` for starget/d lookup + match_stargets = [ + x for x in stargets if is_matching_na(val, x) + ] + + if len(match_stargets): + # add our 'standardized' na + stargets.add(val) + + # matching na found + else: + assert len(match) == 1 + val = match[0] + if val in stargets: if val not in d: d[val] = [] d[val].append(i) - elif util.is_nan(val): - # GH#35392 - if need_nan_check: - # Do this check only once - stargets_has_nan = any(util.is_nan(val) for x in stargets) - need_nan_check = False - - if stargets_has_nan: - if not d_has_nan: - # use a canonical nan object - d[np.nan] = [] - d_has_nan = True - d[np.nan].append(i) - - elif need_np_nat_check: - if is_dt64nat(val): - if need_dt64nat_check: - # Do this check only once - stargets_has_dt64nat = any( - is_dt64nat(starget) for starget in stargets - ) - need_dt64nat_check = False - - if stargets_has_dt64nat: - if not d_has_dt64nat: - # store to ensure future access to `d` uses same key - dt64nat = np.datetime64("NaT") - d[dt64nat] = [] - d_has_dt64nat = True - d[dt64nat].append(i) - - elif is_td64nat(val): - if need_td64nat_check: - # Do this check only once - stargets_has_td64nat = any( - is_td64nat(starget) for starget in stargets - ) - need_td64nat_check = False - - if stargets_has_td64nat: - if not d_has_td64nat: - # store to ensure future access to `d` uses same key - td64nat = np.timedelta64("NaT") - d[td64nat] = [] - d_has_td64nat = True - d[td64nat].append(i) - for i in range(n_t): val = targets[i] + + # ensure there are nas in values before looking for a matching null + if check_na_values and checknull(val): + match = [na for na in found_nas if is_matching_na(val, na)] + assert len(match) == 1 + val = match[0] + # found - # cannot search for nan/nat target using `in`, - # need to lookup key using d_has_... - # and confirm na type via util function - if ( - val in d - or (d_has_nan and util.is_nan(val)) - or (d_has_dt64nat and is_dt64nat(val)) - or (d_has_td64nat and is_td64nat(val)) - ): + if val in d: key = val - if d_has_nan and util.is_nan(key): - key = np.nan - elif d_has_dt64nat and is_dt64nat(key): - key = dt64nat - elif d_has_td64nat and is_td64nat(key): - key = td64nat for j in d[key]: From 0d1e26097d435481f06da6021c20e1ea08dd93df Mon Sep 17 00:00:00 2001 From: Alex Lim Date: Sun, 17 Oct 2021 22:49:31 -0400 Subject: [PATCH 19/23] CLN: fixing typo in comment + check match target --- pandas/_libs/index.pyx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index c56db19348adc..59459cc781ddd 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -357,7 +357,7 @@ cdef class IndexEngine: if stargets: # otherwise, map by iterating through all items in the index - # short-circuting na check + # short-circuit na check if values.dtype == object: check_na_values = True # keep track of nas in values @@ -401,8 +401,9 @@ cdef class IndexEngine: # ensure there are nas in values before looking for a matching null if check_na_values and checknull(val): match = [na for na in found_nas if is_matching_na(val, na)] - assert len(match) == 1 - val = match[0] + if len(match): + assert len(match) == 1 + val = match[0] # found if val in d: From 802c2614000da1bf1905c20f5095a4b9cb4ef16d Mon Sep 17 00:00:00 2001 From: Alex Lim Date: Mon, 18 Oct 2021 12:08:35 -0400 Subject: [PATCH 20/23] CLN: condensing np_nat_objects fixture --- pandas/_testing/__init__.py | 45 ++++++++++++++----------------------- 1 file changed, 17 insertions(+), 28 deletions(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index bf5e0821d17ca..e8283a222d86a 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -158,34 +158,23 @@ NULL_OBJECTS = [None, np.nan, pd.NaT, float("nan"), pd.NA, Decimal("NaN")] NP_NAT_OBJECTS = [ - np.datetime64("NaT"), - np.datetime64("NaT", "Y"), - np.datetime64("NaT", "M"), - np.datetime64("NaT", "W"), - np.datetime64("NaT", "D"), - np.datetime64("NaT", "h"), - np.datetime64("NaT", "m"), - np.datetime64("NaT", "s"), - np.datetime64("NaT", "ms"), - np.datetime64("NaT", "us"), - np.datetime64("NaT", "ns"), - np.datetime64("NaT", "ps"), - np.datetime64("NaT", "fs"), - np.datetime64("NaT", "as"), - np.timedelta64("NaT"), - np.timedelta64("NaT", "Y"), - np.timedelta64("NaT", "M"), - np.timedelta64("NaT", "W"), - np.timedelta64("NaT", "D"), - np.timedelta64("NaT", "h"), - np.timedelta64("NaT", "m"), - np.timedelta64("NaT", "s"), - np.timedelta64("NaT", "ms"), - np.timedelta64("NaT", "us"), - np.timedelta64("NaT", "ns"), - np.timedelta64("NaT", "ps"), - np.timedelta64("NaT", "fs"), - np.timedelta64("NaT", "as"), + cls("NaT", unit) + for cls in [np.datetime64, np.timedelta64] + for unit in [ + "Y", + "M", + "W", + "D", + "h", + "m", + "s", + "ms", + "us", + "ns", + "ps", + "fs", + "as", + ] ] EMPTY_STRING_PATTERN = re.compile("^$") From 14e0868636c273ae5390feece8697a7324c1887b Mon Sep 17 00:00:00 2001 From: Alex Lim Date: Mon, 18 Oct 2021 12:11:25 -0400 Subject: [PATCH 21/23] CLN: fixing comment typos --- pandas/_libs/index.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 59459cc781ddd..92837a43e2b69 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -376,7 +376,7 @@ cdef class IndexEngine: if not len(match): found_nas.add(val) - # add na to stargets to utilize `in` for starget/d lookup + # add na to stargets to utilize `in` for stargets/d lookup match_stargets = [ x for x in stargets if is_matching_na(val, x) ] @@ -398,7 +398,7 @@ cdef class IndexEngine: for i in range(n_t): val = targets[i] - # ensure there are nas in values before looking for a matching null + # ensure there are nas in values before looking for a matching na if check_na_values and checknull(val): match = [na for na in found_nas if is_matching_na(val, na)] if len(match): From 14e5c0dbfe68b608d6f938ea65be4e7689041201 Mon Sep 17 00:00:00 2001 From: Alex Lim Date: Mon, 18 Oct 2021 12:15:22 -0400 Subject: [PATCH 22/23] TST: added matching-but-not-identical for Decimal(NaN) --- pandas/tests/indexes/object/test_indexing.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index ab65936efce83..38bd96921b991 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -1,3 +1,5 @@ +from decimal import Decimal + import numpy as np import pytest @@ -90,6 +92,14 @@ def test_get_indexer_non_unique_nas(self, nulls_fixture): # matching-but-not-identical nans if is_matching_na(nulls_fixture, float("NaN")): index = Index(["a", float("NaN"), "b", float("NaN")]) + match_but_not_identical = True + elif is_matching_na(nulls_fixture, Decimal("NaN")): + index = Index(["a", Decimal("NaN"), "b", Decimal("NaN")]) + match_but_not_identical = True + else: + match_but_not_identical = False + + if match_but_not_identical: indexer, missing = index.get_indexer_non_unique([nulls_fixture]) expected_indexer = np.array([1, 3], dtype=np.intp) From 54aa23d2e730c529cf660780a7a6eefc32e004aa Mon Sep 17 00:00:00 2001 From: Alex Lim Date: Mon, 18 Oct 2021 12:25:23 -0400 Subject: [PATCH 23/23] TST: edge case with np.nan before float(NaN) while searching for np.nan --- pandas/tests/indexes/test_indexing.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/tests/indexes/test_indexing.py b/pandas/tests/indexes/test_indexing.py index ff2cd76ab6377..0a001008c2f1b 100644 --- a/pandas/tests/indexes/test_indexing.py +++ b/pandas/tests/indexes/test_indexing.py @@ -320,6 +320,11 @@ def test_maybe_cast_slice_bound_kind_deprecated(index): np.array([1, 2], dtype=np.intp), ), (["a", "b", "a", np.nan], [np.nan], np.array([3], dtype=np.intp)), + ( + np.array(["b", np.nan, float("NaN"), "b"], dtype=object), + Index([np.nan], dtype=object), + np.array([1, 2], dtype=np.intp), + ), ], ) def test_get_indexer_non_unique_multiple_nans(idx, target, expected):