From b66aed759c437322534462101c41c097b7e37f80 Mon Sep 17 00:00:00 2001 From: Matias Lindgren Date: Sat, 20 Jul 2024 11:06:34 +0200 Subject: [PATCH 01/13] cast all tuple subclass index keys to tuple --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/indexes/base.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index c5c886912eae0..97f4fc4693cdf 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -33,6 +33,7 @@ Other enhancements - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`) - :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`) - :meth:`DataFrame.agg` called with ``axis=1`` and a ``func`` which relabels the result index now raises a ``NotImplementedError`` (:issue:`58807`). +- :meth:`Index.get_loc` now accepts also subclasses of ``tuple`` as a keys (:issue:`57922`) - :meth:`Styler.set_tooltips` provides alternative method to storing tooltips by using title attribute of td elements. (:issue:`56981`) - Allow dictionaries to be passed to :meth:`pandas.Series.str.replace` via ``pat`` parameter (:issue:`51748`) - Support passing a :class:`Series` input to :func:`json_normalize` that retains the :class:`Series` :class:`Index` (:issue:`51452`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5bffac5fa64b6..5a774867ff5b4 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3568,6 +3568,8 @@ def get_loc(self, key): array([False, True, False, True]) """ casted_key = self._maybe_cast_indexer(key) + if isinstance(casted_key, tuple): + casted_key = tuple(casted_key) try: return self._engine.get_loc(casted_key) except KeyError as err: From 4f191e919a83debea652d0054a449f082b398ab7 Mon Sep 17 00:00:00 2001 From: Matias Lindgren Date: Sat, 20 Jul 2024 11:08:16 +0200 Subject: [PATCH 02/13] fix docs typo --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 97f4fc4693cdf..0411cb96d592e 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -33,7 +33,7 @@ Other enhancements - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`) - :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`) - :meth:`DataFrame.agg` called with ``axis=1`` and a ``func`` which relabels the result index now raises a ``NotImplementedError`` (:issue:`58807`). -- :meth:`Index.get_loc` now accepts also subclasses of ``tuple`` as a keys (:issue:`57922`) +- :meth:`Index.get_loc` now accepts also subclasses of ``tuple`` as keys (:issue:`57922`) - :meth:`Styler.set_tooltips` provides alternative method to storing tooltips by using title attribute of td elements. (:issue:`56981`) - Allow dictionaries to be passed to :meth:`pandas.Series.str.replace` via ``pat`` parameter (:issue:`51748`) - Support passing a :class:`Series` input to :func:`json_normalize` that retains the :class:`Series` :class:`Index` (:issue:`51452`) From b5fdd7b45b4422aa40560ef36cc0dd463ab7d6a3 Mon Sep 17 00:00:00 2001 From: Matias Lindgren Date: Sat, 20 Jul 2024 15:06:20 +0200 Subject: [PATCH 03/13] add multi-index namedtuple test --- pandas/tests/indexes/multi/test_indexing.py | 24 +++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index f08a7625e7f8a..d82203a53a60f 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -1,3 +1,4 @@ +from collections import namedtuple from datetime import timedelta import re @@ -1006,3 +1007,26 @@ def test_get_indexer_for_multiindex_with_nans(nulls_fixture): result = idx1.get_indexer(idx2) expected = np.array([-1, 1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) + + +def test_get_loc_namedtuple_behaves_like_tuple(): + # GH57922 + NamedIndex = namedtuple("NamedIndex", ("a", "b")) + multi_idx = MultiIndex.from_tuples( + [NamedIndex("i1", "i2"), NamedIndex("i3", "i4"), NamedIndex("i5", "i6")] + ) + for idx in (multi_idx, multi_idx.to_flat_index()): + assert idx.get_loc(NamedIndex("i1", "i2")) == 0 + assert idx.get_loc(NamedIndex("i3", "i4")) == 1 + assert idx.get_loc(NamedIndex("i5", "i6")) == 2 + assert idx.get_loc(("i1", "i2")) == 0 + assert idx.get_loc(("i3", "i4")) == 1 + assert idx.get_loc(("i5", "i6")) == 2 + multi_idx = MultiIndex.from_tuples([("i1", "i2"), ("i3", "i4"), ("i5", "i6")]) + for idx in (multi_idx, multi_idx.to_flat_index()): + assert idx.get_loc(NamedIndex("i1", "i2")) == 0 + assert idx.get_loc(NamedIndex("i3", "i4")) == 1 + assert idx.get_loc(NamedIndex("i5", "i6")) == 2 + assert idx.get_loc(("i1", "i2")) == 0 + assert idx.get_loc(("i3", "i4")) == 1 + assert idx.get_loc(("i5", "i6")) == 2 From af115209d174bf9f833ba832317b64f8ff53de18 Mon Sep 17 00:00:00 2001 From: Matias Lindgren Date: Sun, 21 Jul 2024 14:17:57 +0200 Subject: [PATCH 04/13] hash and compare all tuple subclasses as tuples --- pandas/_libs/include/pandas/vendored/klib/khash_python.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/include/pandas/vendored/klib/khash_python.h b/pandas/_libs/include/pandas/vendored/klib/khash_python.h index 8d4c382241d39..40ce2622fec66 100644 --- a/pandas/_libs/include/pandas/vendored/klib/khash_python.h +++ b/pandas/_libs/include/pandas/vendored/klib/khash_python.h @@ -207,7 +207,7 @@ static inline int pyobject_cmp(PyObject *a, PyObject *b) { if (PyComplex_CheckExact(a)) { return complexobject_cmp((PyComplexObject *)a, (PyComplexObject *)b); } - if (PyTuple_CheckExact(a)) { + if (PyTuple_Check(a)) { return tupleobject_cmp((PyTupleObject *)a, (PyTupleObject *)b); } // frozenset isn't yet supported @@ -311,7 +311,7 @@ static inline khuint32_t kh_python_hash_func(PyObject *key) { // because complex(k,0) == k holds for any int-object k // and kh_complex128_hash_func doesn't respect it hash = complexobject_hash((PyComplexObject *)key); - } else if (PyTuple_CheckExact(key)) { + } else if (PyTuple_Check(key)) { hash = tupleobject_hash((PyTupleObject *)key); } else { hash = PyObject_Hash(key); From 68cabf313b2de4a57ca7bc7e47a192db6eca6356 Mon Sep 17 00:00:00 2001 From: Matias Lindgren Date: Sun, 21 Jul 2024 14:18:29 +0200 Subject: [PATCH 05/13] test hashtable with namedtuples --- pandas/tests/libs/test_hashtable.py | 30 +++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index b70386191d9d9..db127851d8ac7 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -1,3 +1,4 @@ +from collections import namedtuple from collections.abc import Generator from contextlib import contextmanager import re @@ -440,6 +441,27 @@ def test_nan_in_nested_tuple(self): table.get_item(other) assert str(error.value) == str(other) + def test_nan_in_namedtuple(self): + T = namedtuple("T", ["x"]) + nan1 = T(float("nan")) + nan2 = T(float("nan")) + assert nan1.x is not nan2.x + table = ht.PyObjectHashTable() + table.set_item(nan1, 42) + assert table.get_item(nan2) == 42 + + def test_nan_in_nested_namedtuple(self): + T = namedtuple("T", ["x", "y"]) + nan1 = T(1, (2, (float("nan"),))) + nan2 = T(1, (2, (float("nan"),))) + other = T(1, 2) + table = ht.PyObjectHashTable() + table.set_item(nan1, 42) + assert table.get_item(nan2) == 42 + with pytest.raises(KeyError, match=None) as error: + table.get_item(other) + assert str(error.value) == str(other) + def test_hash_equal_tuple_with_nans(): a = (float("nan"), (float("nan"), float("nan"))) @@ -448,6 +470,14 @@ def test_hash_equal_tuple_with_nans(): assert ht.objects_are_equal(a, b) +def test_hash_equal_namedtuple_with_nans(): + T = namedtuple("T", ["x", "y"]) + a = T(float("nan"), (float("nan"), float("nan"))) + b = T(float("nan"), (float("nan"), float("nan"))) + assert ht.object_hash(a) == ht.object_hash(b) + assert ht.objects_are_equal(a, b) + + def test_get_labels_groupby_for_Int64(writable): table = ht.Int64HashTable() vals = np.array([1, 2, -1, 2, 1, -1], dtype=np.int64) From e800139714531a48de5ed8737b902e8125853ec9 Mon Sep 17 00:00:00 2001 From: Matias Lindgren Date: Sun, 21 Jul 2024 14:22:53 +0200 Subject: [PATCH 06/13] remove redundant index key conversion --- pandas/core/indexes/base.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5a774867ff5b4..5bffac5fa64b6 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3568,8 +3568,6 @@ def get_loc(self, key): array([False, True, False, True]) """ casted_key = self._maybe_cast_indexer(key) - if isinstance(casted_key, tuple): - casted_key = tuple(casted_key) try: return self._engine.get_loc(casted_key) except KeyError as err: From 54009c324d5a10ab26e0820ac07315c96d75d07c Mon Sep 17 00:00:00 2001 From: Matias Lindgren Date: Sun, 21 Jul 2024 19:45:08 +0200 Subject: [PATCH 07/13] add comments --- pandas/_libs/include/pandas/vendored/klib/khash_python.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/_libs/include/pandas/vendored/klib/khash_python.h b/pandas/_libs/include/pandas/vendored/klib/khash_python.h index 40ce2622fec66..2fa61642968cf 100644 --- a/pandas/_libs/include/pandas/vendored/klib/khash_python.h +++ b/pandas/_libs/include/pandas/vendored/klib/khash_python.h @@ -208,6 +208,7 @@ static inline int pyobject_cmp(PyObject *a, PyObject *b) { return complexobject_cmp((PyComplexObject *)a, (PyComplexObject *)b); } if (PyTuple_Check(a)) { + // compare tuple subclasses as builtin tuples return tupleobject_cmp((PyTupleObject *)a, (PyTupleObject *)b); } // frozenset isn't yet supported @@ -312,6 +313,7 @@ static inline khuint32_t kh_python_hash_func(PyObject *key) { // and kh_complex128_hash_func doesn't respect it hash = complexobject_hash((PyComplexObject *)key); } else if (PyTuple_Check(key)) { + // hash tuple subclasses as builtin tuples hash = tupleobject_hash((PyTupleObject *)key); } else { hash = PyObject_Hash(key); From 190ac69a8305113e3be1bdab5c41665432382fd6 Mon Sep 17 00:00:00 2001 From: Matias Lindgren Date: Sun, 21 Jul 2024 20:02:09 +0200 Subject: [PATCH 08/13] update whatsnew --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 0411cb96d592e..74a82bffb5e27 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -35,6 +35,7 @@ Other enhancements - :meth:`DataFrame.agg` called with ``axis=1`` and a ``func`` which relabels the result index now raises a ``NotImplementedError`` (:issue:`58807`). - :meth:`Index.get_loc` now accepts also subclasses of ``tuple`` as keys (:issue:`57922`) - :meth:`Styler.set_tooltips` provides alternative method to storing tooltips by using title attribute of td elements. (:issue:`56981`) +- All classes inheriting from builtin ``tuple`` (including types created with :func:`collections.namedtuple`) are now hashed and compared as builtin ``tuple`` (:issue:`57922`) - Allow dictionaries to be passed to :meth:`pandas.Series.str.replace` via ``pat`` parameter (:issue:`51748`) - Support passing a :class:`Series` input to :func:`json_normalize` that retains the :class:`Series` :class:`Index` (:issue:`51452`) - Support reading value labels from Stata 108-format (Stata 6) and earlier files (:issue:`58154`) From 8ad9e4c61c693127de16c0e936a368c412cba746 Mon Sep 17 00:00:00 2001 From: Matias Lindgren Date: Mon, 22 Jul 2024 20:05:18 +0200 Subject: [PATCH 09/13] check key error message --- pandas/tests/libs/test_hashtable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index db127851d8ac7..710faad099c9b 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -458,7 +458,7 @@ def test_nan_in_nested_namedtuple(self): table = ht.PyObjectHashTable() table.set_item(nan1, 42) assert table.get_item(nan2) == 42 - with pytest.raises(KeyError, match=None) as error: + with pytest.raises(KeyError, match=re.escape(repr(other))) as error: table.get_item(other) assert str(error.value) == str(other) From 70d6511c70d118368ef306b83ac921f1e7ef7a72 Mon Sep 17 00:00:00 2001 From: Matias Lindgren Date: Mon, 22 Jul 2024 20:06:10 +0200 Subject: [PATCH 10/13] fix whatsnew section --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 9afa41097a9d2..78bed1b815864 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -35,7 +35,6 @@ Other enhancements - :meth:`DataFrame.agg` called with ``axis=1`` and a ``func`` which relabels the result index now raises a ``NotImplementedError`` (:issue:`58807`). - :meth:`Index.get_loc` now accepts also subclasses of ``tuple`` as keys (:issue:`57922`) - :meth:`Styler.set_tooltips` provides alternative method to storing tooltips by using title attribute of td elements. (:issue:`56981`) -- All classes inheriting from builtin ``tuple`` (including types created with :func:`collections.namedtuple`) are now hashed and compared as builtin ``tuple`` (:issue:`57922`) - Allow dictionaries to be passed to :meth:`pandas.Series.str.replace` via ``pat`` parameter (:issue:`51748`) - Support passing a :class:`Series` input to :func:`json_normalize` that retains the :class:`Series` :class:`Index` (:issue:`51452`) - Support reading value labels from Stata 108-format (Stata 6) and earlier files (:issue:`58154`) @@ -233,6 +232,7 @@ Other API changes ^^^^^^^^^^^^^^^^^ - 3rd party ``py.path`` objects are no longer explicitly supported in IO methods. Use :py:class:`pathlib.Path` objects instead (:issue:`57091`) - :func:`read_table`'s ``parse_dates`` argument defaults to ``None`` to improve consistency with :func:`read_csv` (:issue:`57476`) +- All classes inheriting from builtin ``tuple`` (including types created with :func:`collections.namedtuple`) are now hashed and compared as builtin ``tuple`` (:issue:`57922`) - Made ``dtype`` a required argument in :meth:`ExtensionArray._from_sequence_of_strings` (:issue:`56519`) - Passing a :class:`Series` input to :func:`json_normalize` will now retain the :class:`Series` :class:`Index`, previously output had a new :class:`RangeIndex` (:issue:`51452`) - Updated :meth:`DataFrame.to_excel` so that the output spreadsheet has no styling. Custom styling can still be done using :meth:`Styler.to_excel` (:issue:`54154`) From 2ef52ebf296d91fc1381a9b43af48391180c1d60 Mon Sep 17 00:00:00 2001 From: Matias Lindgren Date: Mon, 22 Jul 2024 20:36:51 +0200 Subject: [PATCH 11/13] test namedtuple and tuple interchangeable in hashtable --- pandas/tests/libs/test_hashtable.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 710faad099c9b..223c4e3d2c8ab 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -478,6 +478,14 @@ def test_hash_equal_namedtuple_with_nans(): assert ht.objects_are_equal(a, b) +def test_hash_equal_namedtuple_and_tuple(): + T = namedtuple("T", ["x", "y"]) + a = T(1, (2, 3)) + b = (1, (2, 3)) + assert ht.object_hash(a) == ht.object_hash(b) + assert ht.objects_are_equal(a, b) + + def test_get_labels_groupby_for_Int64(writable): table = ht.Int64HashTable() vals = np.array([1, 2, -1, 2, 1, -1], dtype=np.int64) From d5965029ded7578273d14fc4980f17b90e7ae732 Mon Sep 17 00:00:00 2001 From: matiaslindgren Date: Mon, 22 Jul 2024 22:30:14 +0200 Subject: [PATCH 12/13] Update doc/source/whatsnew/v3.0.0.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 78bed1b815864..d940d564b8df2 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -232,7 +232,7 @@ Other API changes ^^^^^^^^^^^^^^^^^ - 3rd party ``py.path`` objects are no longer explicitly supported in IO methods. Use :py:class:`pathlib.Path` objects instead (:issue:`57091`) - :func:`read_table`'s ``parse_dates`` argument defaults to ``None`` to improve consistency with :func:`read_csv` (:issue:`57476`) -- All classes inheriting from builtin ``tuple`` (including types created with :func:`collections.namedtuple`) are now hashed and compared as builtin ``tuple`` (:issue:`57922`) +- All classes inheriting from builtin ``tuple`` (including types created with :func:`collections.namedtuple`) are now hashed and compared as builtin ``tuple`` during indexing operations (:issue:`57922`) - Made ``dtype`` a required argument in :meth:`ExtensionArray._from_sequence_of_strings` (:issue:`56519`) - Passing a :class:`Series` input to :func:`json_normalize` will now retain the :class:`Series` :class:`Index`, previously output had a new :class:`RangeIndex` (:issue:`51452`) - Updated :meth:`DataFrame.to_excel` so that the output spreadsheet has no styling. Custom styling can still be done using :meth:`Styler.to_excel` (:issue:`54154`) From 1554eb749c4aefc3957f57b969d8f95a9c16c086 Mon Sep 17 00:00:00 2001 From: Matias Lindgren Date: Mon, 22 Jul 2024 22:29:50 +0200 Subject: [PATCH 13/13] use pytest.raises regexp instead of str eq --- pandas/tests/libs/test_hashtable.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 223c4e3d2c8ab..50b561aefcf49 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -406,9 +406,8 @@ def test_nan_complex_real(self): table = ht.PyObjectHashTable() table.set_item(nan1, 42) assert table.get_item(nan2) == 42 - with pytest.raises(KeyError, match=None) as error: + with pytest.raises(KeyError, match=re.escape(repr(other))): table.get_item(other) - assert str(error.value) == str(other) def test_nan_complex_imag(self): nan1 = complex(1, float("nan")) @@ -418,9 +417,8 @@ def test_nan_complex_imag(self): table = ht.PyObjectHashTable() table.set_item(nan1, 42) assert table.get_item(nan2) == 42 - with pytest.raises(KeyError, match=None) as error: + with pytest.raises(KeyError, match=re.escape(repr(other))): table.get_item(other) - assert str(error.value) == str(other) def test_nan_in_tuple(self): nan1 = (float("nan"),) @@ -437,9 +435,8 @@ def test_nan_in_nested_tuple(self): table = ht.PyObjectHashTable() table.set_item(nan1, 42) assert table.get_item(nan2) == 42 - with pytest.raises(KeyError, match=None) as error: + with pytest.raises(KeyError, match=re.escape(repr(other))): table.get_item(other) - assert str(error.value) == str(other) def test_nan_in_namedtuple(self): T = namedtuple("T", ["x"]) @@ -458,9 +455,8 @@ def test_nan_in_nested_namedtuple(self): table = ht.PyObjectHashTable() table.set_item(nan1, 42) assert table.get_item(nan2) == 42 - with pytest.raises(KeyError, match=re.escape(repr(other))) as error: + with pytest.raises(KeyError, match=re.escape(repr(other))): table.get_item(other) - assert str(error.value) == str(other) def test_hash_equal_tuple_with_nans():