From bd3d440c6c0047141c802551c93184e7c531df7e Mon Sep 17 00:00:00 2001 From: fjetter Date: Sun, 13 May 2018 11:35:14 +0200 Subject: [PATCH 1/6] categorical: searchsorted returns a scalar if input was scalar --- doc/source/whatsnew/v0.23.0.txt | 3 ++ pandas/core/arrays/categorical.py | 2 + pandas/core/indexes/category.py | 9 ++-- pandas/tests/categorical/test_analytics.py | 6 +-- pandas/tests/indexing/test_categorical.py | 51 +++++++++++++++++++--- 5 files changed, 57 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index a099fb40c35a7..85bad54a5cdb8 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1289,6 +1289,9 @@ Indexing - Bug in performing in-place operations on a ``DataFrame`` with a duplicate ``Index`` (:issue:`17105`) - Bug in :meth:`IntervalIndex.get_loc` and :meth:`IntervalIndex.get_indexer` when used with an :class:`IntervalIndex` containing a single interval (:issue:`17284`, :issue:`20921`) - Bug in ``.loc`` with a ``uint64`` indexer (:issue:`20722`) +- Bug in ``CategoricalIndex.searchsorted`` where the method didn't return a scalar when the input values was scalar (:issue:`21019`) +- Bug in ``CategoricalIndex`` where slicing beyond the range of the data raised a KeyError (:issue:`21019`) + MultiIndex ^^^^^^^^^^ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 30f9c56d24f02..075f13f1bf9fb 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1342,6 +1342,8 @@ def searchsorted(self, value, side='left', sorter=None): if -1 in values_as_codes: raise ValueError("Value(s) to be inserted must be in categories.") + if is_scalar(value): + values_as_codes = np.asscalar(values_as_codes) return self.codes.searchsorted(values_as_codes, side=side, sorter=sorter) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 150eca32e229d..f99f1ae32b4ef 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -432,13 +432,14 @@ def get_loc(self, key, method=None): >>> monotonic_index.get_loc('b') slice(1, 3, None) - >>> non_monotonic_index = p.dCategoricalIndex(list('abcb')) + >>> non_monotonic_index = pd.CategoricalIndex(list('abcb')) >>> non_monotonic_index.get_loc('b') array([False, True, False, True], dtype=bool) """ - codes = self.categories.get_loc(key) - if (codes == -1): - raise KeyError(key) + try: + codes = self.categories.get_loc(key) + except KeyError: + raise KeyError("Category `{}` unknown".format(key)) return self._engine.get_loc(codes) def get_value(self, series, key): diff --git a/pandas/tests/categorical/test_analytics.py b/pandas/tests/categorical/test_analytics.py index 53d0e596a1d99..6557b98c467b7 100644 --- a/pandas/tests/categorical/test_analytics.py +++ b/pandas/tests/categorical/test_analytics.py @@ -86,9 +86,9 @@ def test_searchsorted(self): # Searching for single item argument, side='left' (default) res_cat = c1.searchsorted('apple') res_ser = s1.searchsorted('apple') - exp = np.array([2], dtype=np.intp) - tm.assert_numpy_array_equal(res_cat, exp) - tm.assert_numpy_array_equal(res_ser, exp) + exp = np.int64(2) + assert res_cat == exp + assert res_ser == exp # Searching for single item array, side='left' (default) res_cat = c1.searchsorted(['bread']) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 634ad0d8160ed..de50f88bdd6c4 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -627,15 +627,52 @@ def test_reindexing(self): lambda: self.df2.reindex(['a'], limit=2)) def test_loc_slice(self): - # slicing - # not implemented ATM - # GH9748 + # Raises KeyError since the left slice 'a' is not unique + pytest.raises(KeyError, lambda: self.df.loc["a":"b"]) + result = self.df.loc["b":"c"] - pytest.raises(TypeError, lambda: self.df.loc[1:5]) + expected = DataFrame( + {"A": [2, 3, 4]}, + index=CategoricalIndex( + ["b", "b", "c"], name="B", categories=list("cab") + ), + ) + + assert_frame_equal(result, expected) + + ordered_df = DataFrame( + {"A": range(0, 6)}, + index=CategoricalIndex(list("aabcde"), name="B", ordered=True), + ) + + result = ordered_df.loc["a":"b"] + expected = DataFrame( + {"A": range(0, 3)}, + index=CategoricalIndex( + list("aab"), categories=list("abcde"), name="B", ordered=True + ), + ) + assert_frame_equal(result, expected) + + # This should select the entire dataframe + result = ordered_df.loc["a":"e"] + assert_frame_equal(result, ordered_df) + + df_slice = ordered_df.loc["a":"b"] + # Although the edge is not within the slice, this should fall back + # to searchsorted slicing since the category is known + result = df_slice.loc["a":"e"] + assert_frame_equal(result, df_slice) - # result = df.loc[1:5] - # expected = df.iloc[[1,2,3,4]] - # assert_frame_equal(result, expected) + # If the categorical is not sorted and the requested edge + # is not in the slice we cannot perform slicing + df_slice.index = df_slice.index.as_unordered() + with pytest.raises(KeyError): + df_slice.loc["a":"e"] + + with pytest.raises(KeyError): + # If the category is not known, there is nothing we can do + ordered_df.loc["a":"z"] def test_boolean_selection(self): From c4249fadb45537c3790f069bb67587c4d388bfff Mon Sep 17 00:00:00 2001 From: fjetter Date: Sun, 13 May 2018 16:47:51 +0200 Subject: [PATCH 2/6] Address review comments --- doc/source/whatsnew/v0.23.0.txt | 4 ++-- pandas/core/indexes/category.py | 8 ++++---- pandas/tests/categorical/test_analytics.py | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 85bad54a5cdb8..912e12a1191ca 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1289,8 +1289,8 @@ Indexing - Bug in performing in-place operations on a ``DataFrame`` with a duplicate ``Index`` (:issue:`17105`) - Bug in :meth:`IntervalIndex.get_loc` and :meth:`IntervalIndex.get_indexer` when used with an :class:`IntervalIndex` containing a single interval (:issue:`17284`, :issue:`20921`) - Bug in ``.loc`` with a ``uint64`` indexer (:issue:`20722`) -- Bug in ``CategoricalIndex.searchsorted`` where the method didn't return a scalar when the input values was scalar (:issue:`21019`) -- Bug in ``CategoricalIndex`` where slicing beyond the range of the data raised a KeyError (:issue:`21019`) +- Bug in :func:`CategoricalIndex.searchsorted` where the method did not return a scalar when the input values was scalar (:issue:`21019`) +- Bug in :class:`CategoricalIndex` where slicing beyond the range of the data raised a KeyError (:issue:`21019`) MultiIndex diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index f99f1ae32b4ef..d0d5f3e9de971 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -436,10 +436,10 @@ def get_loc(self, key, method=None): >>> non_monotonic_index.get_loc('b') array([False, True, False, True], dtype=bool) """ - try: - codes = self.categories.get_loc(key) - except KeyError: - raise KeyError("Category `{}` unknown".format(key)) + codes = self.categories.get_loc(key) + if (codes == -1): + raise KeyError(key) + return self._engine.get_loc(codes) def get_value(self, series, key): diff --git a/pandas/tests/categorical/test_analytics.py b/pandas/tests/categorical/test_analytics.py index 6557b98c467b7..ab8d2f30f545a 100644 --- a/pandas/tests/categorical/test_analytics.py +++ b/pandas/tests/categorical/test_analytics.py @@ -86,7 +86,7 @@ def test_searchsorted(self): # Searching for single item argument, side='left' (default) res_cat = c1.searchsorted('apple') res_ser = s1.searchsorted('apple') - exp = np.int64(2) + exp = np.intp(2) assert res_cat == exp assert res_ser == exp From 1c25d659610a874364ce846fdba6cde163818e12 Mon Sep 17 00:00:00 2001 From: fjetter Date: Fri, 25 May 2018 08:48:53 +0200 Subject: [PATCH 3/6] Use item instead of np.asscalar to select value --- pandas/core/arrays/categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 075f13f1bf9fb..8f670d21d9c44 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1343,7 +1343,7 @@ def searchsorted(self, value, side='left', sorter=None): if -1 in values_as_codes: raise ValueError("Value(s) to be inserted must be in categories.") if is_scalar(value): - values_as_codes = np.asscalar(values_as_codes) + values_as_codes = values_as_codes.item() return self.codes.searchsorted(values_as_codes, side=side, sorter=sorter) From d4e9879fe2594a47ef7fc6943a5b3fa310eb50b2 Mon Sep 17 00:00:00 2001 From: fjetter Date: Fri, 25 May 2018 08:49:28 +0200 Subject: [PATCH 4/6] refactor categorical loc slicing tests --- pandas/tests/indexing/test_categorical.py | 45 ++++++++++++++++++----- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index de50f88bdd6c4..695fd82c6f07c 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -645,6 +645,12 @@ def test_loc_slice(self): index=CategoricalIndex(list("aabcde"), name="B", ordered=True), ) + # This should select the entire dataframe + result = ordered_df.loc["a":"e"] + assert_frame_equal(result, ordered_df) + result_iloc = ordered_df.iloc[0:6] + assert_frame_equal(result_iloc, result) + result = ordered_df.loc["a":"b"] expected = DataFrame( {"A": range(0, 3)}, @@ -654,21 +660,42 @@ def test_loc_slice(self): ) assert_frame_equal(result, expected) - # This should select the entire dataframe - result = ordered_df.loc["a":"e"] - assert_frame_equal(result, ordered_df) + @pytest.mark.parametrize( + "content", + [list("aab"), list("bbc"), list('bbc')], + ids=["right_edge", "left_edge", "both_edges"], + ) + def test_loc_beyond_edge_slicing(self, content): + """ + This test ensures that no `KeyError` is raised if trying to slice + beyond the edges of known, ordered categories. + """ + # This dataframe might be a slice of a larger categorical + # (i.e. more categories are known than there are in the column) + + ordered_df = DataFrame( + {"A": range(0, 3)}, + index=CategoricalIndex( + content, categories=list("abcde"), name="B", ordered=True + ), + ) - df_slice = ordered_df.loc["a":"b"] # Although the edge is not within the slice, this should fall back - # to searchsorted slicing since the category is known - result = df_slice.loc["a":"e"] - assert_frame_equal(result, df_slice) + # to searchsorted slicing since the category is known and the index + # is ordered. Since we're selecting a value larger/lower than the + # right/left edge we should get the original slice again. + result = ordered_df.loc["a": "d"] + assert_frame_equal(result, ordered_df) + + # Ensure that index based slicing gives the same result + result_iloc = ordered_df.iloc[0:4] + assert_frame_equal(result, result_iloc) # If the categorical is not sorted and the requested edge # is not in the slice we cannot perform slicing - df_slice.index = df_slice.index.as_unordered() + ordered_df.index = ordered_df.index.as_unordered() with pytest.raises(KeyError): - df_slice.loc["a":"e"] + ordered_df.loc["a": "d"] with pytest.raises(KeyError): # If the category is not known, there is nothing we can do From 25b5fd72e2dd4b5a42c5d8b6e447b23459b64f74 Mon Sep 17 00:00:00 2001 From: fjetter Date: Fri, 25 May 2018 08:55:21 +0200 Subject: [PATCH 5/6] Changelog entry --- doc/source/whatsnew/v0.23.1.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 9c29c34adb7dd..2e7a2f25833ae 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -88,6 +88,8 @@ Indexing - Bug in :meth:`MultiIndex.set_names` where error raised for a ``MultiIndex`` with ``nlevels == 1`` (:issue:`21149`) - Bug in :class:`IntervalIndex` constructors where creating an ``IntervalIndex`` from categorical data was not fully supported (:issue:`21243`, issue:`21253`) - Bug in :meth:`MultiIndex.sort_index` which was not guaranteed to sort correctly with ``level=1``; this was also causing data misalignment in particular :meth:`DataFrame.stack` operations (:issue:`20994`, :issue:`20945`, :issue:`21052`) +- Bug in :func:`CategoricalIndex.searchsorted` where the method did not return a scalar when the input values was scalar (:issue:`21019`) +- Bug in :class:`CategoricalIndex` where slicing beyond the range of the data raised a KeyError (:issue:`21019`) - I/O From 04ca52f01c51fc009ddee75ee79f3d87b9435b95 Mon Sep 17 00:00:00 2001 From: fjetter Date: Wed, 6 Jun 2018 19:55:42 +0200 Subject: [PATCH 6/6] Address PR comments --- doc/source/whatsnew/v0.23.0.txt | 3 -- doc/source/whatsnew/v0.23.1.txt | 3 +- pandas/tests/indexing/test_categorical.py | 53 ++++++++++++----------- 3 files changed, 28 insertions(+), 31 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 912e12a1191ca..a099fb40c35a7 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1289,9 +1289,6 @@ Indexing - Bug in performing in-place operations on a ``DataFrame`` with a duplicate ``Index`` (:issue:`17105`) - Bug in :meth:`IntervalIndex.get_loc` and :meth:`IntervalIndex.get_indexer` when used with an :class:`IntervalIndex` containing a single interval (:issue:`17284`, :issue:`20921`) - Bug in ``.loc`` with a ``uint64`` indexer (:issue:`20722`) -- Bug in :func:`CategoricalIndex.searchsorted` where the method did not return a scalar when the input values was scalar (:issue:`21019`) -- Bug in :class:`CategoricalIndex` where slicing beyond the range of the data raised a KeyError (:issue:`21019`) - MultiIndex ^^^^^^^^^^ diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 2e7a2f25833ae..db39a1c558b97 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -89,8 +89,7 @@ Indexing - Bug in :class:`IntervalIndex` constructors where creating an ``IntervalIndex`` from categorical data was not fully supported (:issue:`21243`, issue:`21253`) - Bug in :meth:`MultiIndex.sort_index` which was not guaranteed to sort correctly with ``level=1``; this was also causing data misalignment in particular :meth:`DataFrame.stack` operations (:issue:`20994`, :issue:`20945`, :issue:`21052`) - Bug in :func:`CategoricalIndex.searchsorted` where the method did not return a scalar when the input values was scalar (:issue:`21019`) -- Bug in :class:`CategoricalIndex` where slicing beyond the range of the data raised a KeyError (:issue:`21019`) -- +- Bug in :class:`CategoricalIndex` where slicing beyond the range of the data raised a ``KeyError`` (:issue:`21019`) I/O ^^^ diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 695fd82c6f07c..9f745700049ae 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -627,38 +627,29 @@ def test_reindexing(self): lambda: self.df2.reindex(['a'], limit=2)) def test_loc_slice(self): - # Raises KeyError since the left slice 'a' is not unique - pytest.raises(KeyError, lambda: self.df.loc["a":"b"]) - result = self.df.loc["b":"c"] - - expected = DataFrame( - {"A": [2, 3, 4]}, - index=CategoricalIndex( - ["b", "b", "c"], name="B", categories=list("cab") - ), + df = DataFrame( + {"A": range(0, 6)}, + index=CategoricalIndex(list("aabcde"), name="B"), ) + # slice on an unordered categorical using in-sample, connected edges + result = df.loc["b":"d"] + expected = df.iloc[2:5] assert_frame_equal(result, expected) - ordered_df = DataFrame( - {"A": range(0, 6)}, - index=CategoricalIndex(list("aabcde"), name="B", ordered=True), - ) - - # This should select the entire dataframe - result = ordered_df.loc["a":"e"] - assert_frame_equal(result, ordered_df) - result_iloc = ordered_df.iloc[0:6] + # Slice the entire dataframe + result = df.loc["a":"e"] + assert_frame_equal(result, df) + result_iloc = df.iloc[0:6] assert_frame_equal(result_iloc, result) - result = ordered_df.loc["a":"b"] - expected = DataFrame( - {"A": range(0, 3)}, - index=CategoricalIndex( - list("aab"), categories=list("abcde"), name="B", ordered=True - ), - ) - assert_frame_equal(result, expected) + # check if the result is identical to an ordinary index + df_non_cat_index = df.copy() + df_non_cat_index.index = df_non_cat_index.index.astype(str) + result = df.loc["a":"e"] + result_non_cat = df_non_cat_index.loc["a": "e"] + result.index = result.index.astype(str) + assert_frame_equal(result_non_cat, result) @pytest.mark.parametrize( "content", @@ -669,6 +660,8 @@ def test_loc_beyond_edge_slicing(self, content): """ This test ensures that no `KeyError` is raised if trying to slice beyond the edges of known, ordered categories. + + see GH21019 """ # This dataframe might be a slice of a larger categorical # (i.e. more categories are known than there are in the column) @@ -701,6 +694,14 @@ def test_loc_beyond_edge_slicing(self, content): # If the category is not known, there is nothing we can do ordered_df.loc["a":"z"] + unordered_df = ordered_df.copy() + unordered_df.index = unordered_df.index.as_unordered() + with pytest.raises(KeyError): + # This operation previously succeeded for an ordered index. Since + # this index is no longer ordered, we cannot perfom out of range + # slicing / searchsorted + unordered_df.loc["a": "d"] + def test_boolean_selection(self): df3 = self.df3