diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index 9c29c34adb7dd..db39a1c558b97 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -88,7 +88,8 @@ Indexing - Bug in :meth:`MultiIndex.set_names` where error raised for a ``MultiIndex`` with ``nlevels == 1`` (:issue:`21149`) - Bug in :class:`IntervalIndex` constructors where creating an ``IntervalIndex`` from categorical data was not fully supported (:issue:`21243`, issue:`21253`) - Bug in :meth:`MultiIndex.sort_index` which was not guaranteed to sort correctly with ``level=1``; this was also causing data misalignment in particular :meth:`DataFrame.stack` operations (:issue:`20994`, :issue:`20945`, :issue:`21052`) -- +- Bug in :func:`CategoricalIndex.searchsorted` where the method did not return a scalar when the input values was scalar (:issue:`21019`) +- Bug in :class:`CategoricalIndex` where slicing beyond the range of the data raised a ``KeyError`` (:issue:`21019`) I/O ^^^ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 30f9c56d24f02..8f670d21d9c44 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1342,6 +1342,8 @@ def searchsorted(self, value, side='left', sorter=None): if -1 in values_as_codes: raise ValueError("Value(s) to be inserted must be in categories.") + if is_scalar(value): + values_as_codes = values_as_codes.item() return self.codes.searchsorted(values_as_codes, side=side, sorter=sorter) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 150eca32e229d..d0d5f3e9de971 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -432,13 +432,14 @@ def get_loc(self, key, method=None): >>> monotonic_index.get_loc('b') slice(1, 3, None) - >>> non_monotonic_index = p.dCategoricalIndex(list('abcb')) + >>> non_monotonic_index = pd.CategoricalIndex(list('abcb')) >>> non_monotonic_index.get_loc('b') array([False, True, False, True], dtype=bool) """ codes = self.categories.get_loc(key) if (codes == -1): raise KeyError(key) + return self._engine.get_loc(codes) def get_value(self, series, key): diff --git a/pandas/tests/categorical/test_analytics.py b/pandas/tests/categorical/test_analytics.py index 53d0e596a1d99..ab8d2f30f545a 100644 --- a/pandas/tests/categorical/test_analytics.py +++ b/pandas/tests/categorical/test_analytics.py @@ -86,9 +86,9 @@ def test_searchsorted(self): # Searching for single item argument, side='left' (default) res_cat = c1.searchsorted('apple') res_ser = s1.searchsorted('apple') - exp = np.array([2], dtype=np.intp) - tm.assert_numpy_array_equal(res_cat, exp) - tm.assert_numpy_array_equal(res_ser, exp) + exp = np.intp(2) + assert res_cat == exp + assert res_ser == exp # Searching for single item array, side='left' (default) res_cat = c1.searchsorted(['bread']) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 634ad0d8160ed..9f745700049ae 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -627,15 +627,80 @@ def test_reindexing(self): lambda: self.df2.reindex(['a'], limit=2)) def test_loc_slice(self): - # slicing - # not implemented ATM - # GH9748 + df = DataFrame( + {"A": range(0, 6)}, + index=CategoricalIndex(list("aabcde"), name="B"), + ) + + # slice on an unordered categorical using in-sample, connected edges + result = df.loc["b":"d"] + expected = df.iloc[2:5] + assert_frame_equal(result, expected) - pytest.raises(TypeError, lambda: self.df.loc[1:5]) + # Slice the entire dataframe + result = df.loc["a":"e"] + assert_frame_equal(result, df) + result_iloc = df.iloc[0:6] + assert_frame_equal(result_iloc, result) + + # check if the result is identical to an ordinary index + df_non_cat_index = df.copy() + df_non_cat_index.index = df_non_cat_index.index.astype(str) + result = df.loc["a":"e"] + result_non_cat = df_non_cat_index.loc["a": "e"] + result.index = result.index.astype(str) + assert_frame_equal(result_non_cat, result) + + @pytest.mark.parametrize( + "content", + [list("aab"), list("bbc"), list('bbc')], + ids=["right_edge", "left_edge", "both_edges"], + ) + def test_loc_beyond_edge_slicing(self, content): + """ + This test ensures that no `KeyError` is raised if trying to slice + beyond the edges of known, ordered categories. + + see GH21019 + """ + # This dataframe might be a slice of a larger categorical + # (i.e. more categories are known than there are in the column) + + ordered_df = DataFrame( + {"A": range(0, 3)}, + index=CategoricalIndex( + content, categories=list("abcde"), name="B", ordered=True + ), + ) + + # Although the edge is not within the slice, this should fall back + # to searchsorted slicing since the category is known and the index + # is ordered. Since we're selecting a value larger/lower than the + # right/left edge we should get the original slice again. + result = ordered_df.loc["a": "d"] + assert_frame_equal(result, ordered_df) + + # Ensure that index based slicing gives the same result + result_iloc = ordered_df.iloc[0:4] + assert_frame_equal(result, result_iloc) + + # If the categorical is not sorted and the requested edge + # is not in the slice we cannot perform slicing + ordered_df.index = ordered_df.index.as_unordered() + with pytest.raises(KeyError): + ordered_df.loc["a": "d"] - # result = df.loc[1:5] - # expected = df.iloc[[1,2,3,4]] - # assert_frame_equal(result, expected) + with pytest.raises(KeyError): + # If the category is not known, there is nothing we can do + ordered_df.loc["a":"z"] + + unordered_df = ordered_df.copy() + unordered_df.index = unordered_df.index.as_unordered() + with pytest.raises(KeyError): + # This operation previously succeeded for an ordered index. Since + # this index is no longer ordered, we cannot perfom out of range + # slicing / searchsorted + unordered_df.loc["a": "d"] def test_boolean_selection(self):