-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
BUG: CategoricalIndex.searchsorted doesn't return a scalar if input was scalar #21019
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
bd3d440
c4249fa
1c25d65
d4e9879
25b5fd7
04ca52f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1342,6 +1342,8 @@ def searchsorted(self, value, side='left', sorter=None): | |
|
||
if -1 in values_as_codes: | ||
raise ValueError("Value(s) to be inserted must be in categories.") | ||
if is_scalar(value): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. would rather do this in use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok I c, change here is ok There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As I found out in #21699, numpy.searchsorted doesn't like python ints, but needs numpy ints to archieve its speed. >>> n = 1_000_000
>>> c = pd.Categorical(list('a' * n + 'b' * n + 'c' * n), ordered=True)
>>> %timeit c.codes.searchsorted(1) # python int
7 ms ± 24.7 µs per loop
>>> c.codes.dtype
int8
>>> %timeit c.codes.searchsorted(np.int8(1))
2.46 µs ± 82.4 ns per loop So the scalar version should be |
||
values_as_codes = values_as_codes.item() | ||
|
||
return self.codes.searchsorted(values_as_codes, side=side, | ||
sorter=sorter) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -627,15 +627,80 @@ def test_reindexing(self): | |
lambda: self.df2.reindex(['a'], limit=2)) | ||
|
||
def test_loc_slice(self): | ||
# slicing | ||
# not implemented ATM | ||
# GH9748 | ||
df = DataFrame( | ||
{"A": range(0, 6)}, | ||
index=CategoricalIndex(list("aabcde"), name="B"), | ||
) | ||
|
||
# slice on an unordered categorical using in-sample, connected edges | ||
result = df.loc["b":"d"] | ||
expected = df.iloc[2:5] | ||
assert_frame_equal(result, expected) | ||
|
||
pytest.raises(TypeError, lambda: self.df.loc[1:5]) | ||
# Slice the entire dataframe | ||
result = df.loc["a":"e"] | ||
assert_frame_equal(result, df) | ||
result_iloc = df.iloc[0:6] | ||
assert_frame_equal(result_iloc, result) | ||
|
||
# check if the result is identical to an ordinary index | ||
df_non_cat_index = df.copy() | ||
df_non_cat_index.index = df_non_cat_index.index.astype(str) | ||
result = df.loc["a":"e"] | ||
result_non_cat = df_non_cat_index.loc["a": "e"] | ||
result.index = result.index.astype(str) | ||
assert_frame_equal(result_non_cat, result) | ||
|
||
@pytest.mark.parametrize( | ||
"content", | ||
[list("aab"), list("bbc"), list('bbc')], | ||
ids=["right_edge", "left_edge", "both_edges"], | ||
) | ||
def test_loc_beyond_edge_slicing(self, content): | ||
""" | ||
This test ensures that no `KeyError` is raised if trying to slice | ||
beyond the edges of known, ordered categories. | ||
|
||
see GH21019 | ||
""" | ||
# This dataframe might be a slice of a larger categorical | ||
# (i.e. more categories are known than there are in the column) | ||
|
||
ordered_df = DataFrame( | ||
{"A": range(0, 3)}, | ||
index=CategoricalIndex( | ||
content, categories=list("abcde"), name="B", ordered=True | ||
), | ||
) | ||
|
||
# Although the edge is not within the slice, this should fall back | ||
# to searchsorted slicing since the category is known and the index | ||
# is ordered. Since we're selecting a value larger/lower than the | ||
# right/left edge we should get the original slice again. | ||
result = ordered_df.loc["a": "d"] | ||
assert_frame_equal(result, ordered_df) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you also test the left edge as well |
||
# Ensure that index based slicing gives the same result | ||
result_iloc = ordered_df.iloc[0:4] | ||
assert_frame_equal(result, result_iloc) | ||
|
||
# If the categorical is not sorted and the requested edge | ||
# is not in the slice we cannot perform slicing | ||
ordered_df.index = ordered_df.index.as_unordered() | ||
with pytest.raises(KeyError): | ||
ordered_df.loc["a": "d"] | ||
|
||
# result = df.loc[1:5] | ||
# expected = df.iloc[[1,2,3,4]] | ||
# assert_frame_equal(result, expected) | ||
with pytest.raises(KeyError): | ||
# If the category is not known, there is nothing we can do | ||
ordered_df.loc["a":"z"] | ||
|
||
unordered_df = ordered_df.copy() | ||
unordered_df.index = unordered_df.index.as_unordered() | ||
with pytest.raises(KeyError): | ||
# This operation previously succeeded for an ordered index. Since | ||
# this index is no longer ordered, we cannot perfom out of range | ||
# slicing / searchsorted | ||
unordered_df.loc["a": "d"] | ||
|
||
def test_boolean_selection(self): | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you move to 0.23.2