-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
BUG: IntervalIndex.get_loc/get_indexer wrong return value / error #25090
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
2aca389
2d12d2f
f357101
7cc4c37
8ec653a
878802e
4dabe0e
564d88d
f4c43e3
a09a07e
6c887e6
0a143f2
0730cd6
246eb57
93f75ea
268db81
a5aa1e8
d480872
6ed1080
120e2bc
2c48272
02127ff
ad13d9e
0ff356c
9f6b5c0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -767,8 +767,13 @@ def get_loc(self, key, method=None): | |
key = Interval(left, right, key.closed) | ||
else: | ||
key = self._maybe_cast_slice_bound(key, 'left', None) | ||
|
||
start, stop = self._find_non_overlapping_monotonic_bounds(key) | ||
try: | ||
start, stop = self._find_non_overlapping_monotonic_bounds(key) | ||
except TypeError: | ||
# get_loc should raise KeyError | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add here a comment as Tom proposed ( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can now use raise from (as we are PY3 only) |
||
# TODO(py3): use raise from. | ||
raise KeyError('Key {!r} is hashable but of incorrect type.' | ||
.format(key)) | ||
|
||
if start is None or stop is None: | ||
return slice(start, stop) | ||
|
@@ -786,7 +791,12 @@ def get_loc(self, key, method=None): | |
left, right = _get_interval_closed_bounds(key) | ||
return self._engine.get_loc_interval(left, right) | ||
else: | ||
return self._engine.get_loc(key) | ||
try: | ||
return self._engine.get_loc(key) | ||
except TypeError: | ||
msg = ('Key {!r} not found (does match index type {}).' | ||
.format(key, self.dtype)) | ||
raise KeyError(msg) | ||
|
||
def get_value(self, series, key): | ||
if com.is_bool_indexer(key): | ||
|
@@ -800,7 +810,7 @@ def get_value(self, series, key): | |
|
||
try: | ||
loc = self.get_loc(key) | ||
except TypeError: | ||
except KeyError: | ||
# we didn't find exact intervals or are non-unique | ||
msg = "unable to slice with this key: {key}".format(key=key) | ||
raise ValueError(msg) | ||
|
@@ -820,11 +830,21 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): | |
return np.arange(len(self), dtype='intp') | ||
|
||
if self.is_non_overlapping_monotonic: | ||
start, stop = self._find_non_overlapping_monotonic_bounds(target) | ||
|
||
start_plus_one = start + 1 | ||
if not ((start_plus_one < stop).any()): | ||
return np.where(start_plus_one == stop, start, -1) | ||
try: | ||
start, stop = ( | ||
self._find_non_overlapping_monotonic_bounds(target) | ||
) | ||
start_plus_one = start + 1 | ||
if not ((start_plus_one < stop).any()): | ||
return np.where(start_plus_one == stop, start, -1) | ||
except TypeError as err: | ||
# Only raise a type error when the types are not | ||
# orderable, such as when the caller is combining | ||
# an interval index with an integer index. | ||
# (see test_append_different_columns_types_raises | ||
# in pandas/tests/reshape/test_concat.py for more examples). | ||
if err.args and 'unorderable types:' in err.args[0]: | ||
raise | ||
|
||
if not self.is_unique: | ||
raise ValueError("cannot handle non-unique indices") | ||
|
@@ -835,7 +855,13 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): | |
|
||
# non IntervalIndex | ||
else: | ||
indexer = np.concatenate([self.get_loc(i) for i in target]) | ||
vals = [] | ||
for i in target: | ||
try: | ||
vals.append(self.get_loc(i)) | ||
except KeyError: | ||
vals.append(-1) | ||
indexer = np.array(vals).flatten() | ||
|
||
return ensure_platform_int(indexer) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -429,6 +429,15 @@ def test_get_loc_value(self): | |
with pytest.raises(KeyError, match=r"^1\.5$"): | ||
idx.get_loc(1.5) | ||
|
||
# GH25087, test get_loc returns key error for interval indexes | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you put this in a new test? (you can leave it in this place, but just put a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you do this one? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Have got a few broken tests to fix. Hoping to make the build green in the coming days. |
||
key = 'a' | ||
msg = 'Key {!r} is hashable but of incorrect type'.format(key) | ||
with pytest.raises(KeyError, match=msg): | ||
idx.get_loc(key) | ||
idx = pd.interval_range(0, 1.0) | ||
with pytest.raises(KeyError, match=msg): | ||
idx.get_loc('a') | ||
|
||
# To be removed, replaced by test_interval_new.py (see #16316, #16386) | ||
def slice_locs_cases(self, breaks): | ||
# TODO: same tests for more index types | ||
|
@@ -581,6 +590,14 @@ def test_get_indexer(self): | |
expected = np.array([-1, 1], dtype='intp') | ||
tm.assert_numpy_array_equal(actual, expected) | ||
|
||
actual = self.index.get_indexer(['a', 1]) | ||
expected = np.array([-1, 0], dtype='intp') | ||
tm.assert_numpy_array_equal(actual, expected) | ||
|
||
actual = self.index.get_indexer(['a', 1, 'b']) | ||
expected = np.array([-1, 0, -1], dtype='intp') | ||
tm.assert_numpy_array_equal(actual, expected) | ||
|
||
# To be removed, replaced by test_interval_new.py (see #16316, #16386) | ||
def test_get_indexer_subintervals(self): | ||
|
||
|
@@ -615,6 +632,24 @@ def test_get_indexer_length_one(self, item, closed): | |
expected = np.array([0] * len(item), dtype='intp') | ||
tm.assert_numpy_array_equal(result, expected) | ||
|
||
@pytest.mark.parametrize('index,value,expected_index', [ | ||
(pd.interval_range(0, 1), 0.5, 0), | ||
(pd.interval_range(0, 3), 0.5, 0), | ||
(pd.IntervalIndex.from_tuples([(1, 3), (2, 4), (0, 2)]), 0.5, 2) | ||
]) | ||
def test_get_indexer_errors(self, index, value, expected_index): | ||
actual = index.get_indexer(['a']) | ||
expected = np.array([-1], dtype='intp') | ||
assert tm.assert_numpy_array_equal(actual, expected) | ||
|
||
actual = index.get_indexer(['a', 'b']) | ||
expected = np.array([-1, -1], dtype='intp') | ||
assert tm.assert_numpy_array_equal(actual, expected) | ||
|
||
actual = index.get_indexer(['a', value, 'b']) | ||
expected = np.array([-1, expected_index, -1], dtype='intp') | ||
assert tm.assert_numpy_array_equal(actual, expected) | ||
|
||
# Make consistent with test_interval_new.py (see #16316, #16386) | ||
@pytest.mark.parametrize('arrays', [ | ||
(date_range('20180101', periods=4), date_range('20180103', periods=4)), | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You'll also need to do something similar in the
else
branch to cover the overlapping/non-monotonic case, e.g. I think something likepd.IntervalIndex.from_tuples([(1, 3), (2, 4), (0, 2)]).get_loc('foo')
will still fail.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe it should be the
engine
that should properly raise a KeyError? (eg the int64 engine does that)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I still need to look into @jorisvandenbossche's comment on raising the error in the engine itself (especially if that's the behaviour for int64), but I think I've addressed everything else.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@jorisvandenbossche : There is code in place within the
engine
that raises aKeyError
, but strings queries fail before it gets there since the engine is expecting ascalar_t
type (fused type consisting of numeric types) forkey
:pandas/pandas/_libs/intervaltree.pxi.in
Line 104 in 2e38d55
I'm not super well versed in Cython. Is there a graceful way to force this to raise a
KeyError
within the Cython code? Removing thescalar_t
type gets a step further but still raises aTypeError
as the code expects things to be comparable (probably some perf implications to removing it too).There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, the other engines have the key as
object
typed, and then afterwards do a check of that.But for me fine as well to leave that for now, and do the check here in the level above that. But on the long term would still be good to make the behaviour consistent throughout the different engines.