From cd16d12b10cc87e6cfe8352eea6ee31afccce901 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Mon, 30 Sep 2024 12:01:58 -0700 Subject: [PATCH] fix: don't always include first doc --- python/python/tests/test_scalar_index.py | 25 +++++++++++++++++++ rust/lance-index/src/scalar/inverted/index.rs | 2 +- rust/lance-index/src/scalar/inverted/wand.rs | 10 +++++++- 3 files changed, 35 insertions(+), 2 deletions(-) diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py index 25257d8f34..6793b218db 100644 --- a/python/python/tests/test_scalar_index.py +++ b/python/python/tests/test_scalar_index.py @@ -245,6 +245,31 @@ def test_filter_with_fts_index(dataset): assert query == row.as_py() +def test_indexed_filter_with_fts_index(tmp_path): + data = pa.table( + { + "text": [ + "Frodo was a puppy", + "There were several kittens playing", + "Frodo was a happy puppy", + "Frodo was a very happy puppy", + ], + "sentiment": ["neutral", "neutral", "positive", "positive"], + } + ) + ds = lance.write_dataset(data, tmp_path, mode="overwrite") + ds.create_scalar_index("text", "INVERTED") + ds.create_scalar_index("sentiment", "BITMAP") + + results = ds.to_table( + full_text_query="puppy", + filter="sentiment='positive'", + prefilter=True, + with_row_id=True, + ) + assert results["_rowid"].to_pylist() == [2, 3] + + def test_fts_with_postfilter(tmp_path): tab = pa.table({"text": ["Frodo the puppy"] * 100, "id": range(100)}) dataset = lance.write_dataset(tab, tmp_path) diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs index a59b83f01e..dae1716947 100644 --- a/rust/lance-index/src/scalar/inverted/index.rs +++ b/rust/lance-index/src/scalar/inverted/index.rs @@ -151,7 +151,7 @@ impl InvertedIndex { position as i32, posting, self.docs.len(), - mask.clone(), + mask, )) }) // Use compute count since data hopefully cached diff --git a/rust/lance-index/src/scalar/inverted/wand.rs b/rust/lance-index/src/scalar/inverted/wand.rs index 23ba86530f..2901d0e900 100644 --- a/rust/lance-index/src/scalar/inverted/wand.rs +++ b/rust/lance-index/src/scalar/inverted/wand.rs @@ -63,11 +63,19 @@ impl PostingIterator { Some(max_score) => max_score, None => idf(list.len(), num_doc) * (K1 + 1.0), }; + + // move the iterator to the first selected document. This is important + // because caller might directly call `doc()` without calling `next()`. + let mut index = 0; + while index < list.len() && !mask.selected(list.row_id(index)) { + index += 1; + } + Self { token_id, position, list, - index: 0, + index, mask, approximate_upper_bound, }