From 239a349be76f89eb5a9468216ac19d498ccd15b3 Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Tue, 10 Sep 2024 12:09:09 +0800 Subject: [PATCH 1/2] fix: phrase query may return incorrect results if there's unknown word Signed-off-by: BubbleCal --- rust/lance-index/src/scalar/inverted/builder.rs | 8 ++++++++ rust/lance-index/src/scalar/inverted/index.rs | 7 ++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/rust/lance-index/src/scalar/inverted/builder.rs b/rust/lance-index/src/scalar/inverted/builder.rs index c6ceba8c88..9f42d20cf3 100644 --- a/rust/lance-index/src/scalar/inverted/builder.rs +++ b/rust/lance-index/src/scalar/inverted/builder.rs @@ -586,6 +586,14 @@ mod tests { .await .unwrap(); assert_eq!(row_ids.len(), Some(0)); + + let row_ids = invert_index + .search(&SargableQuery::FullTextSearch( + FullTextSearchQuery::new("\"lance exists\"".to_owned()).limit(Some(10)), + )) + .await + .unwrap(); + assert_eq!(row_ids.len(), Some(0)); } #[tokio::test] diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs index 2cab09e37f..8aa1349ca9 100644 --- a/rust/lance-index/src/scalar/inverted/index.rs +++ b/rust/lance-index/src/scalar/inverted/index.rs @@ -108,7 +108,12 @@ impl InvertedIndex { let token_ids = if !is_phrase_query(&query.query) { token_ids.sorted_unstable().dedup().collect() } else { - token_ids.collect() + let token_ids = token_ids.collect::>(); + // for phrase query, all tokens must be present + if token_ids.len() != tokens.len() { + return Ok(Vec::new()); + } + token_ids }; self.bm25_search(token_ids, query, prefilter).await } From d9f4406c609a5ba0cff9a032165ddb6f3215eab0 Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Tue, 10 Sep 2024 12:09:57 +0800 Subject: [PATCH 2/2] fix Signed-off-by: BubbleCal --- rust/lance-index/src/scalar/inverted/builder.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/lance-index/src/scalar/inverted/builder.rs b/rust/lance-index/src/scalar/inverted/builder.rs index 9f42d20cf3..459c8e73a2 100644 --- a/rust/lance-index/src/scalar/inverted/builder.rs +++ b/rust/lance-index/src/scalar/inverted/builder.rs @@ -589,7 +589,7 @@ mod tests { let row_ids = invert_index .search(&SargableQuery::FullTextSearch( - FullTextSearchQuery::new("\"lance exists\"".to_owned()).limit(Some(10)), + FullTextSearchQuery::new("\"lance unknown\"".to_owned()).limit(Some(10)), )) .await .unwrap();