diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py index 8ca3e01d95..c919228909 100644 --- a/python/python/tests/test_scalar_index.py +++ b/python/python/tests/test_scalar_index.py @@ -2039,6 +2039,44 @@ def test_label_list_index_array_contains(tmp_path: Path): assert "ScalarIndexQuery" not in explain +def test_label_list_index_null_element_match(tmp_path: Path): + """Ensure LABEL_LIST index keeps scan semantics when lists contain NULLs.""" + tbl = pa.table({"labels": [["foo", None], ["foo"], None]}) + dataset = lance.write_dataset(tbl, tmp_path / "dataset") + + filters = [ + "array_has_any(labels, ['foo'])", + "array_has_all(labels, ['foo'])", + "array_contains(labels, 'foo')", + ] + expected = { + f: dataset.to_table(filter=f).column("labels").to_pylist() for f in filters + } + + dataset.create_scalar_index("labels", index_type="LABEL_LIST") + + actual = { + f: dataset.to_table(filter=f).column("labels").to_pylist() for f in filters + } + assert actual == expected + + +def test_label_list_index_explain_null_literals(tmp_path: Path): + tbl = pa.table({"labels": [["foo", None], ["foo"]]}) + dataset = lance.write_dataset(tbl, tmp_path / "dataset") + dataset.create_scalar_index("labels", index_type="LABEL_LIST") + + # explain_plan should not panic when list literals include NULLs. + for expr in [ + "array_has_any(labels, [NULL])", + "array_has_all(labels, [NULL])", + "array_has_any(labels, ['foo', NULL])", + "array_has_all(labels, ['foo', NULL])", + ]: + explain = dataset.scanner(filter=expr).explain_plan() + assert isinstance(explain, str) + + def test_create_index_empty_dataset(tmp_path: Path): # Creating an index on an empty dataset is (currently) not terribly useful but # we shouldn't return strange errors. diff --git a/rust/lance-index/src/scalar.rs b/rust/lance-index/src/scalar.rs index 6d07b5b821..98aebe96e8 100644 --- a/rust/lance-index/src/scalar.rs +++ b/rust/lance-index/src/scalar.rs @@ -549,7 +549,7 @@ impl AnyQuery for LabelListQuery { let offsets_buffer = OffsetBuffer::new(ScalarBuffer::::from(vec![0, labels_arr.len() as i32])); let labels_list = ListArray::try_new( - Arc::new(Field::new("item", labels_arr.data_type().clone(), false)), + Arc::new(Field::new("item", labels_arr.data_type().clone(), true)), offsets_buffer, labels_arr, None, @@ -569,7 +569,7 @@ impl AnyQuery for LabelListQuery { let offsets_buffer = OffsetBuffer::new(ScalarBuffer::::from(vec![0, labels_arr.len() as i32])); let labels_list = ListArray::try_new( - Arc::new(Field::new("item", labels_arr.data_type().clone(), false)), + Arc::new(Field::new("item", labels_arr.data_type().clone(), true)), offsets_buffer, labels_arr, None, diff --git a/rust/lance-index/src/scalar/bitmap.rs b/rust/lance-index/src/scalar/bitmap.rs index 12d0b6232a..b046412fe3 100644 --- a/rust/lance-index/src/scalar/bitmap.rs +++ b/rust/lance-index/src/scalar/bitmap.rs @@ -534,7 +534,12 @@ impl ScalarIndex for BitmapIndex { } }; - let selection = NullableRowAddrSet::new(row_ids, null_row_ids.unwrap_or_default()); + let mut null_rows = null_row_ids.unwrap_or_default(); + if !null_rows.is_empty() { + // A row can be both TRUE and NULL after list flattening; treat it as TRUE. + null_rows -= &row_ids; + } + let selection = NullableRowAddrSet::new(row_ids, null_rows); Ok(SearchResult::Exact(selection)) }