Skip to content

Commit

Permalink
fix: optimize cosine IVF_PQ index (#2698)
Browse files Browse the repository at this point in the history
  • Loading branch information
chebbyChefNEQ authored Aug 7, 2024
1 parent d5247d0 commit 293758e
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 1 deletion.
2 changes: 1 addition & 1 deletion python/python/lance/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2322,7 +2322,7 @@ def fast_search(self, flag: bool) -> ScannerBuilder:
Users can use `Table::optimize()` or `create_index()` to include the new data
into index, thus make new data searchable.
"""
self.fast_search = flag
self._fast_search = flag
return self

def full_text_search(
Expand Down
68 changes: 68 additions & 0 deletions python/python/tests/test_vector_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,74 @@ def test_optimize_index(dataset, tmp_path):
assert len(list(indices_dir.iterdir())) == 2


def test_optimize_index_cosine(dataset, tmp_path):
dataset_uri = tmp_path / "dataset.lance"
assert not dataset.has_index
ds = lance.write_dataset(dataset.to_table(), dataset_uri)
ds = ds.create_index(
"vector",
metric="cosine",
index_type="IVF_PQ",
num_partitions=4,
num_sub_vectors=2,
)

assert len(ds) == 1000
assert ds.has_index

n_results_before_append = ds.to_table(
nearest={
"q": [0.1 for _ in range(128)],
"column": "vector",
"k": len(ds),
"nprobes": 1,
},
fast_search=True,
).num_rows

# New data
tbl = create_table(nvec=200)
ds = lance.write_dataset(tbl, dataset_uri, mode="append")

assert len(ds) == 1200
assert ds.has_index

indices_dir = dataset_uri / "_indices"
assert len(list(indices_dir.iterdir())) == 1

# with fast search the index doesn't contain new data yet
assert (
ds.to_table(
nearest={
"q": [0.1 for _ in range(128)],
"column": "vector",
"k": len(ds),
"nprobes": 1,
},
fast_search=True,
).num_rows
== n_results_before_append
)

ds.optimize.optimize_indices()
assert len(list(indices_dir.iterdir())) == 2

ds = lance.dataset(dataset_uri)

assert (
ds.to_table(
nearest={
"q": [0.1 for _ in range(128)],
"column": "vector",
"k": len(ds),
"nprobes": 1,
},
fast_search=True,
).num_rows
> n_results_before_append
)


def test_create_index_dot(dataset, tmp_path):
dataset_uri = tmp_path / "dataset.lance"
assert not dataset.has_index
Expand Down
4 changes: 4 additions & 0 deletions rust/lance-index/src/vector/pq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ impl ProductQuantizer {
}

pub fn from_proto(proto: &pb::Pq, distance_type: DistanceType) -> Result<Self> {
let distance_type = match distance_type {
DistanceType::Cosine => DistanceType::L2,
_ => distance_type,
};
let codebook = match proto.codebook_tensor.as_ref() {
Some(tensor) => FixedSizeListArray::try_from(tensor)?,
None => FixedSizeListArray::try_new_from_values(
Expand Down

0 comments on commit 293758e

Please sign in to comment.