Skip to content

Commit

Permalink
Randomize KnnVector codec params in RandomCodec; addresses gh-14047 (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
msokolov authored Dec 9, 2024
1 parent a833887 commit 6b0112c
Show file tree
Hide file tree
Showing 10 changed files with 232 additions and 77 deletions.
3 changes: 2 additions & 1 deletion lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ Optimizations

Bug Fixes
---------------------
(No changes)
* GITHUB#14049: Randomize KNN codec params in RandomCodec. Fixes scalar quantization div-by-zero
when all values are identical. (Mike Sokolov)

Other
---------------------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -541,6 +541,7 @@ public void changeIndexWithAdds(Random random, Directory dir, Version nameVersio
new IndexWriter(
dir,
newIndexWriterConfig(new MockAnalyzer(random))
.setCodec(TestUtil.getDefaultCodec())
.setOpenMode(IndexWriterConfig.OpenMode.APPEND)
.setMergePolicy(newLogMergePolicy()));
// add 10 docs
Expand Down Expand Up @@ -579,6 +580,7 @@ public void changeIndexWithAdds(Random random, Directory dir, Version nameVersio
new IndexWriter(
dir,
newIndexWriterConfig(new MockAnalyzer(random))
.setCodec(TestUtil.getDefaultCodec())
.setOpenMode(IndexWriterConfig.OpenMode.APPEND)
.setMergePolicy(newLogMergePolicy()));
writer.forceMerge(1);
Expand Down Expand Up @@ -630,6 +632,7 @@ public void changeIndexNoAdds(Random random, Directory dir, Version nameVersion)
new IndexWriter(
dir,
newIndexWriterConfig(new MockAnalyzer(random))
.setCodec(TestUtil.getDefaultCodec())
.setOpenMode(IndexWriterConfig.OpenMode.APPEND));
writer.forceMerge(1);
writer.close();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ static RandomVectorScorer fromVectorSimilarity(
VectorSimilarityFunction sim,
float constMultiplier,
QuantizedByteVectorValues values) {
checkDimensions(targetBytes.length, values.dimension());
return switch (sim) {
case EUCLIDEAN -> new Euclidean(values, constMultiplier, targetBytes);
case COSINE, DOT_PRODUCT ->
Expand All @@ -112,6 +113,13 @@ static RandomVectorScorer fromVectorSimilarity(
};
}

static void checkDimensions(int queryLen, int fieldLen) {
if (queryLen != fieldLen) {
throw new IllegalArgumentException(
"vector query dimension: " + queryLen + " differs from field dimension: " + fieldLen);
}
}

private static RandomVectorScorer.AbstractRandomVectorScorer dotProductFactory(
byte[] targetBytes,
float offsetCorrection,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,12 +97,18 @@ public ScalarQuantizer(float minQuantile, float maxQuantile, byte bits) {
}
assert maxQuantile >= minQuantile;
assert bits > 0 && bits <= 8;
this.minQuantile = minQuantile;
this.maxQuantile = maxQuantile;
this.bits = bits;
final float divisor = (float) ((1 << bits) - 1);
this.scale = divisor / (maxQuantile - minQuantile);
this.alpha = (maxQuantile - minQuantile) / divisor;
if (minQuantile == maxQuantile) {
// avoid divide-by-zero with an arbitrary but plausible choice (leads to alpha = scale = 1)
this.minQuantile = minQuantile - divisor;
this.maxQuantile = maxQuantile + divisor;
} else {
this.minQuantile = minQuantile;
this.maxQuantile = maxQuantile;
}
this.scale = divisor / (this.maxQuantile - this.minQuantile);
this.alpha = (this.maxQuantile - this.minQuantile) / divisor;
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import static com.carrotsearch.randomizedtesting.RandomizedTest.frequently;
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomBoolean;
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomIntBetween;
import static org.apache.lucene.index.VectorSimilarityFunction.COSINE;
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;

import java.io.IOException;
Expand Down Expand Up @@ -61,6 +60,8 @@

/** Test cases for AbstractKnnVectorQuery objects. */
abstract class BaseKnnVectorQueryTestCase extends LuceneTestCase {
// handle quantization noise
static final float EPSILON = 0.001f;

abstract AbstractKnnVectorQuery getKnnVectorQuery(
String field, float[] query, int k, Query queryFilter);
Expand All @@ -86,6 +87,10 @@ protected BaseDirectoryWrapper newDirectoryForTest() {
return LuceneTestCase.newDirectory(random());
}

protected IndexWriterConfig configStandardCodec() throws IOException {
return new IndexWriterConfig().setCodec(TestUtil.getDefaultCodec());
}

public void testEquals() {
AbstractKnnVectorQuery q1 = getKnnVectorQuery("f1", new float[] {0, 1}, 10);
Query filter1 = new TermQuery(new Term("id", "id1"));
Expand Down Expand Up @@ -317,66 +322,58 @@ public void testScoreEuclidean() throws IOException {
}

public void testScoreCosine() throws IOException {
try (Directory d = newDirectoryForTest()) {
try (IndexWriter w = new IndexWriter(d, new IndexWriterConfig())) {
for (int j = 1; j <= 5; j++) {
Document doc = new Document();
doc.add(getKnnVectorField("field", new float[] {j, j * j}, COSINE));
w.addDocument(doc);
}
}
try (IndexReader reader = DirectoryReader.open(d)) {
assertEquals(1, reader.leaves().size());
IndexSearcher searcher = new IndexSearcher(reader);
AbstractKnnVectorQuery query = getKnnVectorQuery("field", new float[] {2, 3}, 3);
Query rewritten = query.rewrite(searcher);
Weight weight = searcher.createWeight(rewritten, ScoreMode.COMPLETE, 1);
Scorer scorer = weight.scorer(reader.leaves().get(0));

// prior to advancing, score is undefined
assertEquals(-1, scorer.docID());
expectThrows(ArrayIndexOutOfBoundsException.class, scorer::score);

/* score0 = ((2,3) * (1, 1) = 5) / (||2, 3|| * ||1, 1|| = sqrt(26)), then
* normalized by (1 + x) /2.
*/
float score0 =
(float) ((1 + (2 * 1 + 3 * 1) / Math.sqrt((2 * 2 + 3 * 3) * (1 * 1 + 1 * 1))) / 2);

/* score1 = ((2,3) * (2, 4) = 16) / (||2, 3|| * ||2, 4|| = sqrt(260)), then
* normalized by (1 + x) /2
*/
float score1 =
(float) ((1 + (2 * 2 + 3 * 4) / Math.sqrt((2 * 2 + 3 * 3) * (2 * 2 + 4 * 4))) / 2);

// doc 1 happens to have the maximum score
assertEquals(score1, scorer.getMaxScore(2), 0.0001);
assertEquals(score1, scorer.getMaxScore(Integer.MAX_VALUE), 0.0001);

DocIdSetIterator it = scorer.iterator();
assertEquals(3, it.cost());
assertEquals(0, it.nextDoc());
// doc 0 has (1, 1)
assertEquals(score0, scorer.score(), 0.0001);
assertEquals(1, it.advance(1));
assertEquals(score1, scorer.score(), 0.0001);

// since topK was 3
assertEquals(NO_MORE_DOCS, it.advance(4));
expectThrows(ArrayIndexOutOfBoundsException.class, scorer::score);
}
float[][] vectors = new float[5][];
for (int j = 1; j <= 5; j++) {
vectors[j - 1] = new float[] {j, j * j};
}
try (Directory d = getStableIndexStore("field", VectorSimilarityFunction.COSINE, vectors);
IndexReader reader = DirectoryReader.open(d)) {
assertEquals(1, reader.leaves().size());
IndexSearcher searcher = new IndexSearcher(reader);
AbstractKnnVectorQuery query = getKnnVectorQuery("field", new float[] {2, 3}, 3);
Query rewritten = query.rewrite(searcher);
Weight weight = searcher.createWeight(rewritten, ScoreMode.COMPLETE, 1);
Scorer scorer = weight.scorer(reader.leaves().get(0));

// prior to advancing, score is undefined
assertEquals(-1, scorer.docID());
expectThrows(ArrayIndexOutOfBoundsException.class, scorer::score);

/* score0 = ((2,3) * (1, 1) = 5) / (||2, 3|| * ||1, 1|| = sqrt(26)), then
* normalized by (1 + x) /2.
*/
float score0 =
(float) ((1 + (2 * 1 + 3 * 1) / Math.sqrt((2 * 2 + 3 * 3) * (1 * 1 + 1 * 1))) / 2);

/* score1 = ((2,3) * (2, 4) = 16) / (||2, 3|| * ||2, 4|| = sqrt(260)), then
* normalized by (1 + x) /2
*/
float score1 =
(float) ((1 + (2 * 2 + 3 * 4) / Math.sqrt((2 * 2 + 3 * 3) * (2 * 2 + 4 * 4))) / 2);

// doc 1 happens to have the maximum score
assertEquals(score1, scorer.getMaxScore(2), 0.0001);
assertEquals(score1, scorer.getMaxScore(Integer.MAX_VALUE), 0.0001);

DocIdSetIterator it = scorer.iterator();
assertEquals(3, it.cost());
assertEquals(0, it.nextDoc());
// doc 0 has (1, 1)
assertEquals(score0, scorer.score(), 0.0001);
assertEquals(1, it.advance(1));
assertEquals(score1, scorer.score(), 0.0001);

// since topK was 3
assertEquals(NO_MORE_DOCS, it.advance(4));
expectThrows(ArrayIndexOutOfBoundsException.class, scorer::score);
}
}

public void testScoreMIP() throws IOException {
try (Directory indexStore =
getIndexStore(
"field",
VectorSimilarityFunction.MAXIMUM_INNER_PRODUCT,
new float[] {0, 1},
new float[] {1, 2},
new float[] {0, 0});
IndexReader reader = DirectoryReader.open(indexStore)) {
float[][] vectors = {{0, 1}, {1, 2}, {0, 0}};
try (Directory d =
getStableIndexStore("field", VectorSimilarityFunction.MAXIMUM_INNER_PRODUCT, vectors);
IndexReader reader = DirectoryReader.open(d)) {
IndexSearcher searcher = newSearcher(reader);
AbstractKnnVectorQuery kvq = getKnnVectorQuery("field", new float[] {0, -1}, 10);
assertMatches(searcher, kvq, 3);
Expand Down Expand Up @@ -405,7 +402,8 @@ public void testExplain() throws IOException {
AbstractKnnVectorQuery query = getKnnVectorQuery("field", new float[] {2, 3}, 3);
Explanation matched = searcher.explain(query, 2);
assertTrue(matched.isMatch());
assertEquals(1 / 2f, matched.getValue());
// scores vary widely due to quantization
assertEquals(1 / 2f, matched.getValue().doubleValue(), 0.5);
assertEquals(0, matched.getDetails().length);
assertEquals("within top 3 docs", matched.getDescription());

Expand All @@ -431,9 +429,10 @@ public void testExplainMultipleSegments() throws IOException {
try (IndexReader reader = DirectoryReader.open(d)) {
IndexSearcher searcher = new IndexSearcher(reader);
AbstractKnnVectorQuery query = getKnnVectorQuery("field", new float[] {2, 3}, 3);
Explanation matched = searcher.explain(query, 2);
Explanation matched = searcher.explain(query, 2); // (2, 2)
assertTrue(matched.isMatch());
assertEquals(1 / 2f, matched.getValue());
// scores vary widely due to quantization
assertEquals(1 / 2f, matched.getValue().doubleValue(), 0.5);
assertEquals(0, matched.getDetails().length);
assertEquals("within top 3 docs", matched.getDescription());

Expand All @@ -453,7 +452,7 @@ public void testSkewedIndex() throws IOException {
* randomly fail to find one).
*/
try (Directory d = newDirectoryForTest()) {
try (IndexWriter w = new IndexWriter(d, new IndexWriterConfig())) {
try (IndexWriter w = new IndexWriter(d, configStandardCodec())) {
int r = 0;
for (int i = 0; i < 5; i++) {
for (int j = 0; j < 5; j++) {
Expand Down Expand Up @@ -532,7 +531,7 @@ public void testRandomWithFilter() throws IOException {
// visitedLimit. This is fine since the test targets AbstractKnnVectorQuery logic, not the kNN
// format
// implementation.
IndexWriterConfig iwc = new IndexWriterConfig().setCodec(TestUtil.getDefaultCodec());
IndexWriterConfig iwc = configStandardCodec();
RandomIndexWriter w = new RandomIndexWriter(random(), d, iwc);
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
Expand Down Expand Up @@ -618,7 +617,7 @@ public void testFilterWithSameScore() throws IOException {
// visitedLimit. This is fine since the test targets AbstractKnnVectorQuery logic, not the kNN
// format
// implementation.
IndexWriterConfig iwc = new IndexWriterConfig().setCodec(TestUtil.getDefaultCodec());
IndexWriterConfig iwc = configStandardCodec();
IndexWriter w = new IndexWriter(d, iwc);
float[] vector = randomVector(dimension);
for (int i = 0; i < numDocs; i++) {
Expand Down Expand Up @@ -933,11 +932,17 @@ Directory getIndexStore(
* preserving the order of the added documents.
*/
private Directory getStableIndexStore(String field, float[]... contents) throws IOException {
return getStableIndexStore(field, VectorSimilarityFunction.EUCLIDEAN, contents);
}

private Directory getStableIndexStore(
String field, VectorSimilarityFunction similarityFunction, float[][] contents)
throws IOException {
Directory indexStore = newDirectoryForTest();
try (IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig())) {
try (IndexWriter writer = new IndexWriter(indexStore, configStandardCodec())) {
for (int i = 0; i < contents.length; ++i) {
Document doc = new Document();
doc.add(getKnnVectorField(field, contents[i]));
doc.add(getKnnVectorField(field, contents[i], similarityFunction));
doc.add(new StringField("id", "id" + i, Field.Store.YES));
writer.addDocument(doc);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,15 @@
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.QueryTimeout;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.store.Directory;
import org.apache.lucene.tests.index.RandomIndexWriter;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.tests.util.TestUtil;
import org.apache.lucene.util.hnsw.HnswUtil;

@LuceneTestCase.SuppressCodecs("SimpleText")
Expand Down Expand Up @@ -381,7 +383,7 @@ public void testBoostQuery() throws IOException {
}
}

public void testVectorsAboveSimilarity() throws IOException {
void testVectorsAboveSimilarity() throws IOException {
// Pick number of docs to accept
int numAccepted = random().nextInt(numDocs / 3, numDocs / 2);
float delta = 1e-3f;
Expand All @@ -401,7 +403,10 @@ public void testVectorsAboveSimilarity() throws IOException {
}
}

try (Directory indexStore = getIndexStore(vectors);
// TODO test with random codec params via getIndexStore(vectors);
// this is challenging because scores will vary in a quantized index
// and precomputing as above will not be accurate
try (Directory indexStore = getStableIndexStore(vectors);
IndexReader reader = DirectoryReader.open(indexStore)) {
IndexSearcher searcher = newSearcher(reader);

Expand Down Expand Up @@ -587,6 +592,21 @@ final Directory getIndexStore(V... vectors) throws IOException {
return dir;
}

@SafeVarargs
final Directory getStableIndexStore(V... vectors) throws IOException {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig().setCodec(TestUtil.getDefaultCodec());
try (IndexWriter writer = new IndexWriter(dir, iwc)) {
for (int i = 0; i < vectors.length; ++i) {
Document doc = new Document();
doc.add(getVectorField(vectorField, vectors[i], function));
doc.add(new IntField(idField, i, Field.Store.YES));
writer.addDocument(doc);
}
}
return dir;
}

private static class CountingQueryTimeout implements QueryTimeout {
private int remaining;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
import org.apache.lucene.util.VectorUtil;

public class TestKnnFloatVectorQuery extends BaseKnnVectorQueryTestCase {

@Override
KnnFloatVectorQuery getKnnVectorQuery(String field, float[] query, int k, Query queryFilter) {
return new KnnFloatVectorQuery(field, query, k, queryFilter);
Expand Down Expand Up @@ -130,16 +131,16 @@ public void testScoreNegativeDotProduct() throws IOException {
DocIdSetIterator it = scorer.iterator();
assertEquals(2, it.cost());
assertEquals(0, it.nextDoc());
assertEquals(0, scorer.score(), 0);
assertEquals(0, scorer.score(), EPSILON);
assertEquals(1, it.advance(1));
assertEquals(1, scorer.score(), 0);
assertEquals(1, scorer.score(), EPSILON);
}
}
}

public void testScoreDotProduct() throws IOException {
try (Directory d = newDirectory()) {
try (IndexWriter w = new IndexWriter(d, new IndexWriterConfig())) {
try (IndexWriter w = new IndexWriter(d, configStandardCodec())) {
for (int j = 1; j <= 5; j++) {
Document doc = new Document();
doc.add(
Expand Down Expand Up @@ -174,7 +175,7 @@ public void testScoreDotProduct() throws IOException {
(float) ((1 + (2 * 2 + 3 * 4) / Math.sqrt((2 * 2 + 3 * 3) * (2 * 2 + 4 * 4))) / 2);

// doc 1 happens to have the max score
assertEquals(score1, scorer.getMaxScore(2), 0.0001);
assertEquals(score1, scorer.getMaxScore(2), 0.001);
assertEquals(score1, scorer.getMaxScore(Integer.MAX_VALUE), 0.0001);

DocIdSetIterator it = scorer.iterator();
Expand Down
Loading

0 comments on commit 6b0112c

Please sign in to comment.