Randomize KnnVector codec params in RandomCodec; addresses gh-14047 (#…

…14049)
apache · Dec 9, 2024 · 6b0112c · 6b0112c
1 parent a833887
commit 6b0112c
Show file tree

Hide file tree

Showing 10 changed files with 232 additions and 77 deletions.
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -24,7 +24,8 @@ Optimizations
 
 Bug Fixes
 ---------------------
-(No changes)
+* GITHUB#14049: Randomize KNN codec params in RandomCodec. Fixes scalar quantization div-by-zero
+  when all values are identical. (Mike Sokolov)
 
 Other
 ---------------------

diff --git a/...ard-codecs/src/test/org/apache/lucene/backward_index/TestBasicBackwardsCompatibility.java b/...ard-codecs/src/test/org/apache/lucene/backward_index/TestBasicBackwardsCompatibility.java
@@ -541,6 +541,7 @@ public void changeIndexWithAdds(Random random, Directory dir, Version nameVersio
         new IndexWriter(
             dir,
             newIndexWriterConfig(new MockAnalyzer(random))
+                .setCodec(TestUtil.getDefaultCodec())
                 .setOpenMode(IndexWriterConfig.OpenMode.APPEND)
                 .setMergePolicy(newLogMergePolicy()));
     // add 10 docs
@@ -579,6 +580,7 @@ public void changeIndexWithAdds(Random random, Directory dir, Version nameVersio
         new IndexWriter(
             dir,
             newIndexWriterConfig(new MockAnalyzer(random))
+                .setCodec(TestUtil.getDefaultCodec())
                 .setOpenMode(IndexWriterConfig.OpenMode.APPEND)
                 .setMergePolicy(newLogMergePolicy()));
     writer.forceMerge(1);
@@ -630,6 +632,7 @@ public void changeIndexNoAdds(Random random, Directory dir, Version nameVersion)
         new IndexWriter(
             dir,
             newIndexWriterConfig(new MockAnalyzer(random))
+                .setCodec(TestUtil.getDefaultCodec())
                 .setOpenMode(IndexWriterConfig.OpenMode.APPEND));
     writer.forceMerge(1);
     writer.close();

diff --git a/.../core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorScorer.java b/.../core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorScorer.java
@@ -93,6 +93,7 @@ static RandomVectorScorer fromVectorSimilarity(
       VectorSimilarityFunction sim,
       float constMultiplier,
       QuantizedByteVectorValues values) {
+    checkDimensions(targetBytes.length, values.dimension());
     return switch (sim) {
       case EUCLIDEAN -> new Euclidean(values, constMultiplier, targetBytes);
       case COSINE, DOT_PRODUCT ->
@@ -112,6 +113,13 @@ static RandomVectorScorer fromVectorSimilarity(
     };
   }
 
+  static void checkDimensions(int queryLen, int fieldLen) {
+    if (queryLen != fieldLen) {
+      throw new IllegalArgumentException(
+          "vector query dimension: " + queryLen + " differs from field dimension: " + fieldLen);
+    }
+  }
+
   private static RandomVectorScorer.AbstractRandomVectorScorer dotProductFactory(
       byte[] targetBytes,
       float offsetCorrection,

diff --git a/lucene/core/src/java/org/apache/lucene/util/quantization/ScalarQuantizer.java b/lucene/core/src/java/org/apache/lucene/util/quantization/ScalarQuantizer.java
@@ -97,12 +97,18 @@ public ScalarQuantizer(float minQuantile, float maxQuantile, byte bits) {
     }
     assert maxQuantile >= minQuantile;
     assert bits > 0 && bits <= 8;
-    this.minQuantile = minQuantile;
-    this.maxQuantile = maxQuantile;
     this.bits = bits;
     final float divisor = (float) ((1 << bits) - 1);
-    this.scale = divisor / (maxQuantile - minQuantile);
-    this.alpha = (maxQuantile - minQuantile) / divisor;
+    if (minQuantile == maxQuantile) {
+      // avoid divide-by-zero with an arbitrary but plausible choice (leads to alpha = scale = 1)
+      this.minQuantile = minQuantile - divisor;
+      this.maxQuantile = maxQuantile + divisor;
+    } else {
+      this.minQuantile = minQuantile;
+      this.maxQuantile = maxQuantile;
+    }
+    this.scale = divisor / (this.maxQuantile - this.minQuantile);
+    this.alpha = (this.maxQuantile - this.minQuantile) / divisor;
   }
 
   /**

diff --git a/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java b/lucene/core/src/test/org/apache/lucene/search/BaseKnnVectorQueryTestCase.java
@@ -19,7 +19,6 @@
 import static com.carrotsearch.randomizedtesting.RandomizedTest.frequently;
 import static com.carrotsearch.randomizedtesting.RandomizedTest.randomBoolean;
 import static com.carrotsearch.randomizedtesting.RandomizedTest.randomIntBetween;
-import static org.apache.lucene.index.VectorSimilarityFunction.COSINE;
 import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
 
 import java.io.IOException;
@@ -61,6 +60,8 @@
 
 /** Test cases for AbstractKnnVectorQuery objects. */
 abstract class BaseKnnVectorQueryTestCase extends LuceneTestCase {
+  // handle quantization noise
+  static final float EPSILON = 0.001f;
 
   abstract AbstractKnnVectorQuery getKnnVectorQuery(
       String field, float[] query, int k, Query queryFilter);
@@ -86,6 +87,10 @@ protected BaseDirectoryWrapper newDirectoryForTest() {
     return LuceneTestCase.newDirectory(random());
   }
 
+  protected IndexWriterConfig configStandardCodec() throws IOException {
+    return new IndexWriterConfig().setCodec(TestUtil.getDefaultCodec());
+  }
+
   public void testEquals() {
     AbstractKnnVectorQuery q1 = getKnnVectorQuery("f1", new float[] {0, 1}, 10);
     Query filter1 = new TermQuery(new Term("id", "id1"));
@@ -317,66 +322,58 @@ public void testScoreEuclidean() throws IOException {
   }
 
   public void testScoreCosine() throws IOException {
-    try (Directory d = newDirectoryForTest()) {
-      try (IndexWriter w = new IndexWriter(d, new IndexWriterConfig())) {
-        for (int j = 1; j <= 5; j++) {
-          Document doc = new Document();
-          doc.add(getKnnVectorField("field", new float[] {j, j * j}, COSINE));
-          w.addDocument(doc);
-        }
-      }
-      try (IndexReader reader = DirectoryReader.open(d)) {
-        assertEquals(1, reader.leaves().size());
-        IndexSearcher searcher = new IndexSearcher(reader);
-        AbstractKnnVectorQuery query = getKnnVectorQuery("field", new float[] {2, 3}, 3);
-        Query rewritten = query.rewrite(searcher);
-        Weight weight = searcher.createWeight(rewritten, ScoreMode.COMPLETE, 1);
-        Scorer scorer = weight.scorer(reader.leaves().get(0));
-
-        // prior to advancing, score is undefined
-        assertEquals(-1, scorer.docID());
-        expectThrows(ArrayIndexOutOfBoundsException.class, scorer::score);
-
-        /* score0 = ((2,3) * (1, 1) = 5) / (||2, 3|| * ||1, 1|| = sqrt(26)), then
-         * normalized by (1 + x) /2.
-         */
-        float score0 =
-            (float) ((1 + (2 * 1 + 3 * 1) / Math.sqrt((2 * 2 + 3 * 3) * (1 * 1 + 1 * 1))) / 2);
-
-        /* score1 = ((2,3) * (2, 4) = 16) / (||2, 3|| * ||2, 4|| = sqrt(260)), then
-         * normalized by (1 + x) /2
-         */
-        float score1 =
-            (float) ((1 + (2 * 2 + 3 * 4) / Math.sqrt((2 * 2 + 3 * 3) * (2 * 2 + 4 * 4))) / 2);
-
-        // doc 1 happens to have the maximum score
-        assertEquals(score1, scorer.getMaxScore(2), 0.0001);
-        assertEquals(score1, scorer.getMaxScore(Integer.MAX_VALUE), 0.0001);
-
-        DocIdSetIterator it = scorer.iterator();
-        assertEquals(3, it.cost());
-        assertEquals(0, it.nextDoc());
-        // doc 0 has (1, 1)
-        assertEquals(score0, scorer.score(), 0.0001);
-        assertEquals(1, it.advance(1));
-        assertEquals(score1, scorer.score(), 0.0001);
-
-        // since topK was 3
-        assertEquals(NO_MORE_DOCS, it.advance(4));
-        expectThrows(ArrayIndexOutOfBoundsException.class, scorer::score);
-      }
+    float[][] vectors = new float[5][];
+    for (int j = 1; j <= 5; j++) {
+      vectors[j - 1] = new float[] {j, j * j};
+    }
+    try (Directory d = getStableIndexStore("field", VectorSimilarityFunction.COSINE, vectors);
+        IndexReader reader = DirectoryReader.open(d)) {
+      assertEquals(1, reader.leaves().size());
+      IndexSearcher searcher = new IndexSearcher(reader);
+      AbstractKnnVectorQuery query = getKnnVectorQuery("field", new float[] {2, 3}, 3);
+      Query rewritten = query.rewrite(searcher);
+      Weight weight = searcher.createWeight(rewritten, ScoreMode.COMPLETE, 1);
+      Scorer scorer = weight.scorer(reader.leaves().get(0));
+
+      // prior to advancing, score is undefined
+      assertEquals(-1, scorer.docID());
+      expectThrows(ArrayIndexOutOfBoundsException.class, scorer::score);
+
+      /* score0 = ((2,3) * (1, 1) = 5) / (||2, 3|| * ||1, 1|| = sqrt(26)), then
+       * normalized by (1 + x) /2.
+       */
+      float score0 =
+          (float) ((1 + (2 * 1 + 3 * 1) / Math.sqrt((2 * 2 + 3 * 3) * (1 * 1 + 1 * 1))) / 2);
+
+      /* score1 = ((2,3) * (2, 4) = 16) / (||2, 3|| * ||2, 4|| = sqrt(260)), then
+       * normalized by (1 + x) /2
+       */
+      float score1 =
+          (float) ((1 + (2 * 2 + 3 * 4) / Math.sqrt((2 * 2 + 3 * 3) * (2 * 2 + 4 * 4))) / 2);
+
+      // doc 1 happens to have the maximum score
+      assertEquals(score1, scorer.getMaxScore(2), 0.0001);
+      assertEquals(score1, scorer.getMaxScore(Integer.MAX_VALUE), 0.0001);
+
+      DocIdSetIterator it = scorer.iterator();
+      assertEquals(3, it.cost());
+      assertEquals(0, it.nextDoc());
+      // doc 0 has (1, 1)
+      assertEquals(score0, scorer.score(), 0.0001);
+      assertEquals(1, it.advance(1));
+      assertEquals(score1, scorer.score(), 0.0001);
+
+      // since topK was 3
+      assertEquals(NO_MORE_DOCS, it.advance(4));
+      expectThrows(ArrayIndexOutOfBoundsException.class, scorer::score);
     }
   }
 
   public void testScoreMIP() throws IOException {
-    try (Directory indexStore =
-            getIndexStore(
-                "field",
-                VectorSimilarityFunction.MAXIMUM_INNER_PRODUCT,
-                new float[] {0, 1},
-                new float[] {1, 2},
-                new float[] {0, 0});
-        IndexReader reader = DirectoryReader.open(indexStore)) {
+    float[][] vectors = {{0, 1}, {1, 2}, {0, 0}};
+    try (Directory d =
+            getStableIndexStore("field", VectorSimilarityFunction.MAXIMUM_INNER_PRODUCT, vectors);
+        IndexReader reader = DirectoryReader.open(d)) {
       IndexSearcher searcher = newSearcher(reader);
       AbstractKnnVectorQuery kvq = getKnnVectorQuery("field", new float[] {0, -1}, 10);
       assertMatches(searcher, kvq, 3);
@@ -405,7 +402,8 @@ public void testExplain() throws IOException {
         AbstractKnnVectorQuery query = getKnnVectorQuery("field", new float[] {2, 3}, 3);
         Explanation matched = searcher.explain(query, 2);
         assertTrue(matched.isMatch());
-        assertEquals(1 / 2f, matched.getValue());
+        // scores vary widely due to quantization
+        assertEquals(1 / 2f, matched.getValue().doubleValue(), 0.5);
         assertEquals(0, matched.getDetails().length);
         assertEquals("within top 3 docs", matched.getDescription());
 
@@ -431,9 +429,10 @@ public void testExplainMultipleSegments() throws IOException {
       try (IndexReader reader = DirectoryReader.open(d)) {
         IndexSearcher searcher = new IndexSearcher(reader);
         AbstractKnnVectorQuery query = getKnnVectorQuery("field", new float[] {2, 3}, 3);
-        Explanation matched = searcher.explain(query, 2);
+        Explanation matched = searcher.explain(query, 2); // (2, 2)
         assertTrue(matched.isMatch());
-        assertEquals(1 / 2f, matched.getValue());
+        // scores vary widely due to quantization
+        assertEquals(1 / 2f, matched.getValue().doubleValue(), 0.5);
         assertEquals(0, matched.getDetails().length);
         assertEquals("within top 3 docs", matched.getDescription());
 
@@ -453,7 +452,7 @@ public void testSkewedIndex() throws IOException {
      * randomly fail to find one).
      */
     try (Directory d = newDirectoryForTest()) {
-      try (IndexWriter w = new IndexWriter(d, new IndexWriterConfig())) {
+      try (IndexWriter w = new IndexWriter(d, configStandardCodec())) {
         int r = 0;
         for (int i = 0; i < 5; i++) {
           for (int j = 0; j < 5; j++) {
@@ -532,7 +531,7 @@ public void testRandomWithFilter() throws IOException {
       // visitedLimit. This is fine since the test targets AbstractKnnVectorQuery logic, not the kNN
       // format
       // implementation.
-      IndexWriterConfig iwc = new IndexWriterConfig().setCodec(TestUtil.getDefaultCodec());
+      IndexWriterConfig iwc = configStandardCodec();
       RandomIndexWriter w = new RandomIndexWriter(random(), d, iwc);
       for (int i = 0; i < numDocs; i++) {
         Document doc = new Document();
@@ -618,7 +617,7 @@ public void testFilterWithSameScore() throws IOException {
       // visitedLimit. This is fine since the test targets AbstractKnnVectorQuery logic, not the kNN
       // format
       // implementation.
-      IndexWriterConfig iwc = new IndexWriterConfig().setCodec(TestUtil.getDefaultCodec());
+      IndexWriterConfig iwc = configStandardCodec();
       IndexWriter w = new IndexWriter(d, iwc);
       float[] vector = randomVector(dimension);
       for (int i = 0; i < numDocs; i++) {
@@ -933,11 +932,17 @@ Directory getIndexStore(
    * preserving the order of the added documents.
    */
   private Directory getStableIndexStore(String field, float[]... contents) throws IOException {
+    return getStableIndexStore(field, VectorSimilarityFunction.EUCLIDEAN, contents);
+  }
+
+  private Directory getStableIndexStore(
+      String field, VectorSimilarityFunction similarityFunction, float[][] contents)
+      throws IOException {
     Directory indexStore = newDirectoryForTest();
-    try (IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig())) {
+    try (IndexWriter writer = new IndexWriter(indexStore, configStandardCodec())) {
       for (int i = 0; i < contents.length; ++i) {
         Document doc = new Document();
-        doc.add(getKnnVectorField(field, contents[i]));
+        doc.add(getKnnVectorField(field, contents[i], similarityFunction));
         doc.add(new StringField("id", "id" + i, Field.Store.YES));
         writer.addDocument(doc);
       }

diff --git a/lucene/core/src/test/org/apache/lucene/search/BaseVectorSimilarityQueryTestCase.java b/lucene/core/src/test/org/apache/lucene/search/BaseVectorSimilarityQueryTestCase.java
@@ -34,13 +34,15 @@
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.QueryTimeout;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.VectorSimilarityFunction;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.tests.index.RandomIndexWriter;
 import org.apache.lucene.tests.util.LuceneTestCase;
+import org.apache.lucene.tests.util.TestUtil;
 import org.apache.lucene.util.hnsw.HnswUtil;
 
 @LuceneTestCase.SuppressCodecs("SimpleText")
@@ -381,7 +383,7 @@ public void testBoostQuery() throws IOException {
     }
   }
 
-  public void testVectorsAboveSimilarity() throws IOException {
+  void testVectorsAboveSimilarity() throws IOException {
     // Pick number of docs to accept
     int numAccepted = random().nextInt(numDocs / 3, numDocs / 2);
     float delta = 1e-3f;
@@ -401,7 +403,10 @@ public void testVectorsAboveSimilarity() throws IOException {
       }
     }
 
-    try (Directory indexStore = getIndexStore(vectors);
+    // TODO test with random codec params via getIndexStore(vectors);
+    // this is challenging because scores will vary in a quantized index
+    // and precomputing as above will not be accurate
+    try (Directory indexStore = getStableIndexStore(vectors);
         IndexReader reader = DirectoryReader.open(indexStore)) {
       IndexSearcher searcher = newSearcher(reader);
 
@@ -587,6 +592,21 @@ final Directory getIndexStore(V... vectors) throws IOException {
     return dir;
   }
 
+  @SafeVarargs
+  final Directory getStableIndexStore(V... vectors) throws IOException {
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = new IndexWriterConfig().setCodec(TestUtil.getDefaultCodec());
+    try (IndexWriter writer = new IndexWriter(dir, iwc)) {
+      for (int i = 0; i < vectors.length; ++i) {
+        Document doc = new Document();
+        doc.add(getVectorField(vectorField, vectors[i], function));
+        doc.add(new IntField(idField, i, Field.Store.YES));
+        writer.addDocument(doc);
+      }
+    }
+    return dir;
+  }
+
   private static class CountingQueryTimeout implements QueryTimeout {
     private int remaining;
 

diff --git a/lucene/core/src/test/org/apache/lucene/search/TestKnnFloatVectorQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestKnnFloatVectorQuery.java
@@ -44,6 +44,7 @@
 import org.apache.lucene.util.VectorUtil;
 
 public class TestKnnFloatVectorQuery extends BaseKnnVectorQueryTestCase {
+
   @Override
   KnnFloatVectorQuery getKnnVectorQuery(String field, float[] query, int k, Query queryFilter) {
     return new KnnFloatVectorQuery(field, query, k, queryFilter);
@@ -130,16 +131,16 @@ public void testScoreNegativeDotProduct() throws IOException {
         DocIdSetIterator it = scorer.iterator();
         assertEquals(2, it.cost());
         assertEquals(0, it.nextDoc());
-        assertEquals(0, scorer.score(), 0);
+        assertEquals(0, scorer.score(), EPSILON);
         assertEquals(1, it.advance(1));
-        assertEquals(1, scorer.score(), 0);
+        assertEquals(1, scorer.score(), EPSILON);
       }
     }
   }
 
   public void testScoreDotProduct() throws IOException {
     try (Directory d = newDirectory()) {
-      try (IndexWriter w = new IndexWriter(d, new IndexWriterConfig())) {
+      try (IndexWriter w = new IndexWriter(d, configStandardCodec())) {
         for (int j = 1; j <= 5; j++) {
           Document doc = new Document();
           doc.add(
@@ -174,7 +175,7 @@ public void testScoreDotProduct() throws IOException {
             (float) ((1 + (2 * 2 + 3 * 4) / Math.sqrt((2 * 2 + 3 * 3) * (2 * 2 + 4 * 4))) / 2);
 
         // doc 1 happens to have the max score
-        assertEquals(score1, scorer.getMaxScore(2), 0.0001);
+        assertEquals(score1, scorer.getMaxScore(2), 0.001);
         assertEquals(score1, scorer.getMaxScore(Integer.MAX_VALUE), 0.0001);
 
         DocIdSetIterator it = scorer.iterator();