Added rescorer in hybrid query (#917)

* Initial version for rescorer Signed-off-by: Martin Gaievski <gaievski@amazon.com> (cherry picked from commit 9f4a49a) Signed-off-by: Martin Gaievski <gaievski@amazon.com>
opensearch-project · Oct 4, 2024 · ec73d2b · ec73d2b
1 parent 8a786fe
commit ec73d2b
Show file tree

Hide file tree

Showing 8 changed files with 674 additions and 30 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,6 +16,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 ### Features
 ### Enhancements
 - Implement `ignore_missing` field in text chunking processors ([#907](https://github.com/opensearch-project/neural-search/pull/907))
+- Added rescorer in hybrid query ([#917](https://github.com/opensearch-project/neural-search/pull/917))
 ### Bug Fixes
 ### Infrastructure
 ### Documentation

diff --git a/qa/rolling-upgrade/build.gradle b/qa/rolling-upgrade/build.gradle
@@ -76,6 +76,15 @@ task testAgainstOldCluster(type: StandaloneRestIntegTestTask) {
         }
     }
 
+    // Excluding the test because hybrid query with rescore is not compatible with 2.14 and lower
+    if (ext.neural_search_bwc_version.startsWith("2.9") || ext.neural_search_bwc_version.startsWith("2.10")
+            || ext.neural_search_bwc_version.startsWith("2.11") || ext.neural_search_bwc_version.startsWith("2.12")
+            || ext.neural_search_bwc_version.startsWith("2.13") || ext.neural_search_bwc_version.startsWith("2.14")) {
+        filter {
+            excludeTestsMatching "org.opensearch.neuralsearch.bwc.HybridSearchWithRescoreIT.*"
+        }
+    }
+
     // Excluding the test because we introduce this feature in 2.13
     if (ext.neural_search_bwc_version.startsWith("2.11") || ext.neural_search_bwc_version.startsWith("2.12")){
         filter {

diff --git a/...ling-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/HybridSearchWithRescoreIT.java b/...ling-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/HybridSearchWithRescoreIT.java
@@ -0,0 +1,150 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+package org.opensearch.neuralsearch.bwc;
+
+import org.opensearch.index.query.MatchQueryBuilder;
+import org.opensearch.index.query.QueryBuilder;
+import org.opensearch.index.query.QueryBuilders;
+import org.opensearch.knn.index.query.rescore.RescoreContext;
+import org.opensearch.neuralsearch.query.HybridQueryBuilder;
+import org.opensearch.neuralsearch.query.NeuralQueryBuilder;
+
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+
+import static org.opensearch.neuralsearch.util.TestUtils.NODES_BWC_CLUSTER;
+import static org.opensearch.neuralsearch.util.TestUtils.PARAM_NAME_WEIGHTS;
+import static org.opensearch.neuralsearch.util.TestUtils.TEXT_EMBEDDING_PROCESSOR;
+import static org.opensearch.neuralsearch.util.TestUtils.DEFAULT_NORMALIZATION_METHOD;
+import static org.opensearch.neuralsearch.util.TestUtils.DEFAULT_COMBINATION_METHOD;
+import static org.opensearch.neuralsearch.util.TestUtils.getModelId;
+
+public class HybridSearchWithRescoreIT extends AbstractRollingUpgradeTestCase {
+
+    private static final String PIPELINE_NAME = "nlp-hybrid-with_rescore-pipeline";
+    private static final String SEARCH_PIPELINE_NAME = "nlp-search-with_rescore-pipeline";
+    private static final String TEST_FIELD = "passage_text";
+    private static final String TEXT = "Hello world";
+    private static final String TEXT_MIXED = "Hi planet";
+    private static final String TEXT_UPGRADED = "Hi earth";
+    private static final String QUERY = "Hi world";
+    private static final int NUM_DOCS_PER_ROUND = 1;
+    private static final String VECTOR_EMBEDDING_FIELD = "passage_embedding";
+    protected static final String RESCORE_QUERY = "hi";
+    private static String modelId = "";
+
+    /**
+     * Test normalization with hybrid query and rescore. This test is required as rescore will not be compatible with version lower than 2.15
+     */
+    public void testNormalizationProcessorWithRescore_whenIndexWithMultipleShards_E2EFlow() throws Exception {
+        waitForClusterHealthGreen(NODES_BWC_CLUSTER);
+        switch (getClusterType()) {
+            case OLD:
+                modelId = uploadTextEmbeddingModel();
+                loadModel(modelId);
+                createPipelineProcessor(modelId, PIPELINE_NAME);
+                createIndexWithConfiguration(
+                        getIndexNameForTest(),
+                        Files.readString(Path.of(classLoader.getResource("processor/IndexMappings.json").toURI())),
+                        PIPELINE_NAME
+                );
+                addDocument(getIndexNameForTest(), "0", TEST_FIELD, TEXT, null, null);
+                createSearchPipeline(
+                        SEARCH_PIPELINE_NAME,
+                        DEFAULT_NORMALIZATION_METHOD,
+                        DEFAULT_COMBINATION_METHOD,
+                        Map.of(PARAM_NAME_WEIGHTS, Arrays.toString(new float[] { 0.3f, 0.7f }))
+                );
+                break;
+            case MIXED:
+                modelId = getModelId(getIngestionPipeline(PIPELINE_NAME), TEXT_EMBEDDING_PROCESSOR);
+                int totalDocsCountMixed;
+                if (isFirstMixedRound()) {
+                    totalDocsCountMixed = NUM_DOCS_PER_ROUND;
+                    HybridQueryBuilder hybridQueryBuilder = getQueryBuilder(modelId, null, null);
+                    QueryBuilder rescorer = QueryBuilders.matchQuery(TEST_FIELD, RESCORE_QUERY).boost(0.3f);
+                    validateTestIndexOnUpgrade(totalDocsCountMixed, modelId, hybridQueryBuilder, rescorer);
+                    addDocument(getIndexNameForTest(), "1", TEST_FIELD, TEXT_MIXED, null, null);
+                } else {
+                    totalDocsCountMixed = 2 * NUM_DOCS_PER_ROUND;
+                    HybridQueryBuilder hybridQueryBuilder = getQueryBuilder(modelId, null, null);
+                    validateTestIndexOnUpgrade(totalDocsCountMixed, modelId, hybridQueryBuilder, null);
+                }
+                break;
+            case UPGRADED:
+                try {
+                    modelId = getModelId(getIngestionPipeline(PIPELINE_NAME), TEXT_EMBEDDING_PROCESSOR);
+                    int totalDocsCountUpgraded = 3 * NUM_DOCS_PER_ROUND;
+                    loadModel(modelId);
+                    addDocument(getIndexNameForTest(), "2", TEST_FIELD, TEXT_UPGRADED, null, null);
+                    HybridQueryBuilder hybridQueryBuilder = getQueryBuilder(modelId, null, null);
+                    QueryBuilder rescorer = QueryBuilders.matchQuery(TEST_FIELD, RESCORE_QUERY).boost(0.3f);
+                    validateTestIndexOnUpgrade(totalDocsCountUpgraded, modelId, hybridQueryBuilder, rescorer);
+                    hybridQueryBuilder = getQueryBuilder(modelId, Map.of("ef_search", 100), RescoreContext.getDefault());
+                    validateTestIndexOnUpgrade(totalDocsCountUpgraded, modelId, hybridQueryBuilder, rescorer);
+                } finally {
+                    wipeOfTestResources(getIndexNameForTest(), PIPELINE_NAME, modelId, SEARCH_PIPELINE_NAME);
+                }
+                break;
+            default:
+                throw new IllegalStateException("Unexpected value: " + getClusterType());
+        }
+    }
+
+    private void validateTestIndexOnUpgrade(
+        final int numberOfDocs,
+        final String modelId,
+        HybridQueryBuilder hybridQueryBuilder,
+        QueryBuilder rescorer
+    ) throws Exception {
+        int docCount = getDocCount(getIndexNameForTest());
+        assertEquals(numberOfDocs, docCount);
+        loadModel(modelId);
+        Map<String, Object> searchResponseAsMap = search(
+            getIndexNameForTest(),
+            hybridQueryBuilder,
+            rescorer,
+            1,
+            Map.of("search_pipeline", SEARCH_PIPELINE_NAME)
+        );
+        assertNotNull(searchResponseAsMap);
+        int hits = getHitCount(searchResponseAsMap);
+        assertEquals(1, hits);
+        List<Double> scoresList = getNormalizationScoreList(searchResponseAsMap);
+        for (Double score : scoresList) {
+            assertTrue(0 <= score && score <= 2);
+        }
+    }
+
+    private HybridQueryBuilder getQueryBuilder(
+        final String modelId,
+        final Map<String, ?> methodParameters,
+        final RescoreContext rescoreContextForNeuralQuery
+    ) {
+        NeuralQueryBuilder neuralQueryBuilder = new NeuralQueryBuilder();
+        neuralQueryBuilder.fieldName(VECTOR_EMBEDDING_FIELD);
+        neuralQueryBuilder.modelId(modelId);
+        neuralQueryBuilder.queryText(QUERY);
+        neuralQueryBuilder.k(5);
+        if (methodParameters != null) {
+            neuralQueryBuilder.methodParameters(methodParameters);
+        }
+        if (Objects.nonNull(rescoreContextForNeuralQuery)) {
+            neuralQueryBuilder.rescoreContext(rescoreContextForNeuralQuery);
+        }
+
+        MatchQueryBuilder matchQueryBuilder = new MatchQueryBuilder("text", QUERY);
+
+        HybridQueryBuilder hybridQueryBuilder = new HybridQueryBuilder();
+        hybridQueryBuilder.add(matchQueryBuilder);
+        hybridQueryBuilder.add(neuralQueryBuilder);
+
+        return hybridQueryBuilder;
+    }
+}
diff --git a/src/main/java/org/opensearch/neuralsearch/search/query/HybridCollectorManager.java b/src/main/java/org/opensearch/neuralsearch/search/query/HybridCollectorManager.java
@@ -6,6 +6,7 @@
 
 import java.util.Locale;
 import lombok.RequiredArgsConstructor;
+import lombok.extern.log4j.Log4j2;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.search.Collector;
 import org.apache.lucene.search.CollectorManager;
@@ -33,7 +34,9 @@
 import org.opensearch.search.query.MultiCollectorWrapper;
 import org.opensearch.search.query.QuerySearchResult;
 import org.opensearch.search.query.ReduceableSearchResult;
+import org.opensearch.search.rescore.RescoreContext;
 import org.opensearch.search.sort.SortAndFormats;
+import org.opensearch.neuralsearch.search.query.exception.HybridSearchRescoreQueryException;
 
 import java.io.IOException;
 import java.util.ArrayList;
@@ -55,6 +58,7 @@
  * In most cases it will be wrapped in MultiCollectorManager.
  */
 @RequiredArgsConstructor
+@Log4j2
 public abstract class HybridCollectorManager implements CollectorManager<Collector, ReduceableSearchResult> {
 
     private final int numHits;
@@ -67,6 +71,7 @@ public abstract class HybridCollectorManager implements CollectorManager<Collect
     private final TopDocsMerger topDocsMerger;
     @Nullable
     private final FieldDoc after;
+    private final SearchContext searchContext;
 
     /**
      * Create new instance of HybridCollectorManager depending on the concurrent search beeing enabled or disabled.
@@ -101,17 +106,15 @@ public static CollectorManager createHybridCollectorManager(final SearchContext
                 numDocs,
                 new HitsThresholdChecker(Math.max(numDocs, searchContext.trackTotalHitsUpTo())),
                 trackTotalHitsUpTo,
-                searchContext.sort(),
                 filteringWeight,
-                searchContext.searchAfter()
+                searchContext
             )
             : new HybridCollectorNonConcurrentManager(
                 numDocs,
                 new HitsThresholdChecker(Math.max(numDocs, searchContext.trackTotalHitsUpTo())),
                 trackTotalHitsUpTo,
-                searchContext.sort(),
                 filteringWeight,
-                searchContext.searchAfter()
+                searchContext
             );
     }
 
@@ -161,28 +164,82 @@ private List<ReduceableSearchResult> getSearchResults(final List<HybridSearchCol
         List<ReduceableSearchResult> results = new ArrayList<>();
         DocValueFormat[] docValueFormats = getSortValueFormats(sortAndFormats);
         for (HybridSearchCollector collector : hybridSearchCollectors) {
-            TopDocsAndMaxScore topDocsAndMaxScore = getTopDocsAndAndMaxScore(collector, docValueFormats);
+            boolean isSortEnabled = docValueFormats != null;
+            TopDocsAndMaxScore topDocsAndMaxScore = getTopDocsAndAndMaxScore(collector, isSortEnabled);
             results.add((QuerySearchResult result) -> reduceCollectorResults(result, topDocsAndMaxScore, docValueFormats));
         }
         return results;
     }
 
-    private TopDocsAndMaxScore getTopDocsAndAndMaxScore(
-        final HybridSearchCollector hybridSearchCollector,
-        final DocValueFormat[] docValueFormats
-    ) {
-        TopDocs newTopDocs;
+    private TopDocsAndMaxScore getTopDocsAndAndMaxScore(final HybridSearchCollector hybridSearchCollector, final boolean isSortEnabled) {
         List topDocs = hybridSearchCollector.topDocs();
-        if (docValueFormats != null) {
-            newTopDocs = getNewTopFieldDocs(
-                getTotalHits(this.trackTotalHitsUpTo, topDocs, hybridSearchCollector.getTotalHits()),
-                topDocs,
-                sortAndFormats.sort.getSort()
-            );
-        } else {
-            newTopDocs = getNewTopDocs(getTotalHits(this.trackTotalHitsUpTo, topDocs, hybridSearchCollector.getTotalHits()), topDocs);
+        if (isSortEnabled) {
+            return getSortedTopDocsAndMaxScore(topDocs, hybridSearchCollector);
+        }
+        return getTopDocsAndMaxScore(topDocs, hybridSearchCollector);
+    }
+
+    private TopDocsAndMaxScore getSortedTopDocsAndMaxScore(List<TopFieldDocs> topDocs, HybridSearchCollector hybridSearchCollector) {
+        TopDocs sortedTopDocs = getNewTopFieldDocs(
+            getTotalHits(this.trackTotalHitsUpTo, topDocs, hybridSearchCollector.getTotalHits()),
+            topDocs,
+            sortAndFormats.sort.getSort()
+        );
+        return new TopDocsAndMaxScore(sortedTopDocs, hybridSearchCollector.getMaxScore());
+    }
+
+    private TopDocsAndMaxScore getTopDocsAndMaxScore(List<TopDocs> topDocs, HybridSearchCollector hybridSearchCollector) {
+        if (shouldRescore()) {
+            topDocs = rescore(topDocs);
+        }
+        float maxScore = calculateMaxScore(topDocs, hybridSearchCollector.getMaxScore());
+        TopDocs finalTopDocs = getNewTopDocs(getTotalHits(this.trackTotalHitsUpTo, topDocs, hybridSearchCollector.getTotalHits()), topDocs);
+        return new TopDocsAndMaxScore(finalTopDocs, maxScore);
+    }
+
+    private boolean shouldRescore() {
+        List<RescoreContext> rescoreContexts = searchContext.rescore();
+        return Objects.nonNull(rescoreContexts) && !rescoreContexts.isEmpty();
+    }
+
+    private List<TopDocs> rescore(List<TopDocs> topDocs) {
+        List<TopDocs> rescoredTopDocs = topDocs;
+        for (RescoreContext ctx : searchContext.rescore()) {
+            rescoredTopDocs = rescoredTopDocs(ctx, rescoredTopDocs);
+        }
+        return rescoredTopDocs;
+    }
+
+    /**
+     * Rescores the top documents using the provided context. The input topDocs may be modified during this process.
+     */
+    private List<TopDocs> rescoredTopDocs(final RescoreContext ctx, final List<TopDocs> topDocs) {
+        List<TopDocs> result = new ArrayList<>(topDocs.size());
+        for (TopDocs topDoc : topDocs) {
+            try {
+                result.add(ctx.rescorer().rescore(topDoc, searchContext.searcher(), ctx));
+            } catch (IOException exception) {
+                log.error("rescore failed for hybrid query in collector_manager.reduce call", exception);
+                throw new HybridSearchRescoreQueryException(exception);
+            }
         }
-        return new TopDocsAndMaxScore(newTopDocs, hybridSearchCollector.getMaxScore());
+        return result;
+    }
+
+    /**
+    * Calculates the maximum score from the provided TopDocs, considering rescoring.
+    */
+    private float calculateMaxScore(List<TopDocs> topDocsList, float initialMaxScore) {
+        List<RescoreContext> rescoreContexts = searchContext.rescore();
+        if (Objects.nonNull(rescoreContexts) && !rescoreContexts.isEmpty()) {
+            for (TopDocs topDocs : topDocsList) {
+                if (Objects.nonNull(topDocs.scoreDocs) && topDocs.scoreDocs.length > 0) {
+                    // first top doc for each sub-query has the max score because top docs are sorted by score desc
+                    initialMaxScore = Math.max(initialMaxScore, topDocs.scoreDocs[0].score);
+                }
+            }
+        }
+        return initialMaxScore;
     }
 
     private List<HybridSearchCollector> getHybridSearchCollectors(final Collection<Collector> collectors) {
@@ -415,18 +472,18 @@ public HybridCollectorNonConcurrentManager(
             int numHits,
             HitsThresholdChecker hitsThresholdChecker,
             int trackTotalHitsUpTo,
-            SortAndFormats sortAndFormats,
             Weight filteringWeight,
-            ScoreDoc searchAfter
+            SearchContext searchContext
         ) {
             super(
                 numHits,
                 hitsThresholdChecker,
                 trackTotalHitsUpTo,
-                sortAndFormats,
+                searchContext.sort(),
                 filteringWeight,
-                new TopDocsMerger(sortAndFormats),
-                (FieldDoc) searchAfter
+                new TopDocsMerger(searchContext.sort()),
+                searchContext.searchAfter(),
+                searchContext
             );
             scoreCollector = Objects.requireNonNull(super.newCollector(), "collector for hybrid query cannot be null");
         }
@@ -453,18 +510,18 @@ public HybridCollectorConcurrentSearchManager(
             int numHits,
             HitsThresholdChecker hitsThresholdChecker,
             int trackTotalHitsUpTo,
-            SortAndFormats sortAndFormats,
             Weight filteringWeight,
-            ScoreDoc searchAfter
+            SearchContext searchContext
         ) {
             super(
                 numHits,
                 hitsThresholdChecker,
                 trackTotalHitsUpTo,
-                sortAndFormats,
+                searchContext.sort(),
                 filteringWeight,
-                new TopDocsMerger(sortAndFormats),
-                (FieldDoc) searchAfter
+                new TopDocsMerger(searchContext.sort()),
+                searchContext.searchAfter(),
+                searchContext
             );
         }
     }

diff --git a/src/main/java/org/opensearch/neuralsearch/search/query/HybridQueryPhaseSearcher.java b/src/main/java/org/opensearch/neuralsearch/search/query/HybridQueryPhaseSearcher.java
@@ -66,7 +66,9 @@ public boolean searchWith(
             }
             Query hybridQuery = extractHybridQuery(searchContext, query);
             QueryPhaseSearcher queryPhaseSearcher = getQueryPhaseSearcher(searchContext);
-            return queryPhaseSearcher.searchWith(searchContext, searcher, hybridQuery, collectors, hasFilterCollector, hasTimeout);
+            queryPhaseSearcher.searchWith(searchContext, searcher, hybridQuery, collectors, hasFilterCollector, hasTimeout);
+            // we decide on rescore later in collector manager
+            return false;
         }
     }