Adding L2 norm technique (#236)

* Adding L2 norm technique Signed-off-by: Martin Gaievski <gaievski@amazon.com>
opensearch-project · Jul 31, 2023 · 6ad641a · 6ad641a
1 parent fe72dbc
commit 6ad641a
Show file tree

Hide file tree

Showing 8 changed files with 559 additions and 20 deletions.
diff --git a/.../opensearch/neuralsearch/processor/combination/HarmonicMeanScoreCombinationTechnique.java b/.../opensearch/neuralsearch/processor/combination/HarmonicMeanScoreCombinationTechnique.java
@@ -0,0 +1,104 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+package org.opensearch.neuralsearch.processor.combination;
+
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+/**
+ * Abstracts combination of scores based on arithmetic mean method
+ */
+public class HarmonicMeanScoreCombinationTechnique implements ScoreCombinationTechnique {
+
+    public static final String TECHNIQUE_NAME = "arithmetic_mean";
+    public static final String PARAM_NAME_WEIGHTS = "weights";
+    private static final Set<String> SUPPORTED_PARAMS = Set.of(PARAM_NAME_WEIGHTS);
+    private static final Float ZERO_SCORE = 0.0f;
+    private final List<Float> weights;
+
+    public HarmonicMeanScoreCombinationTechnique(final Map<String, Object> params) {
+        validateParams(params);
+        weights = getWeights(params);
+    }
+
+    private List<Float> getWeights(final Map<String, Object> params) {
+        if (Objects.isNull(params) || params.isEmpty()) {
+            return List.of();
+        }
+        // get weights, we don't need to check for instance as it's done during validation
+        return ((List<Double>) params.getOrDefault(PARAM_NAME_WEIGHTS, List.of())).stream()
+            .map(Double::floatValue)
+            .collect(Collectors.toUnmodifiableList());
+    }
+
+    /**
+     * Arithmetic mean method for combining scores.
+     * score = (weight1*score1 + weight2*score2 +...+ weightN*scoreN)/(weight1 + weight2 + ... + weightN)
+     *
+     * Zero (0.0) scores are excluded from number of scores N
+     */
+    @Override
+    public float combine(final float[] scores) {
+        float combinedScore = 0.0f;
+        float weights = 0;
+        for (int indexOfSubQuery = 0; indexOfSubQuery < scores.length; indexOfSubQuery++) {
+            float score = scores[indexOfSubQuery];
+            if (score >= 0.0) {
+                float weight = getWeightForSubQuery(indexOfSubQuery);
+                score = score * weight;
+                combinedScore += score;
+                weights += weight;
+            }
+        }
+        if (weights == 0.0f) {
+            return ZERO_SCORE;
+        }
+        return combinedScore / weights;
+    }
+
+    private void validateParams(final Map<String, Object> params) {
+        if (Objects.isNull(params) || params.isEmpty()) {
+            return;
+        }
+        // check if only supported params are passed
+        Optional<String> optionalNotSupportedParam = params.keySet()
+            .stream()
+            .filter(paramName -> !SUPPORTED_PARAMS.contains(paramName))
+            .findFirst();
+        if (optionalNotSupportedParam.isPresent()) {
+            throw new IllegalArgumentException(
+                String.format(
+                    Locale.ROOT,
+                    "provided parameter for combination technique is not supported. supported parameters are [%s]",
+                    SUPPORTED_PARAMS.stream().collect(Collectors.joining(","))
+                )
+            );
+        }
+
+        // check param types
+        if (params.keySet().stream().anyMatch(PARAM_NAME_WEIGHTS::equalsIgnoreCase)) {
+            if (!(params.get(PARAM_NAME_WEIGHTS) instanceof List)) {
+                throw new IllegalArgumentException(
+                    String.format(Locale.ROOT, "parameter [%s] must be a collection of numbers", PARAM_NAME_WEIGHTS)
+                );
+            }
+        }
+    }
+
+    /**
+     * Get weight for sub-query based on its index in the hybrid search query. Use user provided weight or 1.0 otherwise
+     * @param indexOfSubQuery 0-based index of sub-query in the Hybrid Search query
+     * @return weight for sub-query, use one that is set in processor/pipeline definition or 1.0 as default
+     */
+    private float getWeightForSubQuery(int indexOfSubQuery) {
+        return indexOfSubQuery < weights.size() ? weights.get(indexOfSubQuery) : 1.0f;
+    }
+}
diff --git a/...va/org/opensearch/neuralsearch/processor/normalization/L2ScoreNormalizationTechnique.java b/...va/org/opensearch/neuralsearch/processor/normalization/L2ScoreNormalizationTechnique.java
@@ -0,0 +1,88 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+package org.opensearch.neuralsearch.processor.normalization;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Objects;
+
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopDocs;
+import org.opensearch.neuralsearch.search.CompoundTopDocs;
+
+/**
+ * Abstracts normalization of scores based on L2 method
+ */
+public class L2ScoreNormalizationTechnique implements ScoreNormalizationTechnique {
+
+    public static final String TECHNIQUE_NAME = "l2";
+    private static final float MIN_SCORE = 0.001f;
+
+    /**
+     * L2 normalization method.
+     * n_score_i = score_i/sqrt(score1^2 + score2^2 + ... + scoren^2)
+     * Main algorithm steps:
+     * - calculate sum of squares of all scores
+     * - iterate over each result and update score as per formula above where "score" is raw score returned by Hybrid query
+     */
+    @Override
+    public void normalize(final List<CompoundTopDocs> queryTopDocs) {
+        // get l2 norms for each sub-query
+        List<Float> normsPerSubquery = getL2Norm(queryTopDocs);
+
+        // do normalization using actual score and l2 norm
+        for (CompoundTopDocs compoundQueryTopDocs : queryTopDocs) {
+            if (Objects.isNull(compoundQueryTopDocs)) {
+                continue;
+            }
+            List<TopDocs> topDocsPerSubQuery = compoundQueryTopDocs.getCompoundTopDocs();
+            for (int j = 0; j < topDocsPerSubQuery.size(); j++) {
+                TopDocs subQueryTopDoc = topDocsPerSubQuery.get(j);
+                for (ScoreDoc scoreDoc : subQueryTopDoc.scoreDocs) {
+                    scoreDoc.score = normalizeSingleScore(scoreDoc.score, normsPerSubquery.get(j));
+                }
+            }
+        }
+    }
+
+    private List<Float> getL2Norm(final List<CompoundTopDocs> queryTopDocs) {
+        // find any non-empty compound top docs, it's either empty if shard does not have any results for all of sub-queries,
+        // or it has results for all the sub-queries. In edge case of shard having results only for one sub-query, there will be TopDocs for
+        // rest of sub-queries with zero total hits
+        int numOfSubqueries = queryTopDocs.stream()
+            .filter(Objects::nonNull)
+            .filter(topDocs -> topDocs.getCompoundTopDocs().size() > 0)
+            .findAny()
+            .get()
+            .getCompoundTopDocs()
+            .size();
+        float[] l2Norms = new float[numOfSubqueries];
+        for (CompoundTopDocs compoundQueryTopDocs : queryTopDocs) {
+            if (Objects.isNull(compoundQueryTopDocs)) {
+                continue;
+            }
+            List<TopDocs> topDocsPerSubQuery = compoundQueryTopDocs.getCompoundTopDocs();
+            int bound = topDocsPerSubQuery.size();
+            for (int index = 0; index < bound; index++) {
+                for (ScoreDoc scoreDocs : topDocsPerSubQuery.get(index).scoreDocs) {
+                    l2Norms[index] += scoreDocs.score * scoreDocs.score;
+                }
+            }
+        }
+        for (int index = 0; index < l2Norms.length; index++) {
+            l2Norms[index] = (float) Math.sqrt(l2Norms[index]);
+        }
+        List<Float> l2NormList = new ArrayList<>();
+        for (int index = 0; index < numOfSubqueries; index++) {
+            l2NormList.add(l2Norms[index]);
+        }
+        return l2NormList;
+    }
+
+    private float normalizeSingleScore(final float score, final float l2Norm) {
+        return l2Norm == 0 ? MIN_SCORE : score / l2Norm;
+    }
+}
diff --git a/...n/java/org/opensearch/neuralsearch/processor/normalization/ScoreNormalizationFactory.java b/...n/java/org/opensearch/neuralsearch/processor/normalization/ScoreNormalizationFactory.java
@@ -17,7 +17,9 @@ public class ScoreNormalizationFactory {
 
     private final Map<String, ScoreNormalizationTechnique> scoreNormalizationMethodsMap = Map.of(
         MinMaxScoreNormalizationTechnique.TECHNIQUE_NAME,
-        new MinMaxScoreNormalizationTechnique()
+        new MinMaxScoreNormalizationTechnique(),
+        L2ScoreNormalizationTechnique.TECHNIQUE_NAME,
+        new L2ScoreNormalizationTechnique()
     );
 
     /**

diff --git a/src/test/java/org/opensearch/neuralsearch/common/BaseNeuralSearchIT.java b/src/test/java/org/opensearch/neuralsearch/common/BaseNeuralSearchIT.java
@@ -107,7 +107,7 @@ protected String uploadModel(String requestBody) throws Exception {
             ImmutableList.of(new BasicHeader(HttpHeaders.USER_AGENT, DEFAULT_USER_AGENT))
         );
         Map<String, Object> uploadResJson = XContentHelper.convertToMap(
-            XContentFactory.xContent(XContentType.JSON),
+            XContentType.JSON.xContent(),
             EntityUtils.toString(uploadResponse.getEntity()),
             false
         );
@@ -136,7 +136,7 @@ protected void loadModel(String modelId) throws Exception {
             ImmutableList.of(new BasicHeader(HttpHeaders.USER_AGENT, DEFAULT_USER_AGENT))
         );
         Map<String, Object> uploadResJson = XContentHelper.convertToMap(
-            XContentFactory.xContent(XContentType.JSON),
+            XContentType.JSON.xContent(),
             EntityUtils.toString(uploadResponse.getEntity()),
             false
         );
@@ -185,7 +185,7 @@ protected float[] runInference(String modelId, String queryText) {
         );
 
         Map<String, Object> inferenceResJson = XContentHelper.convertToMap(
-            XContentFactory.xContent(XContentType.JSON),
+            XContentType.JSON.xContent(),
             EntityUtils.toString(inferenceResponse.getEntity()),
             false
         );
@@ -215,7 +215,7 @@ protected void createIndexWithConfiguration(String indexName, String indexConfig
             ImmutableList.of(new BasicHeader(HttpHeaders.USER_AGENT, DEFAULT_USER_AGENT))
         );
         Map<String, Object> node = XContentHelper.convertToMap(
-            XContentFactory.xContent(XContentType.JSON),
+            XContentType.JSON.xContent(),
             EntityUtils.toString(response.getEntity()),
             false
         );
@@ -239,7 +239,7 @@ protected void createPipelineProcessor(String modelId, String pipelineName) thro
             ImmutableList.of(new BasicHeader(HttpHeaders.USER_AGENT, DEFAULT_USER_AGENT))
         );
         Map<String, Object> node = XContentHelper.convertToMap(
-            XContentFactory.xContent(XContentType.JSON),
+            XContentType.JSON.xContent(),
             EntityUtils.toString(pipelineCreateResponse.getEntity()),
             false
         );
@@ -329,7 +329,7 @@ protected Map<String, Object> search(
 
         String responseBody = EntityUtils.toString(response.getEntity());
 
-        return XContentHelper.convertToMap(XContentFactory.xContent(XContentType.JSON), responseBody, false);
+        return XContentHelper.convertToMap(XContentType.JSON.xContent(), responseBody, false);
     }
 
     /**
@@ -445,11 +445,7 @@ protected Map<String, Object> getTaskQueryResponse(String taskId) throws Excepti
             toHttpEntity(""),
             ImmutableList.of(new BasicHeader(HttpHeaders.USER_AGENT, DEFAULT_USER_AGENT))
         );
-        return XContentHelper.convertToMap(
-            XContentFactory.xContent(XContentType.JSON),
-            EntityUtils.toString(taskQueryResponse.getEntity()),
-            false
-        );
+        return XContentHelper.convertToMap(XContentType.JSON.xContent(), EntityUtils.toString(taskQueryResponse.getEntity()), false);
     }
 
     protected boolean checkComplete(Map<String, Object> node) {

diff --git a/src/test/java/org/opensearch/neuralsearch/processor/ScoreNormalizationCombinationIT.java b/src/test/java/org/opensearch/neuralsearch/processor/ScoreNormalizationCombinationIT.java
@@ -27,6 +27,7 @@
 import org.opensearch.index.query.TermQueryBuilder;
 import org.opensearch.knn.index.SpaceType;
 import org.opensearch.neuralsearch.common.BaseNeuralSearchIT;
+import org.opensearch.neuralsearch.processor.normalization.L2ScoreNormalizationTechnique;
 import org.opensearch.neuralsearch.query.HybridQueryBuilder;
 import org.opensearch.neuralsearch.query.NeuralQueryBuilder;
 
@@ -93,12 +94,7 @@ protected boolean preserveClusterUponCompletion() {
      *                     "technique": "min-max"
      *                 },
      *                 "combination": {
-     *                     "technique": "sum",
-     *                     "parameters": {
-     *                         "weights": [
-     *                             0.4, 0.7
-     *                         ]
-     *                     }
+     *                     "technique": "arithmetic_mean"
      *                 }
      *             }
      *         }
@@ -251,6 +247,29 @@ public void testResultProcessor_whenMultipleShardsAndPartialMatches_thenSuccessf
         assertQueryResults(searchResponseAsMap, 4, true);
     }
 
+    /**
+     * Using search pipelines with result processor configs like below:
+     * {
+     *     "description": "Post processor for hybrid search",
+     *     "phase_results_processors": [
+     *         {
+     *             "normalization-processor": {
+     *                 "normalization": {
+     *                     "technique": "min-max"
+     *                 },
+     *                 "combination": {
+     *                     "technique": "arithmetic_mean",
+     *                     "parameters": {
+     *                         "weights": [
+     *                             0.4, 0.7
+     *                         ]
+     *                     }
+     *                 }
+     *             }
+     *         }
+     *     ]
+     * }
+     */
     @SneakyThrows
     public void testArithmeticWeightedMean_whenWeightsPassed_thenSuccessful() {
         initializeIndexIfNotExist(TEST_MULTI_DOC_INDEX_THREE_SHARDS_NAME);
@@ -337,6 +356,74 @@ public void testArithmeticWeightedMean_whenWeightsPassed_thenSuccessful() {
         assertWeightedScores(searchResponseWithWeights4AsMap, 1.0, 1.0, 0.001);
     }
 
+    /**
+     * Using search pipelines with config for l2 norm:
+     * {
+     *     "description": "Post processor for hybrid search",
+     *     "phase_results_processors": [
+     *         {
+     *             "normalization-processor": {
+     *                 "normalization": {
+     *                     "technique": "l2"
+     *                 },
+     *                 "combination": {
+     *                     "technique": "arithmetic_mean"
+     *                 }
+     *             }
+     *         }
+     *     ]
+     * }
+     */
+    @SneakyThrows
+    public void testL2Norm_whenOneShardAndQueryMatches_thenSuccessful() {
+        initializeIndexIfNotExist(TEST_MULTI_DOC_INDEX_ONE_SHARD_NAME);
+        createSearchPipeline(
+            SEARCH_PIPELINE,
+            L2ScoreNormalizationTechnique.TECHNIQUE_NAME,
+            COMBINATION_METHOD,
+            Map.of(PARAM_NAME_WEIGHTS, Arrays.toString(new float[] { 0.8f, 0.7f }))
+        );
+
+        NeuralQueryBuilder neuralQueryBuilder = new NeuralQueryBuilder(TEST_KNN_VECTOR_FIELD_NAME_1, "", modelId.get(), 5, null, null);
+        TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery(TEST_TEXT_FIELD_NAME_1, TEST_QUERY_TEXT3);
+
+        HybridQueryBuilder hybridQueryBuilder = new HybridQueryBuilder();
+        hybridQueryBuilder.add(neuralQueryBuilder);
+        hybridQueryBuilder.add(termQueryBuilder);
+
+        Map<String, Object> searchResponseAsMap = search(
+            TEST_MULTI_DOC_INDEX_ONE_SHARD_NAME,
+            hybridQueryBuilder,
+            null,
+            5,
+            Map.of("search_pipeline", SEARCH_PIPELINE)
+        );
+        int totalExpectedDocQty = 5;
+        assertNotNull(searchResponseAsMap);
+        Map<String, Object> total = getTotalHits(searchResponseAsMap);
+        assertNotNull(total.get("value"));
+        assertEquals(totalExpectedDocQty, total.get("value"));
+        assertNotNull(total.get("relation"));
+        assertEquals(RELATION_EQUAL_TO, total.get("relation"));
+        assertTrue(getMaxScore(searchResponseAsMap).isPresent());
+        assertTrue(Range.between(.6f, 1.0f).contains(getMaxScore(searchResponseAsMap).get()));
+
+        List<Map<String, Object>> hitsNestedList = getNestedHits(searchResponseAsMap);
+        List<String> ids = new ArrayList<>();
+        List<Double> scores = new ArrayList<>();
+        for (Map<String, Object> oneHit : hitsNestedList) {
+            ids.add((String) oneHit.get("_id"));
+            scores.add((Double) oneHit.get("_score"));
+        }
+        // verify scores order
+        assertTrue(IntStream.range(0, scores.size() - 1).noneMatch(idx -> scores.get(idx) < scores.get(idx + 1)));
+        // verify the scores are normalized. for l2 scores max score will not be 1.0 so we're checking on a range
+        assertTrue(Range.between(.6f, 1.0f).contains((float) scores.stream().map(Double::floatValue).max(Double::compare).get()));
+
+        // verify that all ids are unique
+        assertEquals(Set.copyOf(ids).size(), ids.size());
+    }
+
     private void initializeIndexIfNotExist(String indexName) throws IOException {
         if (TEST_MULTI_DOC_INDEX_ONE_SHARD_NAME.equalsIgnoreCase(indexName) && !indexExists(TEST_MULTI_DOC_INDEX_ONE_SHARD_NAME)) {
             prepareKnnIndex(