From 5e697329560bfd12914d49b8ee2498cb8dc067d3 Mon Sep 17 00:00:00 2001 From: Mayya Sharipova Date: Wed, 20 Mar 2019 08:27:22 -0400 Subject: [PATCH] Add l1norm and l2norm distances for vectors Add L1norm - Manhattan distance Add L2norm - Euclidean distance --- .../query-dsl/script-score-query.asciidoc | 104 +++++++++++ .../index/query/ScoreScriptUtils.java | 163 ++++++++++++++++++ .../index/query/docvalues_whitelist.txt | 4 + .../index/query/ScoreScriptUtilsTests.java | 23 +++ .../test/dense-vector/10_basic.yml | 61 +++++++ .../test/sparse-vector/10_basic.yml | 63 ++++++- 6 files changed, 417 insertions(+), 1 deletion(-) diff --git a/docs/reference/query-dsl/script-score-query.asciidoc b/docs/reference/query-dsl/script-score-query.asciidoc index ee68d3e40fe13..c053a1d6b73bb 100644 --- a/docs/reference/query-dsl/script-score-query.asciidoc +++ b/docs/reference/query-dsl/script-score-query.asciidoc @@ -173,6 +173,110 @@ between a given query vector and document vectors. -------------------------------------------------- // NOTCONSOLE +For dense_vector fields, `l1norm` calculates L^1^ distance +(Manhattan distance) between a given query vector and +document vectors. + +[source,js] +-------------------------------------------------- +{ + "query": { + "script_score": { + "query": { + "match_all": {} + }, + "script": { + "source": "l1norm(params.queryVector, doc['my_dense_vector'])", + "params": { + "queryVector": [4, 3.4, -0.2] + } + } + } + } +} +-------------------------------------------------- +// NOTCONSOLE + +Note that, unlike `cosineSimilarity` that represent +similarity, `l1norm` and the shown below `l2norm` represent distances or +differences. This means, that the mose similar are vectors, +the less will be the scores produced by `l1norm` and `l2norm` functions. +Thus, if you need more similar vectors to score higher, you should +reverse the output from `l1norm` and `l2norm`: + +`"source": " 1/ l1norm(params.queryVector, doc['my_dense_vector'])"` + +For sparse_vector fields, `l1normSparse` calculates L^1^ distance +between a given query vector and document vectors. + +[source,js] +-------------------------------------------------- +{ + "query": { + "script_score": { + "query": { + "match_all": {} + }, + "script": { + "source": "l1normSparse(params.queryVector, doc['my_sparse_vector'])", + "params": { + "queryVector": {"2": 0.5, "10" : 111.3, "50": -1.3, "113": 14.8, "4545": 156.0} + } + } + } + } +} +-------------------------------------------------- +// NOTCONSOLE + +For dense_vector fields, `l2norm` calculates L^2^ distance +(Euclidean distance) between a given query vector and +document vectors. + +[source,js] +-------------------------------------------------- +{ + "query": { + "script_score": { + "query": { + "match_all": {} + }, + "script": { + "source": "l2norm(params.queryVector, doc['my_dense_vector'])", + "params": { + "queryVector": [4, 3.4, -0.2] + } + } + } + } +} +-------------------------------------------------- +// NOTCONSOLE + +Similarly, for sparse_vector fields, `l2normSparse` calculates L^2^ distance +between a given query vector and document vectors. + +[source,js] +-------------------------------------------------- +{ + "query": { + "script_score": { + "query": { + "match_all": {} + }, + "script": { + "source": "l2normSparse(params.queryVector, doc['my_sparse_vector'])", + "params": { + "queryVector": {"2": 0.5, "10" : 111.3, "50": -1.3, "113": 14.8, "4545": 156.0} + } + } + } + } +} +-------------------------------------------------- +// NOTCONSOLE + + NOTE: If a document doesn't have a value for a vector field on which a vector function is executed, 0 is returned as a result for this document. diff --git a/modules/mapper-extras/src/main/java/org/elasticsearch/index/query/ScoreScriptUtils.java b/modules/mapper-extras/src/main/java/org/elasticsearch/index/query/ScoreScriptUtils.java index 93e80d2a653fb..145ba115b0591 100644 --- a/modules/mapper-extras/src/main/java/org/elasticsearch/index/query/ScoreScriptUtils.java +++ b/modules/mapper-extras/src/main/java/org/elasticsearch/index/query/ScoreScriptUtils.java @@ -32,6 +32,53 @@ public class ScoreScriptUtils { //**************FUNCTIONS FOR DENSE VECTORS + /** + * Calculate l1 norm - Manhattan distance + * between a query's dense vector and documents' dense vectors + * + * @param queryVector the query vector parsed as {@code List} from json + * @param dvs VectorScriptDocValues representing encoded documents' vectors + */ + public static double l1norm(List queryVector, VectorScriptDocValues.DenseVectorScriptDocValues dvs){ + BytesRef value = dvs.getEncodedValue(); + if (value == null) return 0; + float[] docVector = VectorEncoderDecoder.decodeDenseVector(value); + + int dims = Math.min(queryVector.size(), docVector.length); + int dim = 0; + double l1norm = 0; + Iterator queryVectorIter = queryVector.iterator(); + while(dim < dims) { + l1norm += Math.abs(queryVectorIter.next().doubleValue() - docVector[dim]); + dim++; + } + return l1norm; + } + + /** + * Calculate l2 norm - Euclidean distance + * between a query's dense vector and documents' dense vectors + * + * @param queryVector the query vector parsed as {@code List} from json + * @param dvs VectorScriptDocValues representing encoded documents' vectors + */ + public static double l2norm(List queryVector, VectorScriptDocValues.DenseVectorScriptDocValues dvs){ + BytesRef value = dvs.getEncodedValue(); + if (value == null) return 0; + float[] docVector = VectorEncoderDecoder.decodeDenseVector(value); + + int dims = Math.min(queryVector.size(), docVector.length); + int dim = 0; + double l2norm = 0; + Iterator queryVectorIter = queryVector.iterator(); + while(dim < dims) { + double diff = queryVectorIter.next().doubleValue() - docVector[dim]; + l2norm += diff * diff; + dim++; + } + return Math.sqrt(l2norm); + } + /** * Calculate a dot product between a query's dense vector and documents' dense vectors * @@ -100,6 +147,122 @@ private static double intDotProduct(List v1, float[] v2){ //**************FUNCTIONS FOR SPARSE VECTORS + /** + * Calculate l1 norm - Manhattan distance + * between a query's sparse vector and documents' sparse vectors + * + * L1NormSparse is implemented as a class to use + * painless script caching to prepare queryVector + * only once per script execution for all documents. + * A user will call `l1normSparse(params.queryVector, doc['my_vector'])` + */ + public static final class L1NormSparse { + final double[] queryValues; + final int[] queryDims; + + // prepare queryVector once per script execution + // queryVector represents a map of dimensions to values + public L1NormSparse(Map queryVector) { + //break vector into two arrays dims and values + int n = queryVector.size(); + queryDims = new int[n]; + queryValues = new double[n]; + int i = 0; + for (Map.Entry dimValue : queryVector.entrySet()) { + try { + queryDims[i] = Integer.parseInt(dimValue.getKey()); + } catch (final NumberFormatException e) { + throw new IllegalArgumentException("Failed to parse a query vector dimension, it must be an integer!", e); + } + queryValues[i] = dimValue.getValue().doubleValue(); + i++; + } + // Sort dimensions in the ascending order and sort values in the same order as their corresponding dimensions + sortSparseDimsDoubleValues(queryDims, queryValues, n); + } + + public double l1normSparse(VectorScriptDocValues.SparseVectorScriptDocValues dvs) { + BytesRef value = dvs.getEncodedValue(); + if (value == null) return 0; + int[] docDims = VectorEncoderDecoder.decodeSparseVectorDims(value); + float[] docValues = VectorEncoderDecoder.decodeSparseVector(value); + int queryIndex = 0; + int docIndex = 0; + double l1norm = 0; + // find common dimensions among vectors v1 and v2 and calculate l1norm based on common dimensions + while (queryIndex < queryDims.length && docIndex < docDims.length) { + if (queryDims[queryIndex] == docDims[docIndex]) { + l1norm += Math.abs(queryValues[queryIndex] - docValues[docIndex]); + queryIndex++; + docIndex++; + } else if (queryDims[queryIndex] > docDims[docIndex]) { + docIndex++; + } else { + queryIndex++; + } + } + return l1norm; + } + } + + /** + * Calculate l2 norm - Euclidean distance + * between a query's sparse vector and documents' sparse vectors + * + * L2NormSparse is implemented as a class to use + * painless script caching to prepare queryVector + * only once per script execution for all documents. + * A user will call `l2normSparse(params.queryVector, doc['my_vector'])` + */ + public static final class L2NormSparse { + final double[] queryValues; + final int[] queryDims; + + // prepare queryVector once per script execution + // queryVector represents a map of dimensions to values + public L2NormSparse(Map queryVector) { + //break vector into two arrays dims and values + int n = queryVector.size(); + queryDims = new int[n]; + queryValues = new double[n]; + int i = 0; + for (Map.Entry dimValue : queryVector.entrySet()) { + try { + queryDims[i] = Integer.parseInt(dimValue.getKey()); + } catch (final NumberFormatException e) { + throw new IllegalArgumentException("Failed to parse a query vector dimension, it must be an integer!", e); + } + queryValues[i] = dimValue.getValue().doubleValue(); + i++; + } + // Sort dimensions in the ascending order and sort values in the same order as their corresponding dimensions + sortSparseDimsDoubleValues(queryDims, queryValues, n); + } + + public double l2normSparse(VectorScriptDocValues.SparseVectorScriptDocValues dvs) { + BytesRef value = dvs.getEncodedValue(); + if (value == null) return 0; + int[] docDims = VectorEncoderDecoder.decodeSparseVectorDims(value); + float[] docValues = VectorEncoderDecoder.decodeSparseVector(value); + int queryIndex = 0; + int docIndex = 0; + double l2norm = 0; + // find common dimensions among vectors v1 and v2 and calculate l1norm based on common dimensions + while (queryIndex < queryDims.length && docIndex < docDims.length) { + if (queryDims[queryIndex] == docDims[docIndex]) { + double diff = queryValues[queryIndex] - docValues[docIndex]; + l2norm += diff * diff; + queryIndex++; + docIndex++; + } else if (queryDims[queryIndex] > docDims[docIndex]) { + docIndex++; + } else { + queryIndex++; + } + } + return Math.sqrt(l2norm); + } + } /** * Calculate a dot product between a query's sparse vector and documents' sparse vectors diff --git a/modules/mapper-extras/src/main/resources/org/elasticsearch/index/query/docvalues_whitelist.txt b/modules/mapper-extras/src/main/resources/org/elasticsearch/index/query/docvalues_whitelist.txt index 3a8989e20b020..cac48257fc32c 100644 --- a/modules/mapper-extras/src/main/resources/org/elasticsearch/index/query/docvalues_whitelist.txt +++ b/modules/mapper-extras/src/main/resources/org/elasticsearch/index/query/docvalues_whitelist.txt @@ -25,8 +25,12 @@ class org.elasticsearch.index.query.VectorScriptDocValues$SparseVectorScriptDocV } static_import { + double l1norm(List, VectorScriptDocValues.DenseVectorScriptDocValues) from_class org.elasticsearch.index.query.ScoreScriptUtils + double l2norm(List, VectorScriptDocValues.DenseVectorScriptDocValues) from_class org.elasticsearch.index.query.ScoreScriptUtils double cosineSimilarity(List, VectorScriptDocValues.DenseVectorScriptDocValues) bound_to org.elasticsearch.index.query.ScoreScriptUtils$CosineSimilarity double dotProduct(List, VectorScriptDocValues.DenseVectorScriptDocValues) from_class org.elasticsearch.index.query.ScoreScriptUtils + double l1normSparse(Map, VectorScriptDocValues.SparseVectorScriptDocValues) bound_to org.elasticsearch.index.query.ScoreScriptUtils$L1NormSparse + double l2normSparse(Map, VectorScriptDocValues.SparseVectorScriptDocValues) bound_to org.elasticsearch.index.query.ScoreScriptUtils$L2NormSparse double dotProductSparse(Map, VectorScriptDocValues.SparseVectorScriptDocValues) bound_to org.elasticsearch.index.query.ScoreScriptUtils$DotProductSparse double cosineSimilaritySparse(Map, VectorScriptDocValues.SparseVectorScriptDocValues) bound_to org.elasticsearch.index.query.ScoreScriptUtils$CosineSimilaritySparse } \ No newline at end of file diff --git a/modules/mapper-extras/src/test/java/org/elasticsearch/index/query/ScoreScriptUtilsTests.java b/modules/mapper-extras/src/test/java/org/elasticsearch/index/query/ScoreScriptUtilsTests.java index bcdf0387c3f71..1d816eea3bd32 100644 --- a/modules/mapper-extras/src/test/java/org/elasticsearch/index/query/ScoreScriptUtilsTests.java +++ b/modules/mapper-extras/src/test/java/org/elasticsearch/index/query/ScoreScriptUtilsTests.java @@ -25,6 +25,8 @@ import org.elasticsearch.index.query.ScoreScriptUtils.CosineSimilarity; import org.elasticsearch.index.query.ScoreScriptUtils.DotProductSparse; import org.elasticsearch.index.query.ScoreScriptUtils.CosineSimilaritySparse; +import org.elasticsearch.index.query.ScoreScriptUtils.L1NormSparse; +import org.elasticsearch.index.query.ScoreScriptUtils.L2NormSparse; import java.util.Arrays; import java.util.HashMap; @@ -33,6 +35,9 @@ import static org.elasticsearch.index.mapper.VectorEncoderDecoderTests.mockEncodeDenseVector; import static org.elasticsearch.index.query.ScoreScriptUtils.dotProduct; +import static org.elasticsearch.index.query.ScoreScriptUtils.l1norm; +import static org.elasticsearch.index.query.ScoreScriptUtils.l2norm; + import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -53,6 +58,14 @@ public void testDenseVectorFunctions() { CosineSimilarity cosineSimilarity = new CosineSimilarity(queryVector); double result2 = cosineSimilarity.cosineSimilarity(dvs); assertEquals("cosineSimilarity result is not equal to the expected value!", 0.78, result2, 0.1); + + // test l1Norm + double result3 = l1norm(queryVector, dvs); + assertEquals("l1norm result is not equal to the expected value!", 485.18, result3, 0.1); + + // test l2norm + double result4 = l2norm(queryVector, dvs); + assertEquals("l2norm result is not equal to the expected value!", 301.36, result4, 0.1); } public void testSparseVectorFunctions() { @@ -78,5 +91,15 @@ public void testSparseVectorFunctions() { CosineSimilaritySparse cosineSimilaritySparse = new CosineSimilaritySparse(queryVector); double result2 = cosineSimilaritySparse.cosineSimilaritySparse(dvs); assertEquals("cosineSimilaritySparse result is not equal to the expected value!", 0.78, result2, 0.1); + + // test l1norm + L1NormSparse l1Norm = new L1NormSparse(queryVector); + double result3 = l1Norm.l1normSparse(dvs); + assertEquals("l1normSparse result is not equal to the expected value!", 485.18, result3, 0.1); + + // test l2norm + L2NormSparse l2Norm = new L2NormSparse(queryVector); + double result4 = l2Norm.l2normSparse(dvs); + assertEquals("l2normSparse result is not equal to the expected value!", 301.36, result4, 0.1); } } diff --git a/modules/mapper-extras/src/test/resources/rest-api-spec/test/dense-vector/10_basic.yml b/modules/mapper-extras/src/test/resources/rest-api-spec/test/dense-vector/10_basic.yml index e5db535b69b80..7649de62d839f 100644 --- a/modules/mapper-extras/src/test/resources/rest-api-spec/test/dense-vector/10_basic.yml +++ b/modules/mapper-extras/src/test/resources/rest-api-spec/test/dense-vector/10_basic.yml @@ -98,3 +98,64 @@ setup: - match: {hits.hits.2._id: "1"} - gte: {hits.hits.2._score: 0.78} - lte: {hits.hits.2._score: 0.791} + + +--- +"L1 norm": + - do: + headers: + Content-Type: application/json + search: + rest_total_hits_as_int: true + body: + query: + script_score: + query: {match_all: {} } + script: + source: "l1norm(params.query_vector, doc['my_dense_vector'])" + params: + query_vector: [0.5, 111.3, -13.0, 14.8, -156.0] + + - match: {hits.total: 3} + + - match: {hits.hits.0._id: "1"} + - gte: {hits.hits.0._score: 485.18} + - lte: {hits.hits.0._score: 485.19} + + - match: {hits.hits.1._id: "2"} + - gte: {hits.hits.1._score: 12.25} + - lte: {hits.hits.1._score: 12.35} + + - match: {hits.hits.2._id: "3"} + - gte: {hits.hits.2._score: 0.00} + - lte: {hits.hits.2._score: 0.01} + +--- +"L2 norm": +- do: + headers: + Content-Type: application/json + search: + rest_total_hits_as_int: true + body: + query: + script_score: + query: {match_all: {} } + script: + source: "l2norm(params.query_vector, doc['my_dense_vector'])" + params: + query_vector: [0.5, 111.3, -13.0, 14.8, -156.0] + +- match: {hits.total: 3} + +- match: {hits.hits.0._id: "1"} +- gte: {hits.hits.0._score: 301.36} +- lte: {hits.hits.0._score: 301.37} + +- match: {hits.hits.1._id: "2"} +- gte: {hits.hits.1._score: 11.34} +- lte: {hits.hits.1._score: 11.35} + +- match: {hits.hits.2._id: "3"} +- gte: {hits.hits.2._score: 0.00} +- lte: {hits.hits.2._score: 0.01} diff --git a/modules/mapper-extras/src/test/resources/rest-api-spec/test/sparse-vector/10_basic.yml b/modules/mapper-extras/src/test/resources/rest-api-spec/test/sparse-vector/10_basic.yml index 142a80291aebf..329c372196bac 100644 --- a/modules/mapper-extras/src/test/resources/rest-api-spec/test/sparse-vector/10_basic.yml +++ b/modules/mapper-extras/src/test/resources/rest-api-spec/test/sparse-vector/10_basic.yml @@ -83,7 +83,7 @@ setup: script: source: "cosineSimilaritySparse(params.query_vector, doc['my_sparse_vector'])" params: - query_vector: {"2": -0.5, "10" : 111.3, "50": -13.0, "113": 14.8, "4545": -156.0} + query_vector: {"2": 0.5, "10" : 111.3, "50": -13.0, "113": 14.8, "4545": -156.0} - match: {hits.total: 3} @@ -98,3 +98,64 @@ setup: - match: {hits.hits.2._id: "1"} - gte: {hits.hits.2._score: 0.78} - lte: {hits.hits.2._score: 0.791} + +--- +"L1 norm": +- do: + headers: + Content-Type: application/json + search: + rest_total_hits_as_int: true + body: + query: + script_score: + query: {match_all: {} } + script: + source: "l1normSparse(params.query_vector, doc['my_sparse_vector'])" + params: + query_vector: {"2": 0.5, "10" : 111.3, "50": -13.0, "113": 14.8, "4545": -156.0} + +- match: {hits.total: 3} + +- match: {hits.hits.0._id: "1"} +- gte: {hits.hits.0._score: 485.18} +- lte: {hits.hits.0._score: 485.19} + +- match: {hits.hits.1._id: "2"} +- gte: {hits.hits.1._score: 12.25} +- lte: {hits.hits.1._score: 12.35} + +- match: {hits.hits.2._id: "3"} +- gte: {hits.hits.2._score: 0.00} +- lte: {hits.hits.2._score: 0.01} + + +--- +"L2 norm": +- do: + headers: + Content-Type: application/json + search: + rest_total_hits_as_int: true + body: + query: + script_score: + query: {match_all: {} } + script: + source: "l2normSparse(params.query_vector, doc['my_sparse_vector'])" + params: + query_vector: {"2": 0.5, "10" : 111.3, "50": -13.0, "113": 14.8, "4545": -156.0} + +- match: {hits.total: 3} + +- match: {hits.hits.0._id: "1"} +- gte: {hits.hits.0._score: 301.36} +- lte: {hits.hits.0._score: 301.37} + +- match: {hits.hits.1._id: "2"} +- gte: {hits.hits.1._score: 11.34} +- lte: {hits.hits.1._score: 11.35} + +- match: {hits.hits.2._id: "3"} +- gte: {hits.hits.2._score: 0.00} +- lte: {hits.hits.2._score: 0.01}