Add cross encoder support (opensearch-project#1615)

* add text similarity inputs and function name Signed-off-by: HenryL27 <hmlindeman@yahoo.com> * add text similarity cross encoder model Signed-off-by: HenryL27 <hmlindeman@yahoo.com> * add text similarity unit tests Signed-off-by: HenryL27 <hmlindeman@yahoo.com> * add text similarity input unittests Signed-off-by: HenryL27 <hmlindeman@yahoo.com> * add text similarity dataset unittests Signed-off-by: HenryL27 <hmlindeman@yahoo.com> * add function name annotation Signed-off-by: HenryL27 <hmlindeman@yahoo.com> * refactor API to use single query Signed-off-by: HenryL27 <hmlindeman@yahoo.com> * omit private from class vars Co-authored-by: Navneet Verma <vermanavneet003@gmail.com> Signed-off-by: HenryL27 <hmlindeman@yahoo.com> * change output name from logits to similarity Signed-off-by: HenryL27 <hmlindeman@yahoo.com> * hashify isDLModel Signed-off-by: HenryL27 <hmlindeman@yahoo.com> * add error message for non-torchscript cross encoders Signed-off-by: HenryL27 <hmlindeman@yahoo.com> * allow onnx, actually. Signed-off-by: HenryL27 <hmlindeman@yahoo.com> * apply spotless after rebase Signed-off-by: HenryL27 <hmlindeman@yahoo.com> * add unittest for new mlinput toXcontent clause Signed-off-by: HenryL27 <hmlindeman@yahoo.com> * static DLModels Signed-off-by: HenryL27 <hmlindeman@yahoo.com> * add tests and error message tweaks Signed-off-by: HenryL27 <hmlindeman@yahoo.com> * name test models w framework Signed-off-by: HenryL27 <hmlindeman@yahoo.com> * change pt->torch_script Signed-off-by: HenryL27 <hmlindeman@yahoo.com> --------- Signed-off-by: HenryL27 <hmlindeman@yahoo.com> Co-authored-by: Navneet Verma <vermanavneet003@gmail.com>
HenryL27 · Dec 7, 2023 · 2761d7d · 2761d7d
1 parent d71c77f
commit 2761d7d
Show file tree

Hide file tree

Showing 13 changed files with 932 additions and 4 deletions.
diff --git a/common/src/main/java/org/opensearch/ml/common/FunctionName.java b/common/src/main/java/org/opensearch/ml/common/FunctionName.java
@@ -5,6 +5,9 @@
 
 package org.opensearch.ml.common;
 
+import java.util.HashSet;
+import java.util.Set;
+
 public enum FunctionName {
     LINEAR_REGRESSION,
     KMEANS,
@@ -17,6 +20,7 @@ public enum FunctionName {
     RCF_SUMMARIZE,
     LOGISTIC_REGRESSION,
     TEXT_EMBEDDING,
+    TEXT_SIMILARITY,
     SPARSE_ENCODING,
     SPARSE_TOKENIZE,
     METRICS_CORRELATION,
@@ -30,14 +34,18 @@ public static FunctionName from(String value) {
         }
     }
 
+    private static final HashSet<FunctionName> DL_MODELS = new HashSet<>(Set.of(
+        TEXT_EMBEDDING,
+        TEXT_SIMILARITY,
+        SPARSE_ENCODING,
+        SPARSE_TOKENIZE
+    ));
+
     /**
      * Check if model is deep learning model.
      * @return true for deep learning model.
      */
     public static boolean isDLModel(FunctionName functionName) {
-        if (functionName == TEXT_EMBEDDING || functionName == SPARSE_ENCODING || functionName == SPARSE_TOKENIZE) {
-            return true;
-        }
-        return false;
+        return DL_MODELS.contains(functionName);
     }
 }
diff --git a/common/src/main/java/org/opensearch/ml/common/dataset/MLInputDataType.java b/common/src/main/java/org/opensearch/ml/common/dataset/MLInputDataType.java
@@ -9,5 +9,6 @@ public enum MLInputDataType {
     SEARCH_QUERY,
     DATA_FRAME,
     TEXT_DOCS,
+    TEXT_SIMILARITY,
     REMOTE
 }
diff --git a/common/src/main/java/org/opensearch/ml/common/dataset/TextSimilarityInputDataSet.java b/common/src/main/java/org/opensearch/ml/common/dataset/TextSimilarityInputDataSet.java
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2023 Aryn
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.opensearch.ml.common.dataset;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Objects;
+
+import org.opensearch.core.common.io.stream.StreamInput;
+import org.opensearch.core.common.io.stream.StreamOutput;
+import org.opensearch.ml.common.annotation.InputDataSet;
+
+import lombok.AccessLevel;
+import lombok.Builder;
+import lombok.Getter;
+import lombok.experimental.FieldDefaults;
+
+@Getter
+@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
+@InputDataSet(MLInputDataType.TEXT_SIMILARITY)
+public class TextSimilarityInputDataSet extends MLInputDataset {
+
+   List<String> textDocs;
+
+   String queryText;
+
+    @Builder(toBuilder = true)
+    public TextSimilarityInputDataSet(String queryText, List<String> textDocs) {
+        super(MLInputDataType.TEXT_SIMILARITY);
+        Objects.requireNonNull(textDocs);
+        Objects.requireNonNull(queryText);
+        if(textDocs.isEmpty()) {
+            throw new IllegalArgumentException("No text documents were provided");
+        }
+        this.textDocs = textDocs;
+        this.queryText = queryText;
+    }
+
+    public TextSimilarityInputDataSet(StreamInput in) throws IOException {
+        super(MLInputDataType.TEXT_SIMILARITY);
+        this.queryText = in.readString();
+        int size = in.readInt();
+        this.textDocs = new ArrayList<String>();
+        for(int i = 0; i < size; i++) {
+            String context = in.readString();
+            this.textDocs.add(context);
+        }
+    }
+
+    @Override
+    public void writeTo(StreamOutput out) throws IOException {
+        super.writeTo(out);
+        out.writeString(queryText);
+        out.writeInt(this.textDocs.size());
+        for (String doc : this.textDocs) {
+            out.writeString(doc);
+        }
+    }
+}
diff --git a/common/src/main/java/org/opensearch/ml/common/input/MLInput.java b/common/src/main/java/org/opensearch/ml/common/input/MLInput.java
@@ -8,6 +8,7 @@
 import lombok.Builder;
 import lombok.Data;
 import lombok.NoArgsConstructor;
+
 import org.opensearch.core.common.io.stream.StreamInput;
 import org.opensearch.core.common.io.stream.StreamOutput;
 import org.opensearch.core.xcontent.XContentBuilder;
@@ -21,6 +22,7 @@
 import org.opensearch.ml.common.dataset.SearchQueryInputDataset;
 import org.opensearch.ml.common.FunctionName;
 import org.opensearch.ml.common.dataset.TextDocsInputDataSet;
+import org.opensearch.ml.common.dataset.TextSimilarityInputDataSet;
 import org.opensearch.ml.common.input.parameter.MLAlgoParams;
 import org.opensearch.search.builder.SearchSourceBuilder;
 
@@ -55,6 +57,8 @@ public class MLInput implements Input {
     public static final String TARGET_RESPONSE_POSITIONS_FIELD = "target_response_positions";
     // Input text sentences for text embedding model
     public static final String TEXT_DOCS_FIELD = "text_docs";
+    // Input query text to compare against for text similarity model
+    public static final String QUERY_TEXT_FIELD = "query_text";
 
     // Algorithm name
     protected FunctionName algorithm;
@@ -157,6 +161,20 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
                             builder.field(TARGET_RESPONSE_POSITIONS_FIELD, targetPositions.toArray(new Integer[0]));
                         }
                     }
+                    break;
+                case TEXT_SIMILARITY:
+                    TextSimilarityInputDataSet ds = (TextSimilarityInputDataSet) this.inputDataset;
+                    List<String> tdocs = ds.getTextDocs();
+                    String queryText = ds.getQueryText();
+                    builder.field(QUERY_TEXT_FIELD, queryText);
+                    if (tdocs != null && !tdocs.isEmpty()) {
+                        builder.startArray(TEXT_DOCS_FIELD);
+                        for(String d : tdocs) {
+                            builder.value(d);
+                        }
+                        builder.endArray();
+                    }
+                    break;
                 default:
                     break;
             }
@@ -186,6 +204,7 @@ public static MLInput parse(XContentParser parser, String inputAlgoName) throws
         List<String> targetResponse = new ArrayList<>();
         List<Integer> targetResponsePositions = new ArrayList<>();
         List<String> textDocs = new ArrayList<>();
+        String queryText = null;
 
         ensureExpectedToken(XContentParser.Token.START_OBJECT, parser.currentToken(), parser);
         while (parser.nextToken() != XContentParser.Token.END_OBJECT) {
@@ -233,6 +252,9 @@ public static MLInput parse(XContentParser parser, String inputAlgoName) throws
                         textDocs.add(parser.text());
                     }
                     break;
+                case QUERY_TEXT_FIELD:
+                    queryText = parser.text();
+                    break;
                 default:
                     parser.skipChildren();
                     break;
@@ -243,6 +265,9 @@ public static MLInput parse(XContentParser parser, String inputAlgoName) throws
             ModelResultFilter filter = new ModelResultFilter(returnBytes, returnNumber, targetResponse, targetResponsePositions);
             inputDataSet = new TextDocsInputDataSet(textDocs, filter);
         }
+        if (algorithm == FunctionName.TEXT_SIMILARITY) {
+            inputDataSet = new TextSimilarityInputDataSet(queryText, textDocs);
+        }
         return new MLInput(algorithm, mlParameters, searchSourceBuilder, sourceIndices, dataFrame, inputDataSet);
     }
 

diff --git a/common/src/main/java/org/opensearch/ml/common/input/nlp/TextSimilarityMLInput.java b/common/src/main/java/org/opensearch/ml/common/input/nlp/TextSimilarityMLInput.java
@@ -0,0 +1,116 @@
+/*
+ * Copyright 2023 Aryn
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.opensearch.ml.common.input.nlp;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.opensearch.core.common.io.stream.StreamInput;
+import org.opensearch.core.common.io.stream.StreamOutput;
+import org.opensearch.core.xcontent.XContentBuilder;
+import org.opensearch.core.xcontent.XContentParser;
+import org.opensearch.ml.common.FunctionName;
+import org.opensearch.ml.common.dataset.MLInputDataset;
+import org.opensearch.ml.common.dataset.TextSimilarityInputDataSet;
+import org.opensearch.ml.common.input.MLInput;
+
+import static org.opensearch.core.xcontent.XContentParserUtils.ensureExpectedToken;
+
+
+/**
+ * MLInput which supports a text similarity algorithm
+ * Inputs are a query and a list of texts. Outputs are real numbers
+ * Use this for Cross Encoder models
+ */
+@org.opensearch.ml.common.annotation.MLInput(functionNames = {FunctionName.TEXT_SIMILARITY})
+public class TextSimilarityMLInput extends MLInput {
+
+    public TextSimilarityMLInput(FunctionName algorithm, MLInputDataset dataset) {
+        super(algorithm, null, dataset);
+    }
+
+    public TextSimilarityMLInput(StreamInput in) throws IOException {
+        super(in);
+    }
+
+    @Override
+    public void writeTo(StreamOutput out) throws IOException {
+        super.writeTo(out);
+    }
+
+    @Override
+    public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
+        builder.startObject();
+        builder.field(ALGORITHM_FIELD, algorithm.name());
+        if(parameters != null) {
+            builder.field(ML_PARAMETERS_FIELD, parameters);
+        }
+        if(inputDataset != null) {
+            TextSimilarityInputDataSet ds = (TextSimilarityInputDataSet) this.inputDataset;
+            List<String> docs = ds.getTextDocs();
+            String queryText = ds.getQueryText();
+            builder.field(QUERY_TEXT_FIELD, queryText);
+            if (docs != null && !docs.isEmpty()) {
+                builder.startArray(TEXT_DOCS_FIELD);
+                for(String d : docs) {
+                    builder.value(d);
+                }
+                builder.endArray();
+            }
+        }
+        builder.endObject();
+        return builder;
+    }
+
+    public TextSimilarityMLInput(XContentParser parser, FunctionName functionName) throws IOException {
+        super();
+        this.algorithm = functionName;
+        List<String> docs = new ArrayList<>();
+        String queryText = null;
+
+        ensureExpectedToken(XContentParser.Token.START_OBJECT, parser.currentToken(), parser);
+        while (parser.nextToken() != XContentParser.Token.END_OBJECT) {
+            String fieldName = parser.currentName();
+            parser.nextToken();
+
+            switch (fieldName) {
+                case TEXT_DOCS_FIELD:
+                    ensureExpectedToken(XContentParser.Token.START_ARRAY, parser.currentToken(), parser);
+                    while (parser.nextToken() != XContentParser.Token.END_ARRAY) {
+                        String context = parser.text();
+                        docs.add(context);
+                    }
+                    break;
+                case QUERY_TEXT_FIELD: 
+                    queryText = parser.text();
+                default:
+                    parser.skipChildren();
+                    break;
+            }
+        }        
+        if(docs.isEmpty()) {
+            throw new IllegalArgumentException("No text documents were provided");
+        }
+        if(queryText == null) {
+            throw new IllegalArgumentException("No query text was provided");
+        }
+        inputDataset = new TextSimilarityInputDataSet(queryText, docs);
+    }
+
+}
diff --git a/common/src/test/java/org/opensearch/ml/common/dataset/TextSimilarityInputDatasetTest.java b/common/src/test/java/org/opensearch/ml/common/dataset/TextSimilarityInputDatasetTest.java
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2023 Aryn
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.opensearch.ml.common.dataset;
+
+import static org.junit.Assert.assertThrows;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.junit.Test;
+import org.opensearch.common.io.stream.BytesStreamOutput;
+import org.opensearch.core.common.bytes.BytesReference;
+import org.opensearch.core.common.io.stream.BytesStreamInput;
+import org.opensearch.core.common.io.stream.OutputStreamStreamOutput;
+import org.opensearch.core.common.io.stream.StreamInput;
+import org.opensearch.core.common.io.stream.StreamOutput;
+
+public class TextSimilarityInputDatasetTest {
+
+    @Test
+    public void testStreaming() throws IOException {
+        List<String> docs = List.of("That is a happy dog", "it's summer");
+        String queryText = "today is sunny";
+        TextSimilarityInputDataSet dataset = TextSimilarityInputDataSet.builder().queryText(queryText).textDocs(docs).build();
+        BytesStreamOutput outbytes = new BytesStreamOutput();
+        StreamOutput osso = new OutputStreamStreamOutput(outbytes);
+        dataset.writeTo(osso);
+        StreamInput in = new BytesStreamInput(BytesReference.toBytes(outbytes.bytes()));
+        TextSimilarityInputDataSet newDs = (TextSimilarityInputDataSet) MLInputDataset.fromStream(in);
+        assert (dataset.getTextDocs().equals(newDs.getTextDocs()));
+        assert (dataset.getQueryText().equals(newDs.getQueryText()));
+    }
+
+    @Test
+    public void noPairs_ThenFail() {
+        List<String> docs = List.of();
+        String queryText = "today is sunny";
+        IllegalArgumentException e = assertThrows(IllegalArgumentException.class, 
+            () -> TextSimilarityInputDataSet.builder().textDocs(docs).queryText(queryText).build());
+        assert (e.getMessage().equals("No text documents were provided"));
+    }
+
+    @Test
+    public void noQuery_ThenFail() {
+        List<String> docs = List.of("That is a happy dog", "it's summer");
+        String queryText = null;
+        assertThrows(NullPointerException.class,
+            () -> TextSimilarityInputDataSet.builder().textDocs(docs).queryText(queryText).build());
+    }
+}