diff --git a/build-tools-internal/src/main/groovy/elasticsearch.ide.gradle b/build-tools-internal/src/main/groovy/elasticsearch.ide.gradle index 9237c3ae8918c..895cca2af7967 100644 --- a/build-tools-internal/src/main/groovy/elasticsearch.ide.gradle +++ b/build-tools-internal/src/main/groovy/elasticsearch.ide.gradle @@ -142,13 +142,18 @@ if (providers.systemProperty('idea.active').getOrNull() == 'true') { description = 'Enables preview features on native library module' dependsOn tasks.named("enableExternalConfiguration") - doLast { - ['main', 'test'].each { sourceSet -> - modifyXml(".idea/modules/libs/native/elasticsearch.libs.native.${sourceSet}.iml") { xml -> - xml.component.find { it.'@name' == 'NewModuleRootManager' }?.'@LANGUAGE_LEVEL' = 'JDK_21_PREVIEW' + ext { + enablePreview = { moduleFile, languageLevel -> + modifyXml(moduleFile) { xml -> + xml.component.find { it.'@name' == 'NewModuleRootManager' }?.'@LANGUAGE_LEVEL' = languageLevel } } } + + doLast { + enablePreview('.idea/modules/libs/native/elasticsearch.libs.native.main.iml', 'JDK_21_PREVIEW') + enablePreview('.idea/modules/libs/native/elasticsearch.libs.native.test.iml', 'JDK_21_PREVIEW') + } } tasks.register('buildDependencyArtifacts') { diff --git a/docs/changelog/117595.yaml b/docs/changelog/117595.yaml new file mode 100644 index 0000000000000..9360c372ac97e --- /dev/null +++ b/docs/changelog/117595.yaml @@ -0,0 +1,5 @@ +pr: 117595 +summary: Fix for Deberta tokenizer when input sequence exceeds 512 tokens +area: Machine Learning +type: bug +issues: [] diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/NlpTokenizer.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/NlpTokenizer.java index 0b4a5b651d8d4..930dbee304790 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/NlpTokenizer.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/NlpTokenizer.java @@ -331,6 +331,29 @@ public List tokenize(String seq1, String seq2, Tokeni tokenIdsSeq2 = tokenIdsSeq2.subList(0, maxSequenceLength() - extraTokens - tokenIdsSeq1.size()); tokenPositionMapSeq2 = tokenPositionMapSeq2.subList(0, maxSequenceLength() - extraTokens - tokenIdsSeq1.size()); } + case BALANCED -> { + isTruncated = true; + int firstSequenceLength = 0; + + if (tokenIdsSeq2.size() > (maxSequenceLength() - getNumExtraTokensForSeqPair()) / 2) { + firstSequenceLength = min(tokenIdsSeq1.size(), (maxSequenceLength() - getNumExtraTokensForSeqPair()) / 2); + } else { + firstSequenceLength = min( + tokenIdsSeq1.size(), + maxSequenceLength() - tokenIdsSeq2.size() - getNumExtraTokensForSeqPair() + ); + } + int secondSequenceLength = min( + tokenIdsSeq2.size(), + maxSequenceLength() - firstSequenceLength - getNumExtraTokensForSeqPair() + ); + + tokenIdsSeq1 = tokenIdsSeq1.subList(0, firstSequenceLength); + tokenPositionMapSeq1 = tokenPositionMapSeq1.subList(0, firstSequenceLength); + + tokenIdsSeq2 = tokenIdsSeq2.subList(0, secondSequenceLength); + tokenPositionMapSeq2 = tokenPositionMapSeq2.subList(0, secondSequenceLength); + } case NONE -> throw ExceptionsHelper.badRequestException( "Input too large. The tokenized input length [{}] exceeds the maximum sequence length [{}]", numTokens, diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/TextSimilarityProcessorTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/TextSimilarityProcessorTests.java index 3590793b81abd..7460e17055a00 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/TextSimilarityProcessorTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/TextSimilarityProcessorTests.java @@ -10,11 +10,13 @@ import org.elasticsearch.test.ESTestCase; import org.elasticsearch.xpack.core.ml.inference.results.TextSimilarityInferenceResults; import org.elasticsearch.xpack.core.ml.inference.trainedmodel.BertTokenization; +import org.elasticsearch.xpack.core.ml.inference.trainedmodel.DebertaV2Tokenization; import org.elasticsearch.xpack.core.ml.inference.trainedmodel.TextSimilarityConfig; import org.elasticsearch.xpack.core.ml.inference.trainedmodel.Tokenization; import org.elasticsearch.xpack.core.ml.inference.trainedmodel.VocabularyConfig; import org.elasticsearch.xpack.ml.inference.nlp.tokenizers.BertTokenizationResult; import org.elasticsearch.xpack.ml.inference.nlp.tokenizers.BertTokenizer; +import org.elasticsearch.xpack.ml.inference.nlp.tokenizers.DebertaV2Tokenizer; import org.elasticsearch.xpack.ml.inference.nlp.tokenizers.TokenizationResult; import org.elasticsearch.xpack.ml.inference.pytorch.results.PyTorchInferenceResult; @@ -22,6 +24,8 @@ import java.util.List; import static org.elasticsearch.xpack.ml.inference.nlp.tokenizers.BertTokenizerTests.TEST_CASED_VOCAB; +import static org.elasticsearch.xpack.ml.inference.nlp.tokenizers.DebertaV2TokenizerTests.TEST_CASE_SCORES; +import static org.elasticsearch.xpack.ml.inference.nlp.tokenizers.DebertaV2TokenizerTests.TEST_CASE_VOCAB; import static org.hamcrest.Matchers.closeTo; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.is; @@ -62,6 +66,33 @@ public void testProcessor() throws IOException { assertThat(result.predictedValue(), closeTo(42, 1e-6)); } + public void testBalancedTruncationWithLongInput() throws IOException { + String question = "Is Elasticsearch scalable?"; + StringBuilder longInputBuilder = new StringBuilder(); + for (int i = 0; i < 1000; i++) { + longInputBuilder.append(TEST_CASE_VOCAB.get(randomIntBetween(0, TEST_CASE_VOCAB.size() - 1))).append(i).append(" "); + } + String longInput = longInputBuilder.toString().trim(); + + DebertaV2Tokenization tokenization = new DebertaV2Tokenization(false, true, null, Tokenization.Truncate.BALANCED, -1); + DebertaV2Tokenizer tokenizer = DebertaV2Tokenizer.builder(TEST_CASE_VOCAB, TEST_CASE_SCORES, tokenization).build(); + TextSimilarityConfig textSimilarityConfig = new TextSimilarityConfig( + question, + new VocabularyConfig(""), + tokenization, + "result", + TextSimilarityConfig.SpanScoreFunction.MAX + ); + TextSimilarityProcessor processor = new TextSimilarityProcessor(tokenizer); + TokenizationResult tokenizationResult = processor.getRequestBuilder(textSimilarityConfig) + .buildRequest(List.of(longInput), "1", Tokenization.Truncate.BALANCED, -1, null) + .tokenization(); + + // Assert that the tokenization result is as expected + assertThat(tokenizationResult.anyTruncated(), is(true)); + assertThat(tokenizationResult.getTokenization(0).tokenIds().length, equalTo(512)); + } + public void testResultFunctions() { BertTokenization tokenization = new BertTokenization(false, true, 384, Tokenization.Truncate.NONE, 128); BertTokenizer tokenizer = BertTokenizer.builder(TEST_CASED_VOCAB, tokenization).build(); diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/DebertaV2TokenizerTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/DebertaV2TokenizerTests.java index a8461de8630ae..fc070ec25dc68 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/DebertaV2TokenizerTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/DebertaV2TokenizerTests.java @@ -23,7 +23,7 @@ public class DebertaV2TokenizerTests extends ESTestCase { - private static final List TEST_CASE_VOCAB = List.of( + public static final List TEST_CASE_VOCAB = List.of( DebertaV2Tokenizer.CLASS_TOKEN, DebertaV2Tokenizer.PAD_TOKEN, DebertaV2Tokenizer.SEPARATOR_TOKEN, @@ -48,7 +48,7 @@ public class DebertaV2TokenizerTests extends ESTestCase { "<0xAD>", "▁" ); - private static final List TEST_CASE_SCORES = List.of( + public static final List TEST_CASE_SCORES = List.of( 0.0, 0.0, 0.0,