From c70cf1d037d5b78f98787e9fd4ace78beff9a456 Mon Sep 17 00:00:00 2001 From: "opensearch-trigger-bot[bot]" <98922864+opensearch-trigger-bot[bot]@users.noreply.github.com> Date: Mon, 22 Jul 2024 11:58:06 -0700 Subject: [PATCH] Add SplitResponseProcessor to Search Pipelines (#14800) (#14863) * Add SplitResponseProcessor for search pipelines * Register the split processor factory * Address code review comments * Avoid list copy by casting array --------- (cherry picked from commit 45c5f8d3154db84e28a6dc201d9b31ff91288fde) Signed-off-by: Daniel Widdis Signed-off-by: github-actions[bot] Co-authored-by: github-actions[bot] Signed-off-by: kkewwei --- CHANGELOG.md | 1 + .../SearchPipelineCommonModulePlugin.java | 4 +- .../common/SplitResponseProcessor.java | 162 +++++++++++++ ...SearchPipelineCommonModulePluginTests.java | 2 +- .../common/SplitResponseProcessorTests.java | 213 ++++++++++++++++++ 5 files changed, 380 insertions(+), 2 deletions(-) create mode 100644 modules/search-pipeline-common/src/main/java/org/opensearch/search/pipeline/common/SplitResponseProcessor.java create mode 100644 modules/search-pipeline-common/src/test/java/org/opensearch/search/pipeline/common/SplitResponseProcessorTests.java diff --git a/CHANGELOG.md b/CHANGELOG.md index 43f18a4b07057..89e1b6bd5e1ff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Add matchesPluginSystemIndexPattern to SystemIndexRegistry ([#14750](https://github.com/opensearch-project/OpenSearch/pull/14750)) - Add Plugin interface for loading application based configuration templates (([#14659](https://github.com/opensearch-project/OpenSearch/issues/14659))) - Add prefix mode verification setting for repository verification (([#14790](https://github.com/opensearch-project/OpenSearch/pull/14790))) +- Add SplitResponseProcessor to Search Pipelines (([#14800](https://github.com/opensearch-project/OpenSearch/issues/14800))) - Optimize TransportNodesAction to not send DiscoveryNodes for NodeStats, NodesInfo and ClusterStats call ([14749](https://github.com/opensearch-project/OpenSearch/pull/14749)) ### Dependencies diff --git a/modules/search-pipeline-common/src/main/java/org/opensearch/search/pipeline/common/SearchPipelineCommonModulePlugin.java b/modules/search-pipeline-common/src/main/java/org/opensearch/search/pipeline/common/SearchPipelineCommonModulePlugin.java index 1574621a8200e..d05101da2817c 100644 --- a/modules/search-pipeline-common/src/main/java/org/opensearch/search/pipeline/common/SearchPipelineCommonModulePlugin.java +++ b/modules/search-pipeline-common/src/main/java/org/opensearch/search/pipeline/common/SearchPipelineCommonModulePlugin.java @@ -96,7 +96,9 @@ public Map> getResponseProces TruncateHitsResponseProcessor.TYPE, new TruncateHitsResponseProcessor.Factory(), CollapseResponseProcessor.TYPE, - new CollapseResponseProcessor.Factory() + new CollapseResponseProcessor.Factory(), + SplitResponseProcessor.TYPE, + new SplitResponseProcessor.Factory() ) ); } diff --git a/modules/search-pipeline-common/src/main/java/org/opensearch/search/pipeline/common/SplitResponseProcessor.java b/modules/search-pipeline-common/src/main/java/org/opensearch/search/pipeline/common/SplitResponseProcessor.java new file mode 100644 index 0000000000000..0762f8f59b76e --- /dev/null +++ b/modules/search-pipeline-common/src/main/java/org/opensearch/search/pipeline/common/SplitResponseProcessor.java @@ -0,0 +1,162 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.search.pipeline.common; + +import org.opensearch.action.search.SearchRequest; +import org.opensearch.action.search.SearchResponse; +import org.opensearch.common.collect.Tuple; +import org.opensearch.common.document.DocumentField; +import org.opensearch.common.xcontent.XContentHelper; +import org.opensearch.core.common.bytes.BytesReference; +import org.opensearch.core.xcontent.MediaType; +import org.opensearch.core.xcontent.XContentBuilder; +import org.opensearch.ingest.ConfigurationUtils; +import org.opensearch.search.SearchHit; +import org.opensearch.search.pipeline.AbstractProcessor; +import org.opensearch.search.pipeline.Processor; +import org.opensearch.search.pipeline.SearchResponseProcessor; + +import java.util.Arrays; +import java.util.Map; +import java.util.Objects; + +/** + * Processor that sorts an array of items. + * Throws exception is the specified field is not an array. + */ +public class SplitResponseProcessor extends AbstractProcessor implements SearchResponseProcessor { + /** Key to reference this processor type from a search pipeline. */ + public static final String TYPE = "split"; + /** Key defining the string field to be split. */ + public static final String SPLIT_FIELD = "field"; + /** Key defining the delimiter used to split the string. This can be a regular expression pattern. */ + public static final String SEPARATOR = "separator"; + /** Optional key for handling empty trailing fields. */ + public static final String PRESERVE_TRAILING = "preserve_trailing"; + /** Optional key to put the split values in a different field. */ + public static final String TARGET_FIELD = "target_field"; + + private final String splitField; + private final String separator; + private final boolean preserveTrailing; + private final String targetField; + + SplitResponseProcessor( + String tag, + String description, + boolean ignoreFailure, + String splitField, + String separator, + boolean preserveTrailing, + String targetField + ) { + super(tag, description, ignoreFailure); + this.splitField = Objects.requireNonNull(splitField); + this.separator = Objects.requireNonNull(separator); + this.preserveTrailing = preserveTrailing; + this.targetField = targetField == null ? splitField : targetField; + } + + /** + * Getter function for splitField + * @return sortField + */ + public String getSplitField() { + return splitField; + } + + /** + * Getter function for separator + * @return separator + */ + public String getSeparator() { + return separator; + } + + /** + * Getter function for preserveTrailing + * @return preserveTrailing; + */ + public boolean isPreserveTrailing() { + return preserveTrailing; + } + + /** + * Getter function for targetField + * @return targetField + */ + public String getTargetField() { + return targetField; + } + + @Override + public String getType() { + return TYPE; + } + + @Override + public SearchResponse processResponse(SearchRequest request, SearchResponse response) throws Exception { + SearchHit[] hits = response.getHits().getHits(); + for (SearchHit hit : hits) { + Map fields = hit.getFields(); + if (fields.containsKey(splitField)) { + DocumentField docField = hit.getFields().get(splitField); + if (docField == null) { + throw new IllegalArgumentException("field [" + splitField + "] is null, cannot split."); + } + Object val = docField.getValue(); + if (val == null || !String.class.isAssignableFrom(val.getClass())) { + throw new IllegalArgumentException("field [" + splitField + "] is not a string, cannot split"); + } + Object[] strings = ((String) val).split(separator, preserveTrailing ? -1 : 0); + hit.setDocumentField(targetField, new DocumentField(targetField, Arrays.asList(strings))); + } + if (hit.hasSource()) { + BytesReference sourceRef = hit.getSourceRef(); + Tuple> typeAndSourceMap = XContentHelper.convertToMap( + sourceRef, + false, + (MediaType) null + ); + + Map sourceAsMap = typeAndSourceMap.v2(); + if (sourceAsMap.containsKey(splitField)) { + Object val = sourceAsMap.get(splitField); + if (val instanceof String) { + Object[] strings = ((String) val).split(separator, preserveTrailing ? -1 : 0); + sourceAsMap.put(targetField, Arrays.asList(strings)); + } + XContentBuilder builder = XContentBuilder.builder(typeAndSourceMap.v1().xContent()); + builder.map(sourceAsMap); + hit.sourceRef(BytesReference.bytes(builder)); + } + } + } + return response; + } + + static class Factory implements Processor.Factory { + + @Override + public SplitResponseProcessor create( + Map> processorFactories, + String tag, + String description, + boolean ignoreFailure, + Map config, + PipelineContext pipelineContext + ) { + String splitField = ConfigurationUtils.readStringProperty(TYPE, tag, config, SPLIT_FIELD); + String separator = ConfigurationUtils.readStringProperty(TYPE, tag, config, SEPARATOR); + boolean preserveTrailing = ConfigurationUtils.readBooleanProperty(TYPE, tag, config, PRESERVE_TRAILING, false); + String targetField = ConfigurationUtils.readStringProperty(TYPE, tag, config, TARGET_FIELD, splitField); + return new SplitResponseProcessor(tag, description, ignoreFailure, splitField, separator, preserveTrailing, targetField); + } + } +} diff --git a/modules/search-pipeline-common/src/test/java/org/opensearch/search/pipeline/common/SearchPipelineCommonModulePluginTests.java b/modules/search-pipeline-common/src/test/java/org/opensearch/search/pipeline/common/SearchPipelineCommonModulePluginTests.java index 519468ebe17ff..d4f9ae2490a10 100644 --- a/modules/search-pipeline-common/src/test/java/org/opensearch/search/pipeline/common/SearchPipelineCommonModulePluginTests.java +++ b/modules/search-pipeline-common/src/test/java/org/opensearch/search/pipeline/common/SearchPipelineCommonModulePluginTests.java @@ -82,7 +82,7 @@ public void testAllowlistNotSpecified() throws IOException { try (SearchPipelineCommonModulePlugin plugin = new SearchPipelineCommonModulePlugin()) { assertEquals(Set.of("oversample", "filter_query", "script"), plugin.getRequestProcessors(createParameters(settings)).keySet()); assertEquals( - Set.of("rename_field", "truncate_hits", "collapse"), + Set.of("rename_field", "truncate_hits", "collapse", "split"), plugin.getResponseProcessors(createParameters(settings)).keySet() ); assertEquals(Set.of(), plugin.getSearchPhaseResultsProcessors(createParameters(settings)).keySet()); diff --git a/modules/search-pipeline-common/src/test/java/org/opensearch/search/pipeline/common/SplitResponseProcessorTests.java b/modules/search-pipeline-common/src/test/java/org/opensearch/search/pipeline/common/SplitResponseProcessorTests.java new file mode 100644 index 0000000000000..fcbc8ccf43cff --- /dev/null +++ b/modules/search-pipeline-common/src/test/java/org/opensearch/search/pipeline/common/SplitResponseProcessorTests.java @@ -0,0 +1,213 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a.java + * compatible open source license. + */ + +package org.opensearch.search.pipeline.common; + +import org.apache.lucene.search.TotalHits; +import org.opensearch.OpenSearchParseException; +import org.opensearch.action.search.SearchRequest; +import org.opensearch.action.search.SearchResponse; +import org.opensearch.action.search.SearchResponseSections; +import org.opensearch.common.document.DocumentField; +import org.opensearch.core.common.bytes.BytesArray; +import org.opensearch.index.query.QueryBuilder; +import org.opensearch.index.query.TermQueryBuilder; +import org.opensearch.ingest.RandomDocumentPicks; +import org.opensearch.search.SearchHit; +import org.opensearch.search.SearchHits; +import org.opensearch.search.builder.SearchSourceBuilder; +import org.opensearch.test.OpenSearchTestCase; + +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class SplitResponseProcessorTests extends OpenSearchTestCase { + + private static final String NO_TRAILING = "one,two,three"; + private static final String TRAILING = "alpha,beta,gamma,"; + private static final String REGEX_DELIM = "one1two2three"; + + private SearchRequest createDummyRequest() { + QueryBuilder query = new TermQueryBuilder("field", "value"); + SearchSourceBuilder source = new SearchSourceBuilder().query(query); + return new SearchRequest().source(source); + } + + private SearchResponse createTestResponse() { + SearchHit[] hits = new SearchHit[2]; + + // one response with source + Map csvMap = new HashMap<>(); + csvMap.put("csv", new DocumentField("csv", List.of(NO_TRAILING))); + hits[0] = new SearchHit(0, "doc 1", csvMap, Collections.emptyMap()); + hits[0].sourceRef(new BytesArray("{ \"csv\" : \"" + NO_TRAILING + "\" }")); + hits[0].score(1f); + + // one without source + csvMap = new HashMap<>(); + csvMap.put("csv", new DocumentField("csv", List.of(TRAILING))); + hits[1] = new SearchHit(1, "doc 2", csvMap, Collections.emptyMap()); + hits[1].score(2f); + + SearchHits searchHits = new SearchHits(hits, new TotalHits(2, TotalHits.Relation.EQUAL_TO), 2); + SearchResponseSections searchResponseSections = new SearchResponseSections(searchHits, null, null, false, false, null, 0); + return new SearchResponse(searchResponseSections, null, 1, 1, 0, 10, null, null); + } + + private SearchResponse createTestResponseRegex() { + SearchHit[] hits = new SearchHit[1]; + + Map dsvMap = new HashMap<>(); + dsvMap.put("dsv", new DocumentField("dsv", List.of(REGEX_DELIM))); + hits[0] = new SearchHit(0, "doc 1", dsvMap, Collections.emptyMap()); + hits[0].sourceRef(new BytesArray("{ \"dsv\" : \"" + REGEX_DELIM + "\" }")); + hits[0].score(1f); + + SearchHits searchHits = new SearchHits(hits, new TotalHits(1, TotalHits.Relation.EQUAL_TO), 1); + SearchResponseSections searchResponseSections = new SearchResponseSections(searchHits, null, null, false, false, null, 0); + return new SearchResponse(searchResponseSections, null, 1, 1, 0, 10, null, null); + } + + private SearchResponse createTestResponseNullField() { + SearchHit[] hits = new SearchHit[1]; + + Map map = new HashMap<>(); + map.put("csv", null); + hits[0] = new SearchHit(0, "doc 1", map, Collections.emptyMap()); + hits[0].sourceRef(new BytesArray("{ \"csv\" : null }")); + hits[0].score(1f); + + SearchHits searchHits = new SearchHits(hits, new TotalHits(1, TotalHits.Relation.EQUAL_TO), 1); + SearchResponseSections searchResponseSections = new SearchResponseSections(searchHits, null, null, false, false, null, 0); + return new SearchResponse(searchResponseSections, null, 1, 1, 0, 10, null, null); + } + + private SearchResponse createTestResponseEmptyList() { + SearchHit[] hits = new SearchHit[1]; + + Map map = new HashMap<>(); + map.put("empty", new DocumentField("empty", List.of())); + hits[0] = new SearchHit(0, "doc 1", map, Collections.emptyMap()); + hits[0].sourceRef(new BytesArray("{ \"empty\" : [] }")); + hits[0].score(1f); + + SearchHits searchHits = new SearchHits(hits, new TotalHits(1, TotalHits.Relation.EQUAL_TO), 1); + SearchResponseSections searchResponseSections = new SearchResponseSections(searchHits, null, null, false, false, null, 0); + return new SearchResponse(searchResponseSections, null, 1, 1, 0, 10, null, null); + } + + private SearchResponse createTestResponseNotString() { + SearchHit[] hits = new SearchHit[1]; + + Map piMap = new HashMap<>(); + piMap.put("maps", new DocumentField("maps", List.of(Map.of("foo", "I'm the Map!")))); + hits[0] = new SearchHit(0, "doc 1", piMap, Collections.emptyMap()); + hits[0].sourceRef(new BytesArray("{ \"maps\" : [{ \"foo\" : \"I'm the Map!\"}]] }")); + hits[0].score(1f); + + SearchHits searchHits = new SearchHits(hits, new TotalHits(1, TotalHits.Relation.EQUAL_TO), 1); + SearchResponseSections searchResponseSections = new SearchResponseSections(searchHits, null, null, false, false, null, 0); + return new SearchResponse(searchResponseSections, null, 1, 1, 0, 10, null, null); + } + + public void testSplitResponse() throws Exception { + SearchRequest request = createDummyRequest(); + + SplitResponseProcessor splitResponseProcessor = new SplitResponseProcessor(null, null, false, "csv", ",", false, "split"); + SearchResponse response = createTestResponse(); + SearchResponse splitResponse = splitResponseProcessor.processResponse(request, response); + + assertEquals(response.getHits(), splitResponse.getHits()); + + assertEquals(NO_TRAILING, splitResponse.getHits().getHits()[0].field("csv").getValue()); + assertEquals(List.of("one", "two", "three"), splitResponse.getHits().getHits()[0].field("split").getValues()); + Map map = splitResponse.getHits().getHits()[0].getSourceAsMap(); + assertNotNull(map); + assertEquals(List.of("one", "two", "three"), map.get("split")); + + assertEquals(TRAILING, splitResponse.getHits().getHits()[1].field("csv").getValue()); + assertEquals(List.of("alpha", "beta", "gamma"), splitResponse.getHits().getHits()[1].field("split").getValues()); + assertNull(splitResponse.getHits().getHits()[1].getSourceAsMap()); + } + + public void testSplitResponseRegex() throws Exception { + SearchRequest request = createDummyRequest(); + + SplitResponseProcessor splitResponseProcessor = new SplitResponseProcessor(null, null, false, "dsv", "\\d", false, "split"); + SearchResponse response = createTestResponseRegex(); + SearchResponse splitResponse = splitResponseProcessor.processResponse(request, response); + + assertEquals(response.getHits(), splitResponse.getHits()); + + assertEquals(REGEX_DELIM, splitResponse.getHits().getHits()[0].field("dsv").getValue()); + assertEquals(List.of("one", "two", "three"), splitResponse.getHits().getHits()[0].field("split").getValues()); + Map map = splitResponse.getHits().getHits()[0].getSourceAsMap(); + assertNotNull(map); + assertEquals(List.of("one", "two", "three"), map.get("split")); + } + + public void testSplitResponseSameField() throws Exception { + SearchRequest request = createDummyRequest(); + + SplitResponseProcessor splitResponseProcessor = new SplitResponseProcessor(null, null, false, "csv", ",", true, null); + SearchResponse response = createTestResponse(); + SearchResponse splitResponse = splitResponseProcessor.processResponse(request, response); + + assertEquals(response.getHits(), splitResponse.getHits()); + assertEquals(List.of("one", "two", "three"), splitResponse.getHits().getHits()[0].field("csv").getValues()); + assertEquals(List.of("alpha", "beta", "gamma", ""), splitResponse.getHits().getHits()[1].field("csv").getValues()); + } + + public void testSplitResponseEmptyList() { + SearchRequest request = createDummyRequest(); + + SplitResponseProcessor splitResponseProcessor = new SplitResponseProcessor(null, null, false, "empty", ",", false, null); + assertThrows(IllegalArgumentException.class, () -> splitResponseProcessor.processResponse(request, createTestResponseEmptyList())); + } + + public void testNullField() { + SearchRequest request = createDummyRequest(); + + SplitResponseProcessor splitResponseProcessor = new SplitResponseProcessor(null, null, false, "csv", ",", false, null); + + assertThrows(IllegalArgumentException.class, () -> splitResponseProcessor.processResponse(request, createTestResponseNullField())); + } + + public void testNotStringField() { + SearchRequest request = createDummyRequest(); + + SplitResponseProcessor splitResponseProcessor = new SplitResponseProcessor(null, null, false, "maps", ",", false, null); + + assertThrows(IllegalArgumentException.class, () -> splitResponseProcessor.processResponse(request, createTestResponseNotString())); + } + + public void testFactory() { + String splitField = RandomDocumentPicks.randomFieldName(random()); + String targetField = RandomDocumentPicks.randomFieldName(random()); + Map config = new HashMap<>(); + config.put("field", splitField); + config.put("separator", ","); + config.put("preserve_trailing", true); + config.put("target_field", targetField); + + SplitResponseProcessor.Factory factory = new SplitResponseProcessor.Factory(); + SplitResponseProcessor processor = factory.create(Collections.emptyMap(), null, null, false, config, null); + assertEquals("split", processor.getType()); + assertEquals(splitField, processor.getSplitField()); + assertEquals(",", processor.getSeparator()); + assertTrue(processor.isPreserveTrailing()); + assertEquals(targetField, processor.getTargetField()); + + expectThrows( + OpenSearchParseException.class, + () -> factory.create(Collections.emptyMap(), null, null, false, Collections.emptyMap(), null) + ); + } +}