From 5ce6849eaf8e37b24257da4b02c102259720b7ab Mon Sep 17 00:00:00 2001 From: Srikanth Padakanti Date: Tue, 6 Jan 2026 17:33:43 -0600 Subject: [PATCH 01/32] MvCombine Command Feature Signed-off-by: Srikanth Padakanti --- .../org/opensearch/sql/analysis/Analyzer.java | 6 + .../sql/ast/AbstractNodeVisitor.java | 5 + .../org/opensearch/sql/ast/dsl/AstDSL.java | 13 + .../opensearch/sql/ast/tree/MvCombine.java | 47 +++ .../sql/calcite/CalciteRelNodeVisitor.java | 112 +++++- .../remote/CalciteMvCombineCommandIT.java | 350 ++++++++++++++++++ .../sql/legacy/SQLIntegTestCase.java | 5 + integ-test/src/test/resources/mvcombine.json | 42 +++ .../resources/mvcombine_index_mapping.json | 13 + ppl/src/main/antlr/OpenSearchPPLLexer.g4 | 3 + ppl/src/main/antlr/OpenSearchPPLParser.g4 | 9 + .../opensearch/sql/ppl/parser/AstBuilder.java | 35 ++ .../ppl/calcite/CalcitePPLMvCombineTest.java | 161 ++++++++ 13 files changed, 794 insertions(+), 7 deletions(-) create mode 100644 core/src/main/java/org/opensearch/sql/ast/tree/MvCombine.java create mode 100644 integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java create mode 100644 integ-test/src/test/resources/mvcombine.json create mode 100644 integ-test/src/test/resources/mvcombine_index_mapping.json create mode 100644 ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLMvCombineTest.java diff --git a/core/src/main/java/org/opensearch/sql/analysis/Analyzer.java b/core/src/main/java/org/opensearch/sql/analysis/Analyzer.java index 24cef144c97..76e3e931076 100644 --- a/core/src/main/java/org/opensearch/sql/analysis/Analyzer.java +++ b/core/src/main/java/org/opensearch/sql/analysis/Analyzer.java @@ -80,6 +80,7 @@ import org.opensearch.sql.ast.tree.Lookup; import org.opensearch.sql.ast.tree.ML; import org.opensearch.sql.ast.tree.Multisearch; +import org.opensearch.sql.ast.tree.MvCombine; import org.opensearch.sql.ast.tree.Paginate; import org.opensearch.sql.ast.tree.Parse; import org.opensearch.sql.ast.tree.Patterns; @@ -533,6 +534,11 @@ public LogicalPlan visitAddColTotals(AddColTotals node, AnalysisContext context) throw getOnlyForCalciteException("addcoltotals"); } + @Override + public LogicalPlan visitMvCombine(MvCombine node, AnalysisContext context) { + throw getOnlyForCalciteException("mvcombine"); + } + /** Build {@link ParseExpression} to context and skip to child nodes. */ @Override public LogicalPlan visitParse(Parse node, AnalysisContext context) { diff --git a/core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java b/core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java index a6ef5e7547a..3ba78e73e89 100644 --- a/core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java @@ -68,6 +68,7 @@ import org.opensearch.sql.ast.tree.Lookup; import org.opensearch.sql.ast.tree.ML; import org.opensearch.sql.ast.tree.Multisearch; +import org.opensearch.sql.ast.tree.MvCombine; import org.opensearch.sql.ast.tree.Paginate; import org.opensearch.sql.ast.tree.Parse; import org.opensearch.sql.ast.tree.Patterns; @@ -461,4 +462,8 @@ public T visitAddTotals(AddTotals node, C context) { public T visitAddColTotals(AddColTotals node, C context) { return visitChildren(node, context); } + + public T visitMvCombine(MvCombine node, C context) { + return visitChildren(node, context); + } } diff --git a/core/src/main/java/org/opensearch/sql/ast/dsl/AstDSL.java b/core/src/main/java/org/opensearch/sql/ast/dsl/AstDSL.java index bf54d2ffd89..356ab769aa3 100644 --- a/core/src/main/java/org/opensearch/sql/ast/dsl/AstDSL.java +++ b/core/src/main/java/org/opensearch/sql/ast/dsl/AstDSL.java @@ -62,6 +62,7 @@ import org.opensearch.sql.ast.tree.Head; import org.opensearch.sql.ast.tree.Limit; import org.opensearch.sql.ast.tree.MinSpanBin; +import org.opensearch.sql.ast.tree.MvCombine; import org.opensearch.sql.ast.tree.Parse; import org.opensearch.sql.ast.tree.Patterns; import org.opensearch.sql.ast.tree.Project; @@ -468,6 +469,18 @@ public static List defaultDedupArgs() { argument("consecutive", booleanLiteral(false))); } + public static MvCombine mvcombine(Field field) { + return new MvCombine(field, null, false); + } + + public static MvCombine mvcombine(Field field, String delim) { + return new MvCombine(field, delim, false); + } + + public static MvCombine mvcombine(Field field, String delim, boolean nomv) { + return new MvCombine(field, delim, nomv); + } + public static List sortOptions() { return exprList(argument("desc", booleanLiteral(false))); } diff --git a/core/src/main/java/org/opensearch/sql/ast/tree/MvCombine.java b/core/src/main/java/org/opensearch/sql/ast/tree/MvCombine.java new file mode 100644 index 00000000000..219088ebae1 --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/ast/tree/MvCombine.java @@ -0,0 +1,47 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.ast.tree; + +import com.google.common.collect.ImmutableList; +import java.util.List; +import javax.annotation.Nullable; +import lombok.EqualsAndHashCode; +import lombok.Getter; +import lombok.ToString; +import org.opensearch.sql.ast.AbstractNodeVisitor; +import org.opensearch.sql.ast.expression.Field; + +@Getter +@ToString(callSuper = true) +@EqualsAndHashCode(callSuper = false) +public class MvCombine extends UnresolvedPlan { + + private final Field field; + private final String delim; + @Nullable private UnresolvedPlan child; + private final boolean nomv; + + public MvCombine(Field field, @Nullable String delim, boolean nomv) { + this.field = field; + this.delim = (delim == null) ? " " : delim; + this.nomv = nomv; + } + + public MvCombine attach(UnresolvedPlan child) { + this.child = child; + return this; + } + + @Override + public List getChild() { + return child == null ? ImmutableList.of() : ImmutableList.of(child); + } + + @Override + public T accept(AbstractNodeVisitor nodeVisitor, C context) { + return nodeVisitor.visitMvCombine(this, context); + } +} diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java index 19dce3e3609..3d18f2e0b7b 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java @@ -53,13 +53,7 @@ import org.apache.calcite.rel.type.RelDataType; import org.apache.calcite.rel.type.RelDataTypeFamily; import org.apache.calcite.rel.type.RelDataTypeField; -import org.apache.calcite.rex.RexCall; -import org.apache.calcite.rex.RexCorrelVariable; -import org.apache.calcite.rex.RexInputRef; -import org.apache.calcite.rex.RexLiteral; -import org.apache.calcite.rex.RexNode; -import org.apache.calcite.rex.RexVisitorImpl; -import org.apache.calcite.rex.RexWindowBounds; +import org.apache.calcite.rex.*; import org.apache.calcite.sql.SqlKind; import org.apache.calcite.sql.fun.SqlStdOperatorTable; import org.apache.calcite.sql.type.ArraySqlType; @@ -122,6 +116,7 @@ import org.opensearch.sql.ast.tree.Lookup.OutputStrategy; import org.opensearch.sql.ast.tree.ML; import org.opensearch.sql.ast.tree.Multisearch; +import org.opensearch.sql.ast.tree.MvCombine; import org.opensearch.sql.ast.tree.Paginate; import org.opensearch.sql.ast.tree.Parse; import org.opensearch.sql.ast.tree.Patterns; @@ -3116,6 +3111,109 @@ public RelNode visitExpand(Expand expand, CalcitePlanContext context) { return context.relBuilder.peek(); } + @Override + public RelNode visitMvCombine(MvCombine node, CalcitePlanContext context) { + // 1) Lower child first + visitChildren(node, context); + + final RelBuilder relBuilder = context.relBuilder; + final RexBuilder rexBuilder = context.rexBuilder; + + final RelNode input = relBuilder.peek(); + final List inputFieldNames = input.getRowType().getFieldNames(); + + // 2) Splunk parity: accept delim (default is single space). + // NOTE: delim only affects single-value rendering when nomv=true. + final String delim = (node.getDelim() != null) ? node.getDelim() : " "; + + // 3) Resolve target field to an input ref (must be a direct field reference) + final Field targetField = node.getField(); + final RexNode targetRex = rexVisitor.analyze(targetField, context); + if (!(targetRex instanceof RexInputRef)) { + throw new SemanticCheckException( + "mvcombine target must be a direct field reference, but got: " + targetField); + } + final int targetIndex = ((RexInputRef) targetRex).getIndex(); + final String targetName = inputFieldNames.get(targetIndex); + + // 4) Group key = all fields except target + final List groupExprs = new ArrayList<>(); + for (int i = 0; i < inputFieldNames.size(); i++) { + if (i == targetIndex) continue; + groupExprs.add(relBuilder.field(i)); + } + + // 5) Aggregate target values: COLLECT => MULTISET + final RelBuilder.AggCall aggCall = + relBuilder + .aggregateCall(SqlStdOperatorTable.COLLECT, relBuilder.field(targetIndex)) + .as(targetName); + + relBuilder.aggregate(relBuilder.groupKey(groupExprs), aggCall); + + // 6) Restore original output column order AND cast MULTISET -> ARRAY using RexBuilder + // After aggregate => [group fields..., targetAgg(multiset)] + final int aggPos = groupExprs.size(); + + final RelDataType elemType = input.getRowType().getFieldList().get(targetIndex).getType(); + final RelDataType arrayType = relBuilder.getTypeFactory().createArrayType(elemType, -1); + + final List projections = new ArrayList<>(inputFieldNames.size()); + final List projNames = new ArrayList<>(inputFieldNames.size()); + + int groupPos = 0; + for (int i = 0; i < inputFieldNames.size(); i++) { + projNames.add(inputFieldNames.get(i)); + if (i == targetIndex) { + final RexNode multisetRef = relBuilder.field(aggPos); // MULTISET + projections.add(rexBuilder.makeCast(arrayType, multisetRef)); // ARRAY + } else { + projections.add(relBuilder.field(groupPos)); + groupPos++; + } + } + relBuilder.project(projections, projNames, /* force= */ true); + + // 7) Splunk parity: nomv=true converts multivalue output to a single delimited string. + // arrayToString in this engine supports only String/ByteString, so cast elements to STRING + // first. + if (node.isNomv()) { + final List nomvProjections = new ArrayList<>(inputFieldNames.size()); + final List nomvNames = new ArrayList<>(inputFieldNames.size()); + + // Build ARRAY type once + final RelDataType stringType = + relBuilder + .getTypeFactory() + .createSqlType(org.apache.calcite.sql.type.SqlTypeName.VARCHAR); + final RelDataType stringArrayType = + relBuilder.getTypeFactory().createArrayType(stringType, -1); + + for (int i = 0; i < inputFieldNames.size(); i++) { + final String name = inputFieldNames.get(i); + nomvNames.add(name); + + if (i == targetIndex) { + // At this point relBuilder.field(i) is ARRAY. Cast to ARRAY so + // ARRAY_TO_STRING works. + final RexNode stringArray = rexBuilder.makeCast(stringArrayType, relBuilder.field(i)); + + nomvProjections.add( + relBuilder.call( + org.apache.calcite.sql.fun.SqlLibraryOperators.ARRAY_TO_STRING, + stringArray, + relBuilder.literal(delim))); + } else { + nomvProjections.add(relBuilder.field(i)); + } + } + + relBuilder.project(nomvProjections, nomvNames, /* force= */ true); + } + + return relBuilder.peek(); + } + @Override public RelNode visitValues(Values values, CalcitePlanContext context) { if (values.getValues() == null || values.getValues().isEmpty()) { diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java new file mode 100644 index 00000000000..5e60e1985be --- /dev/null +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java @@ -0,0 +1,350 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.calcite.remote; + +import static org.opensearch.sql.util.MatcherUtils.verifyNumOfRows; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.regex.Pattern; +import org.json.JSONArray; +import org.json.JSONObject; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; +import org.opensearch.client.ResponseException; +import org.opensearch.sql.ppl.PPLIntegTestCase; + +/** + * mvcombine integration tests using existing SQLIntegTestCase Index.MVCOMBINE mapping + dataset. + * + *

IMPORTANT: - Do NOT filter by a field named "case" (parser keyword risk + dataset mismatch + * risk). - Prefer stable keys: ip/bytes/tags. - Use single quotes in query strings; executeQuery + * wraps the query in JSON. + * + *

NOTE ON nomv/delim: - Some builds may not support nomv and/or delim yet, or may validate their + * ordering strictly. - These tests are written to: - Validate behavior when the feature is + * supported - Tolerate a 400 syntax rejection when not supported + */ +public class CalciteMvCombineCommandIT extends PPLIntegTestCase { + + private static final String INDEX = Index.MVCOMBINE.getName(); + + @Override + public void init() throws Exception { + super.init(); + enableCalcite(); + loadIndex(Index.MVCOMBINE); + } + + // --------------------------- + // Sanity (precondition) + // --------------------------- + + @Test + public void testSanity_datasetIsLoaded() throws IOException { + JSONObject result = executeQuery("source=" + INDEX + " | head 5"); + int rows = result.getJSONArray("datarows").length(); + Assertions.assertTrue(rows > 0, "Expected MVCOMBINE dataset to have rows, got 0"); + } + + // --------------------------- + // Happy paths (core mvcombine) + // --------------------------- + + @Test + public void testMvCombine_basicGroupCollapsesToOneRow() throws IOException { + // Dataset: ip=10.0.0.1 bytes=100 tags=t1 packets_str in [10,20,30] + String q = + "source=" + + INDEX + + " | where ip='10.0.0.1' and bytes=100 and tags='t1'" + + " | fields ip, bytes, tags, packets_str" + + " | mvcombine packets_str"; + + JSONObject result = executeQuery(q); + verifyNumOfRows(result, 1); + + JSONArray row = result.getJSONArray("datarows").getJSONArray(0); + Assertions.assertEquals("10.0.0.1", row.getString(0)); + Assertions.assertEquals("100", String.valueOf(row.get(1))); + Assertions.assertEquals("t1", row.getString(2)); + + List mv = toStringListDropNulls(row.get(3)); + Assertions.assertTrue(mv.contains("10"), "Expected packets_str to include 10, got " + mv); + Assertions.assertTrue(mv.contains("20"), "Expected packets_str to include 20, got " + mv); + Assertions.assertTrue(mv.contains("30"), "Expected packets_str to include 30, got " + mv); + } + + @Test + public void testMvCombine_singleRowGroupStaysSingleRow() throws IOException { + // Dataset: ip=10.0.0.2 bytes=200 tags=t2 packets_str=7 + String q = + "source=" + + INDEX + + " | where ip='10.0.0.2' and bytes=200 and tags='t2'" + + " | fields ip, bytes, tags, packets_str" + + " | mvcombine packets_str"; + + JSONObject result = executeQuery(q); + verifyNumOfRows(result, 1); + + JSONArray row = result.getJSONArray("datarows").getJSONArray(0); + List mv = toStringListDropNulls(row.get(3)); + + Assertions.assertEquals(1, mv.size(), "Expected single-value MV, got " + mv); + Assertions.assertEquals("7", mv.get(0)); + } + + @Test + public void testMvCombine_missingTargetWithinGroup_collapses_nonNullPreserved() + throws IOException { + // Dataset: ip=10.0.0.3 bytes=300 tags=t3 has: + // - one doc with packets_str=5 + // - one doc missing packets_str + // - one doc with letters only + String q = + "source=" + + INDEX + + " | where ip='10.0.0.3' and bytes=300 and tags='t3'" + + " | fields ip, bytes, tags, packets_str" + + " | mvcombine packets_str"; + + JSONObject result = executeQuery(q); + verifyNumOfRows(result, 1); + + JSONArray row = result.getJSONArray("datarows").getJSONArray(0); + List mv = toStringListKeepNulls(row.get(3)); + + // Requirements we can enforce safely: + // - collapse happens into 1 row + // - non-null value is preserved + Assertions.assertTrue(mv.contains("5"), "Expected packets_str to include 5, got " + mv); + } + + // --------------------------- + // Multi-group behavior + // --------------------------- + + @Test + public void testMvCombine_multipleGroups_producesOneRowPerGroupKey() throws IOException { + // Dataset has: + // ip=10.0.0.7 bytes=700 tags=t7 packets_str=[1,2] + // ip=10.0.0.8 bytes=700 tags=t7 packets_str=[9] + String base = + "source=" + + INDEX + + " | where (ip='10.0.0.7' or ip='10.0.0.8') and bytes=700 and tags='t7'" + + " | fields ip, bytes, tags, packets_str"; + + // precondition (should exist; if someone edits the dataset, fail with a crisp message) + JSONObject before = executeQuery(base); + int beforeRows = before.getJSONArray("datarows").length(); + Assertions.assertTrue(beforeRows >= 1, "Expected dataset rows for multi-group test, got 0"); + + JSONObject result = executeQuery(base + " | mvcombine packets_str | sort ip"); + int outRows = result.getJSONArray("datarows").length(); + Assertions.assertEquals( + 2, outRows, "Expected 2 groups (10.0.0.7 and 10.0.0.8), got " + outRows); + + // Spot-check values without assuming ordering within MV arrays + JSONArray r0 = result.getJSONArray("datarows").getJSONArray(0); + JSONArray r1 = result.getJSONArray("datarows").getJSONArray(1); + + String ip0 = r0.getString(0); + String ip1 = r1.getString(0); + + if ("10.0.0.7".equals(ip0)) { + List mv0 = toStringListDropNulls(r0.get(3)); + Assertions.assertTrue( + mv0.contains("1") && mv0.contains("2"), + "Expected 10.0.0.7 to include 1 and 2, got " + mv0); + + List mv1 = toStringListDropNulls(r1.get(3)); + Assertions.assertTrue(mv1.contains("9"), "Expected 10.0.0.8 to include 9, got " + mv1); + } else { + List mv0 = toStringListDropNulls(r0.get(3)); + Assertions.assertTrue(mv0.contains("9"), "Expected 10.0.0.8 to include 9, got " + mv0); + + List mv1 = toStringListDropNulls(r1.get(3)); + Assertions.assertTrue( + mv1.contains("1") && mv1.contains("2"), + "Expected 10.0.0.7 to include 1 and 2, got " + mv1); + } + } + + // --------------------------- + // delim/nomv: happy paths + edge tolerance + // --------------------------- + + @Test + public void testMvCombine_nomv_defaultDelim_ifSupported_elseSyntaxRejected() throws Exception { + // Uses ip=10.0.0.9 bytes=900 tags=t9 packets_str=[1,2,3] + String base = + "source=" + + INDEX + + " | where ip='10.0.0.9' and bytes=900 and tags='t9'" + + " | fields ip, bytes, tags, packets_str"; + + // If supported: should return a scalar string containing 1,2,3 in some delimiter format. + // If unsupported: expected 400 syntax rejection. + String q = base + " | mvcombine packets_str nomv"; + + try { + JSONObject result = executeQuery(q); + verifyNumOfRows(result, 1); + + Object cell = result.getJSONArray("datarows").getJSONArray(0).get(3); + Assertions.assertTrue( + cell instanceof String, "Expected nomv output scalar string, got: " + cell); + + String s = (String) cell; + Assertions.assertTrue( + Pattern.compile("1.*2.*3").matcher(s).find(), + "Expected nomv string to contain values 1,2,3 in order, got: " + s); + } catch (ResponseException e) { + Assertions.assertTrue( + isSyntaxBadRequest(e), + "Expected syntax rejection if nomv unsupported, got: " + e.getMessage()); + } + } + + @Test + public void testMvCombine_nomvWithCustomDelim_ifSupported_elseSyntaxRejected() throws Exception { + // IMPORTANT: single quotes in query + String base = + "source=" + + INDEX + + " | where ip='10.0.0.9' and bytes=900 and tags='t9'" + + " | fields ip, bytes, tags, packets_str"; + + // Test both parameter orders + String q1 = base + " | mvcombine packets_str nomv delim='|'"; + String q2 = base + " | mvcombine packets_str delim='|' nomv"; + + JSONObject result; + try { + result = executeQuery(q1); + } catch (ResponseException e1) { + if (!isSyntaxBadRequest(e1)) throw e1; + + try { + result = executeQuery(q2); + } catch (ResponseException e2) { + Assertions.assertTrue( + isSyntaxBadRequest(e2), + "Expected syntax rejection for unsupported nomv/delim, got: " + e2.getMessage()); + return; // unsupported -> acceptable + } + } + + // Supported -> validate scalar string output and delimiter presence + verifyNumOfRows(result, 1); + Object cell = result.getJSONArray("datarows").getJSONArray(0).get(3); + Assertions.assertTrue( + cell instanceof String, "Expected nomv output scalar string, got: " + cell); + + String s = (String) cell; + Assertions.assertTrue(s.contains("|"), "Expected delimiter '|' in: " + s); + Assertions.assertTrue( + Pattern.compile("1\\|.*2\\|.*3|1.*\\|.*2.*\\|.*3|1.*2.*3").matcher(s).find(), + "Expected values to be present in the joined output, got: " + s); + } + + @Test + public void testMvCombine_delimWithoutNomv_shouldNotChangeMvShape_ifSupported_elseSyntaxRejected() + throws Exception { + // If delim is only meaningful with nomv, then: + // - supported behavior should still return multivalue (JSONArray) for the field + // - or it may reject delim when nomv absent (400) + String base = + "source=" + + INDEX + + " | where ip='10.0.0.9' and bytes=900 and tags='t9'" + + " | fields ip, bytes, tags, packets_str"; + + String q = base + " | mvcombine packets_str delim='|'"; + + try { + JSONObject result = executeQuery(q); + verifyNumOfRows(result, 1); + + Object cell = result.getJSONArray("datarows").getJSONArray(0).get(3); + Assertions.assertTrue( + cell instanceof JSONArray, + "Expected multivalue array (delim without nomv should not coerce to string), got: " + + cell); + } catch (ResponseException e) { + Assertions.assertTrue( + isSyntaxBadRequest(e), + "Expected syntax rejection if delim-only unsupported, got: " + e.getMessage()); + } + } + + // --------------------------- + // Edge / error semantics + // --------------------------- + + @Test + public void testMvCombine_missingField_shouldReturn4xx() throws IOException { + try { + executeQuery("source=" + INDEX + " | mvcombine does_not_exist"); + Assertions.fail("Expected ResponseException was not thrown"); + } catch (ResponseException e) { + int status = e.getResponse().getStatusLine().getStatusCode(); + Assertions.assertTrue( + status >= 400 && status < 500, + "Expected 4xx for missing field, got " + status + " msg=" + e.getMessage()); + } + } + + // --------------------------- + // Helpers + // --------------------------- + + private static boolean isSyntaxBadRequest(ResponseException e) { + int status = e.getResponse().getStatusLine().getStatusCode(); + if (status != 400) return false; + + String msg = e.getMessage(); + if (msg == null) return false; + + // Keep it broad: different layers throw different messages + return msg.contains("SyntaxCheckException") + || msg.contains("Invalid Query") + || msg.contains("parsing_exception") + || msg.contains("ParseException"); + } + + /** JSONArray -> list (nulls preserved), scalar -> singleton list, null -> empty list. */ + private static List toStringListKeepNulls(Object cell) { + if (cell == null || cell == JSONObject.NULL) { + return Collections.emptyList(); + } + if (cell instanceof JSONArray arr) { + List out = new ArrayList<>(); + for (int i = 0; i < arr.length(); i++) { + Object v = arr.get(i); + out.add(v == JSONObject.NULL ? null : String.valueOf(v)); + } + return out; + } + return List.of(String.valueOf(cell)); + } + + /** Same as above but drops null entries. */ + private static List toStringListDropNulls(Object cell) { + List all = toStringListKeepNulls(cell); + if (all.isEmpty()) return all; + + List out = new ArrayList<>(); + for (String v : all) { + if (v != null) out.add(v); + } + return out; + } +} diff --git a/integ-test/src/test/java/org/opensearch/sql/legacy/SQLIntegTestCase.java b/integ-test/src/test/java/org/opensearch/sql/legacy/SQLIntegTestCase.java index 50ee11b765a..c306ad29cf9 100644 --- a/integ-test/src/test/java/org/opensearch/sql/legacy/SQLIntegTestCase.java +++ b/integ-test/src/test/java/org/opensearch/sql/legacy/SQLIntegTestCase.java @@ -921,6 +921,11 @@ public enum Index { "time_data", getMappingFile("time_test_data_index_mapping.json"), "src/test/resources/time_test_data.json"), + MVCOMBINE( + "test_index_mvcombine", + "_doc", + getMappingFile("mvcombine_index_mapping.json"), + "src/test/resources/mvcombine.json"), TIME_TEST_DATA_WITH_NULL( TestsConstants.TEST_INDEX_TIME_DATE_NULL, "time_data_with_null", diff --git a/integ-test/src/test/resources/mvcombine.json b/integ-test/src/test/resources/mvcombine.json new file mode 100644 index 00000000000..315aafb532c --- /dev/null +++ b/integ-test/src/test/resources/mvcombine.json @@ -0,0 +1,42 @@ +{ "index": { "_index": "test_index_mvcombine", "_id": "1" } } +{ "ip": "10.0.0.1", "bytes": 100, "tags": "t1", "packets_str": "10" } +{ "index": { "_index": "test_index_mvcombine", "_id": "2" } } +{ "ip": "10.0.0.1", "bytes": 100, "tags": "t1", "packets_str": "20" } +{ "index": { "_index": "test_index_mvcombine", "_id": "3" } } +{ "ip": "10.0.0.1", "bytes": 100, "tags": "t1", "packets_str": "30" } + +{ "index": { "_index": "test_index_mvcombine", "_id": "4" } } +{ "ip": "10.0.0.2", "bytes": 200, "tags": "t2", "packets_str": "7" } + +{ "index": { "_index": "test_index_mvcombine", "_id": "5" } } +{ "ip": "10.0.0.3", "bytes": 300, "tags": "t3", "packets_str": "5" } +{ "index": { "_index": "test_index_mvcombine", "_id": "6" } } +{ "ip": "10.0.0.3", "bytes": 300, "tags": "t3" } +{ "index": { "_index": "test_index_mvcombine", "_id": "7" } } +{ "ip": "10.0.0.3", "bytes": 300, "tags": "t3", "letters": "a" } + +{ "index": { "_index": "test_index_mvcombine", "_id": "16" } } +{ "ip": "10.0.0.7", "bytes": 700, "tags": "t7", "packets_str": "1" } +{ "index": { "_index": "test_index_mvcombine", "_id": "17" } } +{ "ip": "10.0.0.7", "bytes": 700, "tags": "t7", "packets_str": "2" } +{ "index": { "_index": "test_index_mvcombine", "_id": "18" } } +{ "ip": "10.0.0.8", "bytes": 700, "tags": "t7", "packets_str": "9" } + +{ "index": { "_index": "test_index_mvcombine", "_id": "19" } } +{ "ip": "10.0.0.9", "bytes": 900, "tags": "t9", "packets_str": "1" } +{ "index": { "_index": "test_index_mvcombine", "_id": "20" } } +{ "ip": "10.0.0.9", "bytes": 900, "tags": "t9", "packets_str": "2" } +{ "index": { "_index": "test_index_mvcombine", "_id": "21" } } +{ "ip": "10.0.0.9", "bytes": 900, "tags": "t9", "packets_str": "3" } + +{ "index": { "_index": "test_index_mvcombine", "_id": "11" } } +{ "ip": "10.0.0.5", "bytes": 500, "tags": "t5", "packets_str": "dup" } +{ "index": { "_index": "test_index_mvcombine", "_id": "12" } } +{ "ip": "10.0.0.5", "bytes": 500, "tags": "t5", "packets_str": "dup" } +{ "index": { "_index": "test_index_mvcombine", "_id": "13" } } +{ "ip": "10.0.0.5", "bytes": 500, "tags": "t5", "packets_str": "x" } + +{ "index": { "_index": "test_index_mvcombine", "_id": "14" } } +{ "ip": "10.0.0.6", "bytes": 600, "tags": "t6", "packets_str": "" } +{ "index": { "_index": "test_index_mvcombine", "_id": "15" } } +{ "ip": "10.0.0.6", "bytes": 600, "tags": "t6", "packets_str": "z" } diff --git a/integ-test/src/test/resources/mvcombine_index_mapping.json b/integ-test/src/test/resources/mvcombine_index_mapping.json new file mode 100644 index 00000000000..0c008faf2f1 --- /dev/null +++ b/integ-test/src/test/resources/mvcombine_index_mapping.json @@ -0,0 +1,13 @@ +{ + "mappings": { + "properties": { + "case": { "type": "keyword" }, + "ip": { "type": "ip" }, + "bytes": { "type": "long" }, + "tags": { "type": "keyword" }, + + "packets_str": { "type": "keyword" }, + "letters": { "type": "keyword" } + } + } +} diff --git a/ppl/src/main/antlr/OpenSearchPPLLexer.g4 b/ppl/src/main/antlr/OpenSearchPPLLexer.g4 index 71162e81bd8..7cc6e64d070 100644 --- a/ppl/src/main/antlr/OpenSearchPPLLexer.g4 +++ b/ppl/src/main/antlr/OpenSearchPPLLexer.g4 @@ -70,6 +70,9 @@ LABEL: 'LABEL'; SHOW_NUMBERED_TOKEN: 'SHOW_NUMBERED_TOKEN'; AGGREGATION: 'AGGREGATION'; APPENDPIPE: 'APPENDPIPE'; +MVCOMBINE: 'MVCOMBINE'; +NOMV: 'NOMV'; + //Native JOIN KEYWORDS JOIN: 'JOIN'; diff --git a/ppl/src/main/antlr/OpenSearchPPLParser.g4 b/ppl/src/main/antlr/OpenSearchPPLParser.g4 index 7045796a03c..1bf523a146e 100644 --- a/ppl/src/main/antlr/OpenSearchPPLParser.g4 +++ b/ppl/src/main/antlr/OpenSearchPPLParser.g4 @@ -88,6 +88,7 @@ commands | rexCommand | appendPipeCommand | replaceCommand + | mvcombineCommand ; commandName @@ -531,6 +532,13 @@ expandCommand : EXPAND fieldExpression (AS alias = qualifiedName)? ; +mvcombineCommand + : MVCOMBINE fieldExpression + (DELIM EQUAL stringLiteral)? + (NOMV EQUAL booleanLiteral)? + ; + + flattenCommand : FLATTEN fieldExpression (AS aliases = identifierSeq)? ; @@ -1574,6 +1582,7 @@ searchableKeyWord | PARTITIONS | ALLNUM | DELIM + | NOMV | CURRENT | WINDOW | GLOBAL diff --git a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java index 3f4f3049365..3c3023afcf0 100644 --- a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java +++ b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java @@ -91,6 +91,7 @@ import org.opensearch.sql.ast.tree.ML; import org.opensearch.sql.ast.tree.MinSpanBin; import org.opensearch.sql.ast.tree.Multisearch; +import org.opensearch.sql.ast.tree.MvCombine; import org.opensearch.sql.ast.tree.Parse; import org.opensearch.sql.ast.tree.Patterns; import org.opensearch.sql.ast.tree.Project; @@ -868,6 +869,26 @@ public UnresolvedPlan visitExpandCommand(OpenSearchPPLParser.ExpandCommandContex return new Expand(fieldExpression, alias); } + /** mvcombine command. */ + /** mvcombine command. */ + @Override + public UnresolvedPlan visitMvcombineCommand(OpenSearchPPLParser.MvcombineCommandContext ctx) { + Field field = (Field) internalVisitExpression(ctx.fieldExpression()); + + String delim = null; + if (ctx.DELIM() != null) { + delim = unquoteStringLiteral(ctx.stringLiteral().getText()); + } + + boolean nomv = false; + if (ctx.NOMV() != null) { + // booleanLiteral rule is TRUE | FALSE + nomv = ctx.booleanLiteral().TRUE() != null; + } + + return new MvCombine(field, delim, nomv); + } + @Override public UnresolvedPlan visitGrokCommand(OpenSearchPPLParser.GrokCommandContext ctx) { UnresolvedExpression sourceField = internalVisitExpression(ctx.source_field); @@ -1289,6 +1310,20 @@ private String getTextInQuery(ParserRuleContext ctx) { return query.substring(start.getStartIndex(), stop.getStopIndex() + 1); } + private static String unquoteStringLiteral(String text) { + if (text == null || text.length() < 2) { + return text; + } + char first = text.charAt(0); + char last = text.charAt(text.length() - 1); + if ((first == '"' && last == '"') + || (first == '\'' && last == '\'') + || (first == '`' && last == '`')) { + return text.substring(1, text.length() - 1); + } + return text; + } + /** * Try to wrap the plan with a project node of this AllFields expression. Only wrap it if the plan * is not a project node or if the project is type of excluded. diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLMvCombineTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLMvCombineTest.java new file mode 100644 index 00000000000..6eb167e9ee2 --- /dev/null +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLMvCombineTest.java @@ -0,0 +1,161 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.ppl.calcite; + +import com.google.common.collect.ImmutableList; +import java.util.List; +import lombok.RequiredArgsConstructor; +import org.apache.calcite.DataContext; +import org.apache.calcite.config.CalciteConnectionConfig; +import org.apache.calcite.linq4j.Enumerable; +import org.apache.calcite.linq4j.Linq4j; +import org.apache.calcite.plan.RelTraitDef; +import org.apache.calcite.rel.RelCollations; +import org.apache.calcite.rel.RelNode; +import org.apache.calcite.rel.type.RelDataType; +import org.apache.calcite.rel.type.RelDataTypeFactory; +import org.apache.calcite.rel.type.RelProtoDataType; +import org.apache.calcite.schema.ScannableTable; +import org.apache.calcite.schema.Schema; +import org.apache.calcite.schema.SchemaPlus; +import org.apache.calcite.schema.Statistic; +import org.apache.calcite.schema.Statistics; +import org.apache.calcite.sql.SqlCall; +import org.apache.calcite.sql.SqlNode; +import org.apache.calcite.sql.parser.SqlParser; +import org.apache.calcite.sql.type.SqlTypeName; +import org.apache.calcite.test.CalciteAssert; +import org.apache.calcite.tools.Frameworks; +import org.apache.calcite.tools.Programs; +import org.checkerframework.checker.nullness.qual.Nullable; +import org.junit.Test; + +public class CalcitePPLMvCombineTest extends CalcitePPLAbstractTest { + + public CalcitePPLMvCombineTest() { + super(CalciteAssert.SchemaSpec.SCOTT_WITH_TEMPORAL); + } + + @Override + protected Frameworks.ConfigBuilder config(CalciteAssert.SchemaSpec... schemaSpecs) { + final SchemaPlus rootSchema = Frameworks.createRootSchema(true); + final SchemaPlus schema = CalciteAssert.addSchema(rootSchema, schemaSpecs); + + ImmutableList rows = + ImmutableList.of( + new Object[] {"basic", "A", 10}, + new Object[] {"basic", "A", 20}, + new Object[] {"basic", "B", 60}, + new Object[] {"basic", "A", 30}); + + schema.add("MVCOMBINE_DATA", new MvCombineDataTable(rows)); + + return Frameworks.newConfigBuilder() + .parserConfig(SqlParser.Config.DEFAULT) + .defaultSchema(schema) + .traitDefs((List) null) + .programs(Programs.heuristicJoinOrder(Programs.RULE_SET, true, 2)); + } + + @Test + public void testMvCombineBasic() { + String ppl = + "source=MVCOMBINE_DATA " + + "| where case = \"basic\" " + + "| fields case, ip, packets " + + "| mvcombine packets " + + "| sort ip"; + + RelNode root = getRelNode(ppl); + + // NOTE: Your implementation casts COLLECT result (MULTISET) to ARRAY. + String expectedLogical = + "LogicalSort(sort0=[$1], dir0=[ASC-nulls-first])\n" + + " LogicalProject(case=[$0], ip=[$1], packets=[CAST($2):INTEGER ARRAY NOT NULL])\n" + + " LogicalAggregate(group=[{0, 1}], packets=[COLLECT($2)])\n" + + " LogicalFilter(condition=[=($0, 'basic')])\n" + + " LogicalTableScan(table=[[scott, MVCOMBINE_DATA]])\n"; + verifyLogical(root, expectedLogical); + } + + @Test + public void testMvCombineNomvWithCustomDelim() { + String ppl = + "source=MVCOMBINE_DATA " + + "| where case = \"basic\" " + + "| fields case, ip, packets " + + "| mvcombine packets delim=\"|\" nomv=true " + + "| sort ip"; + + RelNode root = getRelNode(ppl); + + // NOTE: Your implementation does: + // COLLECT -> CAST to INTEGER ARRAY -> CAST to VARCHAR ARRAY -> ARRAY_TO_STRING(..., '|') + String expectedLogical = + "LogicalSort(sort0=[$1], dir0=[ASC-nulls-first])\n" + + " LogicalProject(case=[$0], ip=[$1], packets=[ARRAY_TO_STRING(CAST(CAST($2):INTEGER" + + " ARRAY NOT NULL):VARCHAR NOT NULL ARRAY NOT NULL, '|')])\n" + + " LogicalAggregate(group=[{0, 1}], packets=[COLLECT($2)])\n" + + " LogicalFilter(condition=[=($0, 'basic')])\n" + + " LogicalTableScan(table=[[scott, MVCOMBINE_DATA]])\n"; + verifyLogical(root, expectedLogical); + } + + // ======================================================================== + // Custom ScannableTable for deterministic mvcombine planning tests + // ======================================================================== + + @RequiredArgsConstructor + static class MvCombineDataTable implements ScannableTable { + private final ImmutableList rows; + + protected final RelProtoDataType protoRowType = + factory -> + factory + .builder() + .add("case", SqlTypeName.VARCHAR) + .nullable(true) + .add("ip", SqlTypeName.VARCHAR) + .nullable(true) + .add("packets", SqlTypeName.INTEGER) + .nullable(true) + .build(); + + @Override + public Enumerable<@Nullable Object[]> scan(DataContext root) { + return Linq4j.asEnumerable(rows); + } + + @Override + public RelDataType getRowType(RelDataTypeFactory typeFactory) { + return protoRowType.apply(typeFactory); + } + + @Override + public Statistic getStatistic() { + return Statistics.of(0d, ImmutableList.of(), RelCollations.createSingleton(0)); + } + + @Override + public Schema.TableType getJdbcTableType() { + return Schema.TableType.TABLE; + } + + @Override + public boolean isRolledUp(String column) { + return false; + } + + @Override + public boolean rolledUpColumnValidInsideAgg( + String column, + SqlCall call, + @Nullable SqlNode parent, + @Nullable CalciteConnectionConfig config) { + return false; + } + } +} From 33c9220ba9b4d21bba8e68d33de1d109c6bdf945 Mon Sep 17 00:00:00 2001 From: Srikanth Padakanti Date: Tue, 6 Jan 2026 17:36:03 -0600 Subject: [PATCH 02/32] MvCombine Command Feature Signed-off-by: Srikanth Padakanti --- .../calcite/remote/CalciteMvCombineCommandIT.java | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java index 5e60e1985be..fd2cac3f7dc 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java @@ -19,17 +19,7 @@ import org.opensearch.client.ResponseException; import org.opensearch.sql.ppl.PPLIntegTestCase; -/** - * mvcombine integration tests using existing SQLIntegTestCase Index.MVCOMBINE mapping + dataset. - * - *

IMPORTANT: - Do NOT filter by a field named "case" (parser keyword risk + dataset mismatch - * risk). - Prefer stable keys: ip/bytes/tags. - Use single quotes in query strings; executeQuery - * wraps the query in JSON. - * - *

NOTE ON nomv/delim: - Some builds may not support nomv and/or delim yet, or may validate their - * ordering strictly. - These tests are written to: - Validate behavior when the feature is - * supported - Tolerate a 400 syntax rejection when not supported - */ + public class CalciteMvCombineCommandIT extends PPLIntegTestCase { private static final String INDEX = Index.MVCOMBINE.getName(); From 3ee160b878bd27db634b510b20448b10a86cb696 Mon Sep 17 00:00:00 2001 From: Srikanth Padakanti Date: Wed, 7 Jan 2026 12:10:09 -0600 Subject: [PATCH 03/32] Add doctests to MvCombine Signed-off-by: Srikanth Padakanti --- docs/category.json | 1 + docs/user/dql/metadata.rst | 3 +- docs/user/ppl/cmd/mvcombine.md | 149 ++++++++++++++++++++++++++++ doctest/test_data/mvcombine.json | 18 ++++ doctest/test_docs.py | 3 +- doctest/test_mapping/mvcombine.json | 12 +++ 6 files changed, 184 insertions(+), 2 deletions(-) create mode 100644 docs/user/ppl/cmd/mvcombine.md create mode 100644 doctest/test_data/mvcombine.json create mode 100644 doctest/test_mapping/mvcombine.json diff --git a/docs/category.json b/docs/category.json index 094768d1e6f..03a9630af13 100644 --- a/docs/category.json +++ b/docs/category.json @@ -36,6 +36,7 @@ "user/ppl/cmd/sort.md", "user/ppl/cmd/spath.md", "user/ppl/cmd/stats.md", + "user/ppl/cmd/mvcombine.md", "user/ppl/cmd/streamstats.md", "user/ppl/cmd/subquery.md", "user/ppl/cmd/syntax.md", diff --git a/docs/user/dql/metadata.rst b/docs/user/dql/metadata.rst index e959a69c8b6..e4f55ef1b3e 100644 --- a/docs/user/dql/metadata.rst +++ b/docs/user/dql/metadata.rst @@ -35,7 +35,7 @@ Example 1: Show All Indices Information SQL query:: os> SHOW TABLES LIKE '%' - fetched rows / total rows = 23/23 + fetched rows / total rows = 24/24 +----------------+-------------+-------------------+------------+---------+----------+------------+-----------+---------------------------+----------------+ | TABLE_CAT | TABLE_SCHEM | TABLE_NAME | TABLE_TYPE | REMARKS | TYPE_CAT | TYPE_SCHEM | TYPE_NAME | SELF_REFERENCING_COL_NAME | REF_GENERATION | |----------------+-------------+-------------------+------------+---------+----------+------------+-----------+---------------------------+----------------| @@ -48,6 +48,7 @@ SQL query:: | docTestCluster | null | events_many_hosts | BASE TABLE | null | null | null | null | null | null | | docTestCluster | null | events_null | BASE TABLE | null | null | null | null | null | null | | docTestCluster | null | json_test | BASE TABLE | null | null | null | null | null | null | + | docTestCluster | null | mvcombine_data | BASE TABLE | null | null | null | null | null | null | | docTestCluster | null | nested | BASE TABLE | null | null | null | null | null | null | | docTestCluster | null | nyc_taxi | BASE TABLE | null | null | null | null | null | null | | docTestCluster | null | occupation | BASE TABLE | null | null | null | null | null | null | diff --git a/docs/user/ppl/cmd/mvcombine.md b/docs/user/ppl/cmd/mvcombine.md new file mode 100644 index 00000000000..6f6760b6d84 --- /dev/null +++ b/docs/user/ppl/cmd/mvcombine.md @@ -0,0 +1,149 @@ +# mvcombine + +## Description + +The `mvcombine` command groups events that are identical across all fields except the target field, and combines the target field values into a multivalue (array) field. All other fields in the original events are preserved in the output. + +Key aspects of `mvcombine`: +* It generates one row per group, where the group keys are all fields currently in the pipeline except the target field. +* The target field becomes a multivalue field containing the combined values from the grouped rows. +* If `nomv` is specified, the target field is returned as a single scalar string instead of a multivalue array. +* If `delim` is specified, it controls the delimiter only when `nomv` is enabled. +* If some rows are missing the target field, those rows contribute no value to the combined output for that group. + +## Syntax + +mvcombine [nomv] [delim=] +* field: mandatory. The field whose values are combined into a multivalue field. +* nomv: optional. If specified, returns the combined values as a single string (not a multivalue array). +* delim: optional. Delimiter used to join values when `nomv` is specified. Defaults to comma (`,`) when not provided. + +## Example 1: Basic mvcombine + +Given a dataset `mvcombine` with the following data: + +```text +{"ip":"10.0.0.1","bytes":100,"tags":"t1","packets_str":"10"} +{"ip":"10.0.0.1","bytes":100,"tags":"t1","packets_str":"20"} +{"ip":"10.0.0.1","bytes":100,"tags":"t1","packets_str":"30"} +``` +The following query collapses the three rows into a single row, and combines packets_str into a multivalue field: + +```ppl +source=mvcombine_data +| where ip='10.0.0.1' and bytes=100 and tags='t1' +| fields ip, bytes, tags, packets_str +| mvcombine packets_str +``` + +Expected output: +```text +fetched rows / total rows = 1/1 ++----------+-------+------+-------------+ +| ip | bytes | tags | packets_str | +|----------+-------+------+-------------| +| 10.0.0.1 | 100 | t1 | [10,20,30] | ++----------+-------+------+-------------+ +``` + +Example 2: Multiple groups + +Given a dataset mvcombine with the following data: +```text +{"ip":"10.0.0.7","bytes":700,"tags":"t7","packets_str":"1"} +{"ip":"10.0.0.7","bytes":700,"tags":"t7","packets_str":"2"} +{"ip":"10.0.0.8","bytes":700,"tags":"t7","packets_str":"9"} +``` + +The following query produces one output row per group key: +```ppl +source=mvcombine_data +| where bytes=700 and tags='t7' +| fields ip, bytes, tags, packets_str +| sort ip, packets_str +| mvcombine packets_str +| sort ip +``` + +Expected output: +```text +fetched rows / total rows = 2/2 ++----------+-------+------+-------------+ +| ip | bytes | tags | packets_str | +|----------+-------+------+-------------| +| 10.0.0.7 | 700 | t7 | [1,2] | +| 10.0.0.8 | 700 | t7 | [9] | ++----------+-------+------+-------------+ +``` + +Example 3: mvcombine with nomv + +Given a dataset mvcombine with the following data: +```text +{"ip":"10.0.0.1","bytes":100,"tags":"t1","packets_str":"10"} +{"ip":"10.0.0.1","bytes":100,"tags":"t1","packets_str":"20"} +{"ip":"10.0.0.1","bytes":100,"tags":"t1","packets_str":"30"} +``` + +The following query returns packets_str as a single string instead of a multivalue array: +```ppl +source=mvcombine_data +| where ip='10.0.0.1' and bytes=100 and tags='t1' +| fields ip, bytes, tags, packets_str +| sort packets_str +| mvcombine packets_str nomv=true +``` + +Expected output: +```text +fetched rows / total rows = 1/1 ++----------+-------+------+-------------+ +| ip | bytes | tags | packets_str | +|----------+-------+------+-------------| +| 10.0.0.1 | 100 | t1 | 10 20 30 | ++----------+-------+------+-------------+ +``` + +Example 4: Missing target field in some rows + +Rows missing the target field do not contribute a value to the combined output. + +Given a dataset mvcombine with the following data: +```text +{"ip":"10.0.0.3","bytes":300,"tags":"t3","packets_str":"5"} +{"ip":"10.0.0.3","bytes":300,"tags":"t3"} +{"ip":"10.0.0.3","bytes":300,"tags":"t3","letters":"a"} +``` + +The following query collapses the group and preserves the non-missing value: +```ppl +source=mvcombine_data +| where ip='10.0.0.3' and bytes=300 and tags='t3' +| fields ip, bytes, tags, packets_str +| mvcombine packets_str +``` + +Expected output: +```text +fetched rows / total rows = 1/1 ++----------+-------+------+-------------+ +| ip | bytes | tags | packets_str | +|----------+-------+------+-------------| +| 10.0.0.3 | 300 | t3 | [5] | ++----------+-------+------+-------------+ +``` + +Example 6: Error when field does not exist + +If the specified field does not exist in the current schema, mvcombine returns an error. +```ppl +source=mvcombine_data +| mvcombine does_not_exist +``` + +Expected output: +```text +{'reason': 'Invalid Query', 'details': 'Field [does_not_exist] not found.', 'type': 'IllegalArgumentException'} +Error: Query returned no data + +``` \ No newline at end of file diff --git a/doctest/test_data/mvcombine.json b/doctest/test_data/mvcombine.json new file mode 100644 index 00000000000..60c08ebd8a9 --- /dev/null +++ b/doctest/test_data/mvcombine.json @@ -0,0 +1,18 @@ +{"ip":"10.0.0.1","bytes":100,"tags":"t1","packets_str":"10"} +{"ip":"10.0.0.1","bytes":100,"tags":"t1","packets_str":"20"} +{"ip":"10.0.0.1","bytes":100,"tags":"t1","packets_str":"30"} +{"ip":"10.0.0.2","bytes":200,"tags":"t2","packets_str":"7"} +{"ip":"10.0.0.3","bytes":300,"tags":"t3","packets_str":"5"} +{"ip":"10.0.0.3","bytes":300,"tags":"t3"} +{"ip":"10.0.0.3","bytes":300,"tags":"t3","letters":"a"} +{"ip":"10.0.0.7","bytes":700,"tags":"t7","packets_str":"1"} +{"ip":"10.0.0.7","bytes":700,"tags":"t7","packets_str":"2"} +{"ip":"10.0.0.8","bytes":700,"tags":"t7","packets_str":"9"} +{"ip":"10.0.0.9","bytes":900,"tags":"t9","packets_str":"1"} +{"ip":"10.0.0.9","bytes":900,"tags":"t9","packets_str":"2"} +{"ip":"10.0.0.9","bytes":900,"tags":"t9","packets_str":"3"} +{"ip":"10.0.0.5","bytes":500,"tags":"t5","packets_str":"dup"} +{"ip":"10.0.0.5","bytes":500,"tags":"t5","packets_str":"dup"} +{"ip":"10.0.0.5","bytes":500,"tags":"t5","packets_str":"x"} +{"ip":"10.0.0.6","bytes":600,"tags":"t6","packets_str":""} +{"ip":"10.0.0.6","bytes":600,"tags":"t6","packets_str":"z"} diff --git a/doctest/test_docs.py b/doctest/test_docs.py index e57c41d6827..6283252065f 100644 --- a/doctest/test_docs.py +++ b/doctest/test_docs.py @@ -57,7 +57,8 @@ 'otellogs': 'otellogs.json', 'time_data': 'time_test_data.json', 'time_data2': 'time_test_data2.json', - 'time_test': 'time_test.json' + 'time_test': 'time_test.json', + 'mvcombine_data': 'mvcombine.json', } DEBUG_MODE = os.environ.get('DOCTEST_DEBUG', 'false').lower() == 'true' diff --git a/doctest/test_mapping/mvcombine.json b/doctest/test_mapping/mvcombine.json new file mode 100644 index 00000000000..06e00747e1f --- /dev/null +++ b/doctest/test_mapping/mvcombine.json @@ -0,0 +1,12 @@ +{ + "mappings": { + "properties": { + "case": { "type": "keyword" }, + "ip": { "type": "ip" }, + "bytes": { "type": "long" }, + "tags": { "type": "keyword" }, + "packets_str": { "type": "keyword" }, + "letters": { "type": "keyword" } + } + } +} From 5bc0d98725cad1a46f6b0203caf85b0442db31a4 Mon Sep 17 00:00:00 2001 From: Srikanth Padakanti Date: Wed, 7 Jan 2026 12:15:29 -0600 Subject: [PATCH 04/32] spotlesscheck apply Signed-off-by: Srikanth Padakanti --- .../opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java | 1 - 1 file changed, 1 deletion(-) diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java index fd2cac3f7dc..33a9e462834 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java @@ -19,7 +19,6 @@ import org.opensearch.client.ResponseException; import org.opensearch.sql.ppl.PPLIntegTestCase; - public class CalciteMvCombineCommandIT extends PPLIntegTestCase { private static final String INDEX = Index.MVCOMBINE.getName(); From a5913af35c631fba88845149899178cf0d47bd80 Mon Sep 17 00:00:00 2001 From: Srikanth Padakanti Date: Wed, 7 Jan 2026 12:23:51 -0600 Subject: [PATCH 05/32] spotlesscheck apply Signed-off-by: Srikanth Padakanti --- .../org/opensearch/sql/calcite/CalciteRelNodeVisitor.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java index 3d18f2e0b7b..9c3284ab057 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java @@ -3122,7 +3122,7 @@ public RelNode visitMvCombine(MvCombine node, CalcitePlanContext context) { final RelNode input = relBuilder.peek(); final List inputFieldNames = input.getRowType().getFieldNames(); - // 2) Splunk parity: accept delim (default is single space). + // 2) Accept delim (default is single space). // NOTE: delim only affects single-value rendering when nomv=true. final String delim = (node.getDelim() != null) ? node.getDelim() : " "; @@ -3174,7 +3174,7 @@ public RelNode visitMvCombine(MvCombine node, CalcitePlanContext context) { } relBuilder.project(projections, projNames, /* force= */ true); - // 7) Splunk parity: nomv=true converts multivalue output to a single delimited string. + // 7) nomv=true converts multivalue output to a single delimited string. // arrayToString in this engine supports only String/ByteString, so cast elements to STRING // first. if (node.isNomv()) { From 5ac0f3d1db81734570d50639353d2df61b117bda Mon Sep 17 00:00:00 2001 From: Srikanth Padakanti Date: Wed, 7 Jan 2026 12:26:52 -0600 Subject: [PATCH 06/32] spotlesscheck apply Signed-off-by: Srikanth Padakanti --- .../remote/CalciteMvCombineCommandIT.java | 28 ++----------------- 1 file changed, 2 insertions(+), 26 deletions(-) diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java index 33a9e462834..f15208fcbd3 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java @@ -47,7 +47,6 @@ public void testSanity_datasetIsLoaded() throws IOException { @Test public void testMvCombine_basicGroupCollapsesToOneRow() throws IOException { - // Dataset: ip=10.0.0.1 bytes=100 tags=t1 packets_str in [10,20,30] String q = "source=" + INDEX @@ -71,7 +70,6 @@ public void testMvCombine_basicGroupCollapsesToOneRow() throws IOException { @Test public void testMvCombine_singleRowGroupStaysSingleRow() throws IOException { - // Dataset: ip=10.0.0.2 bytes=200 tags=t2 packets_str=7 String q = "source=" + INDEX @@ -92,10 +90,6 @@ public void testMvCombine_singleRowGroupStaysSingleRow() throws IOException { @Test public void testMvCombine_missingTargetWithinGroup_collapses_nonNullPreserved() throws IOException { - // Dataset: ip=10.0.0.3 bytes=300 tags=t3 has: - // - one doc with packets_str=5 - // - one doc missing packets_str - // - one doc with letters only String q = "source=" + INDEX @@ -109,9 +103,6 @@ public void testMvCombine_missingTargetWithinGroup_collapses_nonNullPreserved() JSONArray row = result.getJSONArray("datarows").getJSONArray(0); List mv = toStringListKeepNulls(row.get(3)); - // Requirements we can enforce safely: - // - collapse happens into 1 row - // - non-null value is preserved Assertions.assertTrue(mv.contains("5"), "Expected packets_str to include 5, got " + mv); } @@ -121,16 +112,12 @@ public void testMvCombine_missingTargetWithinGroup_collapses_nonNullPreserved() @Test public void testMvCombine_multipleGroups_producesOneRowPerGroupKey() throws IOException { - // Dataset has: - // ip=10.0.0.7 bytes=700 tags=t7 packets_str=[1,2] - // ip=10.0.0.8 bytes=700 tags=t7 packets_str=[9] String base = "source=" + INDEX + " | where (ip='10.0.0.7' or ip='10.0.0.8') and bytes=700 and tags='t7'" + " | fields ip, bytes, tags, packets_str"; - // precondition (should exist; if someone edits the dataset, fail with a crisp message) JSONObject before = executeQuery(base); int beforeRows = before.getJSONArray("datarows").length(); Assertions.assertTrue(beforeRows >= 1, "Expected dataset rows for multi-group test, got 0"); @@ -140,7 +127,6 @@ public void testMvCombine_multipleGroups_producesOneRowPerGroupKey() throws IOEx Assertions.assertEquals( 2, outRows, "Expected 2 groups (10.0.0.7 and 10.0.0.8), got " + outRows); - // Spot-check values without assuming ordering within MV arrays JSONArray r0 = result.getJSONArray("datarows").getJSONArray(0); JSONArray r1 = result.getJSONArray("datarows").getJSONArray(1); @@ -167,20 +153,17 @@ public void testMvCombine_multipleGroups_producesOneRowPerGroupKey() throws IOEx } // --------------------------- - // delim/nomv: happy paths + edge tolerance + // delim/nomv: happy paths + edge case // --------------------------- @Test public void testMvCombine_nomv_defaultDelim_ifSupported_elseSyntaxRejected() throws Exception { - // Uses ip=10.0.0.9 bytes=900 tags=t9 packets_str=[1,2,3] String base = "source=" + INDEX + " | where ip='10.0.0.9' and bytes=900 and tags='t9'" + " | fields ip, bytes, tags, packets_str"; - // If supported: should return a scalar string containing 1,2,3 in some delimiter format. - // If unsupported: expected 400 syntax rejection. String q = base + " | mvcombine packets_str nomv"; try { @@ -204,14 +187,12 @@ public void testMvCombine_nomv_defaultDelim_ifSupported_elseSyntaxRejected() thr @Test public void testMvCombine_nomvWithCustomDelim_ifSupported_elseSyntaxRejected() throws Exception { - // IMPORTANT: single quotes in query String base = "source=" + INDEX + " | where ip='10.0.0.9' and bytes=900 and tags='t9'" + " | fields ip, bytes, tags, packets_str"; - // Test both parameter orders String q1 = base + " | mvcombine packets_str nomv delim='|'"; String q2 = base + " | mvcombine packets_str delim='|' nomv"; @@ -231,7 +212,6 @@ public void testMvCombine_nomvWithCustomDelim_ifSupported_elseSyntaxRejected() t } } - // Supported -> validate scalar string output and delimiter presence verifyNumOfRows(result, 1); Object cell = result.getJSONArray("datarows").getJSONArray(0).get(3); Assertions.assertTrue( @@ -247,9 +227,6 @@ public void testMvCombine_nomvWithCustomDelim_ifSupported_elseSyntaxRejected() t @Test public void testMvCombine_delimWithoutNomv_shouldNotChangeMvShape_ifSupported_elseSyntaxRejected() throws Exception { - // If delim is only meaningful with nomv, then: - // - supported behavior should still return multivalue (JSONArray) for the field - // - or it may reject delim when nomv absent (400) String base = "source=" + INDEX @@ -275,7 +252,7 @@ public void testMvCombine_delimWithoutNomv_shouldNotChangeMvShape_ifSupported_el } // --------------------------- - // Edge / error semantics + // Edge case / error semantics // --------------------------- @Test @@ -302,7 +279,6 @@ private static boolean isSyntaxBadRequest(ResponseException e) { String msg = e.getMessage(); if (msg == null) return false; - // Keep it broad: different layers throw different messages return msg.contains("SyntaxCheckException") || msg.contains("Invalid Query") || msg.contains("parsing_exception") From e4150e45b3107070d8d53c5abc9cb9a50745dd75 Mon Sep 17 00:00:00 2001 From: Srikanth Padakanti Date: Thu, 8 Jan 2026 12:08:39 -0600 Subject: [PATCH 07/32] spotlessapply Signed-off-by: Srikanth Padakanti --- .../opensearch/sql/ppl/parser/AstBuilder.java | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java index 3c3023afcf0..d5601458e71 100644 --- a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java +++ b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java @@ -877,7 +877,7 @@ public UnresolvedPlan visitMvcombineCommand(OpenSearchPPLParser.MvcombineCommand String delim = null; if (ctx.DELIM() != null) { - delim = unquoteStringLiteral(ctx.stringLiteral().getText()); + delim = StringUtils.unquoteText(getTextInQuery(ctx.stringLiteral())); } boolean nomv = false; @@ -1310,20 +1310,6 @@ private String getTextInQuery(ParserRuleContext ctx) { return query.substring(start.getStartIndex(), stop.getStopIndex() + 1); } - private static String unquoteStringLiteral(String text) { - if (text == null || text.length() < 2) { - return text; - } - char first = text.charAt(0); - char last = text.charAt(text.length() - 1); - if ((first == '"' && last == '"') - || (first == '\'' && last == '\'') - || (first == '`' && last == '`')) { - return text.substring(1, text.length() - 1); - } - return text; - } - /** * Try to wrap the plan with a project node of this AllFields expression. Only wrap it if the plan * is not a project node or if the project is type of excluded. From dba85da2b9ab274449f5fe9ecb1c96f2c12726ab Mon Sep 17 00:00:00 2001 From: Srikanth Padakanti Date: Thu, 8 Jan 2026 12:38:53 -0600 Subject: [PATCH 08/32] Address coderrabbit comments Signed-off-by: Srikanth Padakanti --- .../sql/calcite/CalciteRelNodeVisitor.java | 2 +- docs/user/ppl/cmd/mvcombine.md | 8 +- .../opensearch/sql/ppl/parser/AstBuilder.java | 1 - .../ppl/calcite/CalcitePPLMvCombineTest.java | 91 ++++++++++++++++++- 4 files changed, 96 insertions(+), 6 deletions(-) diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java index 9c3284ab057..f9664bfd669 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java @@ -3124,7 +3124,7 @@ public RelNode visitMvCombine(MvCombine node, CalcitePlanContext context) { // 2) Accept delim (default is single space). // NOTE: delim only affects single-value rendering when nomv=true. - final String delim = (node.getDelim() != null) ? node.getDelim() : " "; + final String delim = node.getDelim(); // 3) Resolve target field to an input ref (must be a direct field reference) final Field targetField = node.getField(); diff --git a/docs/user/ppl/cmd/mvcombine.md b/docs/user/ppl/cmd/mvcombine.md index 6f6760b6d84..62ecc907434 100644 --- a/docs/user/ppl/cmd/mvcombine.md +++ b/docs/user/ppl/cmd/mvcombine.md @@ -13,10 +13,12 @@ Key aspects of `mvcombine`: ## Syntax -mvcombine [nomv] [delim=] +mvcombine [nomv=] [delim=] + * field: mandatory. The field whose values are combined into a multivalue field. * nomv: optional. If specified, returns the combined values as a single string (not a multivalue array). -* delim: optional. Delimiter used to join values when `nomv` is specified. Defaults to comma (`,`) when not provided. +* delim: optional. Delimiter used to join values when `nomv` is specified. Defaults to space (` `) when not provided. +* “When nomv is not enabled, the result is a multivalue field; in tabular output it may be displayed in bracket notation with comma-separated elements (e.g., [10,20,30]).” ## Example 1: Basic mvcombine @@ -133,7 +135,7 @@ fetched rows / total rows = 1/1 +----------+-------+------+-------------+ ``` -Example 6: Error when field does not exist +Example 5: Error when field does not exist If the specified field does not exist in the current schema, mvcombine returns an error. ```ppl diff --git a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java index d5601458e71..7157299c398 100644 --- a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java +++ b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java @@ -869,7 +869,6 @@ public UnresolvedPlan visitExpandCommand(OpenSearchPPLParser.ExpandCommandContex return new Expand(fieldExpression, alias); } - /** mvcombine command. */ /** mvcombine command. */ @Override public UnresolvedPlan visitMvcombineCommand(OpenSearchPPLParser.MvcombineCommandContext ctx) { diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLMvCombineTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLMvCombineTest.java index 6eb167e9ee2..9874f7256cd 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLMvCombineTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLMvCombineTest.java @@ -5,6 +5,8 @@ package org.opensearch.sql.ppl.calcite; +import static org.junit.Assert.assertThrows; + import com.google.common.collect.ImmutableList; import java.util.List; import lombok.RequiredArgsConstructor; @@ -46,10 +48,19 @@ protected Frameworks.ConfigBuilder config(CalciteAssert.SchemaSpec... schemaSpec ImmutableList rows = ImmutableList.of( + // existing "basic" new Object[] {"basic", "A", 10}, new Object[] {"basic", "A", 20}, new Object[] {"basic", "B", 60}, - new Object[] {"basic", "A", 30}); + new Object[] {"basic", "A", 30}, + + // new: NULL target values case + new Object[] {"nulls", "A", null}, + new Object[] {"nulls", "A", 10}, + new Object[] {"nulls", "B", null}, + + // new: single-row case + new Object[] {"single", "Z", 5}); schema.add("MVCOMBINE_DATA", new MvCombineDataTable(rows)); @@ -104,6 +115,84 @@ public void testMvCombineNomvWithCustomDelim() { verifyLogical(root, expectedLogical); } + @Test + public void testMvCombineWithNullTargetValues() { + String ppl = + "source=MVCOMBINE_DATA " + + "| where case = \"nulls\" " + + "| fields case, ip, packets " + + "| mvcombine packets " + + "| sort ip"; + + RelNode root = getRelNode(ppl); + + String expectedLogical = + "LogicalSort(sort0=[$1], dir0=[ASC-nulls-first])\n" + + " LogicalProject(case=[$0], ip=[$1], packets=[CAST($2):INTEGER ARRAY NOT NULL])\n" + + " LogicalAggregate(group=[{0, 1}], packets=[COLLECT($2)])\n" + + " LogicalFilter(condition=[=($0, 'nulls')])\n" + + " LogicalTableScan(table=[[scott, MVCOMBINE_DATA]])\n"; + verifyLogical(root, expectedLogical); + } + + @Test + public void testMvCombineNonExistentField() { + String ppl = + "source=MVCOMBINE_DATA " + + "| where case = \"basic\" " + + "| fields case, ip, packets " + + "| mvcombine does_not_exist"; + + Exception ex = assertThrows(Exception.class, () -> getRelNode(ppl)); + + // Keep this loose: different layers may wrap exceptions. + // We just need to prove the command fails for missing target field. + String msg = String.valueOf(ex.getMessage()); + org.junit.Assert.assertTrue( + "Expected error message to mention missing field. Actual: " + msg, + msg.toLowerCase().contains("does_not_exist") || msg.toLowerCase().contains("field")); + } + + @Test + public void testMvCombineSingleRow() { + String ppl = + "source=MVCOMBINE_DATA " + + "| where case = \"single\" " + + "| fields case, ip, packets " + + "| mvcombine packets " + + "| sort ip"; + + RelNode root = getRelNode(ppl); + + String expectedLogical = + "LogicalSort(sort0=[$1], dir0=[ASC-nulls-first])\n" + + " LogicalProject(case=[$0], ip=[$1], packets=[CAST($2):INTEGER ARRAY NOT NULL])\n" + + " LogicalAggregate(group=[{0, 1}], packets=[COLLECT($2)])\n" + + " LogicalFilter(condition=[=($0, 'single')])\n" + + " LogicalTableScan(table=[[scott, MVCOMBINE_DATA]])\n"; + verifyLogical(root, expectedLogical); + } + + @Test + public void testMvCombineEmptyResult() { + String ppl = + "source=MVCOMBINE_DATA " + + "| where case = \"no_such_case\" " + + "| fields case, ip, packets " + + "| mvcombine packets " + + "| sort ip"; + + RelNode root = getRelNode(ppl); + + String expectedLogical = + "LogicalSort(sort0=[$1], dir0=[ASC-nulls-first])\n" + + " LogicalProject(case=[$0], ip=[$1], packets=[CAST($2):INTEGER ARRAY NOT NULL])\n" + + " LogicalAggregate(group=[{0, 1}], packets=[COLLECT($2)])\n" + + " LogicalFilter(condition=[=($0, 'no_such_case')])\n" + + " LogicalTableScan(table=[[scott, MVCOMBINE_DATA]])\n"; + verifyLogical(root, expectedLogical); + } + // ======================================================================== // Custom ScannableTable for deterministic mvcombine planning tests // ======================================================================== From 6750593319c1e4a90cfcfddd12ef082f3659bc47 Mon Sep 17 00:00:00 2001 From: Srikanth Padakanti Date: Thu, 8 Jan 2026 12:46:46 -0600 Subject: [PATCH 09/32] Address coderrabbit comments Signed-off-by: Srikanth Padakanti --- .../org/opensearch/sql/calcite/CalciteRelNodeVisitor.java | 8 +++++++- docs/user/ppl/cmd/mvcombine.md | 6 +++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java index f9664bfd669..9458c0254f9 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java @@ -53,7 +53,13 @@ import org.apache.calcite.rel.type.RelDataType; import org.apache.calcite.rel.type.RelDataTypeFamily; import org.apache.calcite.rel.type.RelDataTypeField; -import org.apache.calcite.rex.*; +import org.apache.calcite.rex.RexCall; +import org.apache.calcite.rex.RexCorrelVariable; +import org.apache.calcite.rex.RexInputRef; +import org.apache.calcite.rex.RexLiteral; +import org.apache.calcite.rex.RexNode; +import org.apache.calcite.rex.RexVisitorImpl; +import org.apache.calcite.rex.RexWindowBounds; import org.apache.calcite.sql.SqlKind; import org.apache.calcite.sql.fun.SqlStdOperatorTable; import org.apache.calcite.sql.type.ArraySqlType; diff --git a/docs/user/ppl/cmd/mvcombine.md b/docs/user/ppl/cmd/mvcombine.md index 62ecc907434..87718e365be 100644 --- a/docs/user/ppl/cmd/mvcombine.md +++ b/docs/user/ppl/cmd/mvcombine.md @@ -7,9 +7,9 @@ The `mvcombine` command groups events that are identical across all fields excep Key aspects of `mvcombine`: * It generates one row per group, where the group keys are all fields currently in the pipeline except the target field. * The target field becomes a multivalue field containing the combined values from the grouped rows. -* If `nomv` is specified, the target field is returned as a single scalar string instead of a multivalue array. -* If `delim` is specified, it controls the delimiter only when `nomv` is enabled. -* If some rows are missing the target field, those rows contribute no value to the combined output for that group. +* When `nomv` is specified, the target field is returned as a single scalar string instead of a multivalue array. +* The `delim` parameter controls the delimiter only when `nomv` is enabled. +* Rows missing the target field contribute no value to the combined output for that group. ## Syntax From b2a9d9f5abff8fff63ee4f6717f9800d3a2546c1 Mon Sep 17 00:00:00 2001 From: Srikanth Padakanti Date: Thu, 8 Jan 2026 12:49:34 -0600 Subject: [PATCH 10/32] Address coderrabbit comments Signed-off-by: Srikanth Padakanti --- .../java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java | 1 + 1 file changed, 1 insertion(+) diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java index 9458c0254f9..35050850d10 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java @@ -53,6 +53,7 @@ import org.apache.calcite.rel.type.RelDataType; import org.apache.calcite.rel.type.RelDataTypeFamily; import org.apache.calcite.rel.type.RelDataTypeField; +import org.apache.calcite.rex.RexBuilder; import org.apache.calcite.rex.RexCall; import org.apache.calcite.rex.RexCorrelVariable; import org.apache.calcite.rex.RexInputRef; From 6510d2b581500c1925225ac9c6820b80722471c8 Mon Sep 17 00:00:00 2001 From: Srikanth Padakanti Date: Thu, 8 Jan 2026 13:51:02 -0600 Subject: [PATCH 11/32] Address coderrabbit comments Signed-off-by: Srikanth Padakanti --- .../sql/calcite/CalciteRelNodeVisitor.java | 247 +++++++++++++++--- 1 file changed, 207 insertions(+), 40 deletions(-) diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java index 35050850d10..8e540370cdf 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java @@ -3118,6 +3118,139 @@ public RelNode visitExpand(Expand expand, CalcitePlanContext context) { return context.relBuilder.peek(); } + // /** + // * mvcombine command visitor to collapse rows that are identical on all fields except the + // target + // * field, and combine the target field values into a multivalue (array) field. + // * + // * + // * @param node mvcombine command to be visited + // * @param context CalcitePlanContext containing the RelBuilder, RexBuilder, and resolution + // context + // * @return RelNode representing records collapsed by all non-target fields with the target + // field + // * combined into a multivalue array (or a delimited string when {@code nomv=true}) + // */ + // @Override + // public RelNode visitMvCombine(MvCombine node, CalcitePlanContext context) { + // // 1) Lower child first + // visitChildren(node, context); + // + // final RelBuilder relBuilder = context.relBuilder; + // final RexBuilder rexBuilder = context.rexBuilder; + // + // final RelNode input = relBuilder.peek(); + // final List inputFieldNames = input.getRowType().getFieldNames(); + // + // // 2) Accept delim (default is single space). + // // NOTE: delim only affects single-value rendering when nomv=true. + // final String delim = node.getDelim(); + // + // // 3) Resolve target field to an input ref (must be a direct field reference) + // final Field targetField = node.getField(); + // final RexNode targetRex = rexVisitor.analyze(targetField, context); + // if (!(targetRex instanceof RexInputRef)) { + // throw new SemanticCheckException( + // "mvcombine target must be a direct field reference, but got: " + targetField); + // } + // final int targetIndex = ((RexInputRef) targetRex).getIndex(); + // final String targetName = inputFieldNames.get(targetIndex); + // + // // 4) Group key = all fields except target + // final List groupExprs = new ArrayList<>(); + // for (int i = 0; i < inputFieldNames.size(); i++) { + // if (i == targetIndex) continue; + // groupExprs.add(relBuilder.field(i)); + // } + // + // // 5) Aggregate target values: COLLECT => MULTISET + // final RelBuilder.AggCall aggCall = + // relBuilder + // .aggregateCall(SqlStdOperatorTable.COLLECT, relBuilder.field(targetIndex)) + // .as(targetName); + // + // relBuilder.aggregate(relBuilder.groupKey(groupExprs), aggCall); + // + // // 6) Restore original output column order AND cast MULTISET -> ARRAY using RexBuilder + // // After aggregate => [group fields..., targetAgg(multiset)] + // final int aggPos = groupExprs.size(); + // + // final RelDataType elemType = input.getRowType().getFieldList().get(targetIndex).getType(); + // final RelDataType arrayType = relBuilder.getTypeFactory().createArrayType(elemType, -1); + // + // final List projections = new ArrayList<>(inputFieldNames.size()); + // final List projNames = new ArrayList<>(inputFieldNames.size()); + // + // int groupPos = 0; + // for (int i = 0; i < inputFieldNames.size(); i++) { + // projNames.add(inputFieldNames.get(i)); + // if (i == targetIndex) { + // final RexNode multisetRef = relBuilder.field(aggPos); // MULTISET + // projections.add(rexBuilder.makeCast(arrayType, multisetRef)); // ARRAY + // } else { + // projections.add(relBuilder.field(groupPos)); + // groupPos++; + // } + // } + // relBuilder.project(projections, projNames, /* force= */ true); + // + // // 7) nomv=true converts multivalue output to a single delimited string. + // // arrayToString in this engine supports only String/ByteString, so cast elements to STRING + // // first. + // if (node.isNomv()) { + // final List nomvProjections = new ArrayList<>(inputFieldNames.size()); + // final List nomvNames = new ArrayList<>(inputFieldNames.size()); + // + // // Build ARRAY type once + // final RelDataType stringType = + // relBuilder + // .getTypeFactory() + // .createSqlType(org.apache.calcite.sql.type.SqlTypeName.VARCHAR); + // final RelDataType stringArrayType = + // relBuilder.getTypeFactory().createArrayType(stringType, -1); + // + // for (int i = 0; i < inputFieldNames.size(); i++) { + // final String name = inputFieldNames.get(i); + // nomvNames.add(name); + // + // if (i == targetIndex) { + // // At this point relBuilder.field(i) is ARRAY. Cast to ARRAY so + // // ARRAY_TO_STRING works. + // final RexNode stringArray = rexBuilder.makeCast(stringArrayType, relBuilder.field(i)); + // + // nomvProjections.add( + // relBuilder.call( + // org.apache.calcite.sql.fun.SqlLibraryOperators.ARRAY_TO_STRING, + // stringArray, + // relBuilder.literal(delim))); + // } else { + // nomvProjections.add(relBuilder.field(i)); + // } + // } + // + // relBuilder.project(nomvProjections, nomvNames, /* force= */ true); + // } + // + // return relBuilder.peek(); + // } + + /** + * mvcombine command visitor to collapse rows that are identical on all fields except the target + * field, and combine the target field values into a multivalue (array) field. + * + *

Implementation notes: + * + *

Groups by all input fields except the target field. Aggregates target using {@code COLLECT} + * (MULTISET) and casts MULTISET -> ARRAY. Preserves original output column order. If {@code + * nomv=true}, renders target array as a scalar string via {@code ARRAY_TO_STRING} using {@code + * delim}. {@code delim} only affects output when {@code nomv=true}. + * + * @param node mvcombine command to be visited + * @param context CalcitePlanContext containing the RelBuilder, RexBuilder, and resolution context + * @return RelNode representing collapsed records with the target combined into a multivalue array + * (or a delimited string when {@code nomv=true}) + * @throws SemanticCheckException if the mvcombine target is not a direct field reference + */ @Override public RelNode visitMvCombine(MvCombine node, CalcitePlanContext context) { // 1) Lower child first @@ -3135,30 +3268,65 @@ public RelNode visitMvCombine(MvCombine node, CalcitePlanContext context) { // 3) Resolve target field to an input ref (must be a direct field reference) final Field targetField = node.getField(); + final int targetIndex = resolveTargetIndex(targetField, context); + final String targetName = inputFieldNames.get(targetIndex); + + // 4) Group key = all fields except target + final List groupExprs = + buildGroupExpressionsExcludingTarget(targetIndex, inputFieldNames, relBuilder); + + // 5) Aggregate target values: COLLECT => MULTISET + performCollectAggregation(relBuilder, targetIndex, targetName, groupExprs); + + // 6) Restore original output column order AND cast MULTISET -> ARRAY using RexBuilder + restoreColumnOrderWithArrayCast( + relBuilder, rexBuilder, input, inputFieldNames, targetIndex, groupExprs); + + // 7) nomv=true converts multivalue output to a single delimited string. + if (node.isNomv()) { + applyNomvConversion(relBuilder, rexBuilder, inputFieldNames, targetIndex, delim); + } + + return relBuilder.peek(); + } + + private int resolveTargetIndex(Field targetField, CalcitePlanContext context) { final RexNode targetRex = rexVisitor.analyze(targetField, context); if (!(targetRex instanceof RexInputRef)) { throw new SemanticCheckException( "mvcombine target must be a direct field reference, but got: " + targetField); } - final int targetIndex = ((RexInputRef) targetRex).getIndex(); - final String targetName = inputFieldNames.get(targetIndex); + return ((RexInputRef) targetRex).getIndex(); + } - // 4) Group key = all fields except target + private List buildGroupExpressionsExcludingTarget( + int targetIndex, List inputFieldNames, RelBuilder relBuilder) { final List groupExprs = new ArrayList<>(); for (int i = 0; i < inputFieldNames.size(); i++) { if (i == targetIndex) continue; groupExprs.add(relBuilder.field(i)); } + return groupExprs; + } - // 5) Aggregate target values: COLLECT => MULTISET + private void performCollectAggregation( + RelBuilder relBuilder, int targetIndex, String targetName, List groupExprs) { final RelBuilder.AggCall aggCall = relBuilder .aggregateCall(SqlStdOperatorTable.COLLECT, relBuilder.field(targetIndex)) .as(targetName); relBuilder.aggregate(relBuilder.groupKey(groupExprs), aggCall); + } + + private void restoreColumnOrderWithArrayCast( + RelBuilder relBuilder, + RexBuilder rexBuilder, + RelNode input, + List inputFieldNames, + int targetIndex, + List groupExprs) { - // 6) Restore original output column order AND cast MULTISET -> ARRAY using RexBuilder // After aggregate => [group fields..., targetAgg(multiset)] final int aggPos = groupExprs.size(); @@ -3179,46 +3347,45 @@ public RelNode visitMvCombine(MvCombine node, CalcitePlanContext context) { groupPos++; } } + relBuilder.project(projections, projNames, /* force= */ true); + } - // 7) nomv=true converts multivalue output to a single delimited string. - // arrayToString in this engine supports only String/ByteString, so cast elements to STRING - // first. - if (node.isNomv()) { - final List nomvProjections = new ArrayList<>(inputFieldNames.size()); - final List nomvNames = new ArrayList<>(inputFieldNames.size()); - - // Build ARRAY type once - final RelDataType stringType = - relBuilder - .getTypeFactory() - .createSqlType(org.apache.calcite.sql.type.SqlTypeName.VARCHAR); - final RelDataType stringArrayType = - relBuilder.getTypeFactory().createArrayType(stringType, -1); - - for (int i = 0; i < inputFieldNames.size(); i++) { - final String name = inputFieldNames.get(i); - nomvNames.add(name); - - if (i == targetIndex) { - // At this point relBuilder.field(i) is ARRAY. Cast to ARRAY so - // ARRAY_TO_STRING works. - final RexNode stringArray = rexBuilder.makeCast(stringArrayType, relBuilder.field(i)); - - nomvProjections.add( - relBuilder.call( - org.apache.calcite.sql.fun.SqlLibraryOperators.ARRAY_TO_STRING, - stringArray, - relBuilder.literal(delim))); - } else { - nomvProjections.add(relBuilder.field(i)); - } - } + private void applyNomvConversion( + RelBuilder relBuilder, + RexBuilder rexBuilder, + List inputFieldNames, + int targetIndex, + String delim) { - relBuilder.project(nomvProjections, nomvNames, /* force= */ true); + final List nomvProjections = new ArrayList<>(inputFieldNames.size()); + final List nomvNames = new ArrayList<>(inputFieldNames.size()); + + // Build ARRAY type once + final RelDataType stringType = + relBuilder.getTypeFactory().createSqlType(org.apache.calcite.sql.type.SqlTypeName.VARCHAR); + final RelDataType stringArrayType = relBuilder.getTypeFactory().createArrayType(stringType, -1); + + for (int i = 0; i < inputFieldNames.size(); i++) { + final String name = inputFieldNames.get(i); + nomvNames.add(name); + + if (i == targetIndex) { + // At this point relBuilder.field(i) is ARRAY. Cast to ARRAY so + // ARRAY_TO_STRING works. + final RexNode stringArray = rexBuilder.makeCast(stringArrayType, relBuilder.field(i)); + + nomvProjections.add( + relBuilder.call( + org.apache.calcite.sql.fun.SqlLibraryOperators.ARRAY_TO_STRING, + stringArray, + relBuilder.literal(delim))); + } else { + nomvProjections.add(relBuilder.field(i)); + } } - return relBuilder.peek(); + relBuilder.project(nomvProjections, nomvNames, /* force= */ true); } @Override From a93660b2b9803f3431a16d8acaae2fe9f7343120 Mon Sep 17 00:00:00 2001 From: Srikanth Padakanti Date: Thu, 8 Jan 2026 13:51:54 -0600 Subject: [PATCH 12/32] Address coderrabbit comments Signed-off-by: Srikanth Padakanti --- .../sql/calcite/CalciteRelNodeVisitor.java | 116 ------------------ 1 file changed, 116 deletions(-) diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java index 8e540370cdf..b5b54ee2f3a 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java @@ -3118,122 +3118,6 @@ public RelNode visitExpand(Expand expand, CalcitePlanContext context) { return context.relBuilder.peek(); } - // /** - // * mvcombine command visitor to collapse rows that are identical on all fields except the - // target - // * field, and combine the target field values into a multivalue (array) field. - // * - // * - // * @param node mvcombine command to be visited - // * @param context CalcitePlanContext containing the RelBuilder, RexBuilder, and resolution - // context - // * @return RelNode representing records collapsed by all non-target fields with the target - // field - // * combined into a multivalue array (or a delimited string when {@code nomv=true}) - // */ - // @Override - // public RelNode visitMvCombine(MvCombine node, CalcitePlanContext context) { - // // 1) Lower child first - // visitChildren(node, context); - // - // final RelBuilder relBuilder = context.relBuilder; - // final RexBuilder rexBuilder = context.rexBuilder; - // - // final RelNode input = relBuilder.peek(); - // final List inputFieldNames = input.getRowType().getFieldNames(); - // - // // 2) Accept delim (default is single space). - // // NOTE: delim only affects single-value rendering when nomv=true. - // final String delim = node.getDelim(); - // - // // 3) Resolve target field to an input ref (must be a direct field reference) - // final Field targetField = node.getField(); - // final RexNode targetRex = rexVisitor.analyze(targetField, context); - // if (!(targetRex instanceof RexInputRef)) { - // throw new SemanticCheckException( - // "mvcombine target must be a direct field reference, but got: " + targetField); - // } - // final int targetIndex = ((RexInputRef) targetRex).getIndex(); - // final String targetName = inputFieldNames.get(targetIndex); - // - // // 4) Group key = all fields except target - // final List groupExprs = new ArrayList<>(); - // for (int i = 0; i < inputFieldNames.size(); i++) { - // if (i == targetIndex) continue; - // groupExprs.add(relBuilder.field(i)); - // } - // - // // 5) Aggregate target values: COLLECT => MULTISET - // final RelBuilder.AggCall aggCall = - // relBuilder - // .aggregateCall(SqlStdOperatorTable.COLLECT, relBuilder.field(targetIndex)) - // .as(targetName); - // - // relBuilder.aggregate(relBuilder.groupKey(groupExprs), aggCall); - // - // // 6) Restore original output column order AND cast MULTISET -> ARRAY using RexBuilder - // // After aggregate => [group fields..., targetAgg(multiset)] - // final int aggPos = groupExprs.size(); - // - // final RelDataType elemType = input.getRowType().getFieldList().get(targetIndex).getType(); - // final RelDataType arrayType = relBuilder.getTypeFactory().createArrayType(elemType, -1); - // - // final List projections = new ArrayList<>(inputFieldNames.size()); - // final List projNames = new ArrayList<>(inputFieldNames.size()); - // - // int groupPos = 0; - // for (int i = 0; i < inputFieldNames.size(); i++) { - // projNames.add(inputFieldNames.get(i)); - // if (i == targetIndex) { - // final RexNode multisetRef = relBuilder.field(aggPos); // MULTISET - // projections.add(rexBuilder.makeCast(arrayType, multisetRef)); // ARRAY - // } else { - // projections.add(relBuilder.field(groupPos)); - // groupPos++; - // } - // } - // relBuilder.project(projections, projNames, /* force= */ true); - // - // // 7) nomv=true converts multivalue output to a single delimited string. - // // arrayToString in this engine supports only String/ByteString, so cast elements to STRING - // // first. - // if (node.isNomv()) { - // final List nomvProjections = new ArrayList<>(inputFieldNames.size()); - // final List nomvNames = new ArrayList<>(inputFieldNames.size()); - // - // // Build ARRAY type once - // final RelDataType stringType = - // relBuilder - // .getTypeFactory() - // .createSqlType(org.apache.calcite.sql.type.SqlTypeName.VARCHAR); - // final RelDataType stringArrayType = - // relBuilder.getTypeFactory().createArrayType(stringType, -1); - // - // for (int i = 0; i < inputFieldNames.size(); i++) { - // final String name = inputFieldNames.get(i); - // nomvNames.add(name); - // - // if (i == targetIndex) { - // // At this point relBuilder.field(i) is ARRAY. Cast to ARRAY so - // // ARRAY_TO_STRING works. - // final RexNode stringArray = rexBuilder.makeCast(stringArrayType, relBuilder.field(i)); - // - // nomvProjections.add( - // relBuilder.call( - // org.apache.calcite.sql.fun.SqlLibraryOperators.ARRAY_TO_STRING, - // stringArray, - // relBuilder.literal(delim))); - // } else { - // nomvProjections.add(relBuilder.field(i)); - // } - // } - // - // relBuilder.project(nomvProjections, nomvNames, /* force= */ true); - // } - // - // return relBuilder.peek(); - // } - /** * mvcombine command visitor to collapse rows that are identical on all fields except the target * field, and combine the target field values into a multivalue (array) field. From 776fcad4fe7c344dcb538f8ff206789448de1afe Mon Sep 17 00:00:00 2001 From: Srikanth Padakanti Date: Thu, 8 Jan 2026 14:07:19 -0600 Subject: [PATCH 13/32] Address coderrabbit comments Signed-off-by: Srikanth Padakanti --- .../sql/calcite/CalciteRelNodeVisitor.java | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java index b5b54ee2f3a..5d4f9522b1d 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java @@ -3180,7 +3180,22 @@ private int resolveTargetIndex(Field targetField, CalcitePlanContext context) { throw new SemanticCheckException( "mvcombine target must be a direct field reference, but got: " + targetField); } - return ((RexInputRef) targetRex).getIndex(); + + final int index = ((RexInputRef) targetRex).getIndex(); + + // Validate the target field is scalar-ish. mvcombine outputs ARRAY, so target must not + // already be ARRAY. + final RelDataType fieldType = + context.relBuilder.peek().getRowType().getFieldList().get(index).getType(); + + // Prefer Calcite type inspection instead of relying on rel type name string checks. + if (fieldType.getSqlTypeName() == org.apache.calcite.sql.type.SqlTypeName.ARRAY + || fieldType.getSqlTypeName() == org.apache.calcite.sql.type.SqlTypeName.MULTISET) { + throw new SemanticCheckException( + "mvcombine target cannot be an array/multivalue type, but got: " + fieldType); + } + + return index; } private List buildGroupExpressionsExcludingTarget( From 9a83103a0a9eacbc82c790ef8f505b2772145e55 Mon Sep 17 00:00:00 2001 From: Srikanth Padakanti Date: Thu, 8 Jan 2026 15:33:34 -0600 Subject: [PATCH 14/32] Add mvcombine to index.md Signed-off-by: Srikanth Padakanti --- docs/user/ppl/index.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/user/ppl/index.md b/docs/user/ppl/index.md index 30ad7159182..2472a921d85 100644 --- a/docs/user/ppl/index.md +++ b/docs/user/ppl/index.md @@ -78,8 +78,10 @@ source=accounts | [describe command](cmd/describe.md) | 2.1 | stable (since 2.1) | Query the metadata of an index. | | [explain command](cmd/explain.md) | 3.1 | stable (since 3.1) | Explain the plan of query. | | [show datasources command](cmd/showdatasources.md) | 2.4 | stable (since 2.4) | Query datasources configured in the PPL engine. | - | [addtotals command](cmd/addtotals.md) | 3.4 | stable (since 3.4) | Adds row and column values and appends a totals column and row. | - | [addcoltotals command](cmd/addcoltotals.md) | 3.4 | stable (since 3.4) | Adds column values and appends a totals row. | +| [addtotals command](cmd/addtotals.md) | 3.4 | stable (since 3.4) | Adds row and column values and appends a totals column and row. | +| [addcoltotals command](cmd/addcoltotals.md) | 3.4 | stable (since 3.4) | Adds column values and appends a totals row. | +| [mvcombine command](cmd/mvcombine.md) | 3.4 | stable (since 3.4) | Combines values of a specified field across rows identical on all other fields. | + - [Syntax](cmd/syntax.md) - PPL query structure and command syntax formatting * **Functions** From 26ba22ff5587312a538f647aca4b374ebe5c461b Mon Sep 17 00:00:00 2001 From: Srikanth Padakanti Date: Fri, 9 Jan 2026 14:47:09 -0600 Subject: [PATCH 15/32] Remove the nomv related implementation as that command is still not yet implemented Signed-off-by: Srikanth Padakanti --- .../org/opensearch/sql/ast/dsl/AstDSL.java | 8 +- .../opensearch/sql/ast/tree/MvCombine.java | 4 +- .../sql/calcite/CalciteRelNodeVisitor.java | 163 ++++++++++-------- docs/user/ppl/cmd/mvcombine.md | 81 ++++----- .../remote/CalciteMvCombineCommandIT.java | 87 ++-------- ppl/src/main/antlr/OpenSearchPPLLexer.g4 | 1 - ppl/src/main/antlr/OpenSearchPPLParser.g4 | 5 +- .../opensearch/sql/ppl/parser/AstBuilder.java | 9 +- .../ppl/calcite/CalcitePPLMvCombineTest.java | 63 ++++--- 9 files changed, 180 insertions(+), 241 deletions(-) diff --git a/core/src/main/java/org/opensearch/sql/ast/dsl/AstDSL.java b/core/src/main/java/org/opensearch/sql/ast/dsl/AstDSL.java index 356ab769aa3..8b129c6267a 100644 --- a/core/src/main/java/org/opensearch/sql/ast/dsl/AstDSL.java +++ b/core/src/main/java/org/opensearch/sql/ast/dsl/AstDSL.java @@ -470,15 +470,11 @@ public static List defaultDedupArgs() { } public static MvCombine mvcombine(Field field) { - return new MvCombine(field, null, false); + return new MvCombine(field, null); } public static MvCombine mvcombine(Field field, String delim) { - return new MvCombine(field, delim, false); - } - - public static MvCombine mvcombine(Field field, String delim, boolean nomv) { - return new MvCombine(field, delim, nomv); + return new MvCombine(field, delim); } public static List sortOptions() { diff --git a/core/src/main/java/org/opensearch/sql/ast/tree/MvCombine.java b/core/src/main/java/org/opensearch/sql/ast/tree/MvCombine.java index 219088ebae1..ba94aa10978 100644 --- a/core/src/main/java/org/opensearch/sql/ast/tree/MvCombine.java +++ b/core/src/main/java/org/opensearch/sql/ast/tree/MvCombine.java @@ -22,12 +22,10 @@ public class MvCombine extends UnresolvedPlan { private final Field field; private final String delim; @Nullable private UnresolvedPlan child; - private final boolean nomv; - public MvCombine(Field field, @Nullable String delim, boolean nomv) { + public MvCombine(Field field, @Nullable String delim) { this.field = field; this.delim = (delim == null) ? " " : delim; - this.nomv = nomv; } public MvCombine attach(UnresolvedPlan child) { diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java index f89b63cfc98..6deee96541e 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java @@ -61,6 +61,7 @@ import org.apache.calcite.rex.RexNode; import org.apache.calcite.rex.RexVisitorImpl; import org.apache.calcite.rex.RexWindowBounds; +import org.apache.calcite.rel.hint.RelHint; import org.apache.calcite.sql.SqlKind; import org.apache.calcite.sql.fun.SqlStdOperatorTable; import org.apache.calcite.sql.type.ArraySqlType; @@ -3054,22 +3055,18 @@ public RelNode visitExpand(Expand expand, CalcitePlanContext context) { * mvcombine command visitor to collapse rows that are identical on all fields except the target * field, and combine the target field values into a multivalue (array) field. * - *

Implementation notes: - * - *

Groups by all input fields except the target field. Aggregates target using {@code COLLECT} - * (MULTISET) and casts MULTISET -> ARRAY. Preserves original output column order. If {@code - * nomv=true}, renders target array as a scalar string via {@code ARRAY_TO_STRING} using {@code - * delim}. {@code delim} only affects output when {@code nomv=true}. + *

Implementation notes:Groups by all input fields except the target field. Aggregates target + * values using {@code COLLECT} (MULTISET). Casts the aggregation result from MULTISET to ARRAY + * for a stable multivalue output type. Preserves the original output column order. * * @param node mvcombine command to be visited * @param context CalcitePlanContext containing the RelBuilder, RexBuilder, and resolution context * @return RelNode representing collapsed records with the target combined into a multivalue array - * (or a delimited string when {@code nomv=true}) * @throws SemanticCheckException if the mvcombine target is not a direct field reference */ @Override public RelNode visitMvCombine(MvCombine node, CalcitePlanContext context) { - // 1) Lower child first + // 1) Lower the child plan first so the RelBuilder has the input schema on the stack. visitChildren(node, context); final RelBuilder relBuilder = context.relBuilder; @@ -3078,34 +3075,40 @@ public RelNode visitMvCombine(MvCombine node, CalcitePlanContext context) { final RelNode input = relBuilder.peek(); final List inputFieldNames = input.getRowType().getFieldNames(); - // 2) Accept delim (default is single space). - // NOTE: delim only affects single-value rendering when nomv=true. - final String delim = node.getDelim(); - - // 3) Resolve target field to an input ref (must be a direct field reference) + // 2) Resolve the mvcombine target to an input column index (must be a direct field reference). final Field targetField = node.getField(); final int targetIndex = resolveTargetIndex(targetField, context); final String targetName = inputFieldNames.get(targetIndex); - // 4) Group key = all fields except target + // 3) Group by all fields except the target. final List groupExprs = buildGroupExpressionsExcludingTarget(targetIndex, inputFieldNames, relBuilder); - // 5) Aggregate target values: COLLECT => MULTISET + // 4) Aggregate target values using COLLECT, filtering out NULLs (Splunk-like behavior). performCollectAggregation(relBuilder, targetIndex, targetName, groupExprs); - // 6) Restore original output column order AND cast MULTISET -> ARRAY using RexBuilder + // 5) Restore original output column order, and cast COLLECT's MULTISET output to ARRAY. restoreColumnOrderWithArrayCast( relBuilder, rexBuilder, input, inputFieldNames, targetIndex, groupExprs); - // 7) nomv=true converts multivalue output to a single delimited string. - if (node.isNomv()) { - applyNomvConversion(relBuilder, rexBuilder, inputFieldNames, targetIndex, delim); - } - return relBuilder.peek(); } + /** + * Resolves the mvcombine target expression to an input field index. + * + *

mvcombine requires the target to be a direct field reference (RexInputRef). This keeps the + * command semantics predictable and avoids accidental grouping on computed expressions. + * + *

The target must also be a scalar-ish field. mvcombine outputs ARRAY<T>, so the input + * target cannot already be an ARRAY or MULTISET. + * + * @param targetField Target field expression from the AST + * @param context Planning context + * @return 0-based input field index for the target + * @throws SemanticCheckException if the target is not a direct field reference or has an array + * type + */ private int resolveTargetIndex(Field targetField, CalcitePlanContext context) { final RexNode targetRex = rexVisitor.analyze(targetField, context); if (!(targetRex instanceof RexInputRef)) { @@ -3115,12 +3118,9 @@ private int resolveTargetIndex(Field targetField, CalcitePlanContext context) { final int index = ((RexInputRef) targetRex).getIndex(); - // Validate the target field is scalar-ish. mvcombine outputs ARRAY, so target must not - // already be ARRAY. final RelDataType fieldType = context.relBuilder.peek().getRowType().getFieldList().get(index).getType(); - // Prefer Calcite type inspection instead of relying on rel type name string checks. if (fieldType.getSqlTypeName() == org.apache.calcite.sql.type.SqlTypeName.ARRAY || fieldType.getSqlTypeName() == org.apache.calcite.sql.type.SqlTypeName.MULTISET) { throw new SemanticCheckException( @@ -3130,26 +3130,76 @@ private int resolveTargetIndex(Field targetField, CalcitePlanContext context) { return index; } + /** + * Builds group-by expressions for mvcombine: all input fields except the target field. + * + * @param targetIndex Input index of the mvcombine target field + * @param inputFieldNames Input schema field names (for sizing/ordering) + * @param relBuilder RelBuilder positioned on the input + * @return Group-by expressions in input order excluding the target + */ private List buildGroupExpressionsExcludingTarget( int targetIndex, List inputFieldNames, RelBuilder relBuilder) { - final List groupExprs = new ArrayList<>(); + final List groupExprs = new ArrayList<>(Math.max(0, inputFieldNames.size() - 1)); for (int i = 0; i < inputFieldNames.size(); i++) { - if (i == targetIndex) continue; + if (i == targetIndex) { + continue; + } groupExprs.add(relBuilder.field(i)); } return groupExprs; } + /** + * Applies mvcombine aggregation: + * + *

GROUP BY all non-target fields, and aggregate target values using {@code COLLECT}. {@code + * COLLECT} produces a MULTISET in Calcite, which we later cast to ARRAY for output. + * + *

NULL target values are excluded from the collected multivalue list by applying an aggregate + * filter. This matches typical "combine values" semantics and avoids polluting the result with + * NULL elements. + * + * @param relBuilder RelBuilder positioned on the input + * @param targetIndex Target field input index + * @param targetName Target field output name (preserved) + * @param groupExprs Group-by expressions (all fields except target) + */ private void performCollectAggregation( RelBuilder relBuilder, int targetIndex, String targetName, List groupExprs) { + + final RexNode targetRef = relBuilder.field(targetIndex); + final RexNode notNullTarget = relBuilder.isNotNull(targetRef); + final RelBuilder.AggCall aggCall = relBuilder - .aggregateCall(SqlStdOperatorTable.COLLECT, relBuilder.field(targetIndex)) + .aggregateCall(SqlStdOperatorTable.COLLECT, targetRef) + .filter(notNullTarget) .as(targetName); relBuilder.aggregate(relBuilder.groupKey(groupExprs), aggCall); } + /** + * Restores the original output column order after the aggregate step and converts the collected + * target from MULTISET to ARRAY<T>. + * + *

After aggregation, the schema is: + * + *

+   *   [groupField0, groupField1, ..., groupFieldN, targetAggMultiset]
+   * 
+ * + *

This method projects fields back to the original input order, replacing the original target + * slot with {@code CAST(targetAggMultiset AS ARRAY<T>)}. + * + * @param relBuilder RelBuilder positioned on the post-aggregate node + * @param rexBuilder RexBuilder for explicit casts + * @param input Original input RelNode (used to derive the target element type) + * @param inputFieldNames Original input field names (also output field names) + * @param targetIndex Target field index in the original input + * @param groupExprs Group-by expressions used during aggregation + */ private void restoreColumnOrderWithArrayCast( RelBuilder relBuilder, RexBuilder rexBuilder, @@ -3158,65 +3208,32 @@ private void restoreColumnOrderWithArrayCast( int targetIndex, List groupExprs) { - // After aggregate => [group fields..., targetAgg(multiset)] - final int aggPos = groupExprs.size(); + // Post-aggregate: group fields come first, and the collected target is appended at the end. + final int collectedTargetPos = groupExprs.size(); - final RelDataType elemType = input.getRowType().getFieldList().get(targetIndex).getType(); - final RelDataType arrayType = relBuilder.getTypeFactory().createArrayType(elemType, -1); + final RelDataType targetElemType = input.getRowType().getFieldList().get(targetIndex).getType(); + final RelDataType targetArrayType = + relBuilder.getTypeFactory().createArrayType(targetElemType, -1); final List projections = new ArrayList<>(inputFieldNames.size()); - final List projNames = new ArrayList<>(inputFieldNames.size()); + final List projectionNames = new ArrayList<>(inputFieldNames.size()); int groupPos = 0; for (int i = 0; i < inputFieldNames.size(); i++) { - projNames.add(inputFieldNames.get(i)); + projectionNames.add(inputFieldNames.get(i)); + if (i == targetIndex) { - final RexNode multisetRef = relBuilder.field(aggPos); // MULTISET - projections.add(rexBuilder.makeCast(arrayType, multisetRef)); // ARRAY + // COLLECT returns MULTISET; normalize output to ARRAY. + final RexNode multisetRef = relBuilder.field(collectedTargetPos); + projections.add(rexBuilder.makeCast(targetArrayType, multisetRef)); } else { projections.add(relBuilder.field(groupPos)); groupPos++; } } - relBuilder.project(projections, projNames, /* force= */ true); - } - - private void applyNomvConversion( - RelBuilder relBuilder, - RexBuilder rexBuilder, - List inputFieldNames, - int targetIndex, - String delim) { - - final List nomvProjections = new ArrayList<>(inputFieldNames.size()); - final List nomvNames = new ArrayList<>(inputFieldNames.size()); - - // Build ARRAY type once - final RelDataType stringType = - relBuilder.getTypeFactory().createSqlType(org.apache.calcite.sql.type.SqlTypeName.VARCHAR); - final RelDataType stringArrayType = relBuilder.getTypeFactory().createArrayType(stringType, -1); - - for (int i = 0; i < inputFieldNames.size(); i++) { - final String name = inputFieldNames.get(i); - nomvNames.add(name); - - if (i == targetIndex) { - // At this point relBuilder.field(i) is ARRAY. Cast to ARRAY so - // ARRAY_TO_STRING works. - final RexNode stringArray = rexBuilder.makeCast(stringArrayType, relBuilder.field(i)); - - nomvProjections.add( - relBuilder.call( - org.apache.calcite.sql.fun.SqlLibraryOperators.ARRAY_TO_STRING, - stringArray, - relBuilder.literal(delim))); - } else { - nomvProjections.add(relBuilder.field(i)); - } - } - - relBuilder.project(nomvProjections, nomvNames, /* force= */ true); + // Force projection to avoid Calcite "identity" short-circuit when only names/types change. + relBuilder.project(projections, projectionNames, /* force= */ true); } @Override diff --git a/docs/user/ppl/cmd/mvcombine.md b/docs/user/ppl/cmd/mvcombine.md index 87718e365be..e816de88d17 100644 --- a/docs/user/ppl/cmd/mvcombine.md +++ b/docs/user/ppl/cmd/mvcombine.md @@ -2,27 +2,51 @@ ## Description -The `mvcombine` command groups events that are identical across all fields except the target field, and combines the target field values into a multivalue (array) field. All other fields in the original events are preserved in the output. +The `mvcombine` command groups rows that are identical across all fields except a specified target field, and combines the values of that target field into a multivalue (array) field. All other fields in the input rows are preserved as group keys in the output. -Key aspects of `mvcombine`: -* It generates one row per group, where the group keys are all fields currently in the pipeline except the target field. -* The target field becomes a multivalue field containing the combined values from the grouped rows. -* When `nomv` is specified, the target field is returned as a single scalar string instead of a multivalue array. -* The `delim` parameter controls the delimiter only when `nomv` is enabled. -* Rows missing the target field contribute no value to the combined output for that group. +`mvcombine` is a transforming command: it consumes a set of input results and produces a new result set with reduced cardinality. + +### Key behaviors + +- Rows are grouped by **all fields currently in the pipeline except the target field**. +- One output row is produced per group. +- The target field becomes a **multivalue field** containing the combined values from all rows in the group. +- Rows where the target field is missing or null do **not** contribute a value to the combined multivalue output. +- The default output is a multivalue representation (array). + +Delimiter handling (`delim`) is parsed but has **no effect on the output** unless the `nomv` command is used. Since `nomv` is not yet implemented in OpenSearch, `delim` is currently inert and does not affect execution. + +--- ## Syntax -mvcombine [nomv=] [delim=] +mvcombine [delim=] + +### Arguments + +- **field** (required) + The name of the field whose values are combined into a multivalue field. + +- **delim** (optional) + A string delimiter for rendering a single-value representation of the combined field. + This option has no observable effect unless `nomv` is used. + +--- + +## Semantics -* field: mandatory. The field whose values are combined into a multivalue field. -* nomv: optional. If specified, returns the combined values as a single string (not a multivalue array). -* delim: optional. Delimiter used to join values when `nomv` is specified. Defaults to space (` `) when not provided. -* “When nomv is not enabled, the result is a multivalue field; in tabular output it may be displayed in bracket notation with comma-separated elements (e.g., [10,20,30]).” +Given a set of input rows: + +- All rows that have identical values for every field **except** the target field are grouped together. +- The target field must be a single-valued (scalar) field. +- The output schema preserves the original field order. +- The target field is returned as a multivalue (array) field. + +--- ## Example 1: Basic mvcombine -Given a dataset `mvcombine` with the following data: +Given the following input rows: ```text {"ip":"10.0.0.1","bytes":100,"tags":"t1","packets_str":"10"} @@ -78,35 +102,7 @@ fetched rows / total rows = 2/2 +----------+-------+------+-------------+ ``` -Example 3: mvcombine with nomv - -Given a dataset mvcombine with the following data: -```text -{"ip":"10.0.0.1","bytes":100,"tags":"t1","packets_str":"10"} -{"ip":"10.0.0.1","bytes":100,"tags":"t1","packets_str":"20"} -{"ip":"10.0.0.1","bytes":100,"tags":"t1","packets_str":"30"} -``` - -The following query returns packets_str as a single string instead of a multivalue array: -```ppl -source=mvcombine_data -| where ip='10.0.0.1' and bytes=100 and tags='t1' -| fields ip, bytes, tags, packets_str -| sort packets_str -| mvcombine packets_str nomv=true -``` - -Expected output: -```text -fetched rows / total rows = 1/1 -+----------+-------+------+-------------+ -| ip | bytes | tags | packets_str | -|----------+-------+------+-------------| -| 10.0.0.1 | 100 | t1 | 10 20 30 | -+----------+-------+------+-------------+ -``` - -Example 4: Missing target field in some rows +Example 3: Missing target field in some rows Rows missing the target field do not contribute a value to the combined output. @@ -147,5 +143,4 @@ Expected output: ```text {'reason': 'Invalid Query', 'details': 'Field [does_not_exist] not found.', 'type': 'IllegalArgumentException'} Error: Query returned no data - ``` \ No newline at end of file diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java index f15208fcbd3..377cfbf164b 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java @@ -11,7 +11,6 @@ import java.util.ArrayList; import java.util.Collections; import java.util.List; -import java.util.regex.Pattern; import org.json.JSONArray; import org.json.JSONObject; import org.junit.jupiter.api.Assertions; @@ -153,79 +152,11 @@ public void testMvCombine_multipleGroups_producesOneRowPerGroupKey() throws IOEx } // --------------------------- - // delim/nomv: happy paths + edge case + // delim: Splunk-compatible command input + output shape // --------------------------- @Test - public void testMvCombine_nomv_defaultDelim_ifSupported_elseSyntaxRejected() throws Exception { - String base = - "source=" - + INDEX - + " | where ip='10.0.0.9' and bytes=900 and tags='t9'" - + " | fields ip, bytes, tags, packets_str"; - - String q = base + " | mvcombine packets_str nomv"; - - try { - JSONObject result = executeQuery(q); - verifyNumOfRows(result, 1); - - Object cell = result.getJSONArray("datarows").getJSONArray(0).get(3); - Assertions.assertTrue( - cell instanceof String, "Expected nomv output scalar string, got: " + cell); - - String s = (String) cell; - Assertions.assertTrue( - Pattern.compile("1.*2.*3").matcher(s).find(), - "Expected nomv string to contain values 1,2,3 in order, got: " + s); - } catch (ResponseException e) { - Assertions.assertTrue( - isSyntaxBadRequest(e), - "Expected syntax rejection if nomv unsupported, got: " + e.getMessage()); - } - } - - @Test - public void testMvCombine_nomvWithCustomDelim_ifSupported_elseSyntaxRejected() throws Exception { - String base = - "source=" - + INDEX - + " | where ip='10.0.0.9' and bytes=900 and tags='t9'" - + " | fields ip, bytes, tags, packets_str"; - - String q1 = base + " | mvcombine packets_str nomv delim='|'"; - String q2 = base + " | mvcombine packets_str delim='|' nomv"; - - JSONObject result; - try { - result = executeQuery(q1); - } catch (ResponseException e1) { - if (!isSyntaxBadRequest(e1)) throw e1; - - try { - result = executeQuery(q2); - } catch (ResponseException e2) { - Assertions.assertTrue( - isSyntaxBadRequest(e2), - "Expected syntax rejection for unsupported nomv/delim, got: " + e2.getMessage()); - return; // unsupported -> acceptable - } - } - - verifyNumOfRows(result, 1); - Object cell = result.getJSONArray("datarows").getJSONArray(0).get(3); - Assertions.assertTrue( - cell instanceof String, "Expected nomv output scalar string, got: " + cell); - - String s = (String) cell; - Assertions.assertTrue(s.contains("|"), "Expected delimiter '|' in: " + s); - Assertions.assertTrue( - Pattern.compile("1\\|.*2\\|.*3|1.*\\|.*2.*\\|.*3|1.*2.*3").matcher(s).find(), - "Expected values to be present in the joined output, got: " + s); - } - - @Test - public void testMvCombine_delimWithoutNomv_shouldNotChangeMvShape_ifSupported_elseSyntaxRejected() + public void testMvCombine_delim_shouldNotChangeMvShape_ifSupported_elseSyntaxRejected() throws Exception { String base = "source=" @@ -233,7 +164,8 @@ public void testMvCombine_delimWithoutNomv_shouldNotChangeMvShape_ifSupported_el + " | where ip='10.0.0.9' and bytes=900 and tags='t9'" + " | fields ip, bytes, tags, packets_str"; - String q = base + " | mvcombine packets_str delim='|'"; + // Splunk-style: options before the field + String q = base + " | mvcombine delim='|' packets_str"; try { JSONObject result = executeQuery(q); @@ -242,12 +174,17 @@ public void testMvCombine_delimWithoutNomv_shouldNotChangeMvShape_ifSupported_el Object cell = result.getJSONArray("datarows").getJSONArray(0).get(3); Assertions.assertTrue( cell instanceof JSONArray, - "Expected multivalue array (delim without nomv should not coerce to string), got: " - + cell); + "Expected multivalue array (delim should not coerce to string), got: " + cell); + + // Optional sanity: values exist (order not guaranteed) + List mv = toStringListDropNulls(cell); + Assertions.assertTrue(mv.contains("1"), "Expected MV to include 1, got: " + mv); + Assertions.assertTrue(mv.contains("2"), "Expected MV to include 2, got: " + mv); + Assertions.assertTrue(mv.contains("3"), "Expected MV to include 3, got: " + mv); } catch (ResponseException e) { Assertions.assertTrue( isSyntaxBadRequest(e), - "Expected syntax rejection if delim-only unsupported, got: " + e.getMessage()); + "Expected syntax rejection if delim unsupported, got: " + e.getMessage()); } } diff --git a/ppl/src/main/antlr/OpenSearchPPLLexer.g4 b/ppl/src/main/antlr/OpenSearchPPLLexer.g4 index 7cc6e64d070..7b431dae9ff 100644 --- a/ppl/src/main/antlr/OpenSearchPPLLexer.g4 +++ b/ppl/src/main/antlr/OpenSearchPPLLexer.g4 @@ -71,7 +71,6 @@ SHOW_NUMBERED_TOKEN: 'SHOW_NUMBERED_TOKEN'; AGGREGATION: 'AGGREGATION'; APPENDPIPE: 'APPENDPIPE'; MVCOMBINE: 'MVCOMBINE'; -NOMV: 'NOMV'; //Native JOIN KEYWORDS diff --git a/ppl/src/main/antlr/OpenSearchPPLParser.g4 b/ppl/src/main/antlr/OpenSearchPPLParser.g4 index 1bf523a146e..ab3749bd70c 100644 --- a/ppl/src/main/antlr/OpenSearchPPLParser.g4 +++ b/ppl/src/main/antlr/OpenSearchPPLParser.g4 @@ -533,9 +533,7 @@ expandCommand ; mvcombineCommand - : MVCOMBINE fieldExpression - (DELIM EQUAL stringLiteral)? - (NOMV EQUAL booleanLiteral)? + : MVCOMBINE (DELIM EQUAL stringLiteral)? fieldExpression ; @@ -1582,7 +1580,6 @@ searchableKeyWord | PARTITIONS | ALLNUM | DELIM - | NOMV | CURRENT | WINDOW | GLOBAL diff --git a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java index 7157299c398..a5ab0ec8a06 100644 --- a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java +++ b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java @@ -878,14 +878,7 @@ public UnresolvedPlan visitMvcombineCommand(OpenSearchPPLParser.MvcombineCommand if (ctx.DELIM() != null) { delim = StringUtils.unquoteText(getTextInQuery(ctx.stringLiteral())); } - - boolean nomv = false; - if (ctx.NOMV() != null) { - // booleanLiteral rule is TRUE | FALSE - nomv = ctx.booleanLiteral().TRUE() != null; - } - - return new MvCombine(field, delim, nomv); + return new MvCombine(field, delim); } @Override diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLMvCombineTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLMvCombineTest.java index 9874f7256cd..2cf145cd370 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLMvCombineTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLMvCombineTest.java @@ -54,12 +54,12 @@ protected Frameworks.ConfigBuilder config(CalciteAssert.SchemaSpec... schemaSpec new Object[] {"basic", "B", 60}, new Object[] {"basic", "A", 30}, - // new: NULL target values case + // NULL target values case (Splunk-style: nulls do NOT contribute to mv) new Object[] {"nulls", "A", null}, new Object[] {"nulls", "A", 10}, new Object[] {"nulls", "B", null}, - // new: single-row case + // single-row case new Object[] {"single", "Z", 5}); schema.add("MVCOMBINE_DATA", new MvCombineDataTable(rows)); @@ -82,46 +82,50 @@ public void testMvCombineBasic() { RelNode root = getRelNode(ppl); - // NOTE: Your implementation casts COLLECT result (MULTISET) to ARRAY. + // Calcite often lowers FILTER(condition) into: + // - a LogicalProject producing $f3 = + // - aggregate "FILTER $3" String expectedLogical = "LogicalSort(sort0=[$1], dir0=[ASC-nulls-first])\n" + " LogicalProject(case=[$0], ip=[$1], packets=[CAST($2):INTEGER ARRAY NOT NULL])\n" - + " LogicalAggregate(group=[{0, 1}], packets=[COLLECT($2)])\n" - + " LogicalFilter(condition=[=($0, 'basic')])\n" - + " LogicalTableScan(table=[[scott, MVCOMBINE_DATA]])\n"; + + " LogicalAggregate(group=[{0, 1}], packets=[COLLECT($2) FILTER $3])\n" + + " LogicalProject(case=[$0], ip=[$1], packets=[$2], $f3=[IS NOT NULL($2)])\n" + + " LogicalFilter(condition=[=($0, 'basic')])\n" + + " LogicalTableScan(table=[[scott, MVCOMBINE_DATA]])\n"; verifyLogical(root, expectedLogical); } @Test - public void testMvCombineNomvWithCustomDelim() { + public void testMvCombineWithNullTargetValues() { String ppl = "source=MVCOMBINE_DATA " - + "| where case = \"basic\" " + + "| where case = \"nulls\" " + "| fields case, ip, packets " - + "| mvcombine packets delim=\"|\" nomv=true " + + "| mvcombine packets " + "| sort ip"; RelNode root = getRelNode(ppl); - // NOTE: Your implementation does: - // COLLECT -> CAST to INTEGER ARRAY -> CAST to VARCHAR ARRAY -> ARRAY_TO_STRING(..., '|') String expectedLogical = "LogicalSort(sort0=[$1], dir0=[ASC-nulls-first])\n" - + " LogicalProject(case=[$0], ip=[$1], packets=[ARRAY_TO_STRING(CAST(CAST($2):INTEGER" - + " ARRAY NOT NULL):VARCHAR NOT NULL ARRAY NOT NULL, '|')])\n" - + " LogicalAggregate(group=[{0, 1}], packets=[COLLECT($2)])\n" - + " LogicalFilter(condition=[=($0, 'basic')])\n" - + " LogicalTableScan(table=[[scott, MVCOMBINE_DATA]])\n"; + + " LogicalProject(case=[$0], ip=[$1], packets=[CAST($2):INTEGER ARRAY NOT NULL])\n" + + " LogicalAggregate(group=[{0, 1}], packets=[COLLECT($2) FILTER $3])\n" + + " LogicalProject(case=[$0], ip=[$1], packets=[$2], $f3=[IS NOT NULL($2)])\n" + + " LogicalFilter(condition=[=($0, 'nulls')])\n" + + " LogicalTableScan(table=[[scott, MVCOMBINE_DATA]])\n"; verifyLogical(root, expectedLogical); } @Test - public void testMvCombineWithNullTargetValues() { + public void testMvCombineWithDelimOption_SplunkSyntaxOrder() { + // Splunk syntax/order: mvcombine [delim="..."] + // NOTE: delim does NOT change the output shape (still ARRAY); it’s just an option carried on + // the AST. String ppl = "source=MVCOMBINE_DATA " - + "| where case = \"nulls\" " + + "| where case = \"basic\" " + "| fields case, ip, packets " - + "| mvcombine packets " + + "| mvcombine delim=\"|\" packets " + "| sort ip"; RelNode root = getRelNode(ppl); @@ -129,9 +133,10 @@ public void testMvCombineWithNullTargetValues() { String expectedLogical = "LogicalSort(sort0=[$1], dir0=[ASC-nulls-first])\n" + " LogicalProject(case=[$0], ip=[$1], packets=[CAST($2):INTEGER ARRAY NOT NULL])\n" - + " LogicalAggregate(group=[{0, 1}], packets=[COLLECT($2)])\n" - + " LogicalFilter(condition=[=($0, 'nulls')])\n" - + " LogicalTableScan(table=[[scott, MVCOMBINE_DATA]])\n"; + + " LogicalAggregate(group=[{0, 1}], packets=[COLLECT($2) FILTER $3])\n" + + " LogicalProject(case=[$0], ip=[$1], packets=[$2], $f3=[IS NOT NULL($2)])\n" + + " LogicalFilter(condition=[=($0, 'basic')])\n" + + " LogicalTableScan(table=[[scott, MVCOMBINE_DATA]])\n"; verifyLogical(root, expectedLogical); } @@ -167,9 +172,10 @@ public void testMvCombineSingleRow() { String expectedLogical = "LogicalSort(sort0=[$1], dir0=[ASC-nulls-first])\n" + " LogicalProject(case=[$0], ip=[$1], packets=[CAST($2):INTEGER ARRAY NOT NULL])\n" - + " LogicalAggregate(group=[{0, 1}], packets=[COLLECT($2)])\n" - + " LogicalFilter(condition=[=($0, 'single')])\n" - + " LogicalTableScan(table=[[scott, MVCOMBINE_DATA]])\n"; + + " LogicalAggregate(group=[{0, 1}], packets=[COLLECT($2) FILTER $3])\n" + + " LogicalProject(case=[$0], ip=[$1], packets=[$2], $f3=[IS NOT NULL($2)])\n" + + " LogicalFilter(condition=[=($0, 'single')])\n" + + " LogicalTableScan(table=[[scott, MVCOMBINE_DATA]])\n"; verifyLogical(root, expectedLogical); } @@ -187,9 +193,10 @@ public void testMvCombineEmptyResult() { String expectedLogical = "LogicalSort(sort0=[$1], dir0=[ASC-nulls-first])\n" + " LogicalProject(case=[$0], ip=[$1], packets=[CAST($2):INTEGER ARRAY NOT NULL])\n" - + " LogicalAggregate(group=[{0, 1}], packets=[COLLECT($2)])\n" - + " LogicalFilter(condition=[=($0, 'no_such_case')])\n" - + " LogicalTableScan(table=[[scott, MVCOMBINE_DATA]])\n"; + + " LogicalAggregate(group=[{0, 1}], packets=[COLLECT($2) FILTER $3])\n" + + " LogicalProject(case=[$0], ip=[$1], packets=[$2], $f3=[IS NOT NULL($2)])\n" + + " LogicalFilter(condition=[=($0, 'no_such_case')])\n" + + " LogicalTableScan(table=[[scott, MVCOMBINE_DATA]])\n"; verifyLogical(root, expectedLogical); } From 469b048b695ab594e3efeb31a3a137c26b33d404 Mon Sep 17 00:00:00 2001 From: Srikanth Padakanti Date: Fri, 9 Jan 2026 14:47:32 -0600 Subject: [PATCH 16/32] Remove the nomv related implementation as that command is still not yet implemented Signed-off-by: Srikanth Padakanti --- .../java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java index 6deee96541e..a1383a3d187 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java @@ -61,7 +61,6 @@ import org.apache.calcite.rex.RexNode; import org.apache.calcite.rex.RexVisitorImpl; import org.apache.calcite.rex.RexWindowBounds; -import org.apache.calcite.rel.hint.RelHint; import org.apache.calcite.sql.SqlKind; import org.apache.calcite.sql.fun.SqlStdOperatorTable; import org.apache.calcite.sql.type.ArraySqlType; @@ -3080,7 +3079,7 @@ public RelNode visitMvCombine(MvCombine node, CalcitePlanContext context) { final int targetIndex = resolveTargetIndex(targetField, context); final String targetName = inputFieldNames.get(targetIndex); - // 3) Group by all fields except the target. + // 3) Group by all fields except the target. final List groupExprs = buildGroupExpressionsExcludingTarget(targetIndex, inputFieldNames, relBuilder); From 1259270daa0186fc526d91f766aeb09753011843 Mon Sep 17 00:00:00 2001 From: Srikanth Padakanti Date: Fri, 9 Jan 2026 14:48:40 -0600 Subject: [PATCH 17/32] Remove the nomv related implementation as that command is still not yet implemented Signed-off-by: Srikanth Padakanti --- .../java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java index a1383a3d187..425d733e22b 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java @@ -3083,7 +3083,7 @@ public RelNode visitMvCombine(MvCombine node, CalcitePlanContext context) { final List groupExprs = buildGroupExpressionsExcludingTarget(targetIndex, inputFieldNames, relBuilder); - // 4) Aggregate target values using COLLECT, filtering out NULLs (Splunk-like behavior). + // 4) Aggregate target values using COLLECT, filtering out NULLs. performCollectAggregation(relBuilder, targetIndex, targetName, groupExprs); // 5) Restore original output column order, and cast COLLECT's MULTISET output to ARRAY. From 11fb3e0f31f0b7730c3cf08cdae82612d2859525 Mon Sep 17 00:00:00 2001 From: Srikanth Padakanti Date: Fri, 9 Jan 2026 14:55:25 -0600 Subject: [PATCH 18/32] Remove the nomv related implementation as that command is still not yet implemented Signed-off-by: Srikanth Padakanti --- .../sql/ppl/calcite/CalcitePPLMvCombineTest.java | 8 -------- 1 file changed, 8 deletions(-) diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLMvCombineTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLMvCombineTest.java index 2cf145cd370..a094eeffdd7 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLMvCombineTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLMvCombineTest.java @@ -82,9 +82,6 @@ public void testMvCombineBasic() { RelNode root = getRelNode(ppl); - // Calcite often lowers FILTER(condition) into: - // - a LogicalProject producing $f3 = - // - aggregate "FILTER $3" String expectedLogical = "LogicalSort(sort0=[$1], dir0=[ASC-nulls-first])\n" + " LogicalProject(case=[$0], ip=[$1], packets=[CAST($2):INTEGER ARRAY NOT NULL])\n" @@ -118,9 +115,6 @@ public void testMvCombineWithNullTargetValues() { @Test public void testMvCombineWithDelimOption_SplunkSyntaxOrder() { - // Splunk syntax/order: mvcombine [delim="..."] - // NOTE: delim does NOT change the output shape (still ARRAY); it’s just an option carried on - // the AST. String ppl = "source=MVCOMBINE_DATA " + "| where case = \"basic\" " @@ -150,8 +144,6 @@ public void testMvCombineNonExistentField() { Exception ex = assertThrows(Exception.class, () -> getRelNode(ppl)); - // Keep this loose: different layers may wrap exceptions. - // We just need to prove the command fails for missing target field. String msg = String.valueOf(ex.getMessage()); org.junit.Assert.assertTrue( "Expected error message to mention missing field. Actual: " + msg, From 2771d0ba8785ae00245e565c40c0cf0c048dadd3 Mon Sep 17 00:00:00 2001 From: Srikanth Padakanti Date: Wed, 14 Jan 2026 14:25:08 -0600 Subject: [PATCH 19/32] complete the checklist from ppl-commands.md Signed-off-by: Srikanth Padakanti --- .../sql/calcite/CalciteNoPushdownIT.java | 3 +- .../sql/calcite/remote/CalciteExplainIT.java | 12 ++++++++ .../remote/CalciteMvCombineCommandIT.java | 20 ++++++++----- .../sql/ppl/NewAddedCommandsIT.java | 14 +++++++++ .../sql/security/CrossClusterSearchIT.java | 22 ++++++++++++++ .../calcite/explain_mvcombine.yaml | 13 +++++++++ .../explain_mvcombine.yaml | 13 +++++++++ ppl/src/main/antlr/OpenSearchPPLParser.g4 | 3 +- .../sql/ppl/utils/PPLQueryDataAnonymizer.java | 9 ++++++ .../ppl/calcite/CalcitePPLMvCombineTest.java | 29 ++++++++++++++++++- .../ppl/utils/PPLQueryDataAnonymizerTest.java | 13 +++++++++ 11 files changed, 141 insertions(+), 10 deletions(-) create mode 100644 integ-test/src/test/resources/expectedOutput/calcite/explain_mvcombine.yaml create mode 100644 integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_mvcombine.yaml diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/CalciteNoPushdownIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/CalciteNoPushdownIT.java index c254fb47c44..367af5f6dbb 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/CalciteNoPushdownIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/CalciteNoPushdownIT.java @@ -105,7 +105,8 @@ CalciteTrendlineCommandIT.class, CalciteVisualizationFormatIT.class, CalciteWhereCommandIT.class, - CalcitePPLTpchIT.class + CalcitePPLTpchIT.class, + CalciteMvCombineCommandIT.class }) public class CalciteNoPushdownIT { private static boolean wasPushdownEnabled; diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java index b6cd327989a..f7e1651ba0d 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java @@ -2368,4 +2368,16 @@ public void testExplainBWC() throws IOException { explainQueryToStringBWC(query, format)); } } + + @Test + public void testExplainMvCombine() throws IOException { + String query = + "source=opensearch-sql_test_index_account " + + "| fields state, city, age " + + "| mvcombine age delim=','"; + + String actual = explainQueryYaml(query); + String expected = loadExpectedPlan("explain_mvcombine.yaml"); + assertYamlEqualsIgnoreId(expected, actual); + } } diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java index 377cfbf164b..7f2ab06752f 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java @@ -5,7 +5,11 @@ package org.opensearch.sql.calcite.remote; +import static org.opensearch.sql.util.MatcherUtils.rows; +import static org.opensearch.sql.util.MatcherUtils.schema; +import static org.opensearch.sql.util.MatcherUtils.verifyDataRows; import static org.opensearch.sql.util.MatcherUtils.verifyNumOfRows; +import static org.opensearch.sql.util.MatcherUtils.verifySchema; import java.io.IOException; import java.util.ArrayList; @@ -41,7 +45,7 @@ public void testSanity_datasetIsLoaded() throws IOException { } // --------------------------- - // Happy paths (core mvcombine) + // Happy path (core mvcombine) // --------------------------- @Test @@ -69,21 +73,23 @@ public void testMvCombine_basicGroupCollapsesToOneRow() throws IOException { @Test public void testMvCombine_singleRowGroupStaysSingleRow() throws IOException { + // NOTE: Keep output minimal + deterministic to safely verify schema + datarows String q = "source=" + INDEX + " | where ip='10.0.0.2' and bytes=200 and tags='t2'" - + " | fields ip, bytes, tags, packets_str" + + " | fields ip, tags, packets_str" + " | mvcombine packets_str"; JSONObject result = executeQuery(q); - verifyNumOfRows(result, 1); - JSONArray row = result.getJSONArray("datarows").getJSONArray(0); - List mv = toStringListDropNulls(row.get(3)); + verifySchema( + result, + schema("ip", null, "string"), + schema("tags", null, "string"), + schema("packets_str", null, "array")); - Assertions.assertEquals(1, mv.size(), "Expected single-value MV, got " + mv); - Assertions.assertEquals("7", mv.get(0)); + verifyDataRows(result, rows("10.0.0.2", "t2", new JSONArray().put("7"))); } @Test diff --git a/integ-test/src/test/java/org/opensearch/sql/ppl/NewAddedCommandsIT.java b/integ-test/src/test/java/org/opensearch/sql/ppl/NewAddedCommandsIT.java index 15f3c508b14..45948539c27 100644 --- a/integ-test/src/test/java/org/opensearch/sql/ppl/NewAddedCommandsIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/ppl/NewAddedCommandsIT.java @@ -214,4 +214,18 @@ private void verifyQuery(JSONObject result) throws IOException { assertThat(error.getString("type"), equalTo("UnsupportedOperationException")); } } + + @Test + public void testMvCombineUnsupportedInV2() throws IOException { + JSONObject result; + try { + result = + executeQuery( + String.format( + "source=%s | fields state, city, age | mvcombine age", TEST_INDEX_BANK)); + } catch (ResponseException e) { + result = new JSONObject(TestUtils.getResponseBody(e.getResponse())); + } + verifyQuery(result); + } } diff --git a/integ-test/src/test/java/org/opensearch/sql/security/CrossClusterSearchIT.java b/integ-test/src/test/java/org/opensearch/sql/security/CrossClusterSearchIT.java index 7ee90dc4640..05e18520837 100644 --- a/integ-test/src/test/java/org/opensearch/sql/security/CrossClusterSearchIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/security/CrossClusterSearchIT.java @@ -287,4 +287,26 @@ public void testCrossClusterAppend() throws IOException { disableCalcite(); } + + /** CrossClusterSearchIT Test for mvcombine. */ + @Test + public void testCrossClusterMvcombine() throws IOException { + enableCalcite(); + + JSONObject result = + executeQuery( + String.format( + "search source=%s | where firstname='Hattie' or firstname='Nanette' " + + "| fields firstname, age | mvcombine age", + TEST_INDEX_BANK_REMOTE)); + + verifyColumn(result, columnName("firstname"), columnName("age")); + + verifyDataRows( + result, + rows("Hattie", new org.json.JSONArray().put(36)), + rows("Nanette", new org.json.JSONArray().put(28))); + + disableCalcite(); + } } diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_mvcombine.yaml b/integ-test/src/test/resources/expectedOutput/calcite/explain_mvcombine.yaml new file mode 100644 index 00000000000..768fa9ef384 --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_mvcombine.yaml @@ -0,0 +1,13 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(state=[$0], city=[$1], age=[CAST($2):BIGINT ARRAY NOT NULL]) + LogicalAggregate(group=[{0, 1}], age=[COLLECT($2) FILTER $3]) + LogicalProject(state=[$7], city=[$5], age=[$8], $f3=[IS NOT NULL($8)]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + physical: | + EnumerableLimit(fetch=[10000]) + EnumerableCalc(expr#0..2=[{inputs}], expr#3=[CAST($t2):BIGINT ARRAY NOT NULL], proj#0..1=[{exprs}], age=[$t3]) + EnumerableAggregate(group=[{0, 1}], age=[COLLECT($2) FILTER $3]) + EnumerableCalc(expr#0..2=[{inputs}], expr#3=[IS NOT NULL($t2)], proj#0..3=[{exprs}]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[state, city, age]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["state","city","age"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) diff --git a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_mvcombine.yaml b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_mvcombine.yaml new file mode 100644 index 00000000000..29a8aa3deac --- /dev/null +++ b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_mvcombine.yaml @@ -0,0 +1,13 @@ +calcite: + logical: | + LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) + LogicalProject(state=[$0], city=[$1], age=[CAST($2):BIGINT ARRAY NOT NULL]) + LogicalAggregate(group=[{0, 1}], age=[COLLECT($2) FILTER $3]) + LogicalProject(state=[$7], city=[$5], age=[$8], $f3=[IS NOT NULL($8)]) + CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + physical: | + EnumerableLimit(fetch=[10000]) + EnumerableCalc(expr#0..2=[{inputs}], expr#3=[CAST($t2):BIGINT ARRAY NOT NULL], proj#0..1=[{exprs}], age=[$t3]) + EnumerableAggregate(group=[{0, 1}], age=[COLLECT($2) FILTER $3]) + EnumerableCalc(expr#0..16=[{inputs}], expr#17=[IS NOT NULL($t8)], state=[$t7], city=[$t5], age=[$t8], $f3=[$t17]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) diff --git a/ppl/src/main/antlr/OpenSearchPPLParser.g4 b/ppl/src/main/antlr/OpenSearchPPLParser.g4 index ab3749bd70c..2999f9c9c88 100644 --- a/ppl/src/main/antlr/OpenSearchPPLParser.g4 +++ b/ppl/src/main/antlr/OpenSearchPPLParser.g4 @@ -132,6 +132,7 @@ commandName | REX | APPENDPIPE | REPLACE + | MVCOMBINE ; searchCommand @@ -533,7 +534,7 @@ expandCommand ; mvcombineCommand - : MVCOMBINE (DELIM EQUAL stringLiteral)? fieldExpression + : MVCOMBINE fieldExpression (DELIM EQUAL stringLiteral)? ; diff --git a/ppl/src/main/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizer.java b/ppl/src/main/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizer.java index a1e31e896dc..1ffab94b4a1 100644 --- a/ppl/src/main/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizer.java +++ b/ppl/src/main/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizer.java @@ -82,6 +82,7 @@ import org.opensearch.sql.ast.tree.Lookup; import org.opensearch.sql.ast.tree.MinSpanBin; import org.opensearch.sql.ast.tree.Multisearch; +import org.opensearch.sql.ast.tree.MvCombine; import org.opensearch.sql.ast.tree.Parse; import org.opensearch.sql.ast.tree.Patterns; import org.opensearch.sql.ast.tree.Project; @@ -463,6 +464,14 @@ public String visitExpand(Expand node, String context) { return StringUtils.format("%s | expand %s", child, field); } + @Override + public String visitMvCombine(MvCombine node, String context) { + String child = node.getChild().getFirst().accept(this, context); + String field = visitExpression(node.getField()); + + return StringUtils.format("%s | mvcombine delim=%s %s", child, MASK_LITERAL, field); + } + /** Build {@link LogicalSort}. */ @Override public String visitSort(Sort node, String context) { diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLMvCombineTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLMvCombineTest.java index a094eeffdd7..524f5f7972c 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLMvCombineTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLMvCombineTest.java @@ -90,6 +90,15 @@ public void testMvCombineBasic() { + " LogicalFilter(condition=[=($0, 'basic')])\n" + " LogicalTableScan(table=[[scott, MVCOMBINE_DATA]])\n"; verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `case`, `ip`, CAST(COLLECT(`packets`) FILTER (WHERE `packets` IS NOT NULL) AS" + + " ARRAY< INTEGER >) `packets`\n" + + "FROM `scott`.`MVCOMBINE_DATA`\n" + + "WHERE `case` = 'basic'\n" + + "GROUP BY `case`, `ip`\n" + + "ORDER BY `ip`"; + verifyPPLToSparkSQL(root, expectedSparkSql); } @Test @@ -111,6 +120,15 @@ public void testMvCombineWithNullTargetValues() { + " LogicalFilter(condition=[=($0, 'nulls')])\n" + " LogicalTableScan(table=[[scott, MVCOMBINE_DATA]])\n"; verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `case`, `ip`, CAST(COLLECT(`packets`) FILTER (WHERE `packets` IS NOT NULL) AS" + + " ARRAY< INTEGER >) `packets`\n" + + "FROM `scott`.`MVCOMBINE_DATA`\n" + + "WHERE `case` = 'nulls'\n" + + "GROUP BY `case`, `ip`\n" + + "ORDER BY `ip`"; + verifyPPLToSparkSQL(root, expectedSparkSql); } @Test @@ -119,7 +137,7 @@ public void testMvCombineWithDelimOption_SplunkSyntaxOrder() { "source=MVCOMBINE_DATA " + "| where case = \"basic\" " + "| fields case, ip, packets " - + "| mvcombine delim=\"|\" packets " + + "| mvcombine packets delim='|' " + "| sort ip"; RelNode root = getRelNode(ppl); @@ -132,6 +150,15 @@ public void testMvCombineWithDelimOption_SplunkSyntaxOrder() { + " LogicalFilter(condition=[=($0, 'basic')])\n" + " LogicalTableScan(table=[[scott, MVCOMBINE_DATA]])\n"; verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `case`, `ip`, CAST(COLLECT(`packets`) FILTER (WHERE `packets` IS NOT NULL) AS" + + " ARRAY< INTEGER >) `packets`\n" + + "FROM `scott`.`MVCOMBINE_DATA`\n" + + "WHERE `case` = 'basic'\n" + + "GROUP BY `case`, `ip`\n" + + "ORDER BY `ip`"; + verifyPPLToSparkSQL(root, expectedSparkSql); } @Test diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java index 2fd08988f6b..23e40ac0487 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/utils/PPLQueryDataAnonymizerTest.java @@ -1010,4 +1010,17 @@ public void testMvfind() { "source=t | eval result=mvfind(array('apple', 'banana', 'apricot'), 'ban.*') | fields" + " result")); } + + @Test + public void testMvcombineCommand() { + assertEquals( + "source=table | mvcombine delim=*** identifier", anonymize("source=t | mvcombine age")); + } + + @Test + public void testMvcombineCommandWithDelim() { + assertEquals( + "source=table | mvcombine delim=*** identifier", + anonymize("source=t | mvcombine age delim=','")); + } } From 612eaa796749588af3821e67b867b12ba69cc8d9 Mon Sep 17 00:00:00 2001 From: Srikanth Padakanti Date: Fri, 16 Jan 2026 10:25:25 -0600 Subject: [PATCH 20/32] spotlessApply Signed-off-by: Srikanth Padakanti --- .../org/opensearch/sql/calcite/remote/CalciteExplainIT.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java index b38a44b3da9..6c21d223d97 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteExplainIT.java @@ -2444,8 +2444,8 @@ public void testAggFilterOnNestedFields() throws IOException { "source=%s | stats count(eval(author.name < 'K')) as george_and_jk", TEST_INDEX_CASCADED_NESTED))); } - - @Test + + @Test public void testExplainMvCombine() throws IOException { String query = "source=opensearch-sql_test_index_account " From 22b08134e950d197206d041cf0ac4c28fdc88e76 Mon Sep 17 00:00:00 2001 From: Srikanth Padakanti Date: Thu, 22 Jan 2026 11:01:12 -0600 Subject: [PATCH 21/32] Add visitMvCombine method to the FieldResolutionVisitor Signed-off-by: Srikanth Padakanti --- .../sql/ast/analysis/FieldResolutionVisitor.java | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/core/src/main/java/org/opensearch/sql/ast/analysis/FieldResolutionVisitor.java b/core/src/main/java/org/opensearch/sql/ast/analysis/FieldResolutionVisitor.java index eff567ea498..a6b96d202b8 100644 --- a/core/src/main/java/org/opensearch/sql/ast/analysis/FieldResolutionVisitor.java +++ b/core/src/main/java/org/opensearch/sql/ast/analysis/FieldResolutionVisitor.java @@ -44,6 +44,7 @@ import org.opensearch.sql.ast.tree.Join; import org.opensearch.sql.ast.tree.Lookup; import org.opensearch.sql.ast.tree.Multisearch; +import org.opensearch.sql.ast.tree.MvCombine; import org.opensearch.sql.ast.tree.Parse; import org.opensearch.sql.ast.tree.Patterns; import org.opensearch.sql.ast.tree.Project; @@ -605,7 +606,17 @@ public Node visitExpand(Expand node, FieldResolutionContext context) { return node; } - private Set extractFieldsFromAggregation(UnresolvedExpression expr) { + @Override + public Node visitMvCombine(MvCombine node, FieldResolutionContext context) { + Set mvCombineFields = extractFieldsFromExpression(node.getField()); + context.pushRequirements(context.getCurrentRequirements().or(mvCombineFields)); + visitChildren(node, context); + context.popRequirements(); + return node; + } + + + private Set extractFieldsFromAggregation(UnresolvedExpression expr) { Set fields = new HashSet<>(); if (expr instanceof Alias alias) { return extractFieldsFromAggregation(alias.getDelegated()); From 57cce16711480465b1f9d4ea3c2792d362a45c3b Mon Sep 17 00:00:00 2001 From: Srikanth Padakanti Date: Thu, 22 Jan 2026 14:46:16 -0600 Subject: [PATCH 22/32] Apply spotlesscheck Signed-off-by: Srikanth Padakanti --- .../ast/analysis/FieldResolutionVisitor.java | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/core/src/main/java/org/opensearch/sql/ast/analysis/FieldResolutionVisitor.java b/core/src/main/java/org/opensearch/sql/ast/analysis/FieldResolutionVisitor.java index a6b96d202b8..bd1b3b3072f 100644 --- a/core/src/main/java/org/opensearch/sql/ast/analysis/FieldResolutionVisitor.java +++ b/core/src/main/java/org/opensearch/sql/ast/analysis/FieldResolutionVisitor.java @@ -606,17 +606,16 @@ public Node visitExpand(Expand node, FieldResolutionContext context) { return node; } - @Override - public Node visitMvCombine(MvCombine node, FieldResolutionContext context) { - Set mvCombineFields = extractFieldsFromExpression(node.getField()); - context.pushRequirements(context.getCurrentRequirements().or(mvCombineFields)); - visitChildren(node, context); - context.popRequirements(); - return node; - } - + @Override + public Node visitMvCombine(MvCombine node, FieldResolutionContext context) { + Set mvCombineFields = extractFieldsFromExpression(node.getField()); + context.pushRequirements(context.getCurrentRequirements().or(mvCombineFields)); + visitChildren(node, context); + context.popRequirements(); + return node; + } - private Set extractFieldsFromAggregation(UnresolvedExpression expr) { + private Set extractFieldsFromAggregation(UnresolvedExpression expr) { Set fields = new HashSet<>(); if (expr instanceof Alias alias) { return extractFieldsFromAggregation(alias.getDelegated()); From d47c8cfc9053866237b7167fa6b48155fd34dfc8 Mon Sep 17 00:00:00 2001 From: Srikanth Padakanti Date: Mon, 26 Jan 2026 12:22:02 -0600 Subject: [PATCH 23/32] Add changes to exclude the metadata fields and remove the CAST logic Signed-off-by: Srikanth Padakanti --- .../sql/calcite/CalciteRelNodeVisitor.java | 88 ++++++++++--------- .../calcite/explain_mvcombine.yaml | 11 ++- .../explain_mvcombine.yaml | 11 ++- .../ppl/calcite/CalcitePPLMvCombineTest.java | 29 +++--- 4 files changed, 69 insertions(+), 70 deletions(-) diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java index e521eabc158..0684ef8c76c 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java @@ -53,7 +53,6 @@ import org.apache.calcite.rel.type.RelDataType; import org.apache.calcite.rel.type.RelDataTypeFamily; import org.apache.calcite.rel.type.RelDataTypeField; -import org.apache.calcite.rex.RexBuilder; import org.apache.calcite.rex.RexCall; import org.apache.calcite.rex.RexCorrelVariable; import org.apache.calcite.rex.RexInputRef; @@ -62,6 +61,7 @@ import org.apache.calcite.rex.RexVisitorImpl; import org.apache.calcite.rex.RexWindowBounds; import org.apache.calcite.sql.SqlKind; +import org.apache.calcite.sql.fun.SqlLibraryOperators; import org.apache.calcite.sql.fun.SqlStdOperatorTable; import org.apache.calcite.sql.type.ArraySqlType; import org.apache.calcite.sql.type.MapSqlType; @@ -3089,15 +3089,15 @@ public RelNode visitExpand(Expand expand, CalcitePlanContext context) { } /** - * mvcombine command visitor to collapse rows that are identical on all fields except the target - * field, and combine the target field values into a multivalue (array) field. + * mvcombine command visitor to collapse rows that are identical on all non-target, non-metadata + * fields, and combine the target field values into a multivalue (array) field. * - *

Implementation notes:Groups by all input fields except the target field. Aggregates target - * values using {@code COLLECT} (MULTISET). Casts the aggregation result from MULTISET to ARRAY - * for a stable multivalue output type. Preserves the original output column order. + *

Implementation notes: Groups by all non-target, non-metadata fields. Aggregates target + * values using {@code ARRAY_AGG}, producing a multivalue {@code ARRAY} directly. Preserves the + * original output column order. * * @param node mvcombine command to be visited - * @param context CalcitePlanContext containing the RelBuilder, RexBuilder, and resolution context + * @param context CalcitePlanContext containing the RelBuilder and resolution context * @return RelNode representing collapsed records with the target combined into a multivalue array * @throws SemanticCheckException if the mvcombine target is not a direct field reference */ @@ -3107,7 +3107,6 @@ public RelNode visitMvCombine(MvCombine node, CalcitePlanContext context) { visitChildren(node, context); final RelBuilder relBuilder = context.relBuilder; - final RexBuilder rexBuilder = context.rexBuilder; final RelNode input = relBuilder.peek(); final List inputFieldNames = input.getRowType().getFieldNames(); @@ -3117,16 +3116,18 @@ public RelNode visitMvCombine(MvCombine node, CalcitePlanContext context) { final int targetIndex = resolveTargetIndex(targetField, context); final String targetName = inputFieldNames.get(targetIndex); - // 3) Group by all fields except the target. + // 3) Group by all non-target, non-metadata fields. final List groupExprs = buildGroupExpressionsExcludingTarget(targetIndex, inputFieldNames, relBuilder); - // 4) Aggregate target values using COLLECT, filtering out NULLs. - performCollectAggregation(relBuilder, targetIndex, targetName, groupExprs); + // If all remaining fields are metadata or the target itself, mvcombine degenerates to a global + // combine. This is intentional: result collapses to a single row with aggregated target values. - // 5) Restore original output column order, and cast COLLECT's MULTISET output to ARRAY. - restoreColumnOrderWithArrayCast( - relBuilder, rexBuilder, input, inputFieldNames, targetIndex, groupExprs); + // 4) Aggregate target values using ARRAY_AGG, filtering out NULLs. + performArrayAggAggregation(relBuilder, targetIndex, targetName, groupExprs); + + // 5) Restore original output column order (ARRAY_AGG already returns ARRAY). + restoreColumnOrderAfterArrayAgg(relBuilder, inputFieldNames, targetIndex, groupExprs); return relBuilder.peek(); } @@ -3168,7 +3169,7 @@ private int resolveTargetIndex(Field targetField, CalcitePlanContext context) { } /** - * Builds group-by expressions for mvcombine: all input fields except the target field. + * Builds group-by expressions for mvcombine: all non-target, non-metadata input fields. * * @param targetIndex Input index of the mvcombine target field * @param inputFieldNames Input schema field names (for sizing/ordering) @@ -3182,6 +3183,12 @@ private List buildGroupExpressionsExcludingTarget( if (i == targetIndex) { continue; } + final String fieldName = inputFieldNames.get(i); + + if (isMetadataField(fieldName)) { + continue; + } + groupExprs.add(relBuilder.field(i)); } return groupExprs; @@ -3190,19 +3197,18 @@ private List buildGroupExpressionsExcludingTarget( /** * Applies mvcombine aggregation: * - *

GROUP BY all non-target fields, and aggregate target values using {@code COLLECT}. {@code - * COLLECT} produces a MULTISET in Calcite, which we later cast to ARRAY for output. + *

GROUP BY all non-target fields (excluding metadata fields), and aggregate target values + * using {@code ARRAY_AGG}. {@code ARRAY_AGG} produces an {@code ARRAY} in Calcite, which we + * keep as-is for output. * - *

NULL target values are excluded from the collected multivalue list by applying an aggregate - * filter. This matches typical "combine values" semantics and avoids polluting the result with - * NULL elements. + *

NULL target values are excluded from the aggregated array by applying an aggregate filter. * * @param relBuilder RelBuilder positioned on the input * @param targetIndex Target field input index * @param targetName Target field output name (preserved) - * @param groupExprs Group-by expressions (all fields except target) + * @param groupExprs Group-by expressions (non-target, non-metadata fields) */ - private void performCollectAggregation( + private void performArrayAggAggregation( RelBuilder relBuilder, int targetIndex, String targetName, List groupExprs) { final RexNode targetRef = relBuilder.field(targetIndex); @@ -3210,7 +3216,7 @@ private void performCollectAggregation( final RelBuilder.AggCall aggCall = relBuilder - .aggregateCall(SqlStdOperatorTable.COLLECT, targetRef) + .aggregateCall(SqlLibraryOperators.ARRAY_AGG, targetRef) .filter(notNullTarget) .as(targetName); @@ -3218,51 +3224,49 @@ private void performCollectAggregation( } /** - * Restores the original output column order after the aggregate step and converts the collected - * target from MULTISET to ARRAY<T>. + * Restores the original output column order after the aggregate step. * *

After aggregation, the schema is: * *

-   *   [groupField0, groupField1, ..., groupFieldN, targetAggMultiset]
+   *   [groupField0, groupField1, ..., groupFieldN, targetAggArray]
    * 
* *

This method projects fields back to the original input order, replacing the original target - * slot with {@code CAST(targetAggMultiset AS ARRAY<T>)}. + * slot with the aggregated target value. Metadata fields that were excluded from the group key + * are preserved in the output schema but projected as {@code NULL}, since they do not exist in + * the post-aggregate row. + * + *

Since {@code ARRAY_AGG} already returns {@code ARRAY}, no cast is needed. * * @param relBuilder RelBuilder positioned on the post-aggregate node - * @param rexBuilder RexBuilder for explicit casts - * @param input Original input RelNode (used to derive the target element type) * @param inputFieldNames Original input field names (also output field names) * @param targetIndex Target field index in the original input * @param groupExprs Group-by expressions used during aggregation */ - private void restoreColumnOrderWithArrayCast( + private void restoreColumnOrderAfterArrayAgg( RelBuilder relBuilder, - RexBuilder rexBuilder, - RelNode input, List inputFieldNames, int targetIndex, List groupExprs) { - // Post-aggregate: group fields come first, and the collected target is appended at the end. - final int collectedTargetPos = groupExprs.size(); - - final RelDataType targetElemType = input.getRowType().getFieldList().get(targetIndex).getType(); - final RelDataType targetArrayType = - relBuilder.getTypeFactory().createArrayType(targetElemType, -1); + // Post-aggregate: group fields come first, and the aggregated target is appended at the end. + final int aggregatedTargetPos = groupExprs.size(); final List projections = new ArrayList<>(inputFieldNames.size()); final List projectionNames = new ArrayList<>(inputFieldNames.size()); int groupPos = 0; for (int i = 0; i < inputFieldNames.size(); i++) { - projectionNames.add(inputFieldNames.get(i)); + final String fieldName = inputFieldNames.get(i); + + projectionNames.add(fieldName); if (i == targetIndex) { - // COLLECT returns MULTISET; normalize output to ARRAY. - final RexNode multisetRef = relBuilder.field(collectedTargetPos); - projections.add(rexBuilder.makeCast(targetArrayType, multisetRef)); + // ARRAY_AGG already returns ARRAY + projections.add(relBuilder.field(aggregatedTargetPos)); + } else if (isMetadataField(fieldName)) { + projections.add(relBuilder.literal(null)); } else { projections.add(relBuilder.field(groupPos)); groupPos++; diff --git a/integ-test/src/test/resources/expectedOutput/calcite/explain_mvcombine.yaml b/integ-test/src/test/resources/expectedOutput/calcite/explain_mvcombine.yaml index 768fa9ef384..ff54f066772 100644 --- a/integ-test/src/test/resources/expectedOutput/calcite/explain_mvcombine.yaml +++ b/integ-test/src/test/resources/expectedOutput/calcite/explain_mvcombine.yaml @@ -1,13 +1,12 @@ calcite: logical: | LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) - LogicalProject(state=[$0], city=[$1], age=[CAST($2):BIGINT ARRAY NOT NULL]) - LogicalAggregate(group=[{0, 1}], age=[COLLECT($2) FILTER $3]) + LogicalProject(state=[$0], city=[$1], age=[$2]) + LogicalAggregate(group=[{0, 1}], age=[ARRAY_AGG($2) FILTER $3]) LogicalProject(state=[$7], city=[$5], age=[$8], $f3=[IS NOT NULL($8)]) CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) physical: | EnumerableLimit(fetch=[10000]) - EnumerableCalc(expr#0..2=[{inputs}], expr#3=[CAST($t2):BIGINT ARRAY NOT NULL], proj#0..1=[{exprs}], age=[$t3]) - EnumerableAggregate(group=[{0, 1}], age=[COLLECT($2) FILTER $3]) - EnumerableCalc(expr#0..2=[{inputs}], expr#3=[IS NOT NULL($t2)], proj#0..3=[{exprs}]) - CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[state, city, age]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["state","city","age"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) + EnumerableAggregate(group=[{0, 1}], age=[ARRAY_AGG($2) FILTER $3]) + EnumerableCalc(expr#0..2=[{inputs}], expr#3=[IS NOT NULL($t2)], proj#0..3=[{exprs}]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]], PushDownContext=[[PROJECT->[state, city, age]], OpenSearchRequestBuilder(sourceBuilder={"from":0,"timeout":"1m","_source":{"includes":["state","city","age"],"excludes":[]}}, requestedTotalSize=2147483647, pageSize=null, startFrom=0)]) diff --git a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_mvcombine.yaml b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_mvcombine.yaml index 29a8aa3deac..35f2d79e7c8 100644 --- a/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_mvcombine.yaml +++ b/integ-test/src/test/resources/expectedOutput/calcite_no_pushdown/explain_mvcombine.yaml @@ -1,13 +1,12 @@ calcite: logical: | LogicalSystemLimit(fetch=[10000], type=[QUERY_SIZE_LIMIT]) - LogicalProject(state=[$0], city=[$1], age=[CAST($2):BIGINT ARRAY NOT NULL]) - LogicalAggregate(group=[{0, 1}], age=[COLLECT($2) FILTER $3]) + LogicalProject(state=[$0], city=[$1], age=[$2]) + LogicalAggregate(group=[{0, 1}], age=[ARRAY_AGG($2) FILTER $3]) LogicalProject(state=[$7], city=[$5], age=[$8], $f3=[IS NOT NULL($8)]) CalciteLogicalIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) physical: | EnumerableLimit(fetch=[10000]) - EnumerableCalc(expr#0..2=[{inputs}], expr#3=[CAST($t2):BIGINT ARRAY NOT NULL], proj#0..1=[{exprs}], age=[$t3]) - EnumerableAggregate(group=[{0, 1}], age=[COLLECT($2) FILTER $3]) - EnumerableCalc(expr#0..16=[{inputs}], expr#17=[IS NOT NULL($t8)], state=[$t7], city=[$t5], age=[$t8], $f3=[$t17]) - CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) + EnumerableAggregate(group=[{0, 1}], age=[ARRAY_AGG($2) FILTER $3]) + EnumerableCalc(expr#0..16=[{inputs}], expr#17=[IS NOT NULL($t8)], state=[$t7], city=[$t5], age=[$t8], $f3=[$t17]) + CalciteEnumerableIndexScan(table=[[OpenSearch, opensearch-sql_test_index_account]]) diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLMvCombineTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLMvCombineTest.java index 524f5f7972c..6e6460a2365 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLMvCombineTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLMvCombineTest.java @@ -84,16 +84,15 @@ public void testMvCombineBasic() { String expectedLogical = "LogicalSort(sort0=[$1], dir0=[ASC-nulls-first])\n" - + " LogicalProject(case=[$0], ip=[$1], packets=[CAST($2):INTEGER ARRAY NOT NULL])\n" - + " LogicalAggregate(group=[{0, 1}], packets=[COLLECT($2) FILTER $3])\n" + + " LogicalProject(case=[$0], ip=[$1], packets=[$2])\n" + + " LogicalAggregate(group=[{0, 1}], packets=[ARRAY_AGG($2) FILTER $3])\n" + " LogicalProject(case=[$0], ip=[$1], packets=[$2], $f3=[IS NOT NULL($2)])\n" + " LogicalFilter(condition=[=($0, 'basic')])\n" + " LogicalTableScan(table=[[scott, MVCOMBINE_DATA]])\n"; verifyLogical(root, expectedLogical); String expectedSparkSql = - "SELECT `case`, `ip`, CAST(COLLECT(`packets`) FILTER (WHERE `packets` IS NOT NULL) AS" - + " ARRAY< INTEGER >) `packets`\n" + "SELECT `case`, `ip`, ARRAY_AGG(`packets`) FILTER (WHERE `packets` IS NOT NULL) `packets`\n" + "FROM `scott`.`MVCOMBINE_DATA`\n" + "WHERE `case` = 'basic'\n" + "GROUP BY `case`, `ip`\n" @@ -114,16 +113,15 @@ public void testMvCombineWithNullTargetValues() { String expectedLogical = "LogicalSort(sort0=[$1], dir0=[ASC-nulls-first])\n" - + " LogicalProject(case=[$0], ip=[$1], packets=[CAST($2):INTEGER ARRAY NOT NULL])\n" - + " LogicalAggregate(group=[{0, 1}], packets=[COLLECT($2) FILTER $3])\n" + + " LogicalProject(case=[$0], ip=[$1], packets=[$2])\n" + + " LogicalAggregate(group=[{0, 1}], packets=[ARRAY_AGG($2) FILTER $3])\n" + " LogicalProject(case=[$0], ip=[$1], packets=[$2], $f3=[IS NOT NULL($2)])\n" + " LogicalFilter(condition=[=($0, 'nulls')])\n" + " LogicalTableScan(table=[[scott, MVCOMBINE_DATA]])\n"; verifyLogical(root, expectedLogical); String expectedSparkSql = - "SELECT `case`, `ip`, CAST(COLLECT(`packets`) FILTER (WHERE `packets` IS NOT NULL) AS" - + " ARRAY< INTEGER >) `packets`\n" + "SELECT `case`, `ip`, ARRAY_AGG(`packets`) FILTER (WHERE `packets` IS NOT NULL) `packets`\n" + "FROM `scott`.`MVCOMBINE_DATA`\n" + "WHERE `case` = 'nulls'\n" + "GROUP BY `case`, `ip`\n" @@ -144,16 +142,15 @@ public void testMvCombineWithDelimOption_SplunkSyntaxOrder() { String expectedLogical = "LogicalSort(sort0=[$1], dir0=[ASC-nulls-first])\n" - + " LogicalProject(case=[$0], ip=[$1], packets=[CAST($2):INTEGER ARRAY NOT NULL])\n" - + " LogicalAggregate(group=[{0, 1}], packets=[COLLECT($2) FILTER $3])\n" + + " LogicalProject(case=[$0], ip=[$1], packets=[$2])\n" + + " LogicalAggregate(group=[{0, 1}], packets=[ARRAY_AGG($2) FILTER $3])\n" + " LogicalProject(case=[$0], ip=[$1], packets=[$2], $f3=[IS NOT NULL($2)])\n" + " LogicalFilter(condition=[=($0, 'basic')])\n" + " LogicalTableScan(table=[[scott, MVCOMBINE_DATA]])\n"; verifyLogical(root, expectedLogical); String expectedSparkSql = - "SELECT `case`, `ip`, CAST(COLLECT(`packets`) FILTER (WHERE `packets` IS NOT NULL) AS" - + " ARRAY< INTEGER >) `packets`\n" + "SELECT `case`, `ip`, ARRAY_AGG(`packets`) FILTER (WHERE `packets` IS NOT NULL) `packets`\n" + "FROM `scott`.`MVCOMBINE_DATA`\n" + "WHERE `case` = 'basic'\n" + "GROUP BY `case`, `ip`\n" @@ -190,8 +187,8 @@ public void testMvCombineSingleRow() { String expectedLogical = "LogicalSort(sort0=[$1], dir0=[ASC-nulls-first])\n" - + " LogicalProject(case=[$0], ip=[$1], packets=[CAST($2):INTEGER ARRAY NOT NULL])\n" - + " LogicalAggregate(group=[{0, 1}], packets=[COLLECT($2) FILTER $3])\n" + + " LogicalProject(case=[$0], ip=[$1], packets=[$2])\n" + + " LogicalAggregate(group=[{0, 1}], packets=[ARRAY_AGG($2) FILTER $3])\n" + " LogicalProject(case=[$0], ip=[$1], packets=[$2], $f3=[IS NOT NULL($2)])\n" + " LogicalFilter(condition=[=($0, 'single')])\n" + " LogicalTableScan(table=[[scott, MVCOMBINE_DATA]])\n"; @@ -211,8 +208,8 @@ public void testMvCombineEmptyResult() { String expectedLogical = "LogicalSort(sort0=[$1], dir0=[ASC-nulls-first])\n" - + " LogicalProject(case=[$0], ip=[$1], packets=[CAST($2):INTEGER ARRAY NOT NULL])\n" - + " LogicalAggregate(group=[{0, 1}], packets=[COLLECT($2) FILTER $3])\n" + + " LogicalProject(case=[$0], ip=[$1], packets=[$2])\n" + + " LogicalAggregate(group=[{0, 1}], packets=[ARRAY_AGG($2) FILTER $3])\n" + " LogicalProject(case=[$0], ip=[$1], packets=[$2], $f3=[IS NOT NULL($2)])\n" + " LogicalFilter(condition=[=($0, 'no_such_case')])\n" + " LogicalTableScan(table=[[scott, MVCOMBINE_DATA]])\n"; From 62478052a298962b7edb93d9a49039bf5b5c7fed Mon Sep 17 00:00:00 2001 From: Srikanth Padakanti Date: Tue, 27 Jan 2026 01:47:57 -0600 Subject: [PATCH 24/32] Address CrossClusterSearchIT comment Signed-off-by: Srikanth Padakanti --- .../sql/security/CrossClusterSearchIT.java | 60 ++++++++++--------- 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/integ-test/src/test/java/org/opensearch/sql/security/CrossClusterSearchIT.java b/integ-test/src/test/java/org/opensearch/sql/security/CrossClusterSearchIT.java index 767fd8c9aa8..606bc835158 100644 --- a/integ-test/src/test/java/org/opensearch/sql/security/CrossClusterSearchIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/security/CrossClusterSearchIT.java @@ -293,38 +293,42 @@ public void testCrossClusterTranspose() throws IOException { @Test public void testCrossClusterAppend() throws IOException { // TODO: We should enable calcite by default in CrossClusterSearchIT? - enableCalcite(); - - JSONObject result = - executeQuery( - String.format( - "search source=%s | stats count() as cnt by gender | append [ search source=%s |" - + " stats count() as cnt ]", - TEST_INDEX_BANK_REMOTE, TEST_INDEX_BANK_REMOTE)); - verifyDataRows(result, rows(3, "F"), rows(4, "M"), rows(7, null)); - - disableCalcite(); + try { + enableCalcite(); + + JSONObject result = + executeQuery( + String.format( + "search source=%s | stats count() as cnt by gender | append [ search source=%s |" + + " stats count() as cnt ]", + TEST_INDEX_BANK_REMOTE, TEST_INDEX_BANK_REMOTE)); + verifyDataRows(result, rows(3, "F"), rows(4, "M"), rows(7, null)); + } finally { + disableCalcite(); + } } /** CrossClusterSearchIT Test for mvcombine. */ @Test public void testCrossClusterMvcombine() throws IOException { - enableCalcite(); - - JSONObject result = - executeQuery( - String.format( - "search source=%s | where firstname='Hattie' or firstname='Nanette' " - + "| fields firstname, age | mvcombine age", - TEST_INDEX_BANK_REMOTE)); - - verifyColumn(result, columnName("firstname"), columnName("age")); - - verifyDataRows( - result, - rows("Hattie", new org.json.JSONArray().put(36)), - rows("Nanette", new org.json.JSONArray().put(28))); - - disableCalcite(); + try { + enableCalcite(); + + JSONObject result = + executeQuery( + String.format( + "search source=%s | where firstname='Hattie' or firstname='Nanette' " + + "| fields firstname, age | mvcombine age", + TEST_INDEX_BANK_REMOTE)); + + verifyColumn(result, columnName("firstname"), columnName("age")); + + verifyDataRows( + result, + rows("Hattie", new org.json.JSONArray().put(36)), + rows("Nanette", new org.json.JSONArray().put(28))); + } finally { + disableCalcite(); + } } } From 2c48f1aaf7e7f5b10acba7b0a6a7d0d189624bde Mon Sep 17 00:00:00 2001 From: Srikanth Padakanti Date: Tue, 27 Jan 2026 02:04:22 -0600 Subject: [PATCH 25/32] Address CrossClusterSearchIT comment Signed-off-by: Srikanth Padakanti --- .../sql/security/CrossClusterSearchIT.java | 53 ++++++++----------- 1 file changed, 21 insertions(+), 32 deletions(-) diff --git a/integ-test/src/test/java/org/opensearch/sql/security/CrossClusterSearchIT.java b/integ-test/src/test/java/org/opensearch/sql/security/CrossClusterSearchIT.java index 606bc835158..df658e52d1e 100644 --- a/integ-test/src/test/java/org/opensearch/sql/security/CrossClusterSearchIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/security/CrossClusterSearchIT.java @@ -293,42 +293,31 @@ public void testCrossClusterTranspose() throws IOException { @Test public void testCrossClusterAppend() throws IOException { // TODO: We should enable calcite by default in CrossClusterSearchIT? - try { - enableCalcite(); - - JSONObject result = - executeQuery( - String.format( - "search source=%s | stats count() as cnt by gender | append [ search source=%s |" - + " stats count() as cnt ]", - TEST_INDEX_BANK_REMOTE, TEST_INDEX_BANK_REMOTE)); - verifyDataRows(result, rows(3, "F"), rows(4, "M"), rows(7, null)); - } finally { - disableCalcite(); - } + JSONObject result = + executeQuery( + String.format( + "search source=%s | stats count() as cnt by gender | append [ search source=%s |" + + " stats count() as cnt ]", + TEST_INDEX_BANK_REMOTE, TEST_INDEX_BANK_REMOTE)); + verifyDataRows(result, rows(3, "F"), rows(4, "M"), rows(7, null)); } /** CrossClusterSearchIT Test for mvcombine. */ @Test public void testCrossClusterMvcombine() throws IOException { - try { - enableCalcite(); - - JSONObject result = - executeQuery( - String.format( - "search source=%s | where firstname='Hattie' or firstname='Nanette' " - + "| fields firstname, age | mvcombine age", - TEST_INDEX_BANK_REMOTE)); - - verifyColumn(result, columnName("firstname"), columnName("age")); - - verifyDataRows( - result, - rows("Hattie", new org.json.JSONArray().put(36)), - rows("Nanette", new org.json.JSONArray().put(28))); - } finally { - disableCalcite(); - } + + JSONObject result = + executeQuery( + String.format( + "search source=%s | where firstname='Hattie' or firstname='Nanette' " + + "| fields firstname, age | mvcombine age", + TEST_INDEX_BANK_REMOTE)); + + verifyColumn(result, columnName("firstname"), columnName("age")); + + verifyDataRows( + result, + rows("Hattie", new org.json.JSONArray().put(36)), + rows("Nanette", new org.json.JSONArray().put(28))); } } From d3ad0e9439d93a2e702e49c920baf24cc3add612 Mon Sep 17 00:00:00 2001 From: Srikanth Padakanti Date: Tue, 27 Jan 2026 12:08:34 -0600 Subject: [PATCH 26/32] Address CrossClusterSearchIT comment Signed-off-by: Srikanth Padakanti --- .../sql/calcite/CalciteRelNodeVisitor.java | 33 +++--- docs/user/ppl/cmd/mvcombine.md | 21 +--- .../remote/CalciteMvCombineCommandIT.java | 100 ++++++++---------- 3 files changed, 68 insertions(+), 86 deletions(-) diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java index b387be71a07..fb14e364793 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java @@ -3197,10 +3197,12 @@ public RelNode visitMvCombine(MvCombine node, CalcitePlanContext context) { final Field targetField = node.getField(); final int targetIndex = resolveTargetIndex(targetField, context); final String targetName = inputFieldNames.get(targetIndex); + final boolean includeMetaFields = context.isProjectVisited(); // 3) Group by all non-target, non-metadata fields. final List groupExprs = - buildGroupExpressionsExcludingTarget(targetIndex, inputFieldNames, relBuilder); + buildGroupExpressionsExcludingTarget( + targetIndex, inputFieldNames, relBuilder, includeMetaFields); // If all remaining fields are metadata or the target itself, mvcombine degenerates to a global // combine. This is intentional: result collapses to a single row with aggregated target values. @@ -3209,7 +3211,8 @@ public RelNode visitMvCombine(MvCombine node, CalcitePlanContext context) { performArrayAggAggregation(relBuilder, targetIndex, targetName, groupExprs); // 5) Restore original output column order (ARRAY_AGG already returns ARRAY). - restoreColumnOrderAfterArrayAgg(relBuilder, inputFieldNames, targetIndex, groupExprs); + restoreColumnOrderAfterArrayAgg( + relBuilder, inputFieldNames, targetIndex, groupExprs, includeMetaFields); return relBuilder.peek(); } @@ -3231,7 +3234,7 @@ public RelNode visitMvCombine(MvCombine node, CalcitePlanContext context) { */ private int resolveTargetIndex(Field targetField, CalcitePlanContext context) { final RexNode targetRex = rexVisitor.analyze(targetField, context); - if (!(targetRex instanceof RexInputRef)) { + if (!isInputRef(targetRex)) { throw new SemanticCheckException( "mvcombine target must be a direct field reference, but got: " + targetField); } @@ -3241,8 +3244,7 @@ private int resolveTargetIndex(Field targetField, CalcitePlanContext context) { final RelDataType fieldType = context.relBuilder.peek().getRowType().getFieldList().get(index).getType(); - if (fieldType.getSqlTypeName() == org.apache.calcite.sql.type.SqlTypeName.ARRAY - || fieldType.getSqlTypeName() == org.apache.calcite.sql.type.SqlTypeName.MULTISET) { + if (SqlTypeUtil.isArray(fieldType) || SqlTypeUtil.isMultiset(fieldType)) { throw new SemanticCheckException( "mvcombine target cannot be an array/multivalue type, but got: " + fieldType); } @@ -3259,7 +3261,10 @@ private int resolveTargetIndex(Field targetField, CalcitePlanContext context) { * @return Group-by expressions in input order excluding the target */ private List buildGroupExpressionsExcludingTarget( - int targetIndex, List inputFieldNames, RelBuilder relBuilder) { + int targetIndex, + List inputFieldNames, + RelBuilder relBuilder, + boolean includeMetaFields) { final List groupExprs = new ArrayList<>(Math.max(0, inputFieldNames.size() - 1)); for (int i = 0; i < inputFieldNames.size(); i++) { if (i == targetIndex) { @@ -3267,9 +3272,7 @@ private List buildGroupExpressionsExcludingTarget( } final String fieldName = inputFieldNames.get(i); - if (isMetadataField(fieldName)) { - continue; - } + if (isMetadataField(fieldName) && !includeMetaFields) continue; groupExprs.add(relBuilder.field(i)); } @@ -3330,7 +3333,8 @@ private void restoreColumnOrderAfterArrayAgg( RelBuilder relBuilder, List inputFieldNames, int targetIndex, - List groupExprs) { + List groupExprs, + boolean includeMetaFields) { // Post-aggregate: group fields come first, and the aggregated target is appended at the end. final int aggregatedTargetPos = groupExprs.size(); @@ -3341,14 +3345,19 @@ private void restoreColumnOrderAfterArrayAgg( int groupPos = 0; for (int i = 0; i < inputFieldNames.size(); i++) { final String fieldName = inputFieldNames.get(i); - projectionNames.add(fieldName); if (i == targetIndex) { // ARRAY_AGG already returns ARRAY projections.add(relBuilder.field(aggregatedTargetPos)); } else if (isMetadataField(fieldName)) { - projections.add(relBuilder.literal(null)); + // Metadata fields are intentionally not grouped by mvcombine. + // Preserve schema correctness by projecting a TYPED NULL + // (prevents "undefined type" for fields like _id when explicitly selected). + projections.add( + relBuilder + .getRexBuilder() + .makeNullLiteral(relBuilder.peek().getRowType().getFieldList().get(i).getType())); } else { projections.add(relBuilder.field(groupPos)); groupPos++; diff --git a/docs/user/ppl/cmd/mvcombine.md b/docs/user/ppl/cmd/mvcombine.md index e816de88d17..e50a2d2c1f1 100644 --- a/docs/user/ppl/cmd/mvcombine.md +++ b/docs/user/ppl/cmd/mvcombine.md @@ -10,38 +10,21 @@ The `mvcombine` command groups rows that are identical across all fields except - Rows are grouped by **all fields currently in the pipeline except the target field**. - One output row is produced per group. -- The target field becomes a **multivalue field** containing the combined values from all rows in the group. +- The target field is **replaced** with a multivalue (array) field that contains all non-null values of the target field from the grouped rows. - Rows where the target field is missing or null do **not** contribute a value to the combined multivalue output. - The default output is a multivalue representation (array). -Delimiter handling (`delim`) is parsed but has **no effect on the output** unless the `nomv` command is used. Since `nomv` is not yet implemented in OpenSearch, `delim` is currently inert and does not affect execution. - --- ## Syntax -mvcombine [delim=] +mvcombine ### Arguments - **field** (required) The name of the field whose values are combined into a multivalue field. -- **delim** (optional) - A string delimiter for rendering a single-value representation of the combined field. - This option has no observable effect unless `nomv` is used. - ---- - -## Semantics - -Given a set of input rows: - -- All rows that have identical values for every field **except** the target field are grouped together. -- The target field must be a single-valued (scalar) field. -- The output schema preserves the original field order. -- The target field is returned as a multivalue (array) field. - --- ## Example 1: Basic mvcombine diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java index 7f2ab06752f..10a4f72003b 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java @@ -58,22 +58,21 @@ public void testMvCombine_basicGroupCollapsesToOneRow() throws IOException { + " | mvcombine packets_str"; JSONObject result = executeQuery(q); + verifyNumOfRows(result, 1); - JSONArray row = result.getJSONArray("datarows").getJSONArray(0); - Assertions.assertEquals("10.0.0.1", row.getString(0)); - Assertions.assertEquals("100", String.valueOf(row.get(1))); - Assertions.assertEquals("t1", row.getString(2)); + verifySchema( + result, + schema("ip", null, "string"), + schema("bytes", null, "bigint"), + schema("tags", null, "string"), + schema("packets_str", null, "array")); - List mv = toStringListDropNulls(row.get(3)); - Assertions.assertTrue(mv.contains("10"), "Expected packets_str to include 10, got " + mv); - Assertions.assertTrue(mv.contains("20"), "Expected packets_str to include 20, got " + mv); - Assertions.assertTrue(mv.contains("30"), "Expected packets_str to include 30, got " + mv); + verifyDataRows(result, rows("10.0.0.1", 100, "t1", List.of("10", "20", "30"))); } @Test public void testMvCombine_singleRowGroupStaysSingleRow() throws IOException { - // NOTE: Keep output minimal + deterministic to safely verify schema + datarows String q = "source=" + INDEX @@ -103,12 +102,17 @@ public void testMvCombine_missingTargetWithinGroup_collapses_nonNullPreserved() + " | mvcombine packets_str"; JSONObject result = executeQuery(q); + verifyNumOfRows(result, 1); - JSONArray row = result.getJSONArray("datarows").getJSONArray(0); - List mv = toStringListKeepNulls(row.get(3)); + verifySchema( + result, + schema("ip", null, "string"), + schema("bytes", null, "bigint"), + schema("tags", null, "string"), + schema("packets_str", null, "array")); - Assertions.assertTrue(mv.contains("5"), "Expected packets_str to include 5, got " + mv); + verifyDataRows(result, rows("10.0.0.3", 300, "t3", List.of("5"))); } // --------------------------- @@ -123,38 +127,26 @@ public void testMvCombine_multipleGroups_producesOneRowPerGroupKey() throws IOEx + " | where (ip='10.0.0.7' or ip='10.0.0.8') and bytes=700 and tags='t7'" + " | fields ip, bytes, tags, packets_str"; - JSONObject before = executeQuery(base); - int beforeRows = before.getJSONArray("datarows").length(); - Assertions.assertTrue(beforeRows >= 1, "Expected dataset rows for multi-group test, got 0"); - JSONObject result = executeQuery(base + " | mvcombine packets_str | sort ip"); - int outRows = result.getJSONArray("datarows").length(); - Assertions.assertEquals( - 2, outRows, "Expected 2 groups (10.0.0.7 and 10.0.0.8), got " + outRows); + verifyNumOfRows(result, 2); + + verifySchema( + result, + schema("ip", null, "string"), + schema("bytes", null, "bigint"), + schema("tags", null, "string"), + schema("packets_str", null, "array")); + + // MV contents differ per group → helper cannot express membership safely JSONArray r0 = result.getJSONArray("datarows").getJSONArray(0); JSONArray r1 = result.getJSONArray("datarows").getJSONArray(1); - String ip0 = r0.getString(0); - String ip1 = r1.getString(0); - - if ("10.0.0.7".equals(ip0)) { - List mv0 = toStringListDropNulls(r0.get(3)); - Assertions.assertTrue( - mv0.contains("1") && mv0.contains("2"), - "Expected 10.0.0.7 to include 1 and 2, got " + mv0); - - List mv1 = toStringListDropNulls(r1.get(3)); - Assertions.assertTrue(mv1.contains("9"), "Expected 10.0.0.8 to include 9, got " + mv1); - } else { - List mv0 = toStringListDropNulls(r0.get(3)); - Assertions.assertTrue(mv0.contains("9"), "Expected 10.0.0.8 to include 9, got " + mv0); - - List mv1 = toStringListDropNulls(r1.get(3)); - Assertions.assertTrue( - mv1.contains("1") && mv1.contains("2"), - "Expected 10.0.0.7 to include 1 and 2, got " + mv1); - } + List mv0 = toStringListDropNulls(r0.get(3)); + List mv1 = toStringListDropNulls(r1.get(3)); + + Assertions.assertTrue((mv0.contains("1") && mv0.contains("2")) || mv0.contains("9")); + Assertions.assertTrue((mv1.contains("1") && mv1.contains("2")) || mv1.contains("9")); } // --------------------------- @@ -170,27 +162,29 @@ public void testMvCombine_delim_shouldNotChangeMvShape_ifSupported_elseSyntaxRej + " | where ip='10.0.0.9' and bytes=900 and tags='t9'" + " | fields ip, bytes, tags, packets_str"; - // Splunk-style: options before the field String q = base + " | mvcombine delim='|' packets_str"; try { JSONObject result = executeQuery(q); + verifyNumOfRows(result, 1); + verifySchema( + result, + schema("ip", null, "string"), + schema("bytes", null, "bigint"), + schema("tags", null, "string"), + schema("packets_str", null, "array")); + Object cell = result.getJSONArray("datarows").getJSONArray(0).get(3); - Assertions.assertTrue( - cell instanceof JSONArray, - "Expected multivalue array (delim should not coerce to string), got: " + cell); + Assertions.assertTrue(cell instanceof JSONArray); - // Optional sanity: values exist (order not guaranteed) List mv = toStringListDropNulls(cell); - Assertions.assertTrue(mv.contains("1"), "Expected MV to include 1, got: " + mv); - Assertions.assertTrue(mv.contains("2"), "Expected MV to include 2, got: " + mv); - Assertions.assertTrue(mv.contains("3"), "Expected MV to include 3, got: " + mv); + Assertions.assertTrue(mv.contains("1")); + Assertions.assertTrue(mv.contains("2")); + Assertions.assertTrue(mv.contains("3")); } catch (ResponseException e) { - Assertions.assertTrue( - isSyntaxBadRequest(e), - "Expected syntax rejection if delim unsupported, got: " + e.getMessage()); + Assertions.assertTrue(isSyntaxBadRequest(e)); } } @@ -205,9 +199,7 @@ public void testMvCombine_missingField_shouldReturn4xx() throws IOException { Assertions.fail("Expected ResponseException was not thrown"); } catch (ResponseException e) { int status = e.getResponse().getStatusLine().getStatusCode(); - Assertions.assertTrue( - status >= 400 && status < 500, - "Expected 4xx for missing field, got " + status + " msg=" + e.getMessage()); + Assertions.assertTrue(status >= 400 && status < 500); } } @@ -228,7 +220,6 @@ private static boolean isSyntaxBadRequest(ResponseException e) { || msg.contains("ParseException"); } - /** JSONArray -> list (nulls preserved), scalar -> singleton list, null -> empty list. */ private static List toStringListKeepNulls(Object cell) { if (cell == null || cell == JSONObject.NULL) { return Collections.emptyList(); @@ -244,7 +235,6 @@ private static List toStringListKeepNulls(Object cell) { return List.of(String.valueOf(cell)); } - /** Same as above but drops null entries. */ private static List toStringListDropNulls(Object cell) { List all = toStringListKeepNulls(cell); if (all.isEmpty()) return all; From d399b58c96c4d3dca2d2c8cb224b52fc338dee90 Mon Sep 17 00:00:00 2001 From: Srikanth Padakanti Date: Tue, 27 Jan 2026 12:38:51 -0600 Subject: [PATCH 27/32] Coderrabbit issues Signed-off-by: Srikanth Padakanti --- .../sql/calcite/CalciteRelNodeVisitor.java | 15 +++++----- .../remote/CalciteMvCombineCommandIT.java | 30 +++++++++++++------ 2 files changed, 28 insertions(+), 17 deletions(-) diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java index fb14e364793..2f3265aa2eb 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java @@ -3192,6 +3192,8 @@ public RelNode visitMvCombine(MvCombine node, CalcitePlanContext context) { final RelNode input = relBuilder.peek(); final List inputFieldNames = input.getRowType().getFieldNames(); + final List inputFieldTypes = + input.getRowType().getFieldList().stream().map(RelDataTypeField::getType).toList(); // 2) Resolve the mvcombine target to an input column index (must be a direct field reference). final Field targetField = node.getField(); @@ -3212,7 +3214,7 @@ public RelNode visitMvCombine(MvCombine node, CalcitePlanContext context) { // 5) Restore original output column order (ARRAY_AGG already returns ARRAY). restoreColumnOrderAfterArrayAgg( - relBuilder, inputFieldNames, targetIndex, groupExprs, includeMetaFields); + relBuilder, inputFieldNames, inputFieldTypes, targetIndex, groupExprs, includeMetaFields); return relBuilder.peek(); } @@ -3332,6 +3334,7 @@ private void performArrayAggAggregation( private void restoreColumnOrderAfterArrayAgg( RelBuilder relBuilder, List inputFieldNames, + List inputFieldTypes, int targetIndex, List groupExprs, boolean includeMetaFields) { @@ -3350,14 +3353,10 @@ private void restoreColumnOrderAfterArrayAgg( if (i == targetIndex) { // ARRAY_AGG already returns ARRAY projections.add(relBuilder.field(aggregatedTargetPos)); - } else if (isMetadataField(fieldName)) { + } else if (isMetadataField(fieldName) && !includeMetaFields) { // Metadata fields are intentionally not grouped by mvcombine. - // Preserve schema correctness by projecting a TYPED NULL - // (prevents "undefined type" for fields like _id when explicitly selected). - projections.add( - relBuilder - .getRexBuilder() - .makeNullLiteral(relBuilder.peek().getRowType().getFieldList().get(i).getType())); + // Preserve schema correctness by projecting a TYPED NULL based on ORIGINAL input schema. + projections.add(relBuilder.getRexBuilder().makeNullLiteral(inputFieldTypes.get(i))); } else { projections.add(relBuilder.field(groupPos)); groupPos++; diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java index 10a4f72003b..683d0075cbf 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java @@ -145,8 +145,12 @@ public void testMvCombine_multipleGroups_producesOneRowPerGroupKey() throws IOEx List mv0 = toStringListDropNulls(r0.get(3)); List mv1 = toStringListDropNulls(r1.get(3)); - Assertions.assertTrue((mv0.contains("1") && mv0.contains("2")) || mv0.contains("9")); - Assertions.assertTrue((mv1.contains("1") && mv1.contains("2")) || mv1.contains("9")); + Assertions.assertEquals("10.0.0.7", r0.getString(0)); + Assertions.assertEquals("10.0.0.8", r1.getString(0)); + + Assertions.assertTrue(mv0.containsAll(List.of("1", "2"))); + Assertions.assertEquals(2, mv0.size()); + Assertions.assertEquals(List.of("9"), mv1); } // --------------------------- @@ -194,13 +198,21 @@ public void testMvCombine_delim_shouldNotChangeMvShape_ifSupported_elseSyntaxRej @Test public void testMvCombine_missingField_shouldReturn4xx() throws IOException { - try { - executeQuery("source=" + INDEX + " | mvcombine does_not_exist"); - Assertions.fail("Expected ResponseException was not thrown"); - } catch (ResponseException e) { - int status = e.getResponse().getStatusLine().getStatusCode(); - Assertions.assertTrue(status >= 400 && status < 500); - } + ResponseException ex = + Assertions.assertThrows( + ResponseException.class, + () -> executeQuery("source=" + INDEX + " | mvcombine does_not_exist")); + + Assertions.assertEquals(400, ex.getResponse().getStatusLine().getStatusCode()); + + String msg = ex.getMessage(); + // Keep these checks tight enough to catch regressions but not brittle on formatting. + Assertions.assertTrue( + msg.contains("\"type\":\"IllegalArgumentException\"") + || msg.contains("IllegalArgumentException"), + msg); + Assertions.assertTrue(msg.contains("Field [does_not_exist] not found"), msg); + Assertions.assertTrue(msg.contains("Invalid Query"), msg); } // --------------------------- From 71d96f957049abf3577e0d176fbe6eed3915e4ee Mon Sep 17 00:00:00 2001 From: Srikanth Padakanti Date: Tue, 27 Jan 2026 12:45:24 -0600 Subject: [PATCH 28/32] Coderrabbit issues Signed-off-by: Srikanth Padakanti --- .../calcite/remote/CalciteMvCombineCommandIT.java | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java index 683d0075cbf..1cf535bd71b 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalciteMvCombineCommandIT.java @@ -203,16 +203,12 @@ public void testMvCombine_missingField_shouldReturn4xx() throws IOException { ResponseException.class, () -> executeQuery("source=" + INDEX + " | mvcombine does_not_exist")); - Assertions.assertEquals(400, ex.getResponse().getStatusLine().getStatusCode()); + int status = ex.getResponse().getStatusLine().getStatusCode(); + + Assertions.assertEquals(400, status, "Unexpected status. ex=" + ex.getMessage()); String msg = ex.getMessage(); - // Keep these checks tight enough to catch regressions but not brittle on formatting. - Assertions.assertTrue( - msg.contains("\"type\":\"IllegalArgumentException\"") - || msg.contains("IllegalArgumentException"), - msg); - Assertions.assertTrue(msg.contains("Field [does_not_exist] not found"), msg); - Assertions.assertTrue(msg.contains("Invalid Query"), msg); + Assertions.assertTrue(msg.contains("Field [does_not_exist] not found."), msg); } // --------------------------- From 9012e46f7cb67c81559ee46b8d08cbf3ec62a3da Mon Sep 17 00:00:00 2001 From: Srikanth Padakanti Date: Tue, 27 Jan 2026 14:56:22 -0600 Subject: [PATCH 29/32] Coderrabbit issues Signed-off-by: Srikanth Padakanti --- .../sql/calcite/CalciteRelNodeVisitor.java | 323 ++++++++---------- docs/user/ppl/cmd/mvcombine.md | 6 +- 2 files changed, 149 insertions(+), 180 deletions(-) diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java index 2f3265aa2eb..411a58c39c7 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java @@ -3170,204 +3170,173 @@ public RelNode visitExpand(Expand expand, CalcitePlanContext context) { return context.relBuilder.peek(); } - /** - * mvcombine command visitor to collapse rows that are identical on all non-target, non-metadata - * fields, and combine the target field values into a multivalue (array) field. - * - *

Implementation notes: Groups by all non-target, non-metadata fields. Aggregates target - * values using {@code ARRAY_AGG}, producing a multivalue {@code ARRAY} directly. Preserves the - * original output column order. - * - * @param node mvcombine command to be visited - * @param context CalcitePlanContext containing the RelBuilder and resolution context - * @return RelNode representing collapsed records with the target combined into a multivalue array - * @throws SemanticCheckException if the mvcombine target is not a direct field reference - */ - @Override - public RelNode visitMvCombine(MvCombine node, CalcitePlanContext context) { - // 1) Lower the child plan first so the RelBuilder has the input schema on the stack. - visitChildren(node, context); - - final RelBuilder relBuilder = context.relBuilder; - - final RelNode input = relBuilder.peek(); - final List inputFieldNames = input.getRowType().getFieldNames(); - final List inputFieldTypes = - input.getRowType().getFieldList().stream().map(RelDataTypeField::getType).toList(); - - // 2) Resolve the mvcombine target to an input column index (must be a direct field reference). - final Field targetField = node.getField(); - final int targetIndex = resolveTargetIndex(targetField, context); - final String targetName = inputFieldNames.get(targetIndex); - final boolean includeMetaFields = context.isProjectVisited(); - - // 3) Group by all non-target, non-metadata fields. - final List groupExprs = - buildGroupExpressionsExcludingTarget( - targetIndex, inputFieldNames, relBuilder, includeMetaFields); + /** + * mvcombine command visitor to collapse rows that are identical on all non-target fields. + * + *

Grouping semantics: + * + *

    + *
  • The target field is always excluded from the GROUP BY keys. + *
  • Metadata fields (for example {@code _id}, {@code _index}, {@code _score}) are excluded from + * GROUP BY keys unless they were explicitly projected earlier (for example, + * via {@code fields}). + *
+ * + *

The target field values are aggregated using {@code ARRAY_AGG}, with {@code NULL} values + * filtered out. The aggregation result replaces the original target column and produces an {@code + * ARRAY} output. + * + *

The original output column order is preserved. Metadata fields are projected as typed {@code + * NULL} literals after aggregation only when they are not part of grouping (since they were skipped). + * + * @param node mvcombine command to be visited + * @param context CalcitePlanContext containing the RelBuilder and planning context + * @return RelNode representing collapsed records with the target combined into a multivalue array + * @throws SemanticCheckException if the mvcombine target is not a direct field reference + */ + @Override + public RelNode visitMvCombine(MvCombine node, CalcitePlanContext context) { + // 1) Lower the child plan first so the RelBuilder has the input schema on the stack. + visitChildren(node, context); + + final RelBuilder relBuilder = context.relBuilder; + + final RelNode input = relBuilder.peek(); + final List inputFieldNames = input.getRowType().getFieldNames(); + final List inputFieldTypes = + input.getRowType().getFieldList().stream().map(RelDataTypeField::getType).toList(); + + // If true, we should NOT auto-skip meta fields (because user explicitly projected them) + final boolean includeMetaFields = context.isProjectVisited(); + + // 2) Resolve the mvcombine target to an input column index (must be a direct field reference). + final Field targetField = node.getField(); + final int targetIndex = resolveTargetIndex(targetField, context); + final String targetName = inputFieldNames.get(targetIndex); + + // 3) Group by all non-target fields, skipping meta fields unless explicitly projected. + final List groupExprs = + buildGroupExpressionsExcludingTarget( + targetIndex, inputFieldNames, relBuilder, includeMetaFields); + + // 4) Aggregate target values using ARRAY_AGG, filtering out NULLs. + performArrayAggAggregation(relBuilder, targetIndex, targetName, groupExprs); + + // 5) Restore original output column order (ARRAY_AGG already returns ARRAY). + restoreColumnOrderAfterArrayAgg( + relBuilder, inputFieldNames, inputFieldTypes, targetIndex, groupExprs, includeMetaFields); + + return relBuilder.peek(); + } + + /** Resolves the mvcombine target expression to an input field index. */ + private int resolveTargetIndex(Field targetField, CalcitePlanContext context) { + final RexNode targetRex; + try { + targetRex = rexVisitor.analyze(targetField, context); + } catch (IllegalArgumentException e) { + // Make missing-field behavior deterministic (and consistently mapped to 4xx) + // instead of leaking RelBuilder/rexVisitor exception wording. + throw new SemanticCheckException( + "mvcombine target field not found: " + targetField.getField().toString(), e); + } - // If all remaining fields are metadata or the target itself, mvcombine degenerates to a global - // combine. This is intentional: result collapses to a single row with aggregated target values. + if (!isInputRef(targetRex)) { + throw new SemanticCheckException( + "mvcombine target must be a direct field reference, but got: " + targetField); + } - // 4) Aggregate target values using ARRAY_AGG, filtering out NULLs. - performArrayAggAggregation(relBuilder, targetIndex, targetName, groupExprs); + final int index = ((RexInputRef) targetRex).getIndex(); - // 5) Restore original output column order (ARRAY_AGG already returns ARRAY). - restoreColumnOrderAfterArrayAgg( - relBuilder, inputFieldNames, inputFieldTypes, targetIndex, groupExprs, includeMetaFields); + final RelDataType fieldType = + context.relBuilder.peek().getRowType().getFieldList().get(index).getType(); - return relBuilder.peek(); - } + if (SqlTypeUtil.isArray(fieldType) || SqlTypeUtil.isMultiset(fieldType)) { + throw new SemanticCheckException( + "mvcombine target cannot be an array/multivalue type, but got: " + fieldType); + } - /** - * Resolves the mvcombine target expression to an input field index. - * - *

mvcombine requires the target to be a direct field reference (RexInputRef). This keeps the - * command semantics predictable and avoids accidental grouping on computed expressions. - * - *

The target must also be a scalar-ish field. mvcombine outputs ARRAY<T>, so the input - * target cannot already be an ARRAY or MULTISET. - * - * @param targetField Target field expression from the AST - * @param context Planning context - * @return 0-based input field index for the target - * @throws SemanticCheckException if the target is not a direct field reference or has an array - * type - */ - private int resolveTargetIndex(Field targetField, CalcitePlanContext context) { - final RexNode targetRex = rexVisitor.analyze(targetField, context); - if (!isInputRef(targetRex)) { - throw new SemanticCheckException( - "mvcombine target must be a direct field reference, but got: " + targetField); + return index; } - final int index = ((RexInputRef) targetRex).getIndex(); + /** + * Builds group-by expressions for mvcombine: + * all non-target input fields; meta fields are skipped unless includeMetaFields is true. + */ + private List buildGroupExpressionsExcludingTarget( + int targetIndex, + List inputFieldNames, + RelBuilder relBuilder, + boolean includeMetaFields) { - final RelDataType fieldType = - context.relBuilder.peek().getRowType().getFieldList().get(index).getType(); - - if (SqlTypeUtil.isArray(fieldType) || SqlTypeUtil.isMultiset(fieldType)) { - throw new SemanticCheckException( - "mvcombine target cannot be an array/multivalue type, but got: " + fieldType); + final List groupExprs = new ArrayList<>(Math.max(0, inputFieldNames.size() - 1)); + for (int i = 0; i < inputFieldNames.size(); i++) { + if (i == targetIndex) { + continue; + } + if (isMetadataField(inputFieldNames.get(i)) && !includeMetaFields) { + continue; + } + groupExprs.add(relBuilder.field(i)); + } + return groupExprs; } - return index; - } + /** Applies mvcombine aggregation. */ + private void performArrayAggAggregation( + RelBuilder relBuilder, int targetIndex, String targetName, List groupExprs) { - /** - * Builds group-by expressions for mvcombine: all non-target, non-metadata input fields. - * - * @param targetIndex Input index of the mvcombine target field - * @param inputFieldNames Input schema field names (for sizing/ordering) - * @param relBuilder RelBuilder positioned on the input - * @return Group-by expressions in input order excluding the target - */ - private List buildGroupExpressionsExcludingTarget( - int targetIndex, - List inputFieldNames, - RelBuilder relBuilder, - boolean includeMetaFields) { - final List groupExprs = new ArrayList<>(Math.max(0, inputFieldNames.size() - 1)); - for (int i = 0; i < inputFieldNames.size(); i++) { - if (i == targetIndex) { - continue; - } - final String fieldName = inputFieldNames.get(i); + final RexNode targetRef = relBuilder.field(targetIndex); + final RexNode notNullTarget = relBuilder.isNotNull(targetRef); - if (isMetadataField(fieldName) && !includeMetaFields) continue; + final RelBuilder.AggCall aggCall = + relBuilder + .aggregateCall(SqlLibraryOperators.ARRAY_AGG, targetRef) + .filter(notNullTarget) + .as(targetName); - groupExprs.add(relBuilder.field(i)); + relBuilder.aggregate(relBuilder.groupKey(groupExprs), aggCall); } - return groupExprs; - } - /** - * Applies mvcombine aggregation: - * - *

GROUP BY all non-target fields (excluding metadata fields), and aggregate target values - * using {@code ARRAY_AGG}. {@code ARRAY_AGG} produces an {@code ARRAY} in Calcite, which we - * keep as-is for output. - * - *

NULL target values are excluded from the aggregated array by applying an aggregate filter. - * - * @param relBuilder RelBuilder positioned on the input - * @param targetIndex Target field input index - * @param targetName Target field output name (preserved) - * @param groupExprs Group-by expressions (non-target, non-metadata fields) - */ - private void performArrayAggAggregation( - RelBuilder relBuilder, int targetIndex, String targetName, List groupExprs) { + /** + * Restores the original output column order after the aggregate step. + * Meta fields are set to typed NULL only when they were skipped from grouping (includeMetaFields=false). + */ + private void restoreColumnOrderAfterArrayAgg( + RelBuilder relBuilder, + List inputFieldNames, + List inputFieldTypes, + int targetIndex, + List groupExprs, + boolean includeMetaFields) { - final RexNode targetRef = relBuilder.field(targetIndex); - final RexNode notNullTarget = relBuilder.isNotNull(targetRef); + final int aggregatedTargetPos = groupExprs.size(); - final RelBuilder.AggCall aggCall = - relBuilder - .aggregateCall(SqlLibraryOperators.ARRAY_AGG, targetRef) - .filter(notNullTarget) - .as(targetName); + final List projections = new ArrayList<>(inputFieldNames.size()); + final List projectionNames = new ArrayList<>(inputFieldNames.size()); - relBuilder.aggregate(relBuilder.groupKey(groupExprs), aggCall); - } + int groupPos = 0; + for (int i = 0; i < inputFieldNames.size(); i++) { + final String fieldName = inputFieldNames.get(i); + projectionNames.add(fieldName); - /** - * Restores the original output column order after the aggregate step. - * - *

After aggregation, the schema is: - * - *

-   *   [groupField0, groupField1, ..., groupFieldN, targetAggArray]
-   * 
- * - *

This method projects fields back to the original input order, replacing the original target - * slot with the aggregated target value. Metadata fields that were excluded from the group key - * are preserved in the output schema but projected as {@code NULL}, since they do not exist in - * the post-aggregate row. - * - *

Since {@code ARRAY_AGG} already returns {@code ARRAY}, no cast is needed. - * - * @param relBuilder RelBuilder positioned on the post-aggregate node - * @param inputFieldNames Original input field names (also output field names) - * @param targetIndex Target field index in the original input - * @param groupExprs Group-by expressions used during aggregation - */ - private void restoreColumnOrderAfterArrayAgg( - RelBuilder relBuilder, - List inputFieldNames, - List inputFieldTypes, - int targetIndex, - List groupExprs, - boolean includeMetaFields) { - - // Post-aggregate: group fields come first, and the aggregated target is appended at the end. - final int aggregatedTargetPos = groupExprs.size(); - - final List projections = new ArrayList<>(inputFieldNames.size()); - final List projectionNames = new ArrayList<>(inputFieldNames.size()); - - int groupPos = 0; - for (int i = 0; i < inputFieldNames.size(); i++) { - final String fieldName = inputFieldNames.get(i); - projectionNames.add(fieldName); - - if (i == targetIndex) { - // ARRAY_AGG already returns ARRAY - projections.add(relBuilder.field(aggregatedTargetPos)); - } else if (isMetadataField(fieldName) && !includeMetaFields) { - // Metadata fields are intentionally not grouped by mvcombine. - // Preserve schema correctness by projecting a TYPED NULL based on ORIGINAL input schema. - projections.add(relBuilder.getRexBuilder().makeNullLiteral(inputFieldTypes.get(i))); - } else { - projections.add(relBuilder.field(groupPos)); - groupPos++; - } - } + if (i == targetIndex) { + // aggregated target is always the last field in the aggregate output + projections.add(relBuilder.field(aggregatedTargetPos)); + } else if (isMetadataField(fieldName) && !includeMetaFields) { + // meta fields were skipped from grouping => not present in aggregate output => keep schema stable + projections.add(relBuilder.getRexBuilder().makeNullLiteral(inputFieldTypes.get(i))); + } else { + // grouped field (including meta fields when includeMetaFields=true) + projections.add(relBuilder.field(groupPos)); + groupPos++; + } + } - // Force projection to avoid Calcite "identity" short-circuit when only names/types change. - relBuilder.project(projections, projectionNames, /* force= */ true); - } + relBuilder.project(projections, projectionNames, /* force= */ true); + } - @Override + @Override public RelNode visitValues(Values values, CalcitePlanContext context) { if (values.getValues() == null || values.getValues().isEmpty()) { context.relBuilder.values(context.relBuilder.getTypeFactory().builder().build()); diff --git a/docs/user/ppl/cmd/mvcombine.md b/docs/user/ppl/cmd/mvcombine.md index e50a2d2c1f1..4ccad724ca7 100644 --- a/docs/user/ppl/cmd/mvcombine.md +++ b/docs/user/ppl/cmd/mvcombine.md @@ -55,7 +55,7 @@ fetched rows / total rows = 1/1 +----------+-------+------+-------------+ ``` -Example 2: Multiple groups +## Example 2: Multiple groups Given a dataset mvcombine with the following data: ```text @@ -85,7 +85,7 @@ fetched rows / total rows = 2/2 +----------+-------+------+-------------+ ``` -Example 3: Missing target field in some rows +## Example 3: Missing target field in some rows Rows missing the target field do not contribute a value to the combined output. @@ -114,7 +114,7 @@ fetched rows / total rows = 1/1 +----------+-------+------+-------------+ ``` -Example 5: Error when field does not exist +## Example 4: Error when field does not exist If the specified field does not exist in the current schema, mvcombine returns an error. ```ppl From 42c116fb34dc1a4b31bd809b613fbfb1a0798c3e Mon Sep 17 00:00:00 2001 From: Srikanth Padakanti Date: Tue, 27 Jan 2026 14:56:46 -0600 Subject: [PATCH 30/32] Coderrabbit issues Signed-off-by: Srikanth Padakanti --- .../sql/calcite/CalciteRelNodeVisitor.java | 294 +++++++++--------- 1 file changed, 148 insertions(+), 146 deletions(-) diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java index 411a58c39c7..597501ef576 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java @@ -3170,173 +3170,175 @@ public RelNode visitExpand(Expand expand, CalcitePlanContext context) { return context.relBuilder.peek(); } - /** - * mvcombine command visitor to collapse rows that are identical on all non-target fields. - * - *

Grouping semantics: - * - *

    - *
  • The target field is always excluded from the GROUP BY keys. - *
  • Metadata fields (for example {@code _id}, {@code _index}, {@code _score}) are excluded from - * GROUP BY keys unless they were explicitly projected earlier (for example, - * via {@code fields}). - *
- * - *

The target field values are aggregated using {@code ARRAY_AGG}, with {@code NULL} values - * filtered out. The aggregation result replaces the original target column and produces an {@code - * ARRAY} output. - * - *

The original output column order is preserved. Metadata fields are projected as typed {@code - * NULL} literals after aggregation only when they are not part of grouping (since they were skipped). - * - * @param node mvcombine command to be visited - * @param context CalcitePlanContext containing the RelBuilder and planning context - * @return RelNode representing collapsed records with the target combined into a multivalue array - * @throws SemanticCheckException if the mvcombine target is not a direct field reference - */ - @Override - public RelNode visitMvCombine(MvCombine node, CalcitePlanContext context) { - // 1) Lower the child plan first so the RelBuilder has the input schema on the stack. - visitChildren(node, context); - - final RelBuilder relBuilder = context.relBuilder; - - final RelNode input = relBuilder.peek(); - final List inputFieldNames = input.getRowType().getFieldNames(); - final List inputFieldTypes = - input.getRowType().getFieldList().stream().map(RelDataTypeField::getType).toList(); - - // If true, we should NOT auto-skip meta fields (because user explicitly projected them) - final boolean includeMetaFields = context.isProjectVisited(); - - // 2) Resolve the mvcombine target to an input column index (must be a direct field reference). - final Field targetField = node.getField(); - final int targetIndex = resolveTargetIndex(targetField, context); - final String targetName = inputFieldNames.get(targetIndex); - - // 3) Group by all non-target fields, skipping meta fields unless explicitly projected. - final List groupExprs = - buildGroupExpressionsExcludingTarget( - targetIndex, inputFieldNames, relBuilder, includeMetaFields); - - // 4) Aggregate target values using ARRAY_AGG, filtering out NULLs. - performArrayAggAggregation(relBuilder, targetIndex, targetName, groupExprs); - - // 5) Restore original output column order (ARRAY_AGG already returns ARRAY). - restoreColumnOrderAfterArrayAgg( - relBuilder, inputFieldNames, inputFieldTypes, targetIndex, groupExprs, includeMetaFields); - - return relBuilder.peek(); - } - - /** Resolves the mvcombine target expression to an input field index. */ - private int resolveTargetIndex(Field targetField, CalcitePlanContext context) { - final RexNode targetRex; - try { - targetRex = rexVisitor.analyze(targetField, context); - } catch (IllegalArgumentException e) { - // Make missing-field behavior deterministic (and consistently mapped to 4xx) - // instead of leaking RelBuilder/rexVisitor exception wording. - throw new SemanticCheckException( - "mvcombine target field not found: " + targetField.getField().toString(), e); - } + /** + * mvcombine command visitor to collapse rows that are identical on all non-target fields. + * + *

Grouping semantics: + * + *

    + *
  • The target field is always excluded from the GROUP BY keys. + *
  • Metadata fields (for example {@code _id}, {@code _index}, {@code _score}) are excluded + * from GROUP BY keys unless they were explicitly projected earlier (for + * example, via {@code fields}). + *
+ * + *

The target field values are aggregated using {@code ARRAY_AGG}, with {@code NULL} values + * filtered out. The aggregation result replaces the original target column and produces an {@code + * ARRAY} output. + * + *

The original output column order is preserved. Metadata fields are projected as typed {@code + * NULL} literals after aggregation only when they are not part of grouping (since they were + * skipped). + * + * @param node mvcombine command to be visited + * @param context CalcitePlanContext containing the RelBuilder and planning context + * @return RelNode representing collapsed records with the target combined into a multivalue array + * @throws SemanticCheckException if the mvcombine target is not a direct field reference + */ + @Override + public RelNode visitMvCombine(MvCombine node, CalcitePlanContext context) { + // 1) Lower the child plan first so the RelBuilder has the input schema on the stack. + visitChildren(node, context); - if (!isInputRef(targetRex)) { - throw new SemanticCheckException( - "mvcombine target must be a direct field reference, but got: " + targetField); - } + final RelBuilder relBuilder = context.relBuilder; - final int index = ((RexInputRef) targetRex).getIndex(); + final RelNode input = relBuilder.peek(); + final List inputFieldNames = input.getRowType().getFieldNames(); + final List inputFieldTypes = + input.getRowType().getFieldList().stream().map(RelDataTypeField::getType).toList(); - final RelDataType fieldType = - context.relBuilder.peek().getRowType().getFieldList().get(index).getType(); + // If true, we should NOT auto-skip meta fields (because user explicitly projected them) + final boolean includeMetaFields = context.isProjectVisited(); - if (SqlTypeUtil.isArray(fieldType) || SqlTypeUtil.isMultiset(fieldType)) { - throw new SemanticCheckException( - "mvcombine target cannot be an array/multivalue type, but got: " + fieldType); - } + // 2) Resolve the mvcombine target to an input column index (must be a direct field reference). + final Field targetField = node.getField(); + final int targetIndex = resolveTargetIndex(targetField, context); + final String targetName = inputFieldNames.get(targetIndex); - return index; - } + // 3) Group by all non-target fields, skipping meta fields unless explicitly projected. + final List groupExprs = + buildGroupExpressionsExcludingTarget( + targetIndex, inputFieldNames, relBuilder, includeMetaFields); - /** - * Builds group-by expressions for mvcombine: - * all non-target input fields; meta fields are skipped unless includeMetaFields is true. - */ - private List buildGroupExpressionsExcludingTarget( - int targetIndex, - List inputFieldNames, - RelBuilder relBuilder, - boolean includeMetaFields) { + // 4) Aggregate target values using ARRAY_AGG, filtering out NULLs. + performArrayAggAggregation(relBuilder, targetIndex, targetName, groupExprs); - final List groupExprs = new ArrayList<>(Math.max(0, inputFieldNames.size() - 1)); - for (int i = 0; i < inputFieldNames.size(); i++) { - if (i == targetIndex) { - continue; - } - if (isMetadataField(inputFieldNames.get(i)) && !includeMetaFields) { - continue; - } - groupExprs.add(relBuilder.field(i)); - } - return groupExprs; + // 5) Restore original output column order (ARRAY_AGG already returns ARRAY). + restoreColumnOrderAfterArrayAgg( + relBuilder, inputFieldNames, inputFieldTypes, targetIndex, groupExprs, includeMetaFields); + + return relBuilder.peek(); + } + + /** Resolves the mvcombine target expression to an input field index. */ + private int resolveTargetIndex(Field targetField, CalcitePlanContext context) { + final RexNode targetRex; + try { + targetRex = rexVisitor.analyze(targetField, context); + } catch (IllegalArgumentException e) { + // Make missing-field behavior deterministic (and consistently mapped to 4xx) + // instead of leaking RelBuilder/rexVisitor exception wording. + throw new SemanticCheckException( + "mvcombine target field not found: " + targetField.getField().toString(), e); } - /** Applies mvcombine aggregation. */ - private void performArrayAggAggregation( - RelBuilder relBuilder, int targetIndex, String targetName, List groupExprs) { + if (!isInputRef(targetRex)) { + throw new SemanticCheckException( + "mvcombine target must be a direct field reference, but got: " + targetField); + } - final RexNode targetRef = relBuilder.field(targetIndex); - final RexNode notNullTarget = relBuilder.isNotNull(targetRef); + final int index = ((RexInputRef) targetRex).getIndex(); - final RelBuilder.AggCall aggCall = - relBuilder - .aggregateCall(SqlLibraryOperators.ARRAY_AGG, targetRef) - .filter(notNullTarget) - .as(targetName); + final RelDataType fieldType = + context.relBuilder.peek().getRowType().getFieldList().get(index).getType(); - relBuilder.aggregate(relBuilder.groupKey(groupExprs), aggCall); + if (SqlTypeUtil.isArray(fieldType) || SqlTypeUtil.isMultiset(fieldType)) { + throw new SemanticCheckException( + "mvcombine target cannot be an array/multivalue type, but got: " + fieldType); } - /** - * Restores the original output column order after the aggregate step. - * Meta fields are set to typed NULL only when they were skipped from grouping (includeMetaFields=false). - */ - private void restoreColumnOrderAfterArrayAgg( - RelBuilder relBuilder, - List inputFieldNames, - List inputFieldTypes, - int targetIndex, - List groupExprs, - boolean includeMetaFields) { + return index; + } - final int aggregatedTargetPos = groupExprs.size(); + /** + * Builds group-by expressions for mvcombine: all non-target input fields; meta fields are skipped + * unless includeMetaFields is true. + */ + private List buildGroupExpressionsExcludingTarget( + int targetIndex, + List inputFieldNames, + RelBuilder relBuilder, + boolean includeMetaFields) { + + final List groupExprs = new ArrayList<>(Math.max(0, inputFieldNames.size() - 1)); + for (int i = 0; i < inputFieldNames.size(); i++) { + if (i == targetIndex) { + continue; + } + if (isMetadataField(inputFieldNames.get(i)) && !includeMetaFields) { + continue; + } + groupExprs.add(relBuilder.field(i)); + } + return groupExprs; + } - final List projections = new ArrayList<>(inputFieldNames.size()); - final List projectionNames = new ArrayList<>(inputFieldNames.size()); + /** Applies mvcombine aggregation. */ + private void performArrayAggAggregation( + RelBuilder relBuilder, int targetIndex, String targetName, List groupExprs) { - int groupPos = 0; - for (int i = 0; i < inputFieldNames.size(); i++) { - final String fieldName = inputFieldNames.get(i); - projectionNames.add(fieldName); + final RexNode targetRef = relBuilder.field(targetIndex); + final RexNode notNullTarget = relBuilder.isNotNull(targetRef); - if (i == targetIndex) { - // aggregated target is always the last field in the aggregate output - projections.add(relBuilder.field(aggregatedTargetPos)); - } else if (isMetadataField(fieldName) && !includeMetaFields) { - // meta fields were skipped from grouping => not present in aggregate output => keep schema stable - projections.add(relBuilder.getRexBuilder().makeNullLiteral(inputFieldTypes.get(i))); - } else { - // grouped field (including meta fields when includeMetaFields=true) - projections.add(relBuilder.field(groupPos)); - groupPos++; - } - } + final RelBuilder.AggCall aggCall = + relBuilder + .aggregateCall(SqlLibraryOperators.ARRAY_AGG, targetRef) + .filter(notNullTarget) + .as(targetName); + + relBuilder.aggregate(relBuilder.groupKey(groupExprs), aggCall); + } - relBuilder.project(projections, projectionNames, /* force= */ true); + /** + * Restores the original output column order after the aggregate step. Meta fields are set to + * typed NULL only when they were skipped from grouping (includeMetaFields=false). + */ + private void restoreColumnOrderAfterArrayAgg( + RelBuilder relBuilder, + List inputFieldNames, + List inputFieldTypes, + int targetIndex, + List groupExprs, + boolean includeMetaFields) { + + final int aggregatedTargetPos = groupExprs.size(); + + final List projections = new ArrayList<>(inputFieldNames.size()); + final List projectionNames = new ArrayList<>(inputFieldNames.size()); + + int groupPos = 0; + for (int i = 0; i < inputFieldNames.size(); i++) { + final String fieldName = inputFieldNames.get(i); + projectionNames.add(fieldName); + + if (i == targetIndex) { + // aggregated target is always the last field in the aggregate output + projections.add(relBuilder.field(aggregatedTargetPos)); + } else if (isMetadataField(fieldName) && !includeMetaFields) { + // meta fields were skipped from grouping => not present in aggregate output => keep schema + // stable + projections.add(relBuilder.getRexBuilder().makeNullLiteral(inputFieldTypes.get(i))); + } else { + // grouped field (including meta fields when includeMetaFields=true) + projections.add(relBuilder.field(groupPos)); + groupPos++; + } } - @Override + relBuilder.project(projections, projectionNames, /* force= */ true); + } + + @Override public RelNode visitValues(Values values, CalcitePlanContext context) { if (values.getValues() == null || values.getValues().isEmpty()) { context.relBuilder.values(context.relBuilder.getTypeFactory().builder().build()); From 42b0db14a6ff2e10756c2478d3d5d2b1aba30ec5 Mon Sep 17 00:00:00 2001 From: Srikanth Padakanti Date: Tue, 27 Jan 2026 17:34:30 -0600 Subject: [PATCH 31/32] Address comments Signed-off-by: Srikanth Padakanti --- .../sql/ast/analysis/FieldResolutionVisitor.java | 9 ++++++++- docs/category.json | 2 +- .../calcite/remote/CalcitePPLSpathCommandIT.java | 14 ++++++++++++++ ppl/src/main/antlr/OpenSearchPPLParser.g4 | 1 - .../sql/ppl/parser/FieldResolutionVisitorTest.java | 10 ++++++++++ 5 files changed, 33 insertions(+), 3 deletions(-) diff --git a/core/src/main/java/org/opensearch/sql/ast/analysis/FieldResolutionVisitor.java b/core/src/main/java/org/opensearch/sql/ast/analysis/FieldResolutionVisitor.java index 232d53c0e99..23c6fcea59f 100644 --- a/core/src/main/java/org/opensearch/sql/ast/analysis/FieldResolutionVisitor.java +++ b/core/src/main/java/org/opensearch/sql/ast/analysis/FieldResolutionVisitor.java @@ -632,7 +632,14 @@ public Node visitExpand(Expand node, FieldResolutionContext context) { @Override public Node visitMvCombine(MvCombine node, FieldResolutionContext context) { Set mvCombineFields = extractFieldsFromExpression(node.getField()); - context.pushRequirements(context.getCurrentRequirements().or(mvCombineFields)); + + FieldResolutionResult current = context.getCurrentRequirements(); + + Set regularFields = new HashSet<>(current.getRegularFields()); + regularFields.addAll(mvCombineFields); + + context.pushRequirements(new FieldResolutionResult(regularFields, Set.of(ALL_FIELDS))); + visitChildren(node, context); context.popRequirements(); return node; diff --git a/docs/category.json b/docs/category.json index 7d260be1d65..d6c8c9dfb0e 100644 --- a/docs/category.json +++ b/docs/category.json @@ -23,6 +23,7 @@ "user/ppl/cmd/head.md", "user/ppl/cmd/join.md", "user/ppl/cmd/lookup.md", + "user/ppl/cmd/mvcombine.md", "user/ppl/cmd/parse.md", "user/ppl/cmd/patterns.md", "user/ppl/cmd/rare.md", @@ -36,7 +37,6 @@ "user/ppl/cmd/sort.md", "user/ppl/cmd/spath.md", "user/ppl/cmd/stats.md", - "user/ppl/cmd/mvcombine.md", "user/ppl/cmd/streamstats.md", "user/ppl/cmd/subquery.md", "user/ppl/cmd/syntax.md", diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLSpathCommandIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLSpathCommandIT.java index 0c2c2e94a7e..7c6ebaa8f98 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLSpathCommandIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLSpathCommandIT.java @@ -262,4 +262,18 @@ public void testAppendWithSpathInSubsearchDynamicFields() throws IOException { rows(null, null, null, "simple", "4", sj("{'a': 1, 'b': 2, 'c': 3}")), rows("1", "3", "2", "simple", null, sj("{'a': 1, 'b': 2, 'c': 3}"))); } + + @Test + public void testSpathWithMvCombine() throws IOException { + JSONObject result = + executeQuery( + "source=test_json | where category='simple' " + + "| spath input=userData " + + "| fields a, b, c " + + "| mvcombine c"); + + verifySchema(result, schema("a", "string"), schema("b", "string"), schema("c", "array")); + + verifyDataRows(result, rows("1", "2", new String[] {"3", "3"})); + } } diff --git a/ppl/src/main/antlr/OpenSearchPPLParser.g4 b/ppl/src/main/antlr/OpenSearchPPLParser.g4 index 9ae6ae0fe92..2131eeae939 100644 --- a/ppl/src/main/antlr/OpenSearchPPLParser.g4 +++ b/ppl/src/main/antlr/OpenSearchPPLParser.g4 @@ -549,7 +549,6 @@ mvcombineCommand : MVCOMBINE fieldExpression (DELIM EQUAL stringLiteral)? ; - flattenCommand : FLATTEN fieldExpression (AS aliases = identifierSeq)? ; diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/parser/FieldResolutionVisitorTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/parser/FieldResolutionVisitorTest.java index a4d4f4874ad..4c9a4ea4267 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/parser/FieldResolutionVisitorTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/parser/FieldResolutionVisitorTest.java @@ -172,6 +172,16 @@ public void testMultiRelationResult() { assertEquals("*", result.getWildcard().toString()); } + @Test + public void testMvCombineAddsTargetFieldToRequirements() { + assertSingleRelationFields("source=logs | mvcombine packets_str", Set.of("packets_str"), "*"); + } + + @Test + public void testMvCombineAddsWildcard() { + assertSingleRelationFields("source=logs | mvcombine packets_str", Set.of("packets_str"), "*"); + } + @Test public void testSimpleJoin() { assertMultiRelationFields( From 860a1fe5edc3d89bca0fe6d48508a1aba372537d Mon Sep 17 00:00:00 2001 From: Srikanth Padakanti Date: Tue, 27 Jan 2026 17:38:53 -0600 Subject: [PATCH 32/32] Address comments Signed-off-by: Srikanth Padakanti --- .../opensearch/sql/calcite/remote/CalcitePPLSpathCommandIT.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLSpathCommandIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLSpathCommandIT.java index d552c230ff3..1405df66753 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLSpathCommandIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLSpathCommandIT.java @@ -358,7 +358,7 @@ public void testMultisearchWithSpath() throws IOException { "str", sj("{'nested': {'d': [1, 2, 3], 'e': 'str'}}"))); } - + @Test public void testSpathWithMvCombine() throws IOException { JSONObject result =