diff --git a/core/src/main/java/org/opensearch/sql/ast/tree/SPath.java b/core/src/main/java/org/opensearch/sql/ast/tree/SPath.java index bb891e49cec..7dd517194ec 100644 --- a/core/src/main/java/org/opensearch/sql/ast/tree/SPath.java +++ b/core/src/main/java/org/opensearch/sql/ast/tree/SPath.java @@ -5,21 +5,27 @@ package org.opensearch.sql.ast.tree; +import static org.opensearch.sql.common.utils.StringUtils.unquoteIdentifier; + import com.google.common.collect.ImmutableList; import java.util.List; import lombok.AllArgsConstructor; import lombok.EqualsAndHashCode; import lombok.RequiredArgsConstructor; import lombok.ToString; +import org.apache.calcite.rel.type.RelDataType; +import org.apache.calcite.tools.RelBuilder; import org.checkerframework.checker.nullness.qual.Nullable; import org.opensearch.sql.ast.AbstractNodeVisitor; import org.opensearch.sql.ast.dsl.AstDSL; +import org.opensearch.sql.calcite.CalcitePlanContext; @ToString @EqualsAndHashCode(callSuper = false) @RequiredArgsConstructor @AllArgsConstructor public class SPath extends UnresolvedPlan { + private final char DOT = '.'; private UnresolvedPlan child; private final String inField; @@ -44,17 +50,89 @@ public T accept(AbstractNodeVisitor nodeVisitor, C context) { return nodeVisitor.visitSpath(this, context); } - public Eval rewriteAsEval() { + private String fullPath() { + return this.inField + DOT + this.path; + } + + /** + * Determine whether a provided match string is better than the current best match available, for + * path matching. + * + * @param maybeMatch A field name that we're testing for path matching + * @param currentRecordMatch Our best field match so far. Should at least be as good as + * `this.inField` + * @return The better match between the two provided options + */ + private String preferredPathMatch(String maybeMatch, String currentRecordMatch) { + String path = this.fullPath(); + // If the provided match isn't even a match, skip it + if (!path.startsWith(maybeMatch) || maybeMatch.length() <= currentRecordMatch.length()) { + return currentRecordMatch; + } + // Ensure the match is on a proper segment boundary (either dot-delimited, or exactly matches + // the path) + if (path.length() == maybeMatch.length() || path.charAt(maybeMatch.length()) == '.') { + return maybeMatch; + } + // We had a match, but it wasn't better than our current record + return currentRecordMatch; + } + + /** + * We want input=outer, path=inner.data to match records like `{ "outer": { "inner": "{\"data\": + * 0}" }}`. To rewrite this as eval, that means we need to detect the longest prefix match in the + * fields (`outer.inner`) and parse `data` out of it. We need to match on segments, so + * `outer.inner` shouldn't match `outer.inner_other`. + * + * @return The field from the RelBuilder with the most overlap, or inField if none exists. + */ + private String computePathField(RelBuilder builder) { + RelDataType rowType = builder.peek().getRowType(); + List rowFieldNames = rowType.getFieldNames(); + String result = this.inField; + + for (String name : rowFieldNames) { + result = this.preferredPathMatch(name, result); + } + + return result; + } + + /** + * Convert this `spath` expression to an equivalent `json_extract` eval. + * + * @param context The planning context for the rewrite, which has access to the available fields. + * @return The rewritten expression. + */ + public Eval rewriteAsEval(CalcitePlanContext context) { String outField = this.outField; if (outField == null) { - outField = this.path; + outField = unquoteIdentifier(this.path); + } + + String pathField = computePathField(context.relBuilder); + String reducedPath = this.fullPath().substring(pathField.length()); + + String[] pathFieldParts = unquoteIdentifier(pathField).split("\\."); + + if (reducedPath.isEmpty()) { + // Special case: We're spath-extracting a path that already exists in the data. This is just a + // rename. + return AstDSL.eval( + this.child, + AstDSL.let(AstDSL.field(outField), AstDSL.field(AstDSL.qualifiedName(pathFieldParts)))); } + // Since pathField must be on a segment line, there's a leftover leading dot if we didn't match + // the whole path. + reducedPath = reducedPath.substring(1); return AstDSL.eval( this.child, AstDSL.let( AstDSL.field(outField), AstDSL.function( - "json_extract", AstDSL.field(inField), AstDSL.stringLiteral(this.path)))); + "json_extract", + AstDSL.field(AstDSL.qualifiedName(pathFieldParts)), + AstDSL.stringLiteral(unquoteIdentifier(reducedPath))))); } } diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java index 1f105f5f229..a1782cc4918 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java @@ -670,7 +670,8 @@ public RelNode visitParse(Parse node, CalcitePlanContext context) { @Override public RelNode visitSpath(SPath node, CalcitePlanContext context) { - return visitEval(node.rewriteAsEval(), context); + visitChildren(node, context); + return visitEval(node.rewriteAsEval(context), context); } @Override diff --git a/docs/category.json b/docs/category.json index cb40f1ebbcd..0f9cca2aea2 100644 --- a/docs/category.json +++ b/docs/category.json @@ -50,6 +50,7 @@ "user/ppl/cmd/syntax.rst", "user/ppl/cmd/timechart.rst", "user/ppl/cmd/search.rst", + "user/ppl/cmd/spath.rst", "user/ppl/functions/statistical.rst", "user/ppl/cmd/top.rst", "user/ppl/cmd/trendline.rst", diff --git a/docs/user/dql/metadata.rst b/docs/user/dql/metadata.rst index 8135e3684d4..05b4cf22f96 100644 --- a/docs/user/dql/metadata.rst +++ b/docs/user/dql/metadata.rst @@ -35,7 +35,7 @@ Example 1: Show All Indices Information SQL query:: os> SHOW TABLES LIKE '%' - fetched rows / total rows = 18/18 + fetched rows / total rows = 19/19 +----------------+-------------+------------------+------------+---------+----------+------------+-----------+---------------------------+----------------+ | TABLE_CAT | TABLE_SCHEM | TABLE_NAME | TABLE_TYPE | REMARKS | TYPE_CAT | TYPE_SCHEM | TYPE_NAME | SELF_REFERENCING_COL_NAME | REF_GENERATION | |----------------+-------------+------------------+------------+---------+----------+------------+-----------+---------------------------+----------------| @@ -52,6 +52,7 @@ SQL query:: | docTestCluster | null | otellogs | BASE TABLE | null | null | null | null | null | null | | docTestCluster | null | people | BASE TABLE | null | null | null | null | null | null | | docTestCluster | null | state_country | BASE TABLE | null | null | null | null | null | null | + | docTestCluster | null | structured | BASE TABLE | null | null | null | null | null | null | | docTestCluster | null | time_test | BASE TABLE | null | null | null | null | null | null | | docTestCluster | null | weblogs | BASE TABLE | null | null | null | null | null | null | | docTestCluster | null | wildcard | BASE TABLE | null | null | null | null | null | null | diff --git a/docs/user/ppl/cmd/spath.rst b/docs/user/ppl/cmd/spath.rst index 7defb4437f2..54e837cfec7 100644 --- a/docs/user/ppl/cmd/spath.rst +++ b/docs/user/ppl/cmd/spath.rst @@ -13,6 +13,8 @@ Description ============ | The `spath` command allows extracting fields from structured text data. It currently allows selecting from JSON data with JSON paths. +If the inner data is unable to be extracted (malformed data, missing keys), `"null"` is returned. `spath` always returns string values, except if the input field is `null`. + Version ======= 3.3.0 @@ -37,10 +39,10 @@ The simplest spath is to extract a single field. This extracts `n` from the `doc PPL query:: - PPL> source=test_spath | spath input=doc n; + os> source=structured | spath input=doc_n n | fields doc_n n; fetched rows / total rows = 3/3 +----------+---+ - | doc | n | + | doc_n | n | |----------+---| | {"n": 1} | 1 | | {"n": 2} | 2 | @@ -54,10 +56,10 @@ These queries demonstrate more JSON path uses, like traversing nested fields and PPL query:: - PPL> source=test_spath | spath input=doc output=first_element list{0} | spath input=doc output=all_elements list{} | spath input=doc output=nested nest_out.nest_in; + os> source=structured | spath input=doc_list output=first_element list{0} | spath input=doc_list output=all_elements list{} | spath input=doc_list output=nested nest_out.nest_in | fields doc_list first_element all_elements nested; fetched rows / total rows = 3/3 +------------------------------------------------------+---------------+--------------+--------+ - | doc | first_element | all_elements | nested | + | doc_list | first_element | all_elements | nested | |------------------------------------------------------+---------------+--------------+--------| | {"list": [1, 2, 3, 4], "nest_out": {"nest_in": "a"}} | 1 | [1,2,3,4] | a | | {"list": [], "nest_out": {"nest_in": "a"}} | null | [] | a | @@ -71,10 +73,27 @@ The example shows extracting an inner field and doing statistics on it, using th PPL query:: - PPL> source=test_spath | spath input=doc n | eval n=cast(n as int) | stats sum(n); + os> source=structured | spath input=doc_n n | eval n=cast(n as int) | stats sum(n) | fields `sum(n)`; fetched rows / total rows = 1/1 +--------+ | sum(n) | |--------| | 6 | +--------+ + +Example 4: Field traversal +============================ + +SPath can also traverse plain documents, in which case it acts similarly to renaming. + +PPL query:: + + os> source=structured | spath input=obj_field path=field | fields obj_field field; + fetched rows / total rows = 3/3 + +----------------+-------+ + | obj_field | field | + |----------------+-------| + | {'field': 'a'} | a | + | {'field': 'b'} | b | + | {'field': 'c'} | c | + +----------------+-------+ diff --git a/doctest/test_data/structured.json b/doctest/test_data/structured.json new file mode 100644 index 00000000000..95bc6f8baab --- /dev/null +++ b/doctest/test_data/structured.json @@ -0,0 +1,3 @@ +{"doc_n":"{\"n\": 1}","doc_list":"{\"list\": [1, 2, 3, 4], \"nest_out\": {\"nest_in\": \"a\"}}","obj_field":{"field": "a"}} +{"doc_n":"{\"n\": 2}","doc_list":"{\"list\": [], \"nest_out\": {\"nest_in\": \"a\"}}","obj_field":{"field": "b"}} +{"doc_n":"{\"n\": 3}","doc_list":"{\"list\": [5, 6], \"nest_out\": {\"nest_in\": \"a\"}}","obj_field":{"field": "c"}} diff --git a/doctest/test_docs.py b/doctest/test_docs.py index c94378ed537..af1155631c1 100644 --- a/doctest/test_docs.py +++ b/doctest/test_docs.py @@ -42,6 +42,7 @@ 'work_information': 'work_information.json', 'events': 'events.json', 'otellogs': 'otellogs.json', + 'structured': 'structured.json', 'time_test': 'time_test.json' } diff --git a/doctest/test_mapping/structured.json b/doctest/test_mapping/structured.json new file mode 100644 index 00000000000..9220baef878 --- /dev/null +++ b/doctest/test_mapping/structured.json @@ -0,0 +1,17 @@ +{ + "mappings": { + "properties": { + "doc_n": { + "type": "text" + }, + "doc_list": { + "type": "text" + }, + "obj_field": { + "properties": { + "field": { "type": "text" } + } + } + } + } +} diff --git a/ppl/src/main/antlr/OpenSearchPPLParser.g4 b/ppl/src/main/antlr/OpenSearchPPLParser.g4 index 3ee0c568b41..2c2900dbebd 100644 --- a/ppl/src/main/antlr/OpenSearchPPLParser.g4 +++ b/ppl/src/main/antlr/OpenSearchPPLParser.g4 @@ -332,7 +332,7 @@ indexablePath ; pathElement - : ident pathArrayAccess? + : ident pathArrayAccess* ; pathArrayAccess @@ -1457,6 +1457,7 @@ searchableKeyWord | PATH | INPUT | OUTPUT + | FIELD // AGGREGATIONS AND WINDOW | statsFunctionName diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/utils/SPathRewriteTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/utils/SPathRewriteTest.java index e97fb51ea90..cf892c18d6c 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/utils/SPathRewriteTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/utils/SPathRewriteTest.java @@ -14,16 +14,25 @@ import static org.opensearch.sql.ast.dsl.AstDSL.spath; import static org.opensearch.sql.ast.dsl.AstDSL.stringLiteral; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import org.apache.calcite.rel.type.RelDataType; +import org.apache.calcite.rel.type.RelDataTypeFactory; +import org.apache.calcite.sql.type.SqlTypeName; import org.junit.Test; import org.mockito.Mockito; import org.opensearch.sql.ast.Node; import org.opensearch.sql.ast.tree.Eval; import org.opensearch.sql.ast.tree.SPath; +import org.opensearch.sql.calcite.CalcitePlanContext; +import org.opensearch.sql.common.antlr.SyntaxCheckException; import org.opensearch.sql.common.setting.Settings; import org.opensearch.sql.ppl.antlr.PPLSyntaxParser; +import org.opensearch.sql.ppl.calcite.CalcitePPLAbstractTest; import org.opensearch.sql.ppl.parser.AstBuilder; -public class SPathRewriteTest { +public class SPathRewriteTest extends CalcitePPLAbstractTest { private final Settings settings = Mockito.mock(Settings.class); private final PPLSyntaxParser parser = new PPLSyntaxParser(); @@ -32,6 +41,20 @@ private Node plan(String query) { return astBuilder.visit(parser.parse(query)); } + private CalcitePlanContext createTestFieldContext(String[] fieldNames) { + CalcitePlanContext ctx = this.createBuilderContext(); + RelDataTypeFactory typeFactory = ctx.relBuilder.getTypeFactory(); + + List> fields = new ArrayList<>(); + for (String fieldName : fieldNames) { + fields.add(Map.entry(fieldName, typeFactory.createSqlType(SqlTypeName.ANY))); + } + RelDataType rowType = typeFactory.createStructType(fields); + + ctx.relBuilder.values(rowType); + return ctx; + } + // Control test to make sure something fundamental hasn't changed about the json_extract parsing @Test public void testEvalControl() { @@ -42,14 +65,6 @@ public void testEvalControl() { plan("source = t | eval o=json_extract(f, \"simple.nested\")")); } - @Test - public void testSpathSimpleRewrite() { - SPath sp = spath(relation("t"), "f", "o", "simple.nested"); - Eval ev = (Eval) plan("source = t | eval o=json_extract(f, \"simple.nested\")"); - - assertEquals(ev, sp.rewriteAsEval()); - } - @Test(expected = IllegalArgumentException.class) public void testSpathMissingInputArgumentHandling() { plan("source = t | spath path=a output=a"); @@ -64,4 +79,111 @@ public void testSpathMissingPathArgumentHandling() { public void testSpathArgumentDeshuffle() { assertEquals(plan("source = t | spath path=a input=a"), plan("source = t | spath input=a a")); } + + @Test + public void testSpathSimpleRewriteWithExistingFields() { + // Test when we have the exact field available + CalcitePlanContext ctx = createTestFieldContext(new String[] {"f"}); + SPath sp = spath(relation("t"), "f", "o", "simple.nested"); + Eval ev = (Eval) plan("source = t | eval o=json_extract(f, \"simple.nested\")"); + assertEquals(ev, sp.rewriteAsEval(ctx)); + } + + @Test + public void testSpathPartialMatchRewrite() { + // Test when we have a partial field match + CalcitePlanContext ctx = createTestFieldContext(new String[] {"outer.inner"}); + SPath sp = (SPath) plan("source = t | spath input=outer output=result inner.data"); + Eval ev = (Eval) plan("source = t | eval result=json_extract(outer.inner, \"data\")"); + assertEquals(ev, sp.rewriteAsEval(ctx)); + } + + @Test + public void testSpathExactFieldMatch() { + // Test when the requested path exactly matches an existing field + CalcitePlanContext ctx = createTestFieldContext(new String[] {"data.nested.field"}); + SPath sp = (SPath) plan("source = t | spath input=data output=output nested.field"); + Eval ev = (Eval) plan("source = t | eval output=data.nested.field"); + Eval act = sp.rewriteAsEval(ctx); + assertEquals(ev, act); + } + + @Test + public void testSpathMultipleFieldOptions() { + // Test when multiple field options are available, should choose longest match + CalcitePlanContext ctx = + createTestFieldContext( + new String[] {"data", "data.nested", "data.nested.field", "data.other"}); + SPath sp = (SPath) plan("source = t | spath input=data output=output nested.field.value"); + Eval ev = (Eval) plan("source = t | eval output=json_extract(data.nested.field, \"value\")"); + assertEquals(ev, sp.rewriteAsEval(ctx)); + } + + @Test + public void testSpathNoMatchingFields() { + // Test when no matching fields are available + CalcitePlanContext ctx = createTestFieldContext(new String[] {"other.field"}); + SPath sp = (SPath) plan("source = t | spath input=data output=output nested.field"); + Eval ev = (Eval) plan("source = t | eval output=json_extract(data, \"nested.field\")"); + assertEquals(ev, sp.rewriteAsEval(ctx)); + } + + @Test + public void testSpathBacktickHandling() { + // Test handling of backtick-quoted paths + CalcitePlanContext ctx = createTestFieldContext(new String[] {"data"}); + SPath sp = (SPath) plan("source = t | spath input=data output=output `nested.field`"); + Eval ev = (Eval) plan("source = t | eval output=json_extract(data, \"nested.field\")"); + assertEquals(ev, sp.rewriteAsEval(ctx)); + } + + @Test + public void testSpathSegmentBoundaryMatch() { + // Test that we don't match partial segments + CalcitePlanContext ctx = createTestFieldContext(new String[] {"data.inner_extended"}); + SPath sp = (SPath) plan("source = t | spath input=data output=output inner.field"); + Eval ev = (Eval) plan("source = t | eval output=json_extract(data, \"inner.field\")"); + assertEquals(ev, sp.rewriteAsEval(ctx)); + } + + @Test + public void testEmptyPathMatch() { + CalcitePlanContext ctx = createTestFieldContext(new String[] {""}); + SPath sp = (SPath) plan("source = t | spath input=data output=output field"); + Eval ev = (Eval) plan("source = t | eval output=json_extract(data, \"field\")"); + assertEquals(ev, sp.rewriteAsEval(ctx)); + } + + @Test + public void testSamePathLengthDifferentContent() { + CalcitePlanContext ctx = createTestFieldContext(new String[] {"data.test", "data.best"}); + SPath sp = (SPath) plan("source = t | spath input=data output=output test.field"); + Eval ev = (Eval) plan("source = t | eval output=json_extract(data.test, \"field\")"); + assertEquals(ev, sp.rewriteAsEval(ctx)); + } + + @Test(expected = SyntaxCheckException.class) + public void testMultipleDotsInPath() { + CalcitePlanContext ctx = createTestFieldContext(new String[] {"data.nested..field"}); + SPath sp = (SPath) plan("source = t | spath input=data output=output nested..field.value"); + Eval ev = (Eval) plan("source = t | eval output=json_extract(data.nested..field, \"value\")"); + assertEquals(ev, sp.rewriteAsEval(ctx)); + } + + @Test + public void testCaseSensitivePathMatch() { + CalcitePlanContext ctx = createTestFieldContext(new String[] {"Data.Nested", "data.nested"}); + SPath sp = (SPath) plan("source = t | spath input=data output=output nested.field"); + Eval ev = (Eval) plan("source = t | eval output=json_extract(data.nested, \"field\")"); + assertEquals(ev, sp.rewriteAsEval(ctx)); + } + + @Test + public void testOverlappingFieldNames() { + CalcitePlanContext ctx = + createTestFieldContext(new String[] {"data.nested", "data.nested_extended"}); + SPath sp = (SPath) plan("source = t | spath input=data output=output nested.field"); + Eval ev = (Eval) plan("source = t | eval output=json_extract(data.nested, \"field\")"); + assertEquals(ev, sp.rewriteAsEval(ctx)); + } }