diff --git a/core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java b/core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java index f9eadced7fc..954d8379bec 100644 --- a/core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java @@ -70,6 +70,7 @@ import org.opensearch.sql.ast.tree.RelationSubquery; import org.opensearch.sql.ast.tree.Rename; import org.opensearch.sql.ast.tree.Reverse; +import org.opensearch.sql.ast.tree.SPath; import org.opensearch.sql.ast.tree.Sort; import org.opensearch.sql.ast.tree.SubqueryAlias; import org.opensearch.sql.ast.tree.TableFunction; @@ -237,6 +238,10 @@ public T visitParse(Parse node, C context) { return visitChildren(node, context); } + public T visitSpath(SPath node, C context) { + return visitChildren(node, context); + } + public T visitLet(Let node, C context) { return visitChildren(node, context); } diff --git a/core/src/main/java/org/opensearch/sql/ast/dsl/AstDSL.java b/core/src/main/java/org/opensearch/sql/ast/dsl/AstDSL.java index f5fc501e597..46f937e3a9f 100644 --- a/core/src/main/java/org/opensearch/sql/ast/dsl/AstDSL.java +++ b/core/src/main/java/org/opensearch/sql/ast/dsl/AstDSL.java @@ -65,6 +65,7 @@ import org.opensearch.sql.ast.tree.Relation; import org.opensearch.sql.ast.tree.RelationSubquery; import org.opensearch.sql.ast.tree.Rename; +import org.opensearch.sql.ast.tree.SPath; import org.opensearch.sql.ast.tree.Sort; import org.opensearch.sql.ast.tree.Sort.SortOption; import org.opensearch.sql.ast.tree.SubqueryAlias; @@ -520,6 +521,10 @@ public static Parse parse( return new Parse(parseMethod, sourceField, pattern, arguments, input); } + public static SPath spath(UnresolvedPlan input, String inField, String outField, String path) { + return new SPath(input, inField, outField, path); + } + public static Patterns patterns( UnresolvedPlan input, UnresolvedExpression sourceField, diff --git a/core/src/main/java/org/opensearch/sql/ast/tree/SPath.java b/core/src/main/java/org/opensearch/sql/ast/tree/SPath.java new file mode 100644 index 00000000000..cd38eef4a04 --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/ast/tree/SPath.java @@ -0,0 +1,55 @@ +package org.opensearch.sql.ast.tree; + +import com.google.common.collect.ImmutableList; +import java.util.List; +import lombok.AllArgsConstructor; +import lombok.EqualsAndHashCode; +import lombok.RequiredArgsConstructor; +import lombok.ToString; +import org.checkerframework.checker.nullness.qual.Nullable; +import org.opensearch.sql.ast.AbstractNodeVisitor; +import org.opensearch.sql.ast.dsl.AstDSL; + +@ToString +@EqualsAndHashCode(callSuper = false) +@RequiredArgsConstructor +@AllArgsConstructor +public class SPath extends UnresolvedPlan { + private UnresolvedPlan child; + + private final String inField; + + @Nullable private final String outField; + + private final String path; + + @Override + public UnresolvedPlan attach(UnresolvedPlan child) { + this.child = child; + return this; + } + + @Override + public List getChild() { + return this.child == null ? ImmutableList.of() : ImmutableList.of(this.child); + } + + @Override + public T accept(AbstractNodeVisitor nodeVisitor, C context) { + return nodeVisitor.visitSpath(this, context); + } + + public Eval rewriteAsEval() { + String outField = this.outField; + if (outField == null) { + outField = this.path; + } + + return AstDSL.eval( + this.child, + AstDSL.let( + AstDSL.field(outField), + AstDSL.function( + "json_extract", AstDSL.field(inField), AstDSL.stringLiteral(this.path)))); + } +} diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java index 1a5d8a5c791..d90dbbeb246 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRelNodeVisitor.java @@ -100,6 +100,7 @@ import org.opensearch.sql.ast.tree.RareTopN; import org.opensearch.sql.ast.tree.Relation; import org.opensearch.sql.ast.tree.Rename; +import org.opensearch.sql.ast.tree.SPath; import org.opensearch.sql.ast.tree.Sort; import org.opensearch.sql.ast.tree.Sort.SortOption; import org.opensearch.sql.ast.tree.SubqueryAlias; @@ -479,6 +480,11 @@ public RelNode visitParse(Parse node, CalcitePlanContext context) { return context.relBuilder.peek(); } + @Override + public RelNode visitSpath(SPath node, CalcitePlanContext context) { + return visitEval(node.rewriteAsEval(), context); + } + @Override public RelNode visitPatterns(Patterns node, CalcitePlanContext context) { visitChildren(node, context); diff --git a/core/src/main/java/org/opensearch/sql/expression/function/jsonUDF/JsonExtractFunctionImpl.java b/core/src/main/java/org/opensearch/sql/expression/function/jsonUDF/JsonExtractFunctionImpl.java index b08a2584883..76853706f64 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/jsonUDF/JsonExtractFunctionImpl.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/jsonUDF/JsonExtractFunctionImpl.java @@ -93,7 +93,9 @@ private static boolean isScalarObject(Object obj) { } private static String doJsonize(Object candidate) { - if (isScalarObject(candidate)) { + if (candidate == null) { + return "null"; // Matches isScalarObject, but not toString-able. + } else if (isScalarObject(candidate)) { return candidate.toString(); } else { return JsonFunctions.jsonize(candidate); diff --git a/docs/user/ppl/cmd/spath.rst b/docs/user/ppl/cmd/spath.rst new file mode 100644 index 00000000000..7defb4437f2 --- /dev/null +++ b/docs/user/ppl/cmd/spath.rst @@ -0,0 +1,80 @@ +============= +spath +============= + +.. rubric:: Table of contents + +.. contents:: + :local: + :depth: 2 + + +Description +============ +| The `spath` command allows extracting fields from structured text data. It currently allows selecting from JSON data with JSON paths. + +Version +======= +3.3.0 + +Syntax +============ +spath input= [output=] [path=] + + +* input: mandatory. The field to scan for JSON data. +* output: optional. The destination field that the data will be loaded to. Defaults to the value of `path`. +* path: mandatory. The path of the data to load for the object. For more information on path syntax, see `json_extract <../functions/json.rst#json_extract>`_. + +Note +===== +The `spath` command currently does not support pushdown behavior for extraction. It will be slow on large datasets. It's generally better to index fields needed for filtering directly instead of using `spath` to filter nested fields. + +Example 1: Simple Field Extraction +================================== + +The simplest spath is to extract a single field. This extracts `n` from the `doc` field of type `text`. + +PPL query:: + + PPL> source=test_spath | spath input=doc n; + fetched rows / total rows = 3/3 + +----------+---+ + | doc | n | + |----------+---| + | {"n": 1} | 1 | + | {"n": 2} | 2 | + | {"n": 3} | 3 | + +----------+---+ + +Example 2: Lists & Nesting +============================ + +These queries demonstrate more JSON path uses, like traversing nested fields and extracting list elements. + +PPL query:: + + PPL> source=test_spath | spath input=doc output=first_element list{0} | spath input=doc output=all_elements list{} | spath input=doc output=nested nest_out.nest_in; + fetched rows / total rows = 3/3 + +------------------------------------------------------+---------------+--------------+--------+ + | doc | first_element | all_elements | nested | + |------------------------------------------------------+---------------+--------------+--------| + | {"list": [1, 2, 3, 4], "nest_out": {"nest_in": "a"}} | 1 | [1,2,3,4] | a | + | {"list": [], "nest_out": {"nest_in": "a"}} | null | [] | a | + | {"list": [5, 6], "nest_out": {"nest_in": "a"}} | 5 | [5,6] | a | + +------------------------------------------------------+---------------+--------------+--------+ + +Example 3: Sum of inner elements +============================ + +The example shows extracting an inner field and doing statistics on it, using the docs from example 1. It also demonstrates that `spath` always returns strings for inner types. + +PPL query:: + + PPL> source=test_spath | spath input=doc n | eval n=cast(n as int) | stats sum(n); + fetched rows / total rows = 1/1 + +--------+ + | sum(n) | + |--------| + | 6 | + +--------+ diff --git a/docs/user/ppl/index.rst b/docs/user/ppl/index.rst index 59df104d63e..c74f5c82bf5 100644 --- a/docs/user/ppl/index.rst +++ b/docs/user/ppl/index.rst @@ -100,6 +100,8 @@ The query start with search command and then flowing a set of command delimited - `sort command `_ + - `spath command `_ + - `stats command `_ - `subquery (aka subsearch) command `_ diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLSpathCommandIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLSpathCommandIT.java new file mode 100644 index 00000000000..51b5bd40304 --- /dev/null +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLSpathCommandIT.java @@ -0,0 +1,48 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.calcite.remote; + +import static org.opensearch.sql.util.MatcherUtils.rows; +import static org.opensearch.sql.util.MatcherUtils.schema; +import static org.opensearch.sql.util.MatcherUtils.verifyDataRows; +import static org.opensearch.sql.util.MatcherUtils.verifySchema; + +import java.io.IOException; +import org.json.JSONObject; +import org.junit.jupiter.api.Test; +import org.opensearch.client.Request; +import org.opensearch.sql.ppl.PPLIntegTestCase; + +public class CalcitePPLSpathCommandIT extends PPLIntegTestCase { + @Override + public void init() throws Exception { + super.init(); + enableCalcite(); + + loadIndex(Index.BANK); + + // Create test data for string concatenation + Request request1 = new Request("PUT", "/test_spath/_doc/1?refresh=true"); + request1.setJsonEntity("{\"doc\": \"{\\\"n\\\": 1}\"}"); + client().performRequest(request1); + + Request request2 = new Request("PUT", "/test_spath/_doc/2?refresh=true"); + request2.setJsonEntity("{\"doc\": \"{\\\"n\\\": 2}\"}"); + client().performRequest(request2); + + Request request3 = new Request("PUT", "/test_spath/_doc/3?refresh=true"); + request3.setJsonEntity("{\"doc\": \"{\\\"n\\\": 3}\"}"); + client().performRequest(request3); + } + + @Test + public void testSimpleSpath() throws IOException { + JSONObject result = + executeQuery("source=test_spath | spath input=doc output=result path=n | fields result"); + verifySchema(result, schema("result", "string")); + verifyDataRows(result, rows("1"), rows("2"), rows("3")); + } +} diff --git a/ppl/src/main/antlr/OpenSearchPPLLexer.g4 b/ppl/src/main/antlr/OpenSearchPPLLexer.g4 index b6506f697fe..92656208cd2 100644 --- a/ppl/src/main/antlr/OpenSearchPPLLexer.g4 +++ b/ppl/src/main/antlr/OpenSearchPPLLexer.g4 @@ -28,6 +28,7 @@ HEAD: 'HEAD'; TOP: 'TOP'; RARE: 'RARE'; PARSE: 'PARSE'; +SPATH: 'SPATH'; REGEX: 'REGEX'; PUNCT: 'PUNCT'; GROK: 'GROK'; @@ -114,6 +115,9 @@ ANOMALY_SCORE_THRESHOLD: 'ANOMALY_SCORE_THRESHOLD'; APPEND: 'APPEND'; COUNTFIELD: 'COUNTFIELD'; SHOWCOUNT: 'SHOWCOUNT'; +INPUT: 'INPUT'; +OUTPUT: 'OUTPUT'; +PATH: 'PATH'; // COMPARISON FUNCTION KEYWORDS CASE: 'CASE'; @@ -202,6 +206,8 @@ LT_PRTHS: '('; RT_PRTHS: ')'; LT_SQR_PRTHS: '['; RT_SQR_PRTHS: ']'; +LT_CURLY: '{'; +RT_CURLY: '}'; SINGLE_QUOTE: '\''; DOUBLE_QUOTE: '"'; BACKTICK: '`'; diff --git a/ppl/src/main/antlr/OpenSearchPPLParser.g4 b/ppl/src/main/antlr/OpenSearchPPLParser.g4 index 40d5b662a2e..ff5a82d95af 100644 --- a/ppl/src/main/antlr/OpenSearchPPLParser.g4 +++ b/ppl/src/main/antlr/OpenSearchPPLParser.g4 @@ -60,6 +60,7 @@ commands | rareCommand | grokCommand | parseCommand + | spathCommand | patternsCommand | lookupCommand | kmeansCommand @@ -187,6 +188,28 @@ parseCommand : PARSE (source_field = expression) (pattern = stringLiteral) ; +spathCommand + : SPATH spathParameter* + ; + +spathParameter + : (INPUT EQUAL input = expression) + | (OUTPUT EQUAL output = expression) + | ((PATH EQUAL)? path = indexablePath) + ; + +indexablePath + : pathElement (DOT pathElement)* + ; + +pathElement + : ident pathArrayAccess? + ; + +pathArrayAccess + : LT_CURLY (INTEGER_LITERAL)? RT_CURLY + ; + patternsMethod : PUNCT | REGEX @@ -1199,6 +1222,10 @@ keywordsCanBeId | ANOMALY_SCORE_THRESHOLD | COUNTFIELD | SHOWCOUNT + | PATH + | INPUT + | OUTPUT + // AGGREGATIONS AND WINDOW | statsFunctionName | windowFunctionName diff --git a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java index eb39d42fc86..17ca5a71015 100644 --- a/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java +++ b/ppl/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java @@ -80,6 +80,7 @@ import org.opensearch.sql.ast.tree.Relation; import org.opensearch.sql.ast.tree.Rename; import org.opensearch.sql.ast.tree.Reverse; +import org.opensearch.sql.ast.tree.SPath; import org.opensearch.sql.ast.tree.Sort; import org.opensearch.sql.ast.tree.SubqueryAlias; import org.opensearch.sql.ast.tree.TableFunction; @@ -527,6 +528,34 @@ public UnresolvedPlan visitParseCommand(OpenSearchPPLParser.ParseCommandContext return new Parse(ParseMethod.REGEX, sourceField, pattern, ImmutableMap.of()); } + @Override + public UnresolvedPlan visitSpathCommand(OpenSearchPPLParser.SpathCommandContext ctx) { + String inField = null; + String outField = null; + String path = null; + + for (OpenSearchPPLParser.SpathParameterContext param : ctx.spathParameter()) { + if (param.input != null) { + inField = param.input.getText(); + } + if (param.output != null) { + outField = param.output.getText(); + } + if (param.path != null) { + path = param.path.getText(); + } + } + + if (inField == null) { + throw new IllegalArgumentException("`input` parameter is required for `spath`"); + } + if (path == null) { + throw new IllegalArgumentException("`path` parameter is required for `spath`"); + } + + return new SPath(inField, outField, path); + } + @Override public UnresolvedPlan visitPatternsCommand(OpenSearchPPLParser.PatternsCommandContext ctx) { UnresolvedExpression sourceField = internalVisitExpression(ctx.source_field); diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLSpathTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLSpathTest.java new file mode 100644 index 00000000000..a6de6e28036 --- /dev/null +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLSpathTest.java @@ -0,0 +1,48 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.ppl.calcite; + +import org.apache.calcite.rel.RelNode; +import org.apache.calcite.test.CalciteAssert; +import org.junit.Test; + +public class CalcitePPLSpathTest extends CalcitePPLAbstractTest { + + public CalcitePPLSpathTest() { + super(CalciteAssert.SchemaSpec.SCOTT_WITH_TEMPORAL); + } + + @Test + public void testSimpleEval() { + String ppl = "source=EMP | spath src.path input=ENAME"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4], SAL=[$5]," + + " COMM=[$6], DEPTNO=[$7], src.path=[JSON_EXTRACT($1, 'src.path':VARCHAR)])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `EMPNO`, `ENAME`, `JOB`, `MGR`, `HIREDATE`, `SAL`, `COMM`, `DEPTNO`," + + " `JSON_EXTRACT`(`ENAME`, 'src.path') `src.path`\n" + + "FROM `scott`.`EMP`"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testEvalWithOutput() { + String ppl = "source=EMP | spath src.path input=ENAME output=custom | fields custom"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(custom=[JSON_EXTRACT($1, 'src.path':VARCHAR)])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `JSON_EXTRACT`(`ENAME`, 'src.path') `custom`\n" + "FROM `scott`.`EMP`"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } +} diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java index 4fd566e722b..7fbcc0ad228 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java @@ -42,6 +42,7 @@ import static org.opensearch.sql.ast.dsl.AstDSL.rename; import static org.opensearch.sql.ast.dsl.AstDSL.sort; import static org.opensearch.sql.ast.dsl.AstDSL.span; +import static org.opensearch.sql.ast.dsl.AstDSL.spath; import static org.opensearch.sql.ast.dsl.AstDSL.stringLiteral; import static org.opensearch.sql.ast.dsl.AstDSL.tableFunction; import static org.opensearch.sql.ast.dsl.AstDSL.trendline; @@ -681,6 +682,51 @@ public void testParseCommand() { ImmutableMap.of())); } + @Test + public void testBasicSpathCommand() { + assertEqual( + "source=t | spath input=f path=simple.nested", + spath( + relation("t"), + "f", + null, // no output field specified + "simple.nested")); + } + + @Test + public void testSpathWithOutput() { + assertEqual( + "source=t | spath input=f output=o path=simple.nested", + spath(relation("t"), "f", "o", "simple.nested")); + } + + @Test + public void testSpathWithArrayWildcard() { + assertEqual( + "source=t | spath input=f path=array{}.nested", + spath(relation("t"), "f", null, "array{}.nested")); + } + + @Test + public void testSpathWithArrayIndex() { + assertEqual( + "source=t | spath input=f path=array{1}.nested", + spath(relation("t"), "f", null, "array{1}.nested")); + } + + @Test + public void testSpathWithMultipleArrays() { + assertEqual( + "source=t | spath input=f path=outer{}.middle{2}.inner", + spath(relation("t"), "f", null, "outer{}.middle{2}.inner")); + } + + @Test + public void testSpathWithNoPathKeyword() { + assertEqual( + "source=t | spath input=f simple.nested", spath(relation("t"), "f", null, "simple.nested")); + } + @Test public void testKmeansCommand() { assertEqual( diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/utils/SPathRewriteTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/utils/SPathRewriteTest.java new file mode 100644 index 00000000000..4115ee747fb --- /dev/null +++ b/ppl/src/test/java/org/opensearch/sql/ppl/utils/SPathRewriteTest.java @@ -0,0 +1,62 @@ +package org.opensearch.sql.ppl.utils; + +import static org.junit.Assert.assertEquals; +import static org.opensearch.sql.ast.dsl.AstDSL.eval; +import static org.opensearch.sql.ast.dsl.AstDSL.field; +import static org.opensearch.sql.ast.dsl.AstDSL.function; +import static org.opensearch.sql.ast.dsl.AstDSL.let; +import static org.opensearch.sql.ast.dsl.AstDSL.relation; +import static org.opensearch.sql.ast.dsl.AstDSL.spath; +import static org.opensearch.sql.ast.dsl.AstDSL.stringLiteral; + +import org.junit.Test; +import org.mockito.Mockito; +import org.opensearch.sql.ast.Node; +import org.opensearch.sql.ast.tree.Eval; +import org.opensearch.sql.ast.tree.SPath; +import org.opensearch.sql.common.setting.Settings; +import org.opensearch.sql.ppl.antlr.PPLSyntaxParser; +import org.opensearch.sql.ppl.parser.AstBuilder; + +public class SPathRewriteTest { + private final Settings settings = Mockito.mock(Settings.class); + private final PPLSyntaxParser parser = new PPLSyntaxParser(); + + private Node plan(String query) { + AstBuilder astBuilder = new AstBuilder(query, settings); + return astBuilder.visit(parser.parse(query)); + } + + // Control test to make sure something fundamental hasn't changed about the json_extract parsing + @Test + public void testEvalControl() { + assertEquals( + eval( + relation("t"), + let(field("o"), function("json_extract", field("f"), stringLiteral("simple.nested")))), + plan("source = t | eval o=json_extract(f, \"simple.nested\")")); + } + + @Test + public void testSpathSimpleRewrite() { + SPath sp = spath(relation("t"), "f", "o", "simple.nested"); + Eval ev = (Eval) plan("source = t | eval o=json_extract(f, \"simple.nested\")"); + + assertEquals(ev, sp.rewriteAsEval()); + } + + @Test(expected = IllegalArgumentException.class) + public void testSpathMissingInputArgumentHandling() { + plan("source = t | spath path=a output=a"); + } + + @Test(expected = IllegalArgumentException.class) + public void testSpathMissingPathArgumentHandling() { + plan("source = t | spath input=a output=a"); + } + + @Test + public void testSpathArgumentDeshuffle() { + assertEquals(plan("source = t | spath path=a input=a"), plan("source = t | spath input=a a")); + } +}