opensearch-project · Swiddis · Aug 28, 2025 · Aug 29, 2025 · Aug 29, 2025 · Aug 29, 2025
@@ -5,21 +5,27 @@
 
 package org.opensearch.sql.ast.tree;
 
+import static org.opensearch.sql.common.utils.StringUtils.unquoteIdentifier;
+
 import com.google.common.collect.ImmutableList;
 import java.util.List;
 import lombok.AllArgsConstructor;
 import lombok.EqualsAndHashCode;
 import lombok.RequiredArgsConstructor;
 import lombok.ToString;
+import org.apache.calcite.rel.type.RelDataType;
+import org.apache.calcite.tools.RelBuilder;
 import org.checkerframework.checker.nullness.qual.Nullable;
 import org.opensearch.sql.ast.AbstractNodeVisitor;
 import org.opensearch.sql.ast.dsl.AstDSL;
+import org.opensearch.sql.calcite.CalcitePlanContext;
 
 @ToString
 @EqualsAndHashCode(callSuper = false)
 @RequiredArgsConstructor
 @AllArgsConstructor
 public class SPath extends UnresolvedPlan {
+  private final char DOT = '.';
   private UnresolvedPlan child;
 
   private final String inField;
@@ -44,17 +50,89 @@ public <T, C> T accept(AbstractNodeVisitor<T, C> nodeVisitor, C context) {
     return nodeVisitor.visitSpath(this, context);
   }
 
-  public Eval rewriteAsEval() {
+  private String fullPath() {
+    return this.inField + DOT + this.path;
+  }
+
+  /**
+   * Determine whether a provided match string is better than the current best match available, for
+   * path matching.
+   *
+   * @param maybeMatch A field name that we're testing for path matching
+   * @param currentRecordMatch Our best field match so far. Should at least be as good as
+   *     `this.inField`
+   * @return The better match between the two provided options
+   */
+  private String preferredPathMatch(String maybeMatch, String currentRecordMatch) {
+    String path = this.fullPath();
+    // If the provided match isn't even a match, skip it
+    if (!path.startsWith(maybeMatch) || maybeMatch.length() <= currentRecordMatch.length()) {
+      return currentRecordMatch;
+    }
+    // Ensure the match is on a proper segment boundary (either dot-delimited, or exactly matches
+    // the path)
+    if (path.length() == maybeMatch.length() || path.charAt(maybeMatch.length()) == '.') {
+      return maybeMatch;
+    }
+    // We had a match, but it wasn't better than our current record
+    return currentRecordMatch;
+  }
+
+  /**
+   * We want input=outer, path=inner.data to match records like `{ "outer": { "inner": "{\"data\":
+   * 0}" }}`. To rewrite this as eval, that means we need to detect the longest prefix match in the
+   * fields (`outer.inner`) and parse `data` out of it. We need to match on segments, so
+   * `outer.inner` shouldn't match `outer.inner_other`.
+   *
+   * @return The field from the RelBuilder with the most overlap, or inField if none exists.
+   */
+  private String computePathField(RelBuilder builder) {
+    RelDataType rowType = builder.peek().getRowType();
+    List<String> rowFieldNames = rowType.getFieldNames();
+    String result = this.inField;
+
+    for (String name : rowFieldNames) {
+      result = this.preferredPathMatch(name, result);
+    }
+
+    return result;
+  }
+
+  /**
+   * Convert this `spath` expression to an equivalent `json_extract` eval.
+   *
+   * @param context The planning context for the rewrite, which has access to the available fields.
+   * @return The rewritten expression.
+   */
+  public Eval rewriteAsEval(CalcitePlanContext context) {
     String outField = this.outField;
     if (outField == null) {
-      outField = this.path;
+      outField = unquoteIdentifier(this.path);
+    }
+
+    String pathField = computePathField(context.relBuilder);
+    String reducedPath = this.fullPath().substring(pathField.length());
+
+    String[] pathFieldParts = unquoteIdentifier(pathField).split("\\.");
+
+    if (reducedPath.isEmpty()) {
+      // Special case: We're spath-extracting a path that already exists in the data. This is just a
+      // rename.
+      return AstDSL.eval(
+          this.child,
+          AstDSL.let(AstDSL.field(outField), AstDSL.field(AstDSL.qualifiedName(pathFieldParts))));
     }
+    // Since pathField must be on a segment line, there's a leftover leading dot if we didn't match
+    // the whole path.
+    reducedPath = reducedPath.substring(1);
 
     return AstDSL.eval(
         this.child,
         AstDSL.let(
             AstDSL.field(outField),
             AstDSL.function(
-                "json_extract", AstDSL.field(inField), AstDSL.stringLiteral(this.path))));
+                "json_extract",
+                AstDSL.field(AstDSL.qualifiedName(pathFieldParts)),
+                AstDSL.stringLiteral(unquoteIdentifier(reducedPath)))));
   }
 }
@@ -670,7 +670,8 @@ public RelNode visitParse(Parse node, CalcitePlanContext context) {
 
   @Override
   public RelNode visitSpath(SPath node, CalcitePlanContext context) {
-    return visitEval(node.rewriteAsEval(), context);
+    visitChildren(node, context);
+    return visitEval(node.rewriteAsEval(context), context);
   }
 
   @Override

@@ -50,6 +50,7 @@
     "user/ppl/cmd/syntax.rst",
     "user/ppl/cmd/timechart.rst",
     "user/ppl/cmd/search.rst",
+    "user/ppl/cmd/spath.rst",
     "user/ppl/functions/statistical.rst",
     "user/ppl/cmd/top.rst",
     "user/ppl/cmd/trendline.rst",

@@ -35,7 +35,7 @@ Example 1: Show All Indices Information
 SQL query::
 
     os> SHOW TABLES LIKE '%'
-    fetched rows / total rows = 18/18
+    fetched rows / total rows = 19/19
     +----------------+-------------+------------------+------------+---------+----------+------------+-----------+---------------------------+----------------+
     | TABLE_CAT      | TABLE_SCHEM | TABLE_NAME       | TABLE_TYPE | REMARKS | TYPE_CAT | TYPE_SCHEM | TYPE_NAME | SELF_REFERENCING_COL_NAME | REF_GENERATION |
     |----------------+-------------+------------------+------------+---------+----------+------------+-----------+---------------------------+----------------|
@@ -52,6 +52,7 @@ SQL query::
     | docTestCluster | null        | otellogs         | BASE TABLE | null    | null     | null       | null      | null                      | null           |
     | docTestCluster | null        | people           | BASE TABLE | null    | null     | null       | null      | null                      | null           |
     | docTestCluster | null        | state_country    | BASE TABLE | null    | null     | null       | null      | null                      | null           |
+    | docTestCluster | null        | structured       | BASE TABLE | null    | null     | null       | null      | null                      | null           |
     | docTestCluster | null        | time_test        | BASE TABLE | null    | null     | null       | null      | null                      | null           |
     | docTestCluster | null        | weblogs          | BASE TABLE | null    | null     | null       | null      | null                      | null           |
     | docTestCluster | null        | wildcard         | BASE TABLE | null    | null     | null       | null      | null                      | null           |

@@ -13,6 +13,8 @@ Description
 ============
 | The `spath` command allows extracting fields from structured text data. It currently allows selecting from JSON data with JSON paths.
 
+If the inner data is unable to be extracted (malformed data, missing keys), `"null"` is returned. `spath` always returns string values, except if the input field is `null`.
+
 Version
 =======
 3.3.0
@@ -37,10 +39,10 @@ The simplest spath is to extract a single field. This extracts `n` from the `doc
 
 PPL query::
 
-    PPL> source=test_spath | spath input=doc n;
+    os> source=structured | spath input=doc_n n | fields doc_n n;
     fetched rows / total rows = 3/3
     +----------+---+
-    | doc      | n |
+    | doc_n    | n |
     |----------+---|
     | {"n": 1} | 1 |
     | {"n": 2} | 2 |
@@ -54,10 +56,10 @@ These queries demonstrate more JSON path uses, like traversing nested fields and
 
 PPL query::
 
-    PPL> source=test_spath | spath input=doc output=first_element list{0} | spath input=doc output=all_elements list{} | spath input=doc output=nested nest_out.nest_in;
+    os> source=structured | spath input=doc_list output=first_element list{0} | spath input=doc_list output=all_elements list{} | spath input=doc_list output=nested nest_out.nest_in | fields doc_list first_element all_elements nested;
     fetched rows / total rows = 3/3
     +------------------------------------------------------+---------------+--------------+--------+
-    | doc                                                  | first_element | all_elements | nested |
+    | doc_list                                             | first_element | all_elements | nested |
     |------------------------------------------------------+---------------+--------------+--------|
     | {"list": [1, 2, 3, 4], "nest_out": {"nest_in": "a"}} | 1             | [1,2,3,4]    | a      |
     | {"list": [], "nest_out": {"nest_in": "a"}}           | null          | []           | a      |
@@ -71,10 +73,27 @@ The example shows extracting an inner field and doing statistics on it, using th
 
 PPL query::
 
-    PPL> source=test_spath | spath input=doc n | eval n=cast(n as int) | stats sum(n);
+    os> source=structured | spath input=doc_n n | eval n=cast(n as int) | stats sum(n) | fields `sum(n)`;
     fetched rows / total rows = 1/1
     +--------+
     | sum(n) |
     |--------|
     | 6      |
     +--------+
+
+Example 4: Field traversal
+============================
+
+SPath can also traverse plain documents, in which case it acts similarly to renaming.
+
+PPL query::
+
+    os> source=structured | spath input=obj_field path=field | fields obj_field field;
+    fetched rows / total rows = 3/3
+    +----------------+-------+
+    | obj_field      | field |
+    |----------------+-------|
+    | {'field': 'a'} | a     |
+    | {'field': 'b'} | b     |
+    | {'field': 'c'} | c     |
+    +----------------+-------+
@@ -0,0 +1,3 @@
+{"doc_n":"{\"n\": 1}","doc_list":"{\"list\": [1, 2, 3, 4], \"nest_out\": {\"nest_in\": \"a\"}}","obj_field":{"field": "a"}}
+{"doc_n":"{\"n\": 2}","doc_list":"{\"list\": [], \"nest_out\": {\"nest_in\": \"a\"}}","obj_field":{"field": "b"}}
+{"doc_n":"{\"n\": 3}","doc_list":"{\"list\": [5, 6], \"nest_out\": {\"nest_in\": \"a\"}}","obj_field":{"field": "c"}}
@@ -42,6 +42,7 @@
     'work_information': 'work_information.json',
     'events': 'events.json',
     'otellogs': 'otellogs.json',
+    'structured': 'structured.json',
     'time_test': 'time_test.json'
 }
 

@@ -0,0 +1,17 @@
+{
+  "mappings": {
+    "properties": {
+      "doc_n": {
+        "type": "text"
+      },
+      "doc_list": {
+        "type": "text"
+      },
+      "obj_field": {
+        "properties": {
+          "field": { "type": "text" }
+        }
+      }
+    }
+  }
+}
@@ -332,7 +332,7 @@ indexablePath
    ;
 
 pathElement
-   : ident pathArrayAccess?
+   : ident pathArrayAccess*
    ;
 
 pathArrayAccess
@@ -1457,6 +1457,7 @@ searchableKeyWord
    | PATH
    | INPUT
    | OUTPUT
+   | FIELD
 
    // AGGREGATIONS AND WINDOW
    | statsFunctionName