From 75d30cb66a8ca4b838a7f246338a3b1cc0920c5c Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Wed, 8 Oct 2025 11:25:46 -0700 Subject: [PATCH 1/8] Support Regex for replace eval function Signed-off-by: Kai Huang --- .../function/PPLBuiltinOperators.java | 2 + .../expression/function/PPLFuncImpTable.java | 2 +- .../function/udf/ReplaceFunction.java | 120 ++++++++++++++++++ .../function/udf/ReplaceFunctionTest.java | 118 +++++++++++++++++ docs/user/ppl/functions/string.rst | 46 ++++++- .../CalcitePPLStringBuiltinFunctionIT.java | 86 +++++++++++++ .../calcite/CalcitePPLStringFunctionTest.java | 47 +++++++ 7 files changed, 418 insertions(+), 3 deletions(-) create mode 100644 core/src/main/java/org/opensearch/sql/expression/function/udf/ReplaceFunction.java create mode 100644 core/src/test/java/org/opensearch/sql/expression/function/udf/ReplaceFunctionTest.java diff --git a/core/src/main/java/org/opensearch/sql/expression/function/PPLBuiltinOperators.java b/core/src/main/java/org/opensearch/sql/expression/function/PPLBuiltinOperators.java index f92f70f519a..2aabe259b37 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/PPLBuiltinOperators.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/PPLBuiltinOperators.java @@ -60,6 +60,7 @@ import org.opensearch.sql.expression.function.udf.CryptographicFunction; import org.opensearch.sql.expression.function.udf.ParseFunction; import org.opensearch.sql.expression.function.udf.RelevanceQueryFunction; +import org.opensearch.sql.expression.function.udf.ReplaceFunction; import org.opensearch.sql.expression.function.udf.RexExtractFunction; import org.opensearch.sql.expression.function.udf.RexExtractMultiFunction; import org.opensearch.sql.expression.function.udf.RexOffsetFunction; @@ -418,6 +419,7 @@ public class PPLBuiltinOperators extends ReflectiveSqlOperatorTable { public static final SqlOperator RANGE_BUCKET = new org.opensearch.sql.expression.function.udf.binning.RangeBucketFunction() .toUDF("RANGE_BUCKET"); + public static final SqlOperator REPLACE = new ReplaceFunction().toUDF("REPLACE"); public static final SqlOperator REX_EXTRACT = new RexExtractFunction().toUDF("REX_EXTRACT"); public static final SqlOperator REX_EXTRACT_MULTI = new RexExtractMultiFunction().toUDF("REX_EXTRACT_MULTI"); diff --git a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java index b93472a2703..d3c96435e3a 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java @@ -683,7 +683,7 @@ void populate() { registerOperator(LOWER, SqlStdOperatorTable.LOWER); registerOperator(POSITION, SqlStdOperatorTable.POSITION); registerOperator(LOCATE, SqlStdOperatorTable.POSITION); - registerOperator(REPLACE, SqlStdOperatorTable.REPLACE); + registerOperator(REPLACE, PPLBuiltinOperators.REPLACE); registerOperator(UPPER, SqlStdOperatorTable.UPPER); registerOperator(ABS, SqlStdOperatorTable.ABS); registerOperator(ACOS, SqlStdOperatorTable.ACOS); diff --git a/core/src/main/java/org/opensearch/sql/expression/function/udf/ReplaceFunction.java b/core/src/main/java/org/opensearch/sql/expression/function/udf/ReplaceFunction.java new file mode 100644 index 00000000000..fd932b1d9aa --- /dev/null +++ b/core/src/main/java/org/opensearch/sql/expression/function/udf/ReplaceFunction.java @@ -0,0 +1,120 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.expression.function.udf; + +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; +import org.apache.calcite.adapter.enumerable.NotNullImplementor; +import org.apache.calcite.adapter.enumerable.NullPolicy; +import org.apache.calcite.adapter.enumerable.RexToLixTranslator; +import org.apache.calcite.linq4j.tree.Expression; +import org.apache.calcite.linq4j.tree.Expressions; +import org.apache.calcite.rex.RexCall; +import org.apache.calcite.sql.type.ReturnTypes; +import org.apache.calcite.sql.type.SqlReturnTypeInference; +import org.opensearch.sql.calcite.utils.PPLOperandTypes; +import org.opensearch.sql.expression.function.ImplementorUDF; +import org.opensearch.sql.expression.function.UDFOperandMetadata; + +/** + * Custom REPLACE function for PPL that substitutes the replacement string for every occurrence of + * the regular expression in the input string. + */ +public final class ReplaceFunction extends ImplementorUDF { + + public ReplaceFunction() { + super(new ReplaceImplementor(), NullPolicy.ARG0); + } + + @Override + public SqlReturnTypeInference getReturnTypeInference() { + return ReturnTypes.VARCHAR_2000_NULLABLE; + } + + @Override + public UDFOperandMetadata getOperandMetadata() { + return PPLOperandTypes.STRING_STRING_STRING; + } + + private static class ReplaceImplementor implements NotNullImplementor { + + @Override + public Expression implement( + RexToLixTranslator translator, RexCall call, List translatedOperands) { + Expression str = translatedOperands.get(0); + Expression regex = translatedOperands.get(1); + Expression replacement = translatedOperands.get(2); + + return Expressions.call(ReplaceFunction.class, "replaceRegex", str, regex, replacement); + } + } + + public static String replaceRegex(String str, String regex, String replacement) { + if (str == null || regex == null || replacement == null) { + return null; + } + + try { + // Compile regex pattern using Java's Pattern (same as regex_match uses) + Pattern compiledPattern = Pattern.compile(regex); + Matcher matcher = compiledPattern.matcher(str); + + // Convert Perl-style \1, \2 to Java-style $1, $2 for capture group references + // This matches SPL behavior while using Java's regex engine + String javaReplacement = convertPerlReplacementToJava(replacement); + + return matcher.replaceAll(javaReplacement); + + } catch (PatternSyntaxException e) { + throw new IllegalArgumentException( + "Error in 'replace' function: Encountered the following error while compiling the regex '" + + regex + + "': " + + e.getMessage()); + } catch (IllegalArgumentException | IndexOutOfBoundsException e) { + throw new IllegalArgumentException( + "Error in 'replace' function: Invalid replacement string '" + + replacement + + "': " + + e.getMessage()); + } + } + + private static String convertPerlReplacementToJava(String perlReplacement) { + // Replace \1, \2, etc. with $1, $2, etc. + // We need to be careful with escaped backslashes + StringBuilder result = new StringBuilder(); + int length = perlReplacement.length(); + + for (int i = 0; i < length; i++) { + char ch = perlReplacement.charAt(i); + + if (ch == '\\' && i + 1 < length) { + char next = perlReplacement.charAt(i + 1); + + // Check if this is a group reference (\1, \2, etc.) + if (Character.isDigit(next)) { + result.append('$').append(next); + i++; // Skip the next character since we've already processed it + } else if (next == '\\') { + // This is an escaped backslash (\\), keep both backslashes + result.append("\\\\"); + i++; // Skip the next backslash + } else { + // This is a backslash followed by some other character, keep as is + result.append('\\').append(next); + i++; // Skip the next character + } + } else { + result.append(ch); + } + } + + return result.toString(); + } +} diff --git a/core/src/test/java/org/opensearch/sql/expression/function/udf/ReplaceFunctionTest.java b/core/src/test/java/org/opensearch/sql/expression/function/udf/ReplaceFunctionTest.java new file mode 100644 index 00000000000..9df5d6fbaec --- /dev/null +++ b/core/src/test/java/org/opensearch/sql/expression/function/udf/ReplaceFunctionTest.java @@ -0,0 +1,118 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.sql.expression.function.udf; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.junit.jupiter.api.Test; + +/** Unit tests for {@link ReplaceFunction}. */ +public class ReplaceFunctionTest { + + @Test + public void testBasicReplacementAndRegexPatterns() { + // Literal replacement + assertEquals("Mane", ReplaceFunction.replaceRegex("Jane", "J", "M")); + assertEquals("heLLo", ReplaceFunction.replaceRegex("hello", "l", "L")); + + // Regex patterns - all occurrences replaced + assertEquals("test", ReplaceFunction.replaceRegex("test123", "\\d+", "")); + assertEquals("XXX-XXX-XXXX", ReplaceFunction.replaceRegex("123-456-7890", "\\d", "X")); + assertEquals("123456", ReplaceFunction.replaceRegex("abc123xyz456", "[a-z]+", "")); + assertEquals("hello_world", ReplaceFunction.replaceRegex("hello world", " ", "_")); + } + + @Test + public void testRegexWithCaptureGroupsAndBackreferences() { + // Swap month/day in date using capture groups and backreferences + assertEquals( + "14/1/2023", + ReplaceFunction.replaceRegex("1/14/2023", "^(\\d{1,2})/(\\d{1,2})/", "\\2/\\1/")); + + // Swap words using regex groups + assertEquals( + "World Hello", ReplaceFunction.replaceRegex("Hello World", "(\\w+) (\\w+)", "\\2 \\1")); + + // Reuse same regex group multiple times + assertEquals( + "11-22-33", + ReplaceFunction.replaceRegex("1-2-3", "(\\d)-(\\d)-(\\d)", "\\1\\1-\\2\\2-\\3\\3")); + } + + @Test + public void testEdgeCases() { + // Null inputs + assertNull(ReplaceFunction.replaceRegex(null, "test", "replacement")); + assertNull(ReplaceFunction.replaceRegex("test", null, "replacement")); + assertNull(ReplaceFunction.replaceRegex("test", "pattern", null)); + + // No match returns original + assertEquals("hello", ReplaceFunction.replaceRegex("hello", "xyz", "abc")); + + // Empty strings + assertEquals("", ReplaceFunction.replaceRegex("", "test", "replacement")); + assertEquals("", ReplaceFunction.replaceRegex("12345", "\\d+", "")); + + // Invalid regex + IllegalArgumentException exception = + assertThrows( + IllegalArgumentException.class, + () -> ReplaceFunction.replaceRegex("test", "[invalid", "replacement")); + assertTrue(exception.getMessage().contains("Error in 'replace' function")); + } + + @Test + public void testRealWorldScenarios() { + // Date format conversion: MM/DD/YYYY to DD/MM/YYYY + assertEquals( + "14/01/2023", + ReplaceFunction.replaceRegex("01/14/2023", "^(\\d{2})/(\\d{2})/(\\d{4})$", "\\2/\\1/\\3")); + + // Phone number formatting + assertEquals( + "(123) 456-7890", + ReplaceFunction.replaceRegex("1234567890", "(\\d{3})(\\d{3})(\\d{4})", "(\\1) \\2-\\3")); + + // Remove HTML tags + assertEquals( + "Hello World", ReplaceFunction.replaceRegex("

Hello World

", "<[^>]+>", "")); + + // Clean special characters + assertEquals("5551234567", ReplaceFunction.replaceRegex("(555) 123-4567", "[^0-9]", "")); + } + + @Test + public void testAdvancedRegexFeatures() { + // Word boundaries + assertEquals("The dog sat", ReplaceFunction.replaceRegex("The cat sat", "\\bcat\\b", "dog")); + + // Case insensitive with (?i) flag + assertEquals("X", ReplaceFunction.replaceRegex("abc", "(?i)ABC", "X")); + + // Greedy vs non-greedy quantifiers + assertEquals("X", ReplaceFunction.replaceRegex("content", "<.*>", "X")); + assertEquals("XcontentX", ReplaceFunction.replaceRegex("content", "<.*?>", "X")); + + // Anchors + assertEquals("Xhello", ReplaceFunction.replaceRegex("###hello", "^#+", "X")); + assertEquals("helloX", ReplaceFunction.replaceRegex("hello###", "#+$", "X")); + + // Remove repeated characters + assertEquals("a1b2c3", ReplaceFunction.replaceRegex("aaa111bbb222ccc333", "(\\w)\\1+", "\\1")); + } + + @Test + public void testEscapedBackslashes() { + // Escaped backslash produces literal backslash + assertEquals("a\\b\\c", ReplaceFunction.replaceRegex("a/b/c", "/", "\\\\")); + + // \\1 is literal backslash + 1, not a capture group + assertEquals("a\\1b", ReplaceFunction.replaceRegex("axb", "x", "\\\\1")); + } +} diff --git a/docs/user/ppl/functions/string.rst b/docs/user/ppl/functions/string.rst index d0d38d8c72f..53cdb28f743 100644 --- a/docs/user/ppl/functions/string.rst +++ b/docs/user/ppl/functions/string.rst @@ -207,9 +207,15 @@ REPLACE Description >>>>>>>>>>> -Usage: replace(str, substr, newstr) returns a string with all occurrences of substr replaced by newstr in str. If any argument is NULL, the function returns NULL. +Usage: replace(str, pattern, replacement) returns a string with all occurrences of the pattern replaced by the replacement string in str. If any argument is NULL, the function returns NULL. -Example:: +**Regular Expression Support**: The pattern argument supports Java regex syntax, including: + +Argument type: STRING, STRING (regex pattern), STRING (replacement) + +Return type: STRING + +Literal String Replacement Examples:: os> source=people | eval `REPLACE('helloworld', 'world', 'universe')` = REPLACE('helloworld', 'world', 'universe'), `REPLACE('helloworld', 'invalid', 'universe')` = REPLACE('helloworld', 'invalid', 'universe') | fields `REPLACE('helloworld', 'world', 'universe')`, `REPLACE('helloworld', 'invalid', 'universe')` fetched rows / total rows = 1/1 @@ -219,6 +225,42 @@ Example:: | hellouniverse | helloworld | +--------------------------------------------+----------------------------------------------+ +Regex Pattern Examples:: + + os> source=people | eval `Remove digits` = REPLACE('test123', '\\d+', ''), `Collapse spaces` = REPLACE('hello world', ' +', ' '), `Remove special` = REPLACE('hello@world!', '[^a-zA-Z]', '') | fields `Remove digits`, `Collapse spaces`, `Remove special` + fetched rows / total rows = 1/1 + +---------------+-----------------+----------------+ + | Remove digits | Collapse spaces | Remove special | + |---------------+-----------------+----------------| + | test | hello world | helloworld | + +---------------+-----------------+----------------+ + +Capture Group and Backreference Examples:: + + os> source=people | eval `Swap date` = REPLACE('1/14/2023', '^(\\d{1,2})/(\\d{1,2})/', '\\2/\\1/'), `Reverse words` = REPLACE('Hello World', '(\\w+) (\\w+)', '\\2 \\1'), `Extract domain` = REPLACE('user@example.com', '.*@(.+)', '\\1') | fields `Swap date`, `Reverse words`, `Extract domain` + fetched rows / total rows = 1/1 + +-----------+---------------+----------------+ + | Swap date | Reverse words | Extract domain | + |-----------+---------------+----------------| + | 14/1/2023 | World Hello | example.com | + +-----------+---------------+----------------+ + +Advanced Regex Examples:: + + os> source=people | eval `Clean phone` = REPLACE('(555) 123-4567', '[^0-9]', ''), `Remove vowels` = REPLACE('hello world', '[aeiou]', ''), `Add prefix` = REPLACE('test', '^', 'pre_') | fields `Clean phone`, `Remove vowels`, `Add prefix` + fetched rows / total rows = 1/1 + +-------------+---------------+------------+ + | Clean phone | Remove vowels | Add prefix | + |-------------+---------------+------------| + | 5551234567 | hll wrld | pre_test | + +-------------+---------------+------------+ + +**Note**: When using regex patterns in PPL queries, backslashes must be escaped (use ``\\`` instead of ``\``). For example: + +* ``\\d`` for digit pattern (not ``\d``) +* ``\\1`` for first capture group reference (not ``\1``) +* ``\\w+`` for one or more word characters (not ``\w+``) + REVERSE ------- diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLStringBuiltinFunctionIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLStringBuiltinFunctionIT.java index 33befc23a50..8beaee7a87e 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLStringBuiltinFunctionIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLStringBuiltinFunctionIT.java @@ -5,6 +5,7 @@ package org.opensearch.sql.calcite.remote; +import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_ACCOUNT; import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_STATE_COUNTRY; import static org.opensearch.sql.legacy.TestsConstants.TEST_INDEX_STATE_COUNTRY_WITH_NULL; import static org.opensearch.sql.util.MatcherUtils.*; @@ -24,6 +25,7 @@ public void init() throws Exception { loadIndex(Index.STATE_COUNTRY); loadIndex(Index.STATE_COUNTRY_WITH_NULL); + loadIndex(Index.ACCOUNT); // For regex replacement tests } @Test @@ -300,6 +302,90 @@ public void testReplace() throws IOException { verifyDataRows(actual, rows("Jane", 20, "heLLo")); } + @Test + public void testReplaceWithRegexPattern() throws IOException { + // Test regex pattern replacement - remove digits from address + // ACCOUNT index has addresses like "880 Holmes Lane" + JSONObject actual = + executeQuery( + String.format( + "source=%s | where account_number = 1 | eval street_only = replace(address," + + " '\\\\d+ ', '') | fields address, street_only", + TEST_INDEX_ACCOUNT)); + + verifySchema(actual, schema("address", "string"), schema("street_only", "string")); + + // Verify street number is removed: "880 Holmes Lane" -> "Holmes Lane" + verifyDataRows(actual, rows("880 Holmes Lane", "Holmes Lane")); + } + + @Test + public void testReplaceWithCaptureGroups() throws IOException { + // Test capture group replacement - swap first two characters of firstname + JSONObject actual = + executeQuery( + String.format( + "source=%s | where account_number = 1 | eval swapped = replace(firstname," + + " '^(.)(.)', '\\\\2\\\\1') | fields firstname, swapped", + TEST_INDEX_ACCOUNT)); + + verifySchema(actual, schema("firstname", "string"), schema("swapped", "string")); + + // "Amber" -> "mAber" + verifyDataRows(actual, rows("Amber", "mAber")); + } + + @Test + public void testReplaceWithEmailDomainReplacement() throws IOException { + // Test replacing email domain using capture groups (similar to rex command tests) + // ACCOUNT index has emails like "amberduke@pyrami.com" + JSONObject actual = + executeQuery( + String.format( + "source=%s | where account_number = 1 | eval new_email =" + + " replace(email, '([^@]+)@(.+)', '\\\\1@newdomain.com') | fields email," + + " new_email", + TEST_INDEX_ACCOUNT)); + + verifySchema(actual, schema("email", "string"), schema("new_email", "string")); + + // Verify domain is replaced while preserving username + // "amberduke@pyrami.com" -> "amberduke@newdomain.com" + verifyDataRows(actual, rows("amberduke@pyrami.com", "amberduke@newdomain.com")); + } + + @Test + public void testReplaceWithCharacterClasses() throws IOException { + // Test replacing all letters with 'X' in address + JSONObject actual = + executeQuery( + String.format( + "source=%s | where account_number = 1 | eval masked = replace(address, '[a-zA-Z]'," + + " 'X') | fields address, masked", + TEST_INDEX_ACCOUNT)); + + verifySchema(actual, schema("address", "string"), schema("masked", "string")); + + // "880 Holmes Lane" -> "880 XXXXXX XXXX" + verifyDataRows(actual, rows("880 Holmes Lane", "880 XXXXXX XXXX")); + } + + @Test + public void testReplaceWithAnchors() throws IOException { + // Test anchors - extract street name by removing street number at start + JSONObject actual = + executeQuery( + String.format( + "source=%s | where account_number = 1 | eval street_name = replace(address," + + " '^\\\\d+\\\\s+', '') | fields address, street_name", + TEST_INDEX_ACCOUNT)); + + verifySchema(actual, schema("address", "string"), schema("street_name", "string")); + + // "880 Holmes Lane" -> "Holmes Lane" + verifyDataRows(actual, rows("880 Holmes Lane", "Holmes Lane")); + } + @Test public void testLeft() throws IOException { JSONObject actual = diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLStringFunctionTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLStringFunctionTest.java index d52d915d507..78109f89c2f 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLStringFunctionTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLStringFunctionTest.java @@ -263,4 +263,51 @@ public void testRegexMatchWithStats() { + "WHERE REGEXP_CONTAINS(`JOB`, 'MAN')"; verifyPPLToSparkSQL(root, expectedSparkSql); } + + @Test + public void testReplaceLiteralString() { + // Test basic literal string replacement - replaces all 'A' with 'X' + String ppl = "source=EMP | eval new_name = replace(ENAME, 'A', 'X') | fields ENAME, new_name"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(ENAME=[$1], new_name=[REPLACE($1, 'A', 'X')])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `ENAME`, REPLACE(`ENAME`, 'A', 'X') `new_name`\n" + "FROM `scott`.`EMP`"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testReplaceWithRegexPattern() { + // Test regex pattern - remove all digits + String ppl = "source=EMP | eval no_digits = replace(JOB, '\\\\d+', '') | fields JOB, no_digits"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(JOB=[$2], no_digits=[REPLACE($2, '\\d+':VARCHAR, '':VARCHAR)])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `JOB`, REPLACE(`JOB`, '\\d+', '') `no_digits`\n" + "FROM `scott`.`EMP`"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } + + @Test + public void testReplaceWithRegexCaptureGroups() { + // Test regex with capture groups - swap first two characters using \\1 and \\2 backreferences + String ppl = + "source=EMP | eval swapped = replace(ENAME, '^(.)(.)', '\\\\2\\\\1') | fields ENAME," + + " swapped"; + RelNode root = getRelNode(ppl); + String expectedLogical = + "LogicalProject(ENAME=[$1], swapped=[REPLACE($1, '^(.)(.)':VARCHAR, '\\2\\1':VARCHAR)])\n" + + " LogicalTableScan(table=[[scott, EMP]])\n"; + verifyLogical(root, expectedLogical); + + String expectedSparkSql = + "SELECT `ENAME`, REPLACE(`ENAME`, '^(.)(.)', '\\2\\1') `swapped`\n" + "FROM `scott`.`EMP`"; + verifyPPLToSparkSQL(root, expectedSparkSql); + } } From 16c140ae0a38f2b04df0b6287c6cba0258bfa6ef Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Wed, 8 Oct 2025 13:56:56 -0700 Subject: [PATCH 2/8] remove UDF and use REGEXP_REPLACE_3 Signed-off-by: Kai Huang --- .../function/PPLBuiltinOperators.java | 2 - .../expression/function/PPLFuncImpTable.java | 2 +- .../function/udf/ReplaceFunction.java | 120 ------------------ .../function/udf/ReplaceFunctionTest.java | 118 ----------------- docs/user/ppl/functions/string.rst | 9 +- .../calcite/CalcitePPLStringFunctionTest.java | 14 +- 6 files changed, 13 insertions(+), 252 deletions(-) delete mode 100644 core/src/main/java/org/opensearch/sql/expression/function/udf/ReplaceFunction.java delete mode 100644 core/src/test/java/org/opensearch/sql/expression/function/udf/ReplaceFunctionTest.java diff --git a/core/src/main/java/org/opensearch/sql/expression/function/PPLBuiltinOperators.java b/core/src/main/java/org/opensearch/sql/expression/function/PPLBuiltinOperators.java index 2aabe259b37..f92f70f519a 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/PPLBuiltinOperators.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/PPLBuiltinOperators.java @@ -60,7 +60,6 @@ import org.opensearch.sql.expression.function.udf.CryptographicFunction; import org.opensearch.sql.expression.function.udf.ParseFunction; import org.opensearch.sql.expression.function.udf.RelevanceQueryFunction; -import org.opensearch.sql.expression.function.udf.ReplaceFunction; import org.opensearch.sql.expression.function.udf.RexExtractFunction; import org.opensearch.sql.expression.function.udf.RexExtractMultiFunction; import org.opensearch.sql.expression.function.udf.RexOffsetFunction; @@ -419,7 +418,6 @@ public class PPLBuiltinOperators extends ReflectiveSqlOperatorTable { public static final SqlOperator RANGE_BUCKET = new org.opensearch.sql.expression.function.udf.binning.RangeBucketFunction() .toUDF("RANGE_BUCKET"); - public static final SqlOperator REPLACE = new ReplaceFunction().toUDF("REPLACE"); public static final SqlOperator REX_EXTRACT = new RexExtractFunction().toUDF("REX_EXTRACT"); public static final SqlOperator REX_EXTRACT_MULTI = new RexExtractMultiFunction().toUDF("REX_EXTRACT_MULTI"); diff --git a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java index d3c96435e3a..f691c43eaf6 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java @@ -683,7 +683,7 @@ void populate() { registerOperator(LOWER, SqlStdOperatorTable.LOWER); registerOperator(POSITION, SqlStdOperatorTable.POSITION); registerOperator(LOCATE, SqlStdOperatorTable.POSITION); - registerOperator(REPLACE, PPLBuiltinOperators.REPLACE); + registerOperator(REPLACE, SqlLibraryOperators.REGEXP_REPLACE_3); registerOperator(UPPER, SqlStdOperatorTable.UPPER); registerOperator(ABS, SqlStdOperatorTable.ABS); registerOperator(ACOS, SqlStdOperatorTable.ACOS); diff --git a/core/src/main/java/org/opensearch/sql/expression/function/udf/ReplaceFunction.java b/core/src/main/java/org/opensearch/sql/expression/function/udf/ReplaceFunction.java deleted file mode 100644 index fd932b1d9aa..00000000000 --- a/core/src/main/java/org/opensearch/sql/expression/function/udf/ReplaceFunction.java +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright OpenSearch Contributors - * SPDX-License-Identifier: Apache-2.0 - */ - -package org.opensearch.sql.expression.function.udf; - -import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import java.util.regex.PatternSyntaxException; -import org.apache.calcite.adapter.enumerable.NotNullImplementor; -import org.apache.calcite.adapter.enumerable.NullPolicy; -import org.apache.calcite.adapter.enumerable.RexToLixTranslator; -import org.apache.calcite.linq4j.tree.Expression; -import org.apache.calcite.linq4j.tree.Expressions; -import org.apache.calcite.rex.RexCall; -import org.apache.calcite.sql.type.ReturnTypes; -import org.apache.calcite.sql.type.SqlReturnTypeInference; -import org.opensearch.sql.calcite.utils.PPLOperandTypes; -import org.opensearch.sql.expression.function.ImplementorUDF; -import org.opensearch.sql.expression.function.UDFOperandMetadata; - -/** - * Custom REPLACE function for PPL that substitutes the replacement string for every occurrence of - * the regular expression in the input string. - */ -public final class ReplaceFunction extends ImplementorUDF { - - public ReplaceFunction() { - super(new ReplaceImplementor(), NullPolicy.ARG0); - } - - @Override - public SqlReturnTypeInference getReturnTypeInference() { - return ReturnTypes.VARCHAR_2000_NULLABLE; - } - - @Override - public UDFOperandMetadata getOperandMetadata() { - return PPLOperandTypes.STRING_STRING_STRING; - } - - private static class ReplaceImplementor implements NotNullImplementor { - - @Override - public Expression implement( - RexToLixTranslator translator, RexCall call, List translatedOperands) { - Expression str = translatedOperands.get(0); - Expression regex = translatedOperands.get(1); - Expression replacement = translatedOperands.get(2); - - return Expressions.call(ReplaceFunction.class, "replaceRegex", str, regex, replacement); - } - } - - public static String replaceRegex(String str, String regex, String replacement) { - if (str == null || regex == null || replacement == null) { - return null; - } - - try { - // Compile regex pattern using Java's Pattern (same as regex_match uses) - Pattern compiledPattern = Pattern.compile(regex); - Matcher matcher = compiledPattern.matcher(str); - - // Convert Perl-style \1, \2 to Java-style $1, $2 for capture group references - // This matches SPL behavior while using Java's regex engine - String javaReplacement = convertPerlReplacementToJava(replacement); - - return matcher.replaceAll(javaReplacement); - - } catch (PatternSyntaxException e) { - throw new IllegalArgumentException( - "Error in 'replace' function: Encountered the following error while compiling the regex '" - + regex - + "': " - + e.getMessage()); - } catch (IllegalArgumentException | IndexOutOfBoundsException e) { - throw new IllegalArgumentException( - "Error in 'replace' function: Invalid replacement string '" - + replacement - + "': " - + e.getMessage()); - } - } - - private static String convertPerlReplacementToJava(String perlReplacement) { - // Replace \1, \2, etc. with $1, $2, etc. - // We need to be careful with escaped backslashes - StringBuilder result = new StringBuilder(); - int length = perlReplacement.length(); - - for (int i = 0; i < length; i++) { - char ch = perlReplacement.charAt(i); - - if (ch == '\\' && i + 1 < length) { - char next = perlReplacement.charAt(i + 1); - - // Check if this is a group reference (\1, \2, etc.) - if (Character.isDigit(next)) { - result.append('$').append(next); - i++; // Skip the next character since we've already processed it - } else if (next == '\\') { - // This is an escaped backslash (\\), keep both backslashes - result.append("\\\\"); - i++; // Skip the next backslash - } else { - // This is a backslash followed by some other character, keep as is - result.append('\\').append(next); - i++; // Skip the next character - } - } else { - result.append(ch); - } - } - - return result.toString(); - } -} diff --git a/core/src/test/java/org/opensearch/sql/expression/function/udf/ReplaceFunctionTest.java b/core/src/test/java/org/opensearch/sql/expression/function/udf/ReplaceFunctionTest.java deleted file mode 100644 index 9df5d6fbaec..00000000000 --- a/core/src/test/java/org/opensearch/sql/expression/function/udf/ReplaceFunctionTest.java +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Copyright OpenSearch Contributors - * SPDX-License-Identifier: Apache-2.0 - */ - -package org.opensearch.sql.expression.function.udf; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNull; -import static org.junit.jupiter.api.Assertions.assertThrows; -import static org.junit.jupiter.api.Assertions.assertTrue; - -import org.junit.jupiter.api.Test; - -/** Unit tests for {@link ReplaceFunction}. */ -public class ReplaceFunctionTest { - - @Test - public void testBasicReplacementAndRegexPatterns() { - // Literal replacement - assertEquals("Mane", ReplaceFunction.replaceRegex("Jane", "J", "M")); - assertEquals("heLLo", ReplaceFunction.replaceRegex("hello", "l", "L")); - - // Regex patterns - all occurrences replaced - assertEquals("test", ReplaceFunction.replaceRegex("test123", "\\d+", "")); - assertEquals("XXX-XXX-XXXX", ReplaceFunction.replaceRegex("123-456-7890", "\\d", "X")); - assertEquals("123456", ReplaceFunction.replaceRegex("abc123xyz456", "[a-z]+", "")); - assertEquals("hello_world", ReplaceFunction.replaceRegex("hello world", " ", "_")); - } - - @Test - public void testRegexWithCaptureGroupsAndBackreferences() { - // Swap month/day in date using capture groups and backreferences - assertEquals( - "14/1/2023", - ReplaceFunction.replaceRegex("1/14/2023", "^(\\d{1,2})/(\\d{1,2})/", "\\2/\\1/")); - - // Swap words using regex groups - assertEquals( - "World Hello", ReplaceFunction.replaceRegex("Hello World", "(\\w+) (\\w+)", "\\2 \\1")); - - // Reuse same regex group multiple times - assertEquals( - "11-22-33", - ReplaceFunction.replaceRegex("1-2-3", "(\\d)-(\\d)-(\\d)", "\\1\\1-\\2\\2-\\3\\3")); - } - - @Test - public void testEdgeCases() { - // Null inputs - assertNull(ReplaceFunction.replaceRegex(null, "test", "replacement")); - assertNull(ReplaceFunction.replaceRegex("test", null, "replacement")); - assertNull(ReplaceFunction.replaceRegex("test", "pattern", null)); - - // No match returns original - assertEquals("hello", ReplaceFunction.replaceRegex("hello", "xyz", "abc")); - - // Empty strings - assertEquals("", ReplaceFunction.replaceRegex("", "test", "replacement")); - assertEquals("", ReplaceFunction.replaceRegex("12345", "\\d+", "")); - - // Invalid regex - IllegalArgumentException exception = - assertThrows( - IllegalArgumentException.class, - () -> ReplaceFunction.replaceRegex("test", "[invalid", "replacement")); - assertTrue(exception.getMessage().contains("Error in 'replace' function")); - } - - @Test - public void testRealWorldScenarios() { - // Date format conversion: MM/DD/YYYY to DD/MM/YYYY - assertEquals( - "14/01/2023", - ReplaceFunction.replaceRegex("01/14/2023", "^(\\d{2})/(\\d{2})/(\\d{4})$", "\\2/\\1/\\3")); - - // Phone number formatting - assertEquals( - "(123) 456-7890", - ReplaceFunction.replaceRegex("1234567890", "(\\d{3})(\\d{3})(\\d{4})", "(\\1) \\2-\\3")); - - // Remove HTML tags - assertEquals( - "Hello World", ReplaceFunction.replaceRegex("

Hello World

", "<[^>]+>", "")); - - // Clean special characters - assertEquals("5551234567", ReplaceFunction.replaceRegex("(555) 123-4567", "[^0-9]", "")); - } - - @Test - public void testAdvancedRegexFeatures() { - // Word boundaries - assertEquals("The dog sat", ReplaceFunction.replaceRegex("The cat sat", "\\bcat\\b", "dog")); - - // Case insensitive with (?i) flag - assertEquals("X", ReplaceFunction.replaceRegex("abc", "(?i)ABC", "X")); - - // Greedy vs non-greedy quantifiers - assertEquals("X", ReplaceFunction.replaceRegex("content", "<.*>", "X")); - assertEquals("XcontentX", ReplaceFunction.replaceRegex("content", "<.*?>", "X")); - - // Anchors - assertEquals("Xhello", ReplaceFunction.replaceRegex("###hello", "^#+", "X")); - assertEquals("helloX", ReplaceFunction.replaceRegex("hello###", "#+$", "X")); - - // Remove repeated characters - assertEquals("a1b2c3", ReplaceFunction.replaceRegex("aaa111bbb222ccc333", "(\\w)\\1+", "\\1")); - } - - @Test - public void testEscapedBackslashes() { - // Escaped backslash produces literal backslash - assertEquals("a\\b\\c", ReplaceFunction.replaceRegex("a/b/c", "/", "\\\\")); - - // \\1 is literal backslash + 1, not a capture group - assertEquals("a\\1b", ReplaceFunction.replaceRegex("axb", "x", "\\\\1")); - } -} diff --git a/docs/user/ppl/functions/string.rst b/docs/user/ppl/functions/string.rst index 53cdb28f743..b6fb58b54ee 100644 --- a/docs/user/ppl/functions/string.rst +++ b/docs/user/ppl/functions/string.rst @@ -237,7 +237,7 @@ Regex Pattern Examples:: Capture Group and Backreference Examples:: - os> source=people | eval `Swap date` = REPLACE('1/14/2023', '^(\\d{1,2})/(\\d{1,2})/', '\\2/\\1/'), `Reverse words` = REPLACE('Hello World', '(\\w+) (\\w+)', '\\2 \\1'), `Extract domain` = REPLACE('user@example.com', '.*@(.+)', '\\1') | fields `Swap date`, `Reverse words`, `Extract domain` + os> source=people | eval `Swap date` = REPLACE('1/14/2023', '^(\\d{1,2})/(\\d{1,2})/', '$2/$1/'), `Reverse words` = REPLACE('Hello World', '(\\w+) (\\w+)', '$2 $1'), `Extract domain` = REPLACE('user@example.com', '.*@(.+)', '$1') | fields `Swap date`, `Reverse words`, `Extract domain` fetched rows / total rows = 1/1 +-----------+---------------+----------------+ | Swap date | Reverse words | Extract domain | @@ -255,11 +255,10 @@ Advanced Regex Examples:: | 5551234567 | hll wrld | pre_test | +-------------+---------------+------------+ -**Note**: When using regex patterns in PPL queries, backslashes must be escaped (use ``\\`` instead of ``\``). For example: +**Note**: When using regex patterns in PPL queries: -* ``\\d`` for digit pattern (not ``\d``) -* ``\\1`` for first capture group reference (not ``\1``) -* ``\\w+`` for one or more word characters (not ``\w+``) +* Backslashes must be escaped (use ``\\`` instead of ``\``) - e.g., ``\\d`` for digit pattern, ``\\w+`` for word characters +* Backreferences use Java regex syntax: ``$1``, ``$2``, etc. (not PCRE-style ``\1``, ``\2``) REVERSE diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLStringFunctionTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLStringFunctionTest.java index 78109f89c2f..2f3e89cdfcd 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLStringFunctionTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLStringFunctionTest.java @@ -270,12 +270,12 @@ public void testReplaceLiteralString() { String ppl = "source=EMP | eval new_name = replace(ENAME, 'A', 'X') | fields ENAME, new_name"; RelNode root = getRelNode(ppl); String expectedLogical = - "LogicalProject(ENAME=[$1], new_name=[REPLACE($1, 'A', 'X')])\n" + "LogicalProject(ENAME=[$1], new_name=[REGEXP_REPLACE($1, 'A', 'X')])\n" + " LogicalTableScan(table=[[scott, EMP]])\n"; verifyLogical(root, expectedLogical); String expectedSparkSql = - "SELECT `ENAME`, REPLACE(`ENAME`, 'A', 'X') `new_name`\n" + "FROM `scott`.`EMP`"; + "SELECT `ENAME`, REGEXP_REPLACE(`ENAME`, 'A', 'X') `new_name`\n" + "FROM `scott`.`EMP`"; verifyPPLToSparkSQL(root, expectedSparkSql); } @@ -285,12 +285,12 @@ public void testReplaceWithRegexPattern() { String ppl = "source=EMP | eval no_digits = replace(JOB, '\\\\d+', '') | fields JOB, no_digits"; RelNode root = getRelNode(ppl); String expectedLogical = - "LogicalProject(JOB=[$2], no_digits=[REPLACE($2, '\\d+':VARCHAR, '':VARCHAR)])\n" + "LogicalProject(JOB=[$2], no_digits=[REGEXP_REPLACE($2, '\\d+':VARCHAR, '':VARCHAR)])\n" + " LogicalTableScan(table=[[scott, EMP]])\n"; verifyLogical(root, expectedLogical); String expectedSparkSql = - "SELECT `JOB`, REPLACE(`JOB`, '\\d+', '') `no_digits`\n" + "FROM `scott`.`EMP`"; + "SELECT `JOB`, REGEXP_REPLACE(`JOB`, '\\d+', '') `no_digits`\n" + "FROM `scott`.`EMP`"; verifyPPLToSparkSQL(root, expectedSparkSql); } @@ -302,12 +302,14 @@ public void testReplaceWithRegexCaptureGroups() { + " swapped"; RelNode root = getRelNode(ppl); String expectedLogical = - "LogicalProject(ENAME=[$1], swapped=[REPLACE($1, '^(.)(.)':VARCHAR, '\\2\\1':VARCHAR)])\n" + "LogicalProject(ENAME=[$1], swapped=[REGEXP_REPLACE($1, '^(.)(.)':VARCHAR," + + " '\\2\\1':VARCHAR)])\n" + " LogicalTableScan(table=[[scott, EMP]])\n"; verifyLogical(root, expectedLogical); String expectedSparkSql = - "SELECT `ENAME`, REPLACE(`ENAME`, '^(.)(.)', '\\2\\1') `swapped`\n" + "FROM `scott`.`EMP`"; + "SELECT `ENAME`, REGEXP_REPLACE(`ENAME`, '^(.)(.)', '\\2\\1') `swapped`\n" + + "FROM `scott`.`EMP`"; verifyPPLToSparkSQL(root, expectedSparkSql); } } From 4c5b55de6d61d138997613ae158ee3215fa2d106 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Wed, 8 Oct 2025 14:14:53 -0700 Subject: [PATCH 3/8] Add PCRE-to-Java backreference conversion Signed-off-by: Kai Huang --- .../sql/calcite/CalciteRexNodeVisitor.java | 22 +++++++++++++++++++ docs/user/ppl/functions/string.rst | 2 +- .../calcite/CalcitePPLStringFunctionTest.java | 6 ++--- 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRexNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRexNodeVisitor.java index c3b0d43e872..2470e4222fa 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRexNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRexNodeVisitor.java @@ -28,6 +28,7 @@ import org.apache.calcite.rex.RexBuilder; import org.apache.calcite.rex.RexCall; import org.apache.calcite.rex.RexLambdaRef; +import org.apache.calcite.rex.RexLiteral; import org.apache.calcite.rex.RexNode; import org.apache.calcite.sql.SqlIntervalQualifier; import org.apache.calcite.sql.fun.SqlStdOperatorTable; @@ -412,6 +413,27 @@ public RexNode visitFunction(Function node, CalcitePlanContext context) { } } + // Convert PCRE-style backreferences (\1, \2) to Java-style ($1, $2) for replace function + if ("replace".equalsIgnoreCase(node.getFuncName()) && arguments.size() == 3) { + RexNode replacementArg = arguments.get(2); + if (replacementArg instanceof RexLiteral) { + RexLiteral literal = (RexLiteral) replacementArg; + String replacement = literal.getValueAs(String.class); + if (replacement != null) { + // Convert sed backreferences (\1, \2) to Java style ($1, $2) + // The regex pattern matches a backslash followed by digits + String javaReplacement = replacement.replaceAll("\\\\(\\d+)", "\\$$1"); + if (!javaReplacement.equals(replacement)) { + // Preserve the original type when creating the new literal + arguments.set( + 2, + context.rexBuilder.makeLiteral( + javaReplacement, literal.getType(), literal.getTypeName() != SqlTypeName.CHAR)); + } + } + } + } + RexNode resolvedNode = PPLFuncImpTable.INSTANCE.resolve( context.rexBuilder, node.getFuncName(), arguments.toArray(new RexNode[0])); diff --git a/docs/user/ppl/functions/string.rst b/docs/user/ppl/functions/string.rst index b6fb58b54ee..24efa1434f5 100644 --- a/docs/user/ppl/functions/string.rst +++ b/docs/user/ppl/functions/string.rst @@ -258,7 +258,7 @@ Advanced Regex Examples:: **Note**: When using regex patterns in PPL queries: * Backslashes must be escaped (use ``\\`` instead of ``\``) - e.g., ``\\d`` for digit pattern, ``\\w+`` for word characters -* Backreferences use Java regex syntax: ``$1``, ``$2``, etc. (not PCRE-style ``\1``, ``\2``) +* Backreferences support both PCRE-style (``\1``, ``\2``, etc.) and Java-style (``$1``, ``$2``, etc.) syntax. PCRE-style backreferences are automatically converted to Java-style internally. REVERSE diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLStringFunctionTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLStringFunctionTest.java index 2f3e89cdfcd..1e97052dea0 100644 --- a/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLStringFunctionTest.java +++ b/ppl/src/test/java/org/opensearch/sql/ppl/calcite/CalcitePPLStringFunctionTest.java @@ -296,19 +296,19 @@ public void testReplaceWithRegexPattern() { @Test public void testReplaceWithRegexCaptureGroups() { - // Test regex with capture groups - swap first two characters using \\1 and \\2 backreferences + // Test regex with capture groups - swap first two characters using \1 and \2 backreferences String ppl = "source=EMP | eval swapped = replace(ENAME, '^(.)(.)', '\\\\2\\\\1') | fields ENAME," + " swapped"; RelNode root = getRelNode(ppl); String expectedLogical = "LogicalProject(ENAME=[$1], swapped=[REGEXP_REPLACE($1, '^(.)(.)':VARCHAR," - + " '\\2\\1':VARCHAR)])\n" + + " '$2$1')])\n" + " LogicalTableScan(table=[[scott, EMP]])\n"; verifyLogical(root, expectedLogical); String expectedSparkSql = - "SELECT `ENAME`, REGEXP_REPLACE(`ENAME`, '^(.)(.)', '\\2\\1') `swapped`\n" + "SELECT `ENAME`, REGEXP_REPLACE(`ENAME`, '^(.)(.)', '$2$1') `swapped`\n" + "FROM `scott`.`EMP`"; verifyPPLToSparkSQL(root, expectedSparkSql); } From cda7a2f7e8599e89a02b7d43aed60c41afec3228 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Wed, 8 Oct 2025 19:09:37 -0700 Subject: [PATCH 4/8] remove verbose comment Signed-off-by: Kai Huang --- .../sql/calcite/remote/CalcitePPLStringBuiltinFunctionIT.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLStringBuiltinFunctionIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLStringBuiltinFunctionIT.java index 8beaee7a87e..b01ba59edbf 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLStringBuiltinFunctionIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLStringBuiltinFunctionIT.java @@ -25,7 +25,7 @@ public void init() throws Exception { loadIndex(Index.STATE_COUNTRY); loadIndex(Index.STATE_COUNTRY_WITH_NULL); - loadIndex(Index.ACCOUNT); // For regex replacement tests + loadIndex(Index.ACCOUNT); } @Test From 89a03e074b04a25ab361a5d5b9f89fd528efc528 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Wed, 8 Oct 2025 19:10:49 -0700 Subject: [PATCH 5/8] remove more Signed-off-by: Kai Huang --- .../remote/CalcitePPLStringBuiltinFunctionIT.java | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLStringBuiltinFunctionIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLStringBuiltinFunctionIT.java index b01ba59edbf..850cdbb951a 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLStringBuiltinFunctionIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLStringBuiltinFunctionIT.java @@ -304,8 +304,6 @@ public void testReplace() throws IOException { @Test public void testReplaceWithRegexPattern() throws IOException { - // Test regex pattern replacement - remove digits from address - // ACCOUNT index has addresses like "880 Holmes Lane" JSONObject actual = executeQuery( String.format( @@ -315,13 +313,11 @@ public void testReplaceWithRegexPattern() throws IOException { verifySchema(actual, schema("address", "string"), schema("street_only", "string")); - // Verify street number is removed: "880 Holmes Lane" -> "Holmes Lane" verifyDataRows(actual, rows("880 Holmes Lane", "Holmes Lane")); } @Test public void testReplaceWithCaptureGroups() throws IOException { - // Test capture group replacement - swap first two characters of firstname JSONObject actual = executeQuery( String.format( @@ -331,14 +327,11 @@ public void testReplaceWithCaptureGroups() throws IOException { verifySchema(actual, schema("firstname", "string"), schema("swapped", "string")); - // "Amber" -> "mAber" verifyDataRows(actual, rows("Amber", "mAber")); } @Test public void testReplaceWithEmailDomainReplacement() throws IOException { - // Test replacing email domain using capture groups (similar to rex command tests) - // ACCOUNT index has emails like "amberduke@pyrami.com" JSONObject actual = executeQuery( String.format( @@ -349,14 +342,11 @@ public void testReplaceWithEmailDomainReplacement() throws IOException { verifySchema(actual, schema("email", "string"), schema("new_email", "string")); - // Verify domain is replaced while preserving username - // "amberduke@pyrami.com" -> "amberduke@newdomain.com" verifyDataRows(actual, rows("amberduke@pyrami.com", "amberduke@newdomain.com")); } @Test public void testReplaceWithCharacterClasses() throws IOException { - // Test replacing all letters with 'X' in address JSONObject actual = executeQuery( String.format( @@ -366,13 +356,11 @@ public void testReplaceWithCharacterClasses() throws IOException { verifySchema(actual, schema("address", "string"), schema("masked", "string")); - // "880 Holmes Lane" -> "880 XXXXXX XXXX" verifyDataRows(actual, rows("880 Holmes Lane", "880 XXXXXX XXXX")); } @Test public void testReplaceWithAnchors() throws IOException { - // Test anchors - extract street name by removing street number at start JSONObject actual = executeQuery( String.format( @@ -382,7 +370,6 @@ public void testReplaceWithAnchors() throws IOException { verifySchema(actual, schema("address", "string"), schema("street_name", "string")); - // "880 Holmes Lane" -> "Holmes Lane" verifyDataRows(actual, rows("880 Holmes Lane", "Holmes Lane")); } From 7024259db47d00fa363de6ae64c6ba257e2c1306 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 10 Oct 2025 14:08:00 -0700 Subject: [PATCH 6/8] move from rexNodeVisitor to PPLFuncImpTable Signed-off-by: Kai Huang --- .../sql/calcite/CalciteRexNodeVisitor.java | 22 --------------- .../expression/function/PPLFuncImpTable.java | 27 ++++++++++++++++++- 2 files changed, 26 insertions(+), 23 deletions(-) diff --git a/core/src/main/java/org/opensearch/sql/calcite/CalciteRexNodeVisitor.java b/core/src/main/java/org/opensearch/sql/calcite/CalciteRexNodeVisitor.java index 2470e4222fa..c3b0d43e872 100644 --- a/core/src/main/java/org/opensearch/sql/calcite/CalciteRexNodeVisitor.java +++ b/core/src/main/java/org/opensearch/sql/calcite/CalciteRexNodeVisitor.java @@ -28,7 +28,6 @@ import org.apache.calcite.rex.RexBuilder; import org.apache.calcite.rex.RexCall; import org.apache.calcite.rex.RexLambdaRef; -import org.apache.calcite.rex.RexLiteral; import org.apache.calcite.rex.RexNode; import org.apache.calcite.sql.SqlIntervalQualifier; import org.apache.calcite.sql.fun.SqlStdOperatorTable; @@ -413,27 +412,6 @@ public RexNode visitFunction(Function node, CalcitePlanContext context) { } } - // Convert PCRE-style backreferences (\1, \2) to Java-style ($1, $2) for replace function - if ("replace".equalsIgnoreCase(node.getFuncName()) && arguments.size() == 3) { - RexNode replacementArg = arguments.get(2); - if (replacementArg instanceof RexLiteral) { - RexLiteral literal = (RexLiteral) replacementArg; - String replacement = literal.getValueAs(String.class); - if (replacement != null) { - // Convert sed backreferences (\1, \2) to Java style ($1, $2) - // The regex pattern matches a backslash followed by digits - String javaReplacement = replacement.replaceAll("\\\\(\\d+)", "\\$$1"); - if (!javaReplacement.equals(replacement)) { - // Preserve the original type when creating the new literal - arguments.set( - 2, - context.rexBuilder.makeLiteral( - javaReplacement, literal.getType(), literal.getTypeName() != SqlTypeName.CHAR)); - } - } - } - } - RexNode resolvedNode = PPLFuncImpTable.INSTANCE.resolve( context.rexBuilder, node.getFuncName(), arguments.toArray(new RexNode[0])); diff --git a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java index f691c43eaf6..e9749d64aaf 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java @@ -245,6 +245,7 @@ import org.apache.calcite.rel.type.RelDataType; import org.apache.calcite.rex.RexBuilder; import org.apache.calcite.rex.RexLambda; +import org.apache.calcite.rex.RexLiteral; import org.apache.calcite.rex.RexNode; import org.apache.calcite.sql.SqlAggFunction; import org.apache.calcite.sql.SqlOperator; @@ -683,7 +684,31 @@ void populate() { registerOperator(LOWER, SqlStdOperatorTable.LOWER); registerOperator(POSITION, SqlStdOperatorTable.POSITION); registerOperator(LOCATE, SqlStdOperatorTable.POSITION); - registerOperator(REPLACE, SqlLibraryOperators.REGEXP_REPLACE_3); + // Register REPLACE with automatic PCRE-to-Java backreference conversion + register( + REPLACE, + (RexBuilder builder, RexNode... args) -> { + if (args.length == 3 && args[2] instanceof RexLiteral) { + RexLiteral literal = (RexLiteral) args[2]; + String replacement = literal.getValueAs(String.class); + if (replacement != null) { + // Convert PCRE/sed backreferences (\1, \2) to Java style ($1, $2) + String javaReplacement = replacement.replaceAll("\\\\(\\d+)", "\\$$1"); + if (!javaReplacement.equals(replacement)) { + RexNode convertedLiteral = + builder.makeLiteral( + javaReplacement, + literal.getType(), + literal.getTypeName() != SqlTypeName.CHAR); + return builder.makeCall( + SqlLibraryOperators.REGEXP_REPLACE_3, args[0], args[1], convertedLiteral); + } + } + } + return builder.makeCall(SqlLibraryOperators.REGEXP_REPLACE_3, args); + }, + wrapSqlOperandTypeChecker( + SqlLibraryOperators.REGEXP_REPLACE_3.getOperandTypeChecker(), REPLACE.name(), false)); registerOperator(UPPER, SqlStdOperatorTable.UPPER); registerOperator(ABS, SqlStdOperatorTable.ABS); registerOperator(ACOS, SqlStdOperatorTable.ACOS); From 343367ada3ffc4c5db9da05dc87059c1f55b4182 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 10 Oct 2025 15:51:25 -0700 Subject: [PATCH 7/8] error code handling Signed-off-by: Kai Huang --- .../org/opensearch/sql/plugin/rest/RestPPLQueryAction.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/plugin/src/main/java/org/opensearch/sql/plugin/rest/RestPPLQueryAction.java b/plugin/src/main/java/org/opensearch/sql/plugin/rest/RestPPLQueryAction.java index 94cc8c2fe0f..d078c73dea3 100644 --- a/plugin/src/main/java/org/opensearch/sql/plugin/rest/RestPPLQueryAction.java +++ b/plugin/src/main/java/org/opensearch/sql/plugin/rest/RestPPLQueryAction.java @@ -14,6 +14,7 @@ import java.util.HashSet; import java.util.List; import java.util.Set; +import java.util.regex.PatternSyntaxException; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.opensearch.OpenSearchException; @@ -59,7 +60,9 @@ private static boolean isClientError(Exception e) { || e instanceof QueryEngineException || e instanceof SyntaxCheckException || e instanceof DataSourceClientException - || e instanceof IllegalAccessException; + || e instanceof IllegalAccessException + || e instanceof PatternSyntaxException + || (e.getCause() instanceof PatternSyntaxException); } @Override From 91e6f7fdfacd8c5b7668587474bfaa92b37c2440 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 10 Oct 2025 16:11:13 -0700 Subject: [PATCH 8/8] update error code handling Signed-off-by: Kai Huang --- .../expression/function/PPLFuncImpTable.java | 20 +++++++++ .../CalcitePPLStringBuiltinFunctionIT.java | 45 +++++++++++++++++++ .../sql/plugin/rest/RestPPLQueryAction.java | 5 +-- 3 files changed, 66 insertions(+), 4 deletions(-) diff --git a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java index e9749d64aaf..8b9bdcf9058 100644 --- a/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java +++ b/core/src/main/java/org/opensearch/sql/expression/function/PPLFuncImpTable.java @@ -239,6 +239,8 @@ import java.util.Optional; import java.util.StringJoiner; import java.util.concurrent.ConcurrentHashMap; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; import java.util.stream.Collectors; import java.util.stream.Stream; import javax.annotation.Nullable; @@ -688,6 +690,24 @@ void populate() { register( REPLACE, (RexBuilder builder, RexNode... args) -> { + // Validate regex pattern at query planning time + if (args.length >= 2 && args[1] instanceof RexLiteral) { + RexLiteral patternLiteral = (RexLiteral) args[1]; + String pattern = patternLiteral.getValueAs(String.class); + if (pattern != null) { + try { + // Compile pattern to validate it - this will throw PatternSyntaxException if + // invalid + Pattern.compile(pattern); + } catch (PatternSyntaxException e) { + // Convert to IllegalArgumentException so it's treated as a client error (400) + throw new IllegalArgumentException( + String.format("Invalid regex pattern '%s': %s", pattern, e.getDescription()), + e); + } + } + } + if (args.length == 3 && args[2] instanceof RexLiteral) { RexLiteral literal = (RexLiteral) args[2]; String replacement = literal.getValueAs(String.class); diff --git a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLStringBuiltinFunctionIT.java b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLStringBuiltinFunctionIT.java index 850cdbb951a..3e0b6cb07aa 100644 --- a/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLStringBuiltinFunctionIT.java +++ b/integ-test/src/test/java/org/opensearch/sql/calcite/remote/CalcitePPLStringBuiltinFunctionIT.java @@ -399,6 +399,51 @@ public void testStrCmp() throws IOException { verifyDataRows(actual, rows("Jane", 20)); } + @Test + public void testReplaceWithInvalidRegexPattern() { + // Test invalid regex pattern - unclosed character class + Throwable e1 = + assertThrowsWithReplace( + Exception.class, + () -> + executeQuery( + String.format( + "source=%s | eval result = replace(firstname, '[unclosed', 'X') | fields" + + " firstname, result", + TEST_INDEX_ACCOUNT))); + verifyErrorMessageContains(e1, "Invalid regex pattern"); + verifyErrorMessageContains(e1, "Unclosed character class"); + verifyErrorMessageContains(e1, "400 Bad Request"); + + // Test invalid regex pattern - unclosed group + Throwable e2 = + assertThrowsWithReplace( + Exception.class, + () -> + executeQuery( + String.format( + "source=%s | eval result = replace(firstname, '(invalid', 'X') | fields" + + " firstname, result", + TEST_INDEX_ACCOUNT))); + verifyErrorMessageContains(e2, "Invalid regex pattern"); + verifyErrorMessageContains(e2, "Unclosed group"); + verifyErrorMessageContains(e2, "400 Bad Request"); + + // Test invalid regex pattern - dangling metacharacter + Throwable e3 = + assertThrowsWithReplace( + Exception.class, + () -> + executeQuery( + String.format( + "source=%s | eval result = replace(firstname, '?invalid', 'X') | fields" + + " firstname, result", + TEST_INDEX_ACCOUNT))); + verifyErrorMessageContains(e3, "Invalid regex pattern"); + verifyErrorMessageContains(e3, "Dangling meta character"); + verifyErrorMessageContains(e3, "400 Bad Request"); + } + private void prepareTrim() throws IOException { Request request1 = new Request("PUT", "/opensearch-sql_test_index_state_country/_doc/5?refresh=true"); diff --git a/plugin/src/main/java/org/opensearch/sql/plugin/rest/RestPPLQueryAction.java b/plugin/src/main/java/org/opensearch/sql/plugin/rest/RestPPLQueryAction.java index d078c73dea3..94cc8c2fe0f 100644 --- a/plugin/src/main/java/org/opensearch/sql/plugin/rest/RestPPLQueryAction.java +++ b/plugin/src/main/java/org/opensearch/sql/plugin/rest/RestPPLQueryAction.java @@ -14,7 +14,6 @@ import java.util.HashSet; import java.util.List; import java.util.Set; -import java.util.regex.PatternSyntaxException; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.opensearch.OpenSearchException; @@ -60,9 +59,7 @@ private static boolean isClientError(Exception e) { || e instanceof QueryEngineException || e instanceof SyntaxCheckException || e instanceof DataSourceClientException - || e instanceof IllegalAccessException - || e instanceof PatternSyntaxException - || (e.getCause() instanceof PatternSyntaxException); + || e instanceof IllegalAccessException; } @Override