diff --git a/pinot-core/src/main/java/org/apache/pinot/core/query/optimizer/filter/NumericalFilterOptimizer.java b/pinot-core/src/main/java/org/apache/pinot/core/query/optimizer/filter/NumericalFilterOptimizer.java index 7d2336a1a030..961dfa3b71a6 100644 --- a/pinot-core/src/main/java/org/apache/pinot/core/query/optimizer/filter/NumericalFilterOptimizer.java +++ b/pinot-core/src/main/java/org/apache/pinot/core/query/optimizer/filter/NumericalFilterOptimizer.java @@ -24,7 +24,6 @@ import org.apache.pinot.common.request.Expression; import org.apache.pinot.common.request.ExpressionType; import org.apache.pinot.common.request.Function; -import org.apache.pinot.common.request.Literal; import org.apache.pinot.spi.data.FieldSpec; import org.apache.pinot.spi.data.FieldSpec.DataType; import org.apache.pinot.spi.data.Schema; @@ -32,34 +31,44 @@ /** - * Numerical expressions of form "column literal", where operator can be '=', '!=', '>', '>=', '<', or '<=', - * can compare a column of one datatype (say INT) with a literal of different datatype (say DOUBLE). These expressions - * can not be evaluated on the Server. Hence, we rewrite such expressions into an equivalent expression whose LHS and - * RHS are of the same datatype. - * + * Numerical expressions of the form "column [operator] literal", where operator can be '=', '!=', '>', '>=', '<', '<=', + * or 'BETWEEN' can compare a column of one datatype (say INT) with a literal of different datatype (say DOUBLE). These + * expressions can not be evaluated on the Server. Hence, we rewrite such expressions into an equivalent expression + * whose LHS and RHS are of the same datatype. + *

* Simple predicate examples: - * 1) WHERE "intColumn = 5.0" gets rewritten to "WHERE intColumn = 5" - * 2) WHERE "intColumn != 5.0" gets rewritten to "WHERE intColumn != 5" - * 3) WHERE "intColumn = 5.5" gets rewritten to "WHERE false" because INT values can not match 5.5. - * 4) WHERE "intColumn = 3000000000" gets rewritten to "WHERE false" because INT values can not match 3000000000. - * 5) WHERE "intColumn != 3000000000" gets rewritten to "WHERE true" because INT values always not equal to 3000000000. - * 6) WHERE "intColumn < 5.1" gets rewritten to "WHERE intColumn <= 5" - * 7) WHERE "intColumn > -3E9" gets rewritten to "WHERE true" because int values are always greater than -3E9. - * + *

    + *
  1. "WHERE intColumn = 5.0" gets rewritten to "WHERE intColumn = 5" + *
  2. "WHERE intColumn != 5.0" gets rewritten to "WHERE intColumn != 5" + *
  3. "WHERE intColumn = 5.5" gets rewritten to "WHERE false" because INT values can not match 5.5. + *
  4. "WHERE intColumn = 3000000000" gets rewritten to "WHERE false" because INT values can not match 3000000000. + *
  5. "WHERE intColumn != 3000000000" gets rewritten to "WHERE true" because INT values always not equal to + * 3000000000. + *
  6. "WHERE intColumn < 5.1" gets rewritten to "WHERE intColumn <= 5" + *
  7. "WHERE intColumn > -3E9" gets rewritten to "WHERE true" because int values are always greater than -3E9. + *
  8. "WHERE intColumn BETWEEN 2.5 AND 7.5" gets rewritten to "WHERE intColumn BETWEEN 3 AND 7" + *
  9. "WHERE intColumn BETWEEN 5.5 AND 3000000000" gets rewritten to "WHERE intColumn BETWEEN 6 AND 2147483647" since + * 3000000000 is greater than Integer.MAX_VALUE. + *
  10. "WHERE intColumn BETWEEN 10 AND 0" gets rewritten to "WHERE false" because lower bound is greater than upper + * bound. + *
+ *

* Compound predicate examples: - * 8) WHERE "intColumn1 = 5.5 AND intColumn2 = intColumn3" + *

    + *
  1. "WHERE intColumn1 = 5.5 AND intColumn2 = intColumn3" * rewrite to "WHERE false AND intColumn2 = intColumn3" * rewrite to "WHERE intColumn2 = intColumn3" - * 9) WHERE "intColumn1 != 5.5 OR intColumn2 = 5000000000" (5000000000 is out of bounds for integer column) + *
  2. "WHERE intColumn1 != 5.5 OR intColumn2 = 5000000000" (5000000000 is out of bounds for integer column) * rewrite to "WHERE true OR false" * rewrite to "WHERE true" * rewrite to query without any WHERE clause. - * + *
+ *

* When entire predicate gets rewritten to false (Example 3 above), the query will not return any data. Hence, it is * better for the Broker itself to return an empty response rather than sending the query to servers for further * evaluation. - * - * TODO: Add support for BETWEEN, IN, and NOT IN operators. + *

+ * TODO: Add support for IN, and NOT IN operators. */ public class NumericalFilterOptimizer extends BaseAndOrBooleanFilterOptimizer { @@ -74,33 +83,39 @@ boolean canBeOptimized(Expression filterExpression, @Nullable Schema schema) { Expression optimizeChild(Expression filterExpression, @Nullable Schema schema) { Function function = filterExpression.getFunctionCall(); FilterKind kind = FilterKind.valueOf(function.getOperator()); + + if (!kind.isRange() && kind != FilterKind.EQUALS && kind != FilterKind.NOT_EQUALS) { + return filterExpression; + } + + List operands = function.getOperands(); + // Verify that LHS is a numeric column and RHS is a literal before rewriting. + Expression lhs = operands.get(0); + Expression rhs = operands.get(1); + + DataType dataType = getDataType(lhs, schema); + if (dataType == null || !dataType.isNumeric() || !rhs.isSetLiteral()) { + // No rewrite here + return filterExpression; + } + switch (kind) { - case IS_NULL: - case IS_NOT_NULL: - // No need to try to optimize IS_NULL and IS_NOT_NULL operations on numerical columns. - break; - default: - List operands = function.getOperands(); - // Verify that LHS is a numeric column and RHS is a numeric literal before rewriting. - Expression lhs = operands.get(0); - Expression rhs = operands.get(1); - if (isNumericLiteral(rhs)) { - DataType dataType = getDataType(lhs, schema); - if (dataType != null && dataType.isNumeric()) { - switch (kind) { - case EQUALS: - case NOT_EQUALS: - return rewriteEqualsExpression(filterExpression, kind, dataType, rhs); - case GREATER_THAN: - case GREATER_THAN_OR_EQUAL: - case LESS_THAN: - case LESS_THAN_OR_EQUAL: - return rewriteRangeExpression(filterExpression, kind, dataType, rhs); - default: - break; - } - } + case BETWEEN: { + return rewriteBetweenExpression(filterExpression, dataType); + } + case EQUALS: + case NOT_EQUALS: + case GREATER_THAN: + case GREATER_THAN_OR_EQUAL: + case LESS_THAN: + case LESS_THAN_OR_EQUAL: { + if (kind.isRange()) { + return rewriteRangeExpression(filterExpression, kind, dataType, rhs); + } else { + return rewriteEqualsExpression(filterExpression, kind, dataType, rhs); } + } + default: break; } return filterExpression; @@ -346,6 +361,159 @@ private static Expression rewriteRangeExpression(Expression range, FilterKind ki return range; } + /** + * Rewrite expressions of the form "column BETWEEN lower AND upper" to ensure that lower and upper bounds are the same + * datatype as the column (or can be cast to the same datatype in the server). + */ + private static Expression rewriteBetweenExpression(Expression between, DataType dataType) { + // TODO: Consider unifying logic with rewriteRangeExpression + List operands = between.getFunctionCall().getOperands(); + Expression lower = operands.get(1); + Expression upper = operands.get(2); + + // The BETWEEN filter predicate currently only supports literals as lower and upper bounds, but we're still checking + // here just in case. + if (lower.isSetLiteral()) { + switch (lower.getLiteral().getSetField()) { + case LONG_VALUE: { + long actual = lower.getLiteral().getLongValue(); + // Other data types can be converted on the server side. + if (dataType == DataType.INT) { + if (actual > Integer.MAX_VALUE) { + // Lower bound literal value is greater than the bounds of INT. + return getExpressionFromBoolean(false); + } + if (actual < Integer.MIN_VALUE) { + lower.getLiteral().setIntValue(Integer.MIN_VALUE); + } + } + break; + } + case DOUBLE_VALUE: { + double actual = lower.getLiteral().getDoubleValue(); + + switch (dataType) { + case INT: { + if (actual > Integer.MAX_VALUE) { + // Lower bound literal value is greater than the bounds of INT. + return getExpressionFromBoolean(false); + } + if (actual < Integer.MIN_VALUE) { + lower.getLiteral().setIntValue(Integer.MIN_VALUE); + } else { + // Double value is in int range + int converted = (int) actual; + int comparison = BigDecimal.valueOf(converted).compareTo(BigDecimal.valueOf(actual)); + if (comparison >= 0) { + lower.getLiteral().setIntValue(converted); + } else { + lower.getLiteral().setIntValue(converted + 1); + } + } + break; + } + case LONG: { + if (actual > Long.MAX_VALUE) { + // Lower bound literal value is greater than the bounds of LONG. + return getExpressionFromBoolean(false); + } + if (actual < Long.MIN_VALUE) { + lower.getLiteral().setLongValue(Long.MIN_VALUE); + } else { + // Double value is in long range + long converted = (long) actual; + int comparison = BigDecimal.valueOf(converted).compareTo(BigDecimal.valueOf(actual)); + if (comparison >= 0) { + lower.getLiteral().setLongValue(converted); + } else { + lower.getLiteral().setLongValue(converted + 1); + } + } + break; + } + default: + // For other numeric data types, the double literal can be converted on the server side. + break; + } + break; + } + default: + break; + } + } + + if (upper.isSetLiteral()) { + switch (upper.getLiteral().getSetField()) { + case LONG_VALUE: { + long actual = upper.getLiteral().getLongValue(); + // Other data types can be converted on the server side. + if (dataType == DataType.INT) { + if (actual < Integer.MIN_VALUE) { + // Upper bound literal value is lesser than the bounds of INT. + return getExpressionFromBoolean(false); + } + if (actual > Integer.MAX_VALUE) { + upper.getLiteral().setIntValue(Integer.MAX_VALUE); + } + } + break; + } + case DOUBLE_VALUE: { + double actual = upper.getLiteral().getDoubleValue(); + + switch (dataType) { + case INT: { + if (actual < Integer.MIN_VALUE) { + // Upper bound literal value is lesser than the bounds of INT. + return getExpressionFromBoolean(false); + } + if (actual > Integer.MAX_VALUE) { + upper.getLiteral().setIntValue(Integer.MAX_VALUE); + } else { + // Double value is in int range + int converted = (int) actual; + int comparison = BigDecimal.valueOf(converted).compareTo(BigDecimal.valueOf(actual)); + if (comparison <= 0) { + upper.getLiteral().setIntValue(converted); + } else { + upper.getLiteral().setIntValue(converted - 1); + } + } + break; + } + case LONG: { + if (actual < Long.MIN_VALUE) { + // Upper bound literal value is lesser than the bounds of LONG. + return getExpressionFromBoolean(false); + } + if (actual > Long.MAX_VALUE) { + upper.getLiteral().setLongValue(Long.MAX_VALUE); + } else { + // Double value is in long range + long converted = (long) actual; + int comparison = BigDecimal.valueOf(converted).compareTo(BigDecimal.valueOf(actual)); + if (comparison <= 0) { + upper.getLiteral().setLongValue(converted); + } else { + upper.getLiteral().setLongValue(converted - 1); + } + } + break; + } + default: + // For other numeric data types, the double literal can be converted on the server side. + break; + } + break; + } + default: + break; + } + } + + return between; + } + /** * Helper function to rewrite range operator of a range expression. * @param range Range expression. @@ -356,9 +524,9 @@ private static void rewriteRangeOperator(Expression range, FilterKind kind, int if (comparison > 0) { // Literal value is greater than the converted value, so rewrite: // "column > literal" to "column > converted" - // "column >= literal" to "column >= converted" + // "column >= literal" to "column > converted" // "column < literal" to "column <= converted" - // "column <= literal" to "column < converted" + // "column <= literal" to "column <= converted" if (kind == FilterKind.GREATER_THAN || kind == FilterKind.GREATER_THAN_OR_EQUAL) { range.getFunctionCall().setOperator(FilterKind.GREATER_THAN.name()); } else if (kind == FilterKind.LESS_THAN || kind == FilterKind.LESS_THAN_OR_EQUAL) { @@ -413,21 +581,4 @@ private static DataType getDataType(Expression expression, Schema schema) { } return null; } - - /** @return true if expression is a numeric literal; otherwise, false. */ - private static boolean isNumericLiteral(Expression expression) { - if (expression.getType() == ExpressionType.LITERAL) { - Literal._Fields type = expression.getLiteral().getSetField(); - switch (type) { - case INT_VALUE: - case LONG_VALUE: - case FLOAT_VALUE: - case DOUBLE_VALUE: - return true; - default: - break; - } - } - return false; - } } diff --git a/pinot-core/src/test/java/org/apache/pinot/core/query/optimizer/filter/NumericalFilterOptimizerTest.java b/pinot-core/src/test/java/org/apache/pinot/core/query/optimizer/filter/NumericalFilterOptimizerTest.java index e947740e2423..363ef8887db2 100644 --- a/pinot-core/src/test/java/org/apache/pinot/core/query/optimizer/filter/NumericalFilterOptimizerTest.java +++ b/pinot-core/src/test/java/org/apache/pinot/core/query/optimizer/filter/NumericalFilterOptimizerTest.java @@ -295,6 +295,74 @@ public void testRangeRewrites() { + ".223372036854776E18>)]))"); } + @Test + public void testBetweenRewrites() { + // Test LONG literal with INT column. + Assert.assertEquals(rewrite("SELECT * FROM testTable WHERE intColumn BETWEEN 3000000000 AND 4000000000"), + "Expression(type:LITERAL, literal:)"); + + Assert.assertEquals(rewrite("SELECT * FROM testTable WHERE intColumn BETWEEN -4000000000 AND -3000000000"), + "Expression(type:LITERAL, literal:)"); + + // No rewrite + Assert.assertEquals(rewrite("SELECT * FROM testTable WHERE intColumn BETWEEN -2000000000 AND 2000000000"), + "Expression(type:FUNCTION, functionCall:Function(operator:BETWEEN, operands:[Expression(type:IDENTIFIER, " + + "identifier:Identifier(name:intColumn)), Expression(type:LITERAL, literal:), Expression(type:LITERAL, literal:)]))"); + + // Test INT column with DOUBLE lower bound greater than Integer.MAX_VALUE. + Assert.assertEquals(rewrite("SELECT * FROM testTable WHERE intColumn BETWEEN 3000000000.0 AND 4000000000.0"), + "Expression(type:LITERAL, literal:)"); + // Test INT column with DOUBLE upper bound lesser than Integer.MIN_VALUE. + Assert.assertEquals(rewrite("SELECT * FROM testTable WHERE intColumn BETWEEN -4000000000.0 AND -3000000000.0"), + "Expression(type:LITERAL, literal:)"); + // Test INT column with LONG lower bound lesser than Integer.MIN_VALUE. + Assert.assertEquals(rewrite("SELECT * FROM testTable WHERE intColumn BETWEEN -4000000000 AND 0"), + "Expression(type:FUNCTION, functionCall:Function(operator:BETWEEN, operands:[Expression(type:IDENTIFIER, " + + "identifier:Identifier(name:intColumn)), Expression(type:LITERAL, literal:), Expression(type:LITERAL, literal:)]))"); + // Test INT column with LONG upper bound greater than Integer.MAX_VALUE. + Assert.assertEquals(rewrite("SELECT * FROM testTable WHERE intColumn BETWEEN 0 AND 4000000000"), + "Expression(type:FUNCTION, functionCall:Function(operator:BETWEEN, operands:[Expression(type:IDENTIFIER, " + + "identifier:Identifier(name:intColumn)), Expression(type:LITERAL, literal:), Expression(type:LITERAL, literal:)]))"); + + // Test LONG column with DOUBLE lower bound greater than Long.MAX_VALUE. + Assert.assertEquals( + rewrite("SELECT * FROM testTable WHERE longColumn BETWEEN 9323372036854775808.0 AND 9323372036854775809.0"), + "Expression(type:LITERAL, literal:)"); + // Test LONG column with DOUBLE upper bound lesser than Long.MIN_VALUE. + Assert.assertEquals( + rewrite("SELECT * FROM testTable WHERE longColumn BETWEEN -9323372036854775809.0 AND -9323372036854775808.0"), + "Expression(type:LITERAL, literal:)"); + // Test LONG column with DOUBLE lower bound lesser than Long.MIN_VALUE. + Assert.assertEquals(rewrite("SELECT * FROM testTable WHERE longColumn BETWEEN -9323372036854775809.0 AND 0"), + "Expression(type:FUNCTION, functionCall:Function(operator:BETWEEN, operands:[Expression(type:IDENTIFIER, " + + "identifier:Identifier(name:longColumn)), Expression(type:LITERAL, literal:), Expression(type:LITERAL, literal:)]))"); + // Test LONG column with DOUBLE upper bound greater than Long.MAX_VALUE. + Assert.assertEquals(rewrite("SELECT * FROM testTable WHERE longColumn BETWEEN 0 AND 9323372036854775808.0"), + "Expression(type:FUNCTION, functionCall:Function(operator:BETWEEN, operands:[Expression(type:IDENTIFIER, " + + "identifier:Identifier(name:longColumn)), Expression(type:LITERAL, literal:), " + + "Expression(type:LITERAL, literal:)]))"); + + // Test DOUBLE literal rewrite for INT and LONG columns. + Assert.assertEquals(rewrite("SELECT * FROM testTable WHERE intColumn BETWEEN 2.5 AND 7.5"), + "Expression(type:FUNCTION, functionCall:Function(operator:BETWEEN, operands:[Expression(type:IDENTIFIER, " + + "identifier:Identifier(name:intColumn)), Expression(type:LITERAL, literal:), " + + "Expression(type:LITERAL, literal:)]))"); + + Assert.assertEquals(rewrite("SELECT * FROM testTable WHERE longColumn BETWEEN 2.1 AND 7.9"), + "Expression(type:FUNCTION, functionCall:Function(operator:BETWEEN, operands:[Expression(type:IDENTIFIER, " + + "identifier:Identifier(name:longColumn)), Expression(type:LITERAL, literal:), " + + "Expression(type:LITERAL, literal:)]))"); + + Assert.assertEquals(rewrite("SELECT * FROM testTable WHERE intColumn BETWEEN -5.0 AND -3.5"), + "Expression(type:FUNCTION, functionCall:Function(operator:BETWEEN, operands:[Expression(type:IDENTIFIER, " + + "identifier:Identifier(name:intColumn)), Expression(type:LITERAL, literal:), " + + "Expression(type:LITERAL, literal:)]))"); + } + @Test public void testNull() { // Test column IS NOT NULL.