Skip to content

Commit

Permalink
Add BETWEEN support to NumericalFilterOptimizer (apache#14163)
Browse files Browse the repository at this point in the history
  • Loading branch information
yashmayya authored Oct 11, 2024
1 parent 9d2cd31 commit 782e769
Show file tree
Hide file tree
Showing 2 changed files with 282 additions and 63 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -24,42 +24,51 @@
import org.apache.pinot.common.request.Expression;
import org.apache.pinot.common.request.ExpressionType;
import org.apache.pinot.common.request.Function;
import org.apache.pinot.common.request.Literal;
import org.apache.pinot.spi.data.FieldSpec;
import org.apache.pinot.spi.data.FieldSpec.DataType;
import org.apache.pinot.spi.data.Schema;
import org.apache.pinot.sql.FilterKind;


/**
* Numerical expressions of form "column <operator> literal", where operator can be '=', '!=', '>', '>=', '<', or '<=',
* can compare a column of one datatype (say INT) with a literal of different datatype (say DOUBLE). These expressions
* can not be evaluated on the Server. Hence, we rewrite such expressions into an equivalent expression whose LHS and
* RHS are of the same datatype.
*
* Numerical expressions of the form "column [operator] literal", where operator can be '=', '!=', '>', '>=', '<', '<=',
* or 'BETWEEN' can compare a column of one datatype (say INT) with a literal of different datatype (say DOUBLE). These
* expressions can not be evaluated on the Server. Hence, we rewrite such expressions into an equivalent expression
* whose LHS and RHS are of the same datatype.
* <p>
* Simple predicate examples:
* 1) WHERE "intColumn = 5.0" gets rewritten to "WHERE intColumn = 5"
* 2) WHERE "intColumn != 5.0" gets rewritten to "WHERE intColumn != 5"
* 3) WHERE "intColumn = 5.5" gets rewritten to "WHERE false" because INT values can not match 5.5.
* 4) WHERE "intColumn = 3000000000" gets rewritten to "WHERE false" because INT values can not match 3000000000.
* 5) WHERE "intColumn != 3000000000" gets rewritten to "WHERE true" because INT values always not equal to 3000000000.
* 6) WHERE "intColumn < 5.1" gets rewritten to "WHERE intColumn <= 5"
* 7) WHERE "intColumn > -3E9" gets rewritten to "WHERE true" because int values are always greater than -3E9.
*
* <ol>
* <li> "WHERE intColumn = 5.0" gets rewritten to "WHERE intColumn = 5"
* <li> "WHERE intColumn != 5.0" gets rewritten to "WHERE intColumn != 5"
* <li> "WHERE intColumn = 5.5" gets rewritten to "WHERE false" because INT values can not match 5.5.
* <li> "WHERE intColumn = 3000000000" gets rewritten to "WHERE false" because INT values can not match 3000000000.
* <li> "WHERE intColumn != 3000000000" gets rewritten to "WHERE true" because INT values always not equal to
* 3000000000.
* <li> "WHERE intColumn < 5.1" gets rewritten to "WHERE intColumn <= 5"
* <li> "WHERE intColumn > -3E9" gets rewritten to "WHERE true" because int values are always greater than -3E9.
* <li> "WHERE intColumn BETWEEN 2.5 AND 7.5" gets rewritten to "WHERE intColumn BETWEEN 3 AND 7"
* <li> "WHERE intColumn BETWEEN 5.5 AND 3000000000" gets rewritten to "WHERE intColumn BETWEEN 6 AND 2147483647" since
* 3000000000 is greater than Integer.MAX_VALUE.
* <li> "WHERE intColumn BETWEEN 10 AND 0" gets rewritten to "WHERE false" because lower bound is greater than upper
* bound.
* </ol>
* <p>
* Compound predicate examples:
* 8) WHERE "intColumn1 = 5.5 AND intColumn2 = intColumn3"
* <ol>
* <li> "WHERE intColumn1 = 5.5 AND intColumn2 = intColumn3"
* rewrite to "WHERE false AND intColumn2 = intColumn3"
* rewrite to "WHERE intColumn2 = intColumn3"
* 9) WHERE "intColumn1 != 5.5 OR intColumn2 = 5000000000" (5000000000 is out of bounds for integer column)
* <li> "WHERE intColumn1 != 5.5 OR intColumn2 = 5000000000" (5000000000 is out of bounds for integer column)
* rewrite to "WHERE true OR false"
* rewrite to "WHERE true"
* rewrite to query without any WHERE clause.
*
* </ol>
* <p>
* When entire predicate gets rewritten to false (Example 3 above), the query will not return any data. Hence, it is
* better for the Broker itself to return an empty response rather than sending the query to servers for further
* evaluation.
*
* TODO: Add support for BETWEEN, IN, and NOT IN operators.
* <p>
* TODO: Add support for IN, and NOT IN operators.
*/
public class NumericalFilterOptimizer extends BaseAndOrBooleanFilterOptimizer {

Expand All @@ -74,33 +83,39 @@ boolean canBeOptimized(Expression filterExpression, @Nullable Schema schema) {
Expression optimizeChild(Expression filterExpression, @Nullable Schema schema) {
Function function = filterExpression.getFunctionCall();
FilterKind kind = FilterKind.valueOf(function.getOperator());

if (!kind.isRange() && kind != FilterKind.EQUALS && kind != FilterKind.NOT_EQUALS) {
return filterExpression;
}

List<Expression> operands = function.getOperands();
// Verify that LHS is a numeric column and RHS is a literal before rewriting.
Expression lhs = operands.get(0);
Expression rhs = operands.get(1);

DataType dataType = getDataType(lhs, schema);
if (dataType == null || !dataType.isNumeric() || !rhs.isSetLiteral()) {
// No rewrite here
return filterExpression;
}

switch (kind) {
case IS_NULL:
case IS_NOT_NULL:
// No need to try to optimize IS_NULL and IS_NOT_NULL operations on numerical columns.
break;
default:
List<Expression> operands = function.getOperands();
// Verify that LHS is a numeric column and RHS is a numeric literal before rewriting.
Expression lhs = operands.get(0);
Expression rhs = operands.get(1);
if (isNumericLiteral(rhs)) {
DataType dataType = getDataType(lhs, schema);
if (dataType != null && dataType.isNumeric()) {
switch (kind) {
case EQUALS:
case NOT_EQUALS:
return rewriteEqualsExpression(filterExpression, kind, dataType, rhs);
case GREATER_THAN:
case GREATER_THAN_OR_EQUAL:
case LESS_THAN:
case LESS_THAN_OR_EQUAL:
return rewriteRangeExpression(filterExpression, kind, dataType, rhs);
default:
break;
}
}
case BETWEEN: {
return rewriteBetweenExpression(filterExpression, dataType);
}
case EQUALS:
case NOT_EQUALS:
case GREATER_THAN:
case GREATER_THAN_OR_EQUAL:
case LESS_THAN:
case LESS_THAN_OR_EQUAL: {
if (kind.isRange()) {
return rewriteRangeExpression(filterExpression, kind, dataType, rhs);
} else {
return rewriteEqualsExpression(filterExpression, kind, dataType, rhs);
}
}
default:
break;
}
return filterExpression;
Expand Down Expand Up @@ -346,6 +361,159 @@ private static Expression rewriteRangeExpression(Expression range, FilterKind ki
return range;
}

/**
* Rewrite expressions of the form "column BETWEEN lower AND upper" to ensure that lower and upper bounds are the same
* datatype as the column (or can be cast to the same datatype in the server).
*/
private static Expression rewriteBetweenExpression(Expression between, DataType dataType) {
// TODO: Consider unifying logic with rewriteRangeExpression
List<Expression> operands = between.getFunctionCall().getOperands();
Expression lower = operands.get(1);
Expression upper = operands.get(2);

// The BETWEEN filter predicate currently only supports literals as lower and upper bounds, but we're still checking
// here just in case.
if (lower.isSetLiteral()) {
switch (lower.getLiteral().getSetField()) {
case LONG_VALUE: {
long actual = lower.getLiteral().getLongValue();
// Other data types can be converted on the server side.
if (dataType == DataType.INT) {
if (actual > Integer.MAX_VALUE) {
// Lower bound literal value is greater than the bounds of INT.
return getExpressionFromBoolean(false);
}
if (actual < Integer.MIN_VALUE) {
lower.getLiteral().setIntValue(Integer.MIN_VALUE);
}
}
break;
}
case DOUBLE_VALUE: {
double actual = lower.getLiteral().getDoubleValue();

switch (dataType) {
case INT: {
if (actual > Integer.MAX_VALUE) {
// Lower bound literal value is greater than the bounds of INT.
return getExpressionFromBoolean(false);
}
if (actual < Integer.MIN_VALUE) {
lower.getLiteral().setIntValue(Integer.MIN_VALUE);
} else {
// Double value is in int range
int converted = (int) actual;
int comparison = BigDecimal.valueOf(converted).compareTo(BigDecimal.valueOf(actual));
if (comparison >= 0) {
lower.getLiteral().setIntValue(converted);
} else {
lower.getLiteral().setIntValue(converted + 1);
}
}
break;
}
case LONG: {
if (actual > Long.MAX_VALUE) {
// Lower bound literal value is greater than the bounds of LONG.
return getExpressionFromBoolean(false);
}
if (actual < Long.MIN_VALUE) {
lower.getLiteral().setLongValue(Long.MIN_VALUE);
} else {
// Double value is in long range
long converted = (long) actual;
int comparison = BigDecimal.valueOf(converted).compareTo(BigDecimal.valueOf(actual));
if (comparison >= 0) {
lower.getLiteral().setLongValue(converted);
} else {
lower.getLiteral().setLongValue(converted + 1);
}
}
break;
}
default:
// For other numeric data types, the double literal can be converted on the server side.
break;
}
break;
}
default:
break;
}
}

if (upper.isSetLiteral()) {
switch (upper.getLiteral().getSetField()) {
case LONG_VALUE: {
long actual = upper.getLiteral().getLongValue();
// Other data types can be converted on the server side.
if (dataType == DataType.INT) {
if (actual < Integer.MIN_VALUE) {
// Upper bound literal value is lesser than the bounds of INT.
return getExpressionFromBoolean(false);
}
if (actual > Integer.MAX_VALUE) {
upper.getLiteral().setIntValue(Integer.MAX_VALUE);
}
}
break;
}
case DOUBLE_VALUE: {
double actual = upper.getLiteral().getDoubleValue();

switch (dataType) {
case INT: {
if (actual < Integer.MIN_VALUE) {
// Upper bound literal value is lesser than the bounds of INT.
return getExpressionFromBoolean(false);
}
if (actual > Integer.MAX_VALUE) {
upper.getLiteral().setIntValue(Integer.MAX_VALUE);
} else {
// Double value is in int range
int converted = (int) actual;
int comparison = BigDecimal.valueOf(converted).compareTo(BigDecimal.valueOf(actual));
if (comparison <= 0) {
upper.getLiteral().setIntValue(converted);
} else {
upper.getLiteral().setIntValue(converted - 1);
}
}
break;
}
case LONG: {
if (actual < Long.MIN_VALUE) {
// Upper bound literal value is lesser than the bounds of LONG.
return getExpressionFromBoolean(false);
}
if (actual > Long.MAX_VALUE) {
upper.getLiteral().setLongValue(Long.MAX_VALUE);
} else {
// Double value is in long range
long converted = (long) actual;
int comparison = BigDecimal.valueOf(converted).compareTo(BigDecimal.valueOf(actual));
if (comparison <= 0) {
upper.getLiteral().setLongValue(converted);
} else {
upper.getLiteral().setLongValue(converted - 1);
}
}
break;
}
default:
// For other numeric data types, the double literal can be converted on the server side.
break;
}
break;
}
default:
break;
}
}

return between;
}

/**
* Helper function to rewrite range operator of a range expression.
* @param range Range expression.
Expand All @@ -356,9 +524,9 @@ private static void rewriteRangeOperator(Expression range, FilterKind kind, int
if (comparison > 0) {
// Literal value is greater than the converted value, so rewrite:
// "column > literal" to "column > converted"
// "column >= literal" to "column >= converted"
// "column >= literal" to "column > converted"
// "column < literal" to "column <= converted"
// "column <= literal" to "column < converted"
// "column <= literal" to "column <= converted"
if (kind == FilterKind.GREATER_THAN || kind == FilterKind.GREATER_THAN_OR_EQUAL) {
range.getFunctionCall().setOperator(FilterKind.GREATER_THAN.name());
} else if (kind == FilterKind.LESS_THAN || kind == FilterKind.LESS_THAN_OR_EQUAL) {
Expand Down Expand Up @@ -413,21 +581,4 @@ private static DataType getDataType(Expression expression, Schema schema) {
}
return null;
}

/** @return true if expression is a numeric literal; otherwise, false. */
private static boolean isNumericLiteral(Expression expression) {
if (expression.getType() == ExpressionType.LITERAL) {
Literal._Fields type = expression.getLiteral().getSetField();
switch (type) {
case INT_VALUE:
case LONG_VALUE:
case FLOAT_VALUE:
case DOUBLE_VALUE:
return true;
default:
break;
}
}
return false;
}
}
Loading

0 comments on commit 782e769

Please sign in to comment.