apache
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala‎
Lines changed: 74 additions & 50 deletions b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala‎
Lines changed: 74 additions & 50 deletions
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.plans.logical.statsEstimation
 
 import scala.collection.immutable.HashSet
 import scala.collection.mutable
+import scala.math.BigDecimal.RoundingMode
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.CatalystConf
@@ -90,7 +91,6 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
   def calculateFilterSelectivity(condition: Expression, update: Boolean = true): Option[Double] = {
     condition match {
       case And(cond1, cond2) =>
-        // For ease of debugging, we compute percent1 and percent2 in 2 statements.
         val percent1 = calculateFilterSelectivity(cond1, update)
         val percent2 = calculateFilterSelectivity(cond2, update)
         (percent1, percent2) match {
@@ -101,21 +101,23 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
         }
 
       case Or(cond1, cond2) =>
-        // For ease of debugging, we compute percent1 and percent2 in 2 statements.
         val percent1 = calculateFilterSelectivity(cond1, update = false)
         val percent2 = calculateFilterSelectivity(cond2, update = false)
         (percent1, percent2) match {
           case (Some(p1), Some(p2)) => Some(math.min(1.0, p1 + p2 - (p1 * p2)))
-          case (Some(p1), None) => Some(1.0)
-          case (None, Some(p2)) => Some(1.0)
-          case (None, None) => None
+          case _ => None
         }
 
-      case Not(cond) => calculateFilterSelectivity(cond, update = false) match {
-        case Some(percent) => Some(1.0 - percent)
-        // for not-supported condition, set filter selectivity to a conservative estimate 100%
-        case None => None
-      }
+      case Not(cond) =>
+        if (cond.isInstanceOf[And] || cond.isInstanceOf[Or]) {
+          // Don't support compound Not expression.
+          None
+        } else {
+          calculateSingleCondition(cond, update = false) match {
+            case Some(percent) => Some(1.0 - percent)
+            case None => None
+          }
+        }
 
       case _ => calculateSingleCondition(condition, update)
     }
@@ -225,12 +227,12 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
     }
 
     val percent = if (isNull) {
-      nullPercent.toDouble
+      nullPercent
     } else {
-      1.0 - nullPercent.toDouble
+      1.0 - nullPercent
     }
 
-    Some(percent)
+    Some(percent.toDouble)
   }
 
   /**
@@ -249,17 +251,19 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
       attr: Attribute,
       literal: Literal,
       update: Boolean): Option[Double] = {
+    if (!colStatsMap.contains(attr)) {
+      logDebug("[CBO] No statistics for " + attr)
+      return None
+    }
+
     attr.dataType match {
-      case _: NumericType | DateType | TimestampType =>
+      case _: NumericType | DateType | TimestampType | BooleanType =>
         evaluateBinaryForNumeric(op, attr, literal, update)
       case StringType | BinaryType =>
         // TODO: It is difficult to support other binary comparisons for String/Binary
         // type without min/max and advanced statistics like histogram.
         logDebug("[CBO] No range comparison statistics for String/Binary type " + attr)
         None
-      case _ =>
-        // TODO: support boolean type.
-        None
     }
   }
 
@@ -291,6 +295,10 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
    * Returns a percentage of rows meeting an equality (=) expression.
    * This method evaluates the equality predicate for all data types.
    *
+   * For EqualNullSafe (<=>), if the literal is not null, result will be the same as EqualTo;
+   * if the literal is null, the condition will be changed to IsNull after optimization.
+   * So we don't need specific logic for EqualNullSafe here.
+   *
    * @param attr an Attribute (or a column)
    * @param literal a literal value (or constant)
    * @param update a boolean flag to specify if we need to update ColumnStat of a given column
@@ -323,7 +331,7 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
         colStatsMap(attr) = newStats
       }
 
-      Some(1.0 / ndv.toDouble)
+      Some((1.0 / BigDecimal(ndv)).toDouble)
     } else {
       Some(0.0)
     }
@@ -394,12 +402,12 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
 
     // return the filter selectivity.  Without advanced statistics such as histograms,
     // we have to assume uniform distribution.
-    Some(math.min(1.0, newNdv.toDouble / ndv.toDouble))
+    Some(math.min(1.0, (BigDecimal(newNdv) / BigDecimal(ndv)).toDouble))
   }
 
   /**
    * Returns a percentage of rows meeting a binary comparison expression.
-   * This method evaluate expression for Numeric columns only.
+   * This method evaluate expression for Numeric/Date/Timestamp/Boolean columns.
    *
    * @param op a binary comparison operator uch as =, <, <=, >, >=
    * @param attr an Attribute (or a column)
@@ -414,53 +422,63 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
       literal: Literal,
       update: Boolean): Option[Double] = {
 
-    var percent = 1.0
     val colStat = colStatsMap(attr)
-    val statsRange =
-      Range(colStat.min, colStat.max, attr.dataType).asInstanceOf[NumericRange]
+    val statsRange = Range(colStat.min, colStat.max, attr.dataType).asInstanceOf[NumericRange]
+    val max = BigDecimal(statsRange.max)
+    val min = BigDecimal(statsRange.min)
+    val ndv = BigDecimal(colStat.distinctCount)
 
     // determine the overlapping degree between predicate range and column's range
-    val literalValueBD = BigDecimal(literal.value.toString)
+    val numericLiteral = if (literal.dataType.isInstanceOf[BooleanType]) {
+      if (literal.value.asInstanceOf[Boolean]) BigDecimal(1) else BigDecimal(0)
+    } else {
+      BigDecimal(literal.value.toString)
+    }
     val (noOverlap: Boolean, completeOverlap: Boolean) = op match {
       case _: LessThan =>
-        (literalValueBD <= statsRange.min, literalValueBD > statsRange.max)
+        (numericLiteral <= min, numericLiteral > max)
       case _: LessThanOrEqual =>
-        (literalValueBD < statsRange.min, literalValueBD >= statsRange.max)
+        (numericLiteral < min, numericLiteral >= max)
       case _: GreaterThan =>
-        (literalValueBD >= statsRange.max, literalValueBD < statsRange.min)
+        (numericLiteral >= max, numericLiteral < min)
       case _: GreaterThanOrEqual =>
-        (literalValueBD > statsRange.max, literalValueBD <= statsRange.min)
+        (numericLiteral > max, numericLiteral <= min)
     }
 
+    var percent = BigDecimal(1.0)
     if (noOverlap) {
       percent = 0.0
     } else if (completeOverlap) {
       percent = 1.0
     } else {
-      // this is partial overlap case
-      val literalDouble = literalValueBD.toDouble
-      val maxDouble = BigDecimal(statsRange.max).toDouble
-      val minDouble = BigDecimal(statsRange.min).toDouble
-
+      // This is the partial overlap case:
       // Without advanced statistics like histogram, we assume uniform data distribution.
       // We just prorate the adjusted range over the initial range to compute filter selectivity.
-      // For ease of computation, we convert all relevant numeric values to Double.
+      assert(max > min)
       percent = op match {
         case _: LessThan =>
-          (literalDouble - minDouble) / (maxDouble - minDouble)
+          if (numericLiteral == max) {
+            1.0 - 1.0 / ndv
+          } else {
+            (numericLiteral - min) / (max - min)
+          }
         case _: LessThanOrEqual =>
-          if (literalValueBD == BigDecimal(statsRange.min)) {
-            1.0 / colStat.distinctCount.toDouble
+          if (numericLiteral == min) {
+            1.0 / ndv
           } else {
-            (literalDouble - minDouble) / (maxDouble - minDouble)
+            (numericLiteral - min) / (max - min)
           }
         case _: GreaterThan =>
-          (maxDouble - literalDouble) / (maxDouble - minDouble)
+          if (numericLiteral == min) {
+            1.0 - 1.0 / ndv
+          } else {
+            (max - numericLiteral) / (max - min)
+          }
         case _: GreaterThanOrEqual =>
-          if (literalValueBD == BigDecimal(statsRange.max)) {
-            1.0 / colStat.distinctCount.toDouble
+          if (numericLiteral == max) {
+            1.0 / ndv
           } else {
-            (maxDouble - literalDouble) / (maxDouble - minDouble)
+            (max - numericLiteral) / (max - min)
           }
       }
 
@@ -469,22 +487,28 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
         val newValue = convertBoundValue(attr.dataType, literal.value)
         var newMax = colStat.max
         var newMin = colStat.min
+        var newNdv = (ndv * percent).setScale(0, RoundingMode.HALF_UP).toBigInt()
+        if (newNdv < 1) newNdv = 1
+
         op match {
-          case _: GreaterThan => newMin = newValue
-          case _: GreaterThanOrEqual => newMin = newValue
-          case _: LessThan => newMax = newValue
-          case _: LessThanOrEqual => newMax = newValue
+          case _: GreaterThan =>
+            if (newNdv == 1) newMin = newMax else newMin = newValue
+          case _: GreaterThanOrEqual =>
+            newMin = newValue
+          case _: LessThan =>
+            if (newNdv == 1) newMax = newMin else newMax = newValue
+          case _: LessThanOrEqual =>
+            newMax = newValue
         }
 
-        val newNdv = math.max(math.round(colStat.distinctCount.toDouble * percent), 1)
-        val newStats = colStat.copy(distinctCount = newNdv, min = newMin,
-          max = newMax, nullCount = 0)
+        val newStats =
+          colStat.copy(distinctCount = newNdv, min = newMin, max = newMax, nullCount = 0)
 
         colStatsMap(attr) = newStats
       }
     }
 
-    Some(percent)
+    Some(percent.toDouble)
   }
 
 }