@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.plans.logical.statsEstimation
1919
2020import scala .collection .immutable .HashSet
2121import scala .collection .mutable
22+ import scala .math .BigDecimal .RoundingMode
2223
2324import org .apache .spark .internal .Logging
2425import org .apache .spark .sql .catalyst .CatalystConf
@@ -90,7 +91,6 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
9091 def calculateFilterSelectivity (condition : Expression , update : Boolean = true ): Option [Double ] = {
9192 condition match {
9293 case And (cond1, cond2) =>
93- // For ease of debugging, we compute percent1 and percent2 in 2 statements.
9494 val percent1 = calculateFilterSelectivity(cond1, update)
9595 val percent2 = calculateFilterSelectivity(cond2, update)
9696 (percent1, percent2) match {
@@ -101,21 +101,23 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
101101 }
102102
103103 case Or (cond1, cond2) =>
104- // For ease of debugging, we compute percent1 and percent2 in 2 statements.
105104 val percent1 = calculateFilterSelectivity(cond1, update = false )
106105 val percent2 = calculateFilterSelectivity(cond2, update = false )
107106 (percent1, percent2) match {
108107 case (Some (p1), Some (p2)) => Some (math.min(1.0 , p1 + p2 - (p1 * p2)))
109- case (Some (p1), None ) => Some (1.0 )
110- case (None , Some (p2)) => Some (1.0 )
111- case (None , None ) => None
108+ case _ => None
112109 }
113110
114- case Not (cond) => calculateFilterSelectivity(cond, update = false ) match {
115- case Some (percent) => Some (1.0 - percent)
116- // for not-supported condition, set filter selectivity to a conservative estimate 100%
117- case None => None
118- }
111+ case Not (cond) =>
112+ if (cond.isInstanceOf [And ] || cond.isInstanceOf [Or ]) {
113+ // Don't support compound Not expression.
114+ None
115+ } else {
116+ calculateSingleCondition(cond, update = false ) match {
117+ case Some (percent) => Some (1.0 - percent)
118+ case None => None
119+ }
120+ }
119121
120122 case _ => calculateSingleCondition(condition, update)
121123 }
@@ -225,12 +227,12 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
225227 }
226228
227229 val percent = if (isNull) {
228- nullPercent.toDouble
230+ nullPercent
229231 } else {
230- 1.0 - nullPercent.toDouble
232+ 1.0 - nullPercent
231233 }
232234
233- Some (percent)
235+ Some (percent.toDouble )
234236 }
235237
236238 /**
@@ -249,17 +251,19 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
249251 attr : Attribute ,
250252 literal : Literal ,
251253 update : Boolean ): Option [Double ] = {
254+ if (! colStatsMap.contains(attr)) {
255+ logDebug(" [CBO] No statistics for " + attr)
256+ return None
257+ }
258+
252259 attr.dataType match {
253- case _ : NumericType | DateType | TimestampType =>
260+ case _ : NumericType | DateType | TimestampType | BooleanType =>
254261 evaluateBinaryForNumeric(op, attr, literal, update)
255262 case StringType | BinaryType =>
256263 // TODO: It is difficult to support other binary comparisons for String/Binary
257264 // type without min/max and advanced statistics like histogram.
258265 logDebug(" [CBO] No range comparison statistics for String/Binary type " + attr)
259266 None
260- case _ =>
261- // TODO: support boolean type.
262- None
263267 }
264268 }
265269
@@ -291,6 +295,10 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
291295 * Returns a percentage of rows meeting an equality (=) expression.
292296 * This method evaluates the equality predicate for all data types.
293297 *
298+ * For EqualNullSafe (<=>), if the literal is not null, result will be the same as EqualTo;
299+ * if the literal is null, the condition will be changed to IsNull after optimization.
300+ * So we don't need specific logic for EqualNullSafe here.
301+ *
294302 * @param attr an Attribute (or a column)
295303 * @param literal a literal value (or constant)
296304 * @param update a boolean flag to specify if we need to update ColumnStat of a given column
@@ -323,7 +331,7 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
323331 colStatsMap(attr) = newStats
324332 }
325333
326- Some (1.0 / ndv.toDouble)
334+ Some (( 1.0 / BigDecimal ( ndv)) .toDouble)
327335 } else {
328336 Some (0.0 )
329337 }
@@ -394,12 +402,12 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
394402
395403 // return the filter selectivity. Without advanced statistics such as histograms,
396404 // we have to assume uniform distribution.
397- Some (math.min(1.0 , newNdv.toDouble / ndv.toDouble))
405+ Some (math.min(1.0 , ( BigDecimal ( newNdv) / BigDecimal ( ndv)) .toDouble))
398406 }
399407
400408 /**
401409 * Returns a percentage of rows meeting a binary comparison expression.
402- * This method evaluate expression for Numeric columns only .
410+ * This method evaluate expression for Numeric/Date/Timestamp/Boolean columns.
403411 *
404412 * @param op a binary comparison operator uch as =, <, <=, >, >=
405413 * @param attr an Attribute (or a column)
@@ -414,53 +422,63 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
414422 literal : Literal ,
415423 update : Boolean ): Option [Double ] = {
416424
417- var percent = 1.0
418425 val colStat = colStatsMap(attr)
419- val statsRange =
420- Range (colStat.min, colStat.max, attr.dataType).asInstanceOf [NumericRange ]
426+ val statsRange = Range (colStat.min, colStat.max, attr.dataType).asInstanceOf [NumericRange ]
427+ val max = BigDecimal (statsRange.max)
428+ val min = BigDecimal (statsRange.min)
429+ val ndv = BigDecimal (colStat.distinctCount)
421430
422431 // determine the overlapping degree between predicate range and column's range
423- val literalValueBD = BigDecimal (literal.value.toString)
432+ val numericLiteral = if (literal.dataType.isInstanceOf [BooleanType ]) {
433+ if (literal.value.asInstanceOf [Boolean ]) BigDecimal (1 ) else BigDecimal (0 )
434+ } else {
435+ BigDecimal (literal.value.toString)
436+ }
424437 val (noOverlap : Boolean , completeOverlap : Boolean ) = op match {
425438 case _ : LessThan =>
426- (literalValueBD <= statsRange. min, literalValueBD > statsRange. max)
439+ (numericLiteral <= min, numericLiteral > max)
427440 case _ : LessThanOrEqual =>
428- (literalValueBD < statsRange. min, literalValueBD >= statsRange. max)
441+ (numericLiteral < min, numericLiteral >= max)
429442 case _ : GreaterThan =>
430- (literalValueBD >= statsRange. max, literalValueBD < statsRange. min)
443+ (numericLiteral >= max, numericLiteral < min)
431444 case _ : GreaterThanOrEqual =>
432- (literalValueBD > statsRange. max, literalValueBD <= statsRange. min)
445+ (numericLiteral > max, numericLiteral <= min)
433446 }
434447
448+ var percent = BigDecimal (1.0 )
435449 if (noOverlap) {
436450 percent = 0.0
437451 } else if (completeOverlap) {
438452 percent = 1.0
439453 } else {
440- // this is partial overlap case
441- val literalDouble = literalValueBD.toDouble
442- val maxDouble = BigDecimal (statsRange.max).toDouble
443- val minDouble = BigDecimal (statsRange.min).toDouble
444-
454+ // This is the partial overlap case:
445455 // Without advanced statistics like histogram, we assume uniform data distribution.
446456 // We just prorate the adjusted range over the initial range to compute filter selectivity.
447- // For ease of computation, we convert all relevant numeric values to Double.
457+ assert(max > min)
448458 percent = op match {
449459 case _ : LessThan =>
450- (literalDouble - minDouble) / (maxDouble - minDouble)
460+ if (numericLiteral == max) {
461+ 1.0 - 1.0 / ndv
462+ } else {
463+ (numericLiteral - min) / (max - min)
464+ }
451465 case _ : LessThanOrEqual =>
452- if (literalValueBD == BigDecimal (statsRange. min) ) {
453- 1.0 / colStat.distinctCount.toDouble
466+ if (numericLiteral == min) {
467+ 1.0 / ndv
454468 } else {
455- (literalDouble - minDouble ) / (maxDouble - minDouble )
469+ (numericLiteral - min ) / (max - min )
456470 }
457471 case _ : GreaterThan =>
458- (maxDouble - literalDouble) / (maxDouble - minDouble)
472+ if (numericLiteral == min) {
473+ 1.0 - 1.0 / ndv
474+ } else {
475+ (max - numericLiteral) / (max - min)
476+ }
459477 case _ : GreaterThanOrEqual =>
460- if (literalValueBD == BigDecimal (statsRange. max) ) {
461- 1.0 / colStat.distinctCount.toDouble
478+ if (numericLiteral == max) {
479+ 1.0 / ndv
462480 } else {
463- (maxDouble - literalDouble ) / (maxDouble - minDouble )
481+ (max - numericLiteral ) / (max - min )
464482 }
465483 }
466484
@@ -469,22 +487,28 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
469487 val newValue = convertBoundValue(attr.dataType, literal.value)
470488 var newMax = colStat.max
471489 var newMin = colStat.min
490+ var newNdv = (ndv * percent).setScale(0 , RoundingMode .HALF_UP ).toBigInt()
491+ if (newNdv < 1 ) newNdv = 1
492+
472493 op match {
473- case _ : GreaterThan => newMin = newValue
474- case _ : GreaterThanOrEqual => newMin = newValue
475- case _ : LessThan => newMax = newValue
476- case _ : LessThanOrEqual => newMax = newValue
494+ case _ : GreaterThan =>
495+ if (newNdv == 1 ) newMin = newMax else newMin = newValue
496+ case _ : GreaterThanOrEqual =>
497+ newMin = newValue
498+ case _ : LessThan =>
499+ if (newNdv == 1 ) newMax = newMin else newMax = newValue
500+ case _ : LessThanOrEqual =>
501+ newMax = newValue
477502 }
478503
479- val newNdv = math.max(math.round(colStat.distinctCount.toDouble * percent), 1 )
480- val newStats = colStat.copy(distinctCount = newNdv, min = newMin,
481- max = newMax, nullCount = 0 )
504+ val newStats =
505+ colStat.copy(distinctCount = newNdv, min = newMin, max = newMax, nullCount = 0 )
482506
483507 colStatsMap(attr) = newStats
484508 }
485509 }
486510
487- Some (percent)
511+ Some (percent.toDouble )
488512 }
489513
490514}
0 commit comments