Skip to content

Commit d4787b8

Browse files
author
wangzhenhua
committed
fix filter estimation issues
1 parent e24f21b commit d4787b8

File tree

2 files changed

+101
-78
lines changed

2 files changed

+101
-78
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala

Lines changed: 74 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.plans.logical.statsEstimation
1919

2020
import scala.collection.immutable.HashSet
2121
import scala.collection.mutable
22+
import scala.math.BigDecimal.RoundingMode
2223

2324
import org.apache.spark.internal.Logging
2425
import org.apache.spark.sql.catalyst.CatalystConf
@@ -90,7 +91,6 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
9091
def calculateFilterSelectivity(condition: Expression, update: Boolean = true): Option[Double] = {
9192
condition match {
9293
case And(cond1, cond2) =>
93-
// For ease of debugging, we compute percent1 and percent2 in 2 statements.
9494
val percent1 = calculateFilterSelectivity(cond1, update)
9595
val percent2 = calculateFilterSelectivity(cond2, update)
9696
(percent1, percent2) match {
@@ -101,21 +101,23 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
101101
}
102102

103103
case Or(cond1, cond2) =>
104-
// For ease of debugging, we compute percent1 and percent2 in 2 statements.
105104
val percent1 = calculateFilterSelectivity(cond1, update = false)
106105
val percent2 = calculateFilterSelectivity(cond2, update = false)
107106
(percent1, percent2) match {
108107
case (Some(p1), Some(p2)) => Some(math.min(1.0, p1 + p2 - (p1 * p2)))
109-
case (Some(p1), None) => Some(1.0)
110-
case (None, Some(p2)) => Some(1.0)
111-
case (None, None) => None
108+
case _ => None
112109
}
113110

114-
case Not(cond) => calculateFilterSelectivity(cond, update = false) match {
115-
case Some(percent) => Some(1.0 - percent)
116-
// for not-supported condition, set filter selectivity to a conservative estimate 100%
117-
case None => None
118-
}
111+
case Not(cond) =>
112+
if (cond.isInstanceOf[And] || cond.isInstanceOf[Or]) {
113+
// Don't support compound Not expression.
114+
None
115+
} else {
116+
calculateSingleCondition(cond, update = false) match {
117+
case Some(percent) => Some(1.0 - percent)
118+
case None => None
119+
}
120+
}
119121

120122
case _ => calculateSingleCondition(condition, update)
121123
}
@@ -225,12 +227,12 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
225227
}
226228

227229
val percent = if (isNull) {
228-
nullPercent.toDouble
230+
nullPercent
229231
} else {
230-
1.0 - nullPercent.toDouble
232+
1.0 - nullPercent
231233
}
232234

233-
Some(percent)
235+
Some(percent.toDouble)
234236
}
235237

236238
/**
@@ -249,17 +251,19 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
249251
attr: Attribute,
250252
literal: Literal,
251253
update: Boolean): Option[Double] = {
254+
if (!colStatsMap.contains(attr)) {
255+
logDebug("[CBO] No statistics for " + attr)
256+
return None
257+
}
258+
252259
attr.dataType match {
253-
case _: NumericType | DateType | TimestampType =>
260+
case _: NumericType | DateType | TimestampType | BooleanType =>
254261
evaluateBinaryForNumeric(op, attr, literal, update)
255262
case StringType | BinaryType =>
256263
// TODO: It is difficult to support other binary comparisons for String/Binary
257264
// type without min/max and advanced statistics like histogram.
258265
logDebug("[CBO] No range comparison statistics for String/Binary type " + attr)
259266
None
260-
case _ =>
261-
// TODO: support boolean type.
262-
None
263267
}
264268
}
265269

@@ -291,6 +295,10 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
291295
* Returns a percentage of rows meeting an equality (=) expression.
292296
* This method evaluates the equality predicate for all data types.
293297
*
298+
* For EqualNullSafe (<=>), if the literal is not null, result will be the same as EqualTo;
299+
* if the literal is null, the condition will be changed to IsNull after optimization.
300+
* So we don't need specific logic for EqualNullSafe here.
301+
*
294302
* @param attr an Attribute (or a column)
295303
* @param literal a literal value (or constant)
296304
* @param update a boolean flag to specify if we need to update ColumnStat of a given column
@@ -323,7 +331,7 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
323331
colStatsMap(attr) = newStats
324332
}
325333

326-
Some(1.0 / ndv.toDouble)
334+
Some((1.0 / BigDecimal(ndv)).toDouble)
327335
} else {
328336
Some(0.0)
329337
}
@@ -394,12 +402,12 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
394402

395403
// return the filter selectivity. Without advanced statistics such as histograms,
396404
// we have to assume uniform distribution.
397-
Some(math.min(1.0, newNdv.toDouble / ndv.toDouble))
405+
Some(math.min(1.0, (BigDecimal(newNdv) / BigDecimal(ndv)).toDouble))
398406
}
399407

400408
/**
401409
* Returns a percentage of rows meeting a binary comparison expression.
402-
* This method evaluate expression for Numeric columns only.
410+
* This method evaluate expression for Numeric/Date/Timestamp/Boolean columns.
403411
*
404412
* @param op a binary comparison operator uch as =, <, <=, >, >=
405413
* @param attr an Attribute (or a column)
@@ -414,53 +422,63 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
414422
literal: Literal,
415423
update: Boolean): Option[Double] = {
416424

417-
var percent = 1.0
418425
val colStat = colStatsMap(attr)
419-
val statsRange =
420-
Range(colStat.min, colStat.max, attr.dataType).asInstanceOf[NumericRange]
426+
val statsRange = Range(colStat.min, colStat.max, attr.dataType).asInstanceOf[NumericRange]
427+
val max = BigDecimal(statsRange.max)
428+
val min = BigDecimal(statsRange.min)
429+
val ndv = BigDecimal(colStat.distinctCount)
421430

422431
// determine the overlapping degree between predicate range and column's range
423-
val literalValueBD = BigDecimal(literal.value.toString)
432+
val numericLiteral = if (literal.dataType.isInstanceOf[BooleanType]) {
433+
if (literal.value.asInstanceOf[Boolean]) BigDecimal(1) else BigDecimal(0)
434+
} else {
435+
BigDecimal(literal.value.toString)
436+
}
424437
val (noOverlap: Boolean, completeOverlap: Boolean) = op match {
425438
case _: LessThan =>
426-
(literalValueBD <= statsRange.min, literalValueBD > statsRange.max)
439+
(numericLiteral <= min, numericLiteral > max)
427440
case _: LessThanOrEqual =>
428-
(literalValueBD < statsRange.min, literalValueBD >= statsRange.max)
441+
(numericLiteral < min, numericLiteral >= max)
429442
case _: GreaterThan =>
430-
(literalValueBD >= statsRange.max, literalValueBD < statsRange.min)
443+
(numericLiteral >= max, numericLiteral < min)
431444
case _: GreaterThanOrEqual =>
432-
(literalValueBD > statsRange.max, literalValueBD <= statsRange.min)
445+
(numericLiteral > max, numericLiteral <= min)
433446
}
434447

448+
var percent = BigDecimal(1.0)
435449
if (noOverlap) {
436450
percent = 0.0
437451
} else if (completeOverlap) {
438452
percent = 1.0
439453
} else {
440-
// this is partial overlap case
441-
val literalDouble = literalValueBD.toDouble
442-
val maxDouble = BigDecimal(statsRange.max).toDouble
443-
val minDouble = BigDecimal(statsRange.min).toDouble
444-
454+
// This is the partial overlap case:
445455
// Without advanced statistics like histogram, we assume uniform data distribution.
446456
// We just prorate the adjusted range over the initial range to compute filter selectivity.
447-
// For ease of computation, we convert all relevant numeric values to Double.
457+
assert(max > min)
448458
percent = op match {
449459
case _: LessThan =>
450-
(literalDouble - minDouble) / (maxDouble - minDouble)
460+
if (numericLiteral == max) {
461+
1.0 - 1.0 / ndv
462+
} else {
463+
(numericLiteral - min) / (max - min)
464+
}
451465
case _: LessThanOrEqual =>
452-
if (literalValueBD == BigDecimal(statsRange.min)) {
453-
1.0 / colStat.distinctCount.toDouble
466+
if (numericLiteral == min) {
467+
1.0 / ndv
454468
} else {
455-
(literalDouble - minDouble) / (maxDouble - minDouble)
469+
(numericLiteral - min) / (max - min)
456470
}
457471
case _: GreaterThan =>
458-
(maxDouble - literalDouble) / (maxDouble - minDouble)
472+
if (numericLiteral == min) {
473+
1.0 - 1.0 / ndv
474+
} else {
475+
(max - numericLiteral) / (max - min)
476+
}
459477
case _: GreaterThanOrEqual =>
460-
if (literalValueBD == BigDecimal(statsRange.max)) {
461-
1.0 / colStat.distinctCount.toDouble
478+
if (numericLiteral == max) {
479+
1.0 / ndv
462480
} else {
463-
(maxDouble - literalDouble) / (maxDouble - minDouble)
481+
(max - numericLiteral) / (max - min)
464482
}
465483
}
466484

@@ -469,22 +487,28 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
469487
val newValue = convertBoundValue(attr.dataType, literal.value)
470488
var newMax = colStat.max
471489
var newMin = colStat.min
490+
var newNdv = (ndv * percent).setScale(0, RoundingMode.HALF_UP).toBigInt()
491+
if (newNdv < 1) newNdv = 1
492+
472493
op match {
473-
case _: GreaterThan => newMin = newValue
474-
case _: GreaterThanOrEqual => newMin = newValue
475-
case _: LessThan => newMax = newValue
476-
case _: LessThanOrEqual => newMax = newValue
494+
case _: GreaterThan =>
495+
if (newNdv == 1) newMin = newMax else newMin = newValue
496+
case _: GreaterThanOrEqual =>
497+
newMin = newValue
498+
case _: LessThan =>
499+
if (newNdv == 1) newMax = newMin else newMax = newValue
500+
case _: LessThanOrEqual =>
501+
newMax = newValue
477502
}
478503

479-
val newNdv = math.max(math.round(colStat.distinctCount.toDouble * percent), 1)
480-
val newStats = colStat.copy(distinctCount = newNdv, min = newMin,
481-
max = newMax, nullCount = 0)
504+
val newStats =
505+
colStat.copy(distinctCount = newNdv, min = newMin, max = newMax, nullCount = 0)
482506

483507
colStatsMap(attr) = newStats
484508
}
485509
}
486510

487-
Some(percent)
511+
Some(percent.toDouble)
488512
}
489513

490514
}

0 commit comments

Comments
 (0)