Skip to content

Commit dbfd4c7

Browse files
committed
1.deal with compound Not conditions; 2.reorganize test code; 3.no need to keep column stats for empty output
1 parent 7c8d012 commit dbfd4c7

File tree

2 files changed

+244
-228
lines changed

2 files changed

+244
-228
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala

Lines changed: 46 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -53,17 +53,19 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
5353
def estimate: Option[Statistics] = {
5454
if (childStats.rowCount.isEmpty) return None
5555

56-
// save a mutable copy of colStats so that we can later change it recursively
56+
// Save a mutable copy of colStats so that we can later change it recursively.
5757
colStatsMap.setInitValues(childStats.attributeStats)
5858

59-
// estimate selectivity of this filter predicate
60-
val filterSelectivity: Double = calculateFilterSelectivity(plan.condition) match {
61-
case Some(percent) => percent
62-
// for not-supported condition, set filter selectivity to a conservative estimate 100%
63-
case None => 1.0
64-
}
59+
// Estimate selectivity of this filter predicate, and update column stats if needed.
60+
// For not-supported condition, set filter selectivity to a conservative estimate 100%
61+
val filterSelectivity: Double = calculateFilterSelectivity(plan.condition).getOrElse(1.0)
6562

66-
val newColStats = colStatsMap.toColumnStats
63+
val newColStats = if (filterSelectivity == 0) {
64+
// The output is empty, we don't need to keep column stats.
65+
AttributeMap[ColumnStat](Nil)
66+
} else {
67+
colStatsMap.toColumnStats
68+
}
6769

6870
val filteredRowCount: BigInt =
6971
EstimationUtils.ceil(BigDecimal(childStats.rowCount.get) * filterSelectivity)
@@ -75,12 +77,14 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
7577
}
7678

7779
/**
78-
* Returns a percentage of rows meeting a compound condition in Filter node.
79-
* A compound condition is decomposed into multiple single conditions linked with AND, OR, NOT.
80+
* Returns a percentage of rows meeting a condition in Filter node.
81+
* If it's a single condition, we calculate the percentage directly.
82+
* If it's a compound condition, it is decomposed into multiple single conditions linked with
83+
* AND, OR, NOT.
8084
* For logical AND conditions, we need to update stats after a condition estimation
8185
* so that the stats will be more accurate for subsequent estimation. This is needed for
8286
* range condition such as (c > 40 AND c <= 50)
83-
* For logical OR conditions, we do not update stats after a condition estimation.
87+
* For logical OR and NOT conditions, we do not update stats after a condition estimation.
8488
*
8589
* @param condition the compound logical expression
8690
* @param update a boolean flag to specify if we need to update ColumnStat of a column
@@ -91,32 +95,42 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
9195
def calculateFilterSelectivity(condition: Expression, update: Boolean = true): Option[Double] = {
9296
condition match {
9397
case And(cond1, cond2) =>
94-
val percent1 = calculateFilterSelectivity(cond1, update)
95-
val percent2 = calculateFilterSelectivity(cond2, update)
96-
(percent1, percent2) match {
97-
case (Some(p1), Some(p2)) => Some(p1 * p2)
98-
case (Some(p1), None) => Some(p1)
99-
case (None, Some(p2)) => Some(p2)
100-
case (None, None) => None
101-
}
98+
val percent1 = calculateFilterSelectivity(cond1, update).getOrElse(1.0)
99+
val percent2 = calculateFilterSelectivity(cond2, update).getOrElse(1.0)
100+
Some(percent1 * percent2)
102101

103102
case Or(cond1, cond2) =>
104-
val percent1 = calculateFilterSelectivity(cond1, update = false)
105-
val percent2 = calculateFilterSelectivity(cond2, update = false)
106-
(percent1, percent2) match {
107-
case (Some(p1), Some(p2)) => Some(math.min(1.0, p1 + p2 - (p1 * p2)))
108-
case _ => None
103+
val percent1 = calculateFilterSelectivity(cond1, update = false).getOrElse(1.0)
104+
val percent2 = calculateFilterSelectivity(cond2, update = false).getOrElse(1.0)
105+
Some(percent1 + percent2 - (percent1 * percent2))
106+
107+
// For AND and OR conditions, we will estimate conservatively if one of two
108+
// components is not supported, e.g. suppose c1 is not supported,
109+
// then p(And(c1, c2)) = p(c2), and p(Or(c1, c2)) = 1.0.
110+
// But once they are wrapped in NOT condition, then after 1 - p, it becomes
111+
// under-estimation. So in these cases, we consider them as unsupported.
112+
case Not(And(cond1, cond2)) =>
113+
val p1 = calculateFilterSelectivity(cond1, update = false)
114+
val p2 = calculateFilterSelectivity(cond2, update = false)
115+
if (p1.isDefined && p2.isDefined) {
116+
Some(1 - p1.get * p2.get)
117+
} else {
118+
None
109119
}
110120

111-
case Not(cond) =>
112-
if (cond.isInstanceOf[And] || cond.isInstanceOf[Or]) {
113-
// Don't support compound Not expression.
114-
None
121+
case Not(Or(cond1, cond2)) =>
122+
val p1 = calculateFilterSelectivity(cond1, update = false)
123+
val p2 = calculateFilterSelectivity(cond2, update = false)
124+
if (p1.isDefined && p2.isDefined) {
125+
Some(1 - (p1.get + p2.get - (p1.get * p2.get)))
115126
} else {
116-
calculateSingleCondition(cond, update = false) match {
117-
case Some(percent) => Some(1.0 - percent)
118-
case None => None
119-
}
127+
None
128+
}
129+
130+
case Not(cond) =>
131+
calculateFilterSelectivity(cond, update = false) match {
132+
case Some(percent) => Some(1.0 - percent)
133+
case None => None
120134
}
121135

122136
case _ => calculateSingleCondition(condition, update)

0 commit comments

Comments
 (0)