@@ -53,17 +53,19 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
5353 def estimate : Option [Statistics ] = {
5454 if (childStats.rowCount.isEmpty) return None
5555
56- // save a mutable copy of colStats so that we can later change it recursively
56+ // Save a mutable copy of colStats so that we can later change it recursively.
5757 colStatsMap.setInitValues(childStats.attributeStats)
5858
59- // estimate selectivity of this filter predicate
60- val filterSelectivity : Double = calculateFilterSelectivity(plan.condition) match {
61- case Some (percent) => percent
62- // for not-supported condition, set filter selectivity to a conservative estimate 100%
63- case None => 1.0
64- }
59+ // Estimate selectivity of this filter predicate, and update column stats if needed.
60+ // For not-supported condition, set filter selectivity to a conservative estimate 100%
61+ val filterSelectivity : Double = calculateFilterSelectivity(plan.condition).getOrElse(1.0 )
6562
66- val newColStats = colStatsMap.toColumnStats
63+ val newColStats = if (filterSelectivity == 0 ) {
64+ // The output is empty, we don't need to keep column stats.
65+ AttributeMap [ColumnStat ](Nil )
66+ } else {
67+ colStatsMap.toColumnStats
68+ }
6769
6870 val filteredRowCount : BigInt =
6971 EstimationUtils .ceil(BigDecimal (childStats.rowCount.get) * filterSelectivity)
@@ -75,12 +77,14 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
7577 }
7678
7779 /**
78- * Returns a percentage of rows meeting a compound condition in Filter node.
79- * A compound condition is decomposed into multiple single conditions linked with AND, OR, NOT.
80+ * Returns a percentage of rows meeting a condition in Filter node.
81+ * If it's a single condition, we calculate the percentage directly.
82+ * If it's a compound condition, it is decomposed into multiple single conditions linked with
83+ * AND, OR, NOT.
8084 * For logical AND conditions, we need to update stats after a condition estimation
8185 * so that the stats will be more accurate for subsequent estimation. This is needed for
8286 * range condition such as (c > 40 AND c <= 50)
83- * For logical OR conditions, we do not update stats after a condition estimation.
87+ * For logical OR and NOT conditions, we do not update stats after a condition estimation.
8488 *
8589 * @param condition the compound logical expression
8690 * @param update a boolean flag to specify if we need to update ColumnStat of a column
@@ -91,32 +95,42 @@ case class FilterEstimation(plan: Filter, catalystConf: CatalystConf) extends Lo
9195 def calculateFilterSelectivity (condition : Expression , update : Boolean = true ): Option [Double ] = {
9296 condition match {
9397 case And (cond1, cond2) =>
94- val percent1 = calculateFilterSelectivity(cond1, update)
95- val percent2 = calculateFilterSelectivity(cond2, update)
96- (percent1, percent2) match {
97- case (Some (p1), Some (p2)) => Some (p1 * p2)
98- case (Some (p1), None ) => Some (p1)
99- case (None , Some (p2)) => Some (p2)
100- case (None , None ) => None
101- }
98+ val percent1 = calculateFilterSelectivity(cond1, update).getOrElse(1.0 )
99+ val percent2 = calculateFilterSelectivity(cond2, update).getOrElse(1.0 )
100+ Some (percent1 * percent2)
102101
103102 case Or (cond1, cond2) =>
104- val percent1 = calculateFilterSelectivity(cond1, update = false )
105- val percent2 = calculateFilterSelectivity(cond2, update = false )
106- (percent1, percent2) match {
107- case (Some (p1), Some (p2)) => Some (math.min(1.0 , p1 + p2 - (p1 * p2)))
108- case _ => None
103+ val percent1 = calculateFilterSelectivity(cond1, update = false ).getOrElse(1.0 )
104+ val percent2 = calculateFilterSelectivity(cond2, update = false ).getOrElse(1.0 )
105+ Some (percent1 + percent2 - (percent1 * percent2))
106+
107+ // For AND and OR conditions, we will estimate conservatively if one of two
108+ // components is not supported, e.g. suppose c1 is not supported,
109+ // then p(And(c1, c2)) = p(c2), and p(Or(c1, c2)) = 1.0.
110+ // But once they are wrapped in NOT condition, then after 1 - p, it becomes
111+ // under-estimation. So in these cases, we consider them as unsupported.
112+ case Not (And (cond1, cond2)) =>
113+ val p1 = calculateFilterSelectivity(cond1, update = false )
114+ val p2 = calculateFilterSelectivity(cond2, update = false )
115+ if (p1.isDefined && p2.isDefined) {
116+ Some (1 - p1.get * p2.get)
117+ } else {
118+ None
109119 }
110120
111- case Not (cond) =>
112- if (cond.isInstanceOf [And ] || cond.isInstanceOf [Or ]) {
113- // Don't support compound Not expression.
114- None
121+ case Not (Or (cond1, cond2)) =>
122+ val p1 = calculateFilterSelectivity(cond1, update = false )
123+ val p2 = calculateFilterSelectivity(cond2, update = false )
124+ if (p1.isDefined && p2.isDefined) {
125+ Some (1 - (p1.get + p2.get - (p1.get * p2.get)))
115126 } else {
116- calculateSingleCondition(cond, update = false ) match {
117- case Some (percent) => Some (1.0 - percent)
118- case None => None
119- }
127+ None
128+ }
129+
130+ case Not (cond) =>
131+ calculateFilterSelectivity(cond, update = false ) match {
132+ case Some (percent) => Some (1.0 - percent)
133+ case None => None
120134 }
121135
122136 case _ => calculateSingleCondition(condition, update)
0 commit comments