-
Notifications
You must be signed in to change notification settings - Fork 28.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-21984] [SQL] Join estimation based on equi-height histogram #19594
Changes from all commits
8b2084a
6cb9b39
e69e213
ad14a5e
2a4ee99
2637429
e1669ed
16797d2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,7 +24,7 @@ import org.apache.spark.internal.Logging | |
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference, Expression} | ||
import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys | ||
import org.apache.spark.sql.catalyst.plans._ | ||
import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Join, Statistics} | ||
import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Histogram, Join, Statistics} | ||
import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.EstimationUtils._ | ||
|
||
|
||
|
@@ -191,8 +191,19 @@ case class JoinEstimation(join: Join) extends Logging { | |
val rInterval = ValueInterval(rightKeyStat.min, rightKeyStat.max, rightKey.dataType) | ||
if (ValueInterval.isIntersected(lInterval, rInterval)) { | ||
val (newMin, newMax) = ValueInterval.intersect(lInterval, rInterval, leftKey.dataType) | ||
val (card, joinStat) = computeByNdv(leftKey, rightKey, newMin, newMax) | ||
keyStatsAfterJoin += (leftKey -> joinStat, rightKey -> joinStat) | ||
val (card, joinStat) = (leftKeyStat.histogram, rightKeyStat.histogram) match { | ||
case (Some(l: Histogram), Some(r: Histogram)) => | ||
computeByHistogram(leftKey, rightKey, l, r, newMin, newMax) | ||
case _ => | ||
computeByNdv(leftKey, rightKey, newMin, newMax) | ||
} | ||
keyStatsAfterJoin += ( | ||
// Histograms are propagated as unchanged. During future estimation, they should be | ||
// truncated by the updated max/min. In this way, only pointers of the histograms are | ||
// propagated and thus reduce memory consumption. | ||
leftKey -> joinStat.copy(histogram = leftKeyStat.histogram), | ||
rightKey -> joinStat.copy(histogram = rightKeyStat.histogram) | ||
) | ||
// Return cardinality estimated from the most selective join keys. | ||
if (card < joinCard) joinCard = card | ||
} else { | ||
|
@@ -225,6 +236,43 @@ case class JoinEstimation(join: Join) extends Logging { | |
(ceil(card), newStats) | ||
} | ||
|
||
/** Compute join cardinality using equi-height histograms. */ | ||
private def computeByHistogram( | ||
leftKey: AttributeReference, | ||
rightKey: AttributeReference, | ||
leftHistogram: Histogram, | ||
rightHistogram: Histogram, | ||
newMin: Option[Any], | ||
newMax: Option[Any]): (BigInt, ColumnStat) = { | ||
val overlappedRanges = getOverlappedRanges( | ||
leftHistogram = leftHistogram, | ||
rightHistogram = rightHistogram, | ||
// Only numeric values have equi-height histograms. | ||
lowerBound = newMin.get.toString.toDouble, | ||
upperBound = newMax.get.toString.toDouble) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if we assume the min/max must be defined here, I think the parameter type should be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. that's because we need to update the column stats' min and max at the end of the method. |
||
|
||
var card: BigDecimal = 0 | ||
var totalNdv: Double = 0 | ||
for (i <- overlappedRanges.indices) { | ||
val range = overlappedRanges(i) | ||
if (i == 0 || range.hi != overlappedRanges(i - 1).hi) { | ||
// If range.hi == overlappedRanges(i - 1).hi, that means the current range has only one | ||
// value, and this value is already counted in the previous range. So there is no need to | ||
// count it in this range. | ||
totalNdv += math.min(range.leftNdv, range.rightNdv) | ||
} | ||
// Apply the formula in this overlapped range. | ||
card += range.leftNumRows * range.rightNumRows / math.max(range.leftNdv, range.rightNdv) | ||
} | ||
|
||
val leftKeyStat = leftStats.attributeStats(leftKey) | ||
val rightKeyStat = rightStats.attributeStats(rightKey) | ||
val newMaxLen = math.min(leftKeyStat.maxLen, rightKeyStat.maxLen) | ||
val newAvgLen = (leftKeyStat.avgLen + rightKeyStat.avgLen) / 2 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. shall we count left/right numRows when calculating this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. how do we use left/right numRows to calculate this? Ideally avgLen is calculated by total length of keys / numRowsAfterJoin. For string type, we don't the exact length of the matched keys (we don't support string histogram yet), for numeric types, their avgLen should be the same. So the equation is a fair approximation. |
||
val newStats = ColumnStat(ceil(totalNdv), newMin, newMax, 0, newAvgLen, newMaxLen) | ||
(ceil(card), newStats) | ||
} | ||
|
||
/** | ||
* Propagate or update column stats for output attributes. | ||
*/ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
shall we do this inside
computeByEquiHeightHistogram
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i.e. https://github.com/apache/spark/pull/19594/files#diff-6387e7aaeb7d8e0cb1457b9d0fe5cd00R272
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I put it here because
computeByEquiHeightHistogram
returns a single stats, here we keep the histogram for leftKey and rightKey respectively.