-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-15616][SQL] Hive table supports partition pruning in JoinSelection #25919
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
009944f
e744da5
12e1dc5
b334e99
17e0ba0
8d615f7
86a0d9c
ecfbe4d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,15 +21,16 @@ import java.io.IOException | |
| import java.util.Locale | ||
|
|
||
| import org.apache.hadoop.fs.{FileSystem, Path} | ||
| import org.apache.hadoop.hive.common.StatsSetupConst | ||
|
|
||
| import org.apache.spark.sql._ | ||
| import org.apache.spark.sql.catalyst.catalog._ | ||
| import org.apache.spark.sql.catalyst.expressions._ | ||
| import org.apache.spark.sql.catalyst.planning._ | ||
| import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoDir, InsertIntoStatement, LogicalPlan, ScriptTransformation, Statistics} | ||
| import org.apache.spark.sql.catalyst.plans.logical.{Filter, InsertIntoDir, InsertIntoStatement, LogicalPlan, Project, ScriptTransformation, Statistics} | ||
| import org.apache.spark.sql.catalyst.rules.Rule | ||
| import org.apache.spark.sql.execution._ | ||
| import org.apache.spark.sql.execution.command.{CreateTableCommand, DDLUtils} | ||
| import org.apache.spark.sql.execution.command.{CommandUtils, CreateTableCommand, DDLUtils} | ||
| import org.apache.spark.sql.execution.datasources.CreateTable | ||
| import org.apache.spark.sql.hive.execution._ | ||
| import org.apache.spark.sql.internal.{HiveSerDe, SQLConf} | ||
|
|
@@ -231,6 +232,68 @@ case class RelationConversions( | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * TODO: merge this with PruneFileSourcePartitions after we completely make hive as a data source. | ||
| */ | ||
| case class PruneHiveTablePartitions( | ||
| session: SparkSession) extends Rule[LogicalPlan] with PredicateHelper { | ||
| override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { | ||
| case op @ PhysicalOperation(projections, predicates, relation: HiveTableRelation) | ||
| if predicates.nonEmpty && relation.isPartitioned && relation.prunedPartitions.isEmpty => | ||
| val normalizedFilters = predicates.map { e => | ||
| e transform { | ||
| case a: AttributeReference => | ||
| a.withName(relation.output.find(_.semanticEquals(a)).get.name) | ||
| } | ||
| } | ||
| val partitionSet = AttributeSet(relation.partitionCols) | ||
| val pruningPredicates = normalizedFilters.filter { predicate => | ||
| !predicate.references.isEmpty && predicate.references.subsetOf(partitionSet) | ||
| } | ||
| // SPARK-24085: scalar subquery should be skipped for partition pruning | ||
| val hasScalarSubquery = pruningPredicates.exists(SubqueryExpression.hasSubquery) | ||
| val conf = session.sessionState.conf | ||
| if (conf.metastorePartitionPruning && pruningPredicates.nonEmpty && !hasScalarSubquery) { | ||
| val prunedPartitions = session.sharedState.externalCatalog.listPartitionsByFilter( | ||
cloud-fan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| relation.tableMeta.database, | ||
| relation.tableMeta.identifier.table, | ||
| pruningPredicates, | ||
| conf.sessionLocalTimeZone) | ||
| val sizeInBytes = try { | ||
| val sizeOfPartitions = prunedPartitions.map { part => | ||
| val rawDataSize = part.parameters.get(StatsSetupConst.RAW_DATA_SIZE).map(_.toLong) | ||
| val totalSize = part.parameters.get(StatsSetupConst.TOTAL_SIZE).map(_.toLong) | ||
| if (rawDataSize.isDefined && rawDataSize.get > 0) { | ||
| rawDataSize.get | ||
| } else if (totalSize.isDefined && totalSize.get > 0L) { | ||
| totalSize.get | ||
| } else if (conf.fallBackToHdfsForStatsEnabled) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Per the doc of the conf "spark.sql.statistics.fallBackToHdfs", it is only for non-partitioned hive table : |
||
| CommandUtils.calculateLocationSize( | ||
| session.sessionState, relation.tableMeta.identifier, part.storage.locationUri) | ||
| } else { // we cannot get any size statics here. Use 0 as the default size to sum up. | ||
| 0L | ||
| } | ||
| }.sum | ||
| // If size of partitions is zero fall back to the default size. | ||
| if (sizeOfPartitions == 0L) conf.defaultSizeInBytes else sizeOfPartitions | ||
| } catch { | ||
| case e: IOException => | ||
| logWarning("Failed to get table size from HDFS.", e) | ||
| conf.defaultSizeInBytes | ||
| } | ||
| val withStats = relation.tableMeta.copy( | ||
| stats = Some(CatalogStatistics(sizeInBytes = BigInt(sizeInBytes)))) | ||
| val prunedHiveTableRelation = | ||
| relation.copy(tableMeta = withStats, prunedPartitions = Some(prunedPartitions)) | ||
| val filterExpression = predicates.reduceLeft(And) | ||
| val filter = Filter(filterExpression, prunedHiveTableRelation) | ||
| Project(projections, filter) | ||
| } else { | ||
| op | ||
| } | ||
| } | ||
| } | ||
|
|
||
| private[hive] trait HiveStrategies { | ||
| // Possibly being too clever with types here... or not clever enough. | ||
| self: SparkPlanner => | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -166,14 +166,14 @@ case class HiveTableScanExec( | |
| @transient lazy val rawPartitions = { | ||
| val prunedPartitions = | ||
| if (sparkSession.sessionState.conf.metastorePartitionPruning && | ||
| partitionPruningPred.size > 0) { | ||
| partitionPruningPred.nonEmpty) { | ||
| // Retrieve the original attributes based on expression ID so that capitalization matches. | ||
| val normalizedFilters = partitionPruningPred.map(_.transform { | ||
| case a: AttributeReference => originalAttributes(a) | ||
| }) | ||
| sparkSession.sessionState.catalog.listPartitionsByFilter( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @cloud-fan @maropu @advancedxy |
||
| relation.tableMeta.identifier, | ||
| normalizedFilters) | ||
| relation.prunedPartitions.getOrElse( | ||
| sparkSession.sessionState.catalog.listPartitionsByFilter( | ||
| relation.tableMeta.identifier, normalizedFilters)) | ||
| } else { | ||
| sparkSession.sessionState.catalog.listPartitions(relation.tableMeta.identifier) | ||
| } | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It skips all subqueries instead of scalar subqueries.